def main(): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) source_parser = generic_parser.SourceParser('Springer Journals List 2019', url='https://resource-cms.springernature.com/springer-cms/rest/v1/content/829308/data/v3') SOURCE_ID = source_parser.match_or_create_source() p_parser = generic_parser.NodeParser(name='Springer', type='PUBLISHER') p_node_id, p_node_record, p_match_type = p_parser.match2node() p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(p_parser, p_node_id, p_node_record, p_match_type) inputfile = os.path.join(BASE_DIR, 'datasets', 'tabula-Eligible Open Choice Journals CCBY.csv') with open(inputfile, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) row_counter = 0 for row in reader: row_counter += 1 issn = row['ISSN print'].strip() eissn = row['ISSN electronic'].strip() jname = row['Title'].replace('\n', ' ').replace(' ', ' ').strip() oastatus = row['Open Access'].strip() logger.info('-------({}) Working on {}; oastatus: {}; ' 'issn: {}; eissn: {}'.format(row_counter, jname, oastatus, issn, eissn)) j_parser = generic_parser.NodeParser(name=jname, issn=issn, eissn=eissn, publisher='Springer', source=SOURCE_ID, publisher_node_id=p_parser.node_id) j_parser.oa_status = oastatus.replace('Fully Open Access', 'FULLY_OA').replace('Hybrid (Open Choice)', 'HYBRID').upper() # Attempt to find a match in Orpheus node_id, node_record, match_type = j_parser.match2node() j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(j_parser, node_id, node_record, match_type) logger.debug('j_parser.node_record: {}'.format(j_parser.node_record)) #determine Orpheus id of preferred name if j_parser.node_record['name_status'] not in ['PRIMARY']: preferred_node_id = j_parser.node_record['synonym_of'] else: preferred_node_id = j_parser.node_id logger.debug('preferred_node_id: {}'.format(preferred_node_id)) logger.debug('j_parser: {}'.format(vars(j_parser))) # Attach policies to preferred name node logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()') j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match() logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=deal).match()') j_parser.PolicyMatcher(j_parser, policy_type='deal').match(supersede_existing=False, **{'applies_to': 'INSTITUTIONS', 'type': 'SPRINGER COMPACT'})
def main(): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) input_folder = os.path.join(BASE_DIR, 'datasets', 'clarivate_analytics') source_parser = generic_parser.SourceParser( 'Clarivate Analytics Master Journal List', url='http://mjl.clarivate.com/#journal_lists') WOS_SOURCE_ID = source_parser.match_or_create_source() journal_counter = 0 for f in os.listdir(input_folder): if f.endswith('.csv'): logger.info('------------- Working on file {}'.format(f)) with open(os.path.join(input_folder, f), encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: journal_counter += 1 jname = row['Journal Title'].strip().replace('\n', '').title() publisher = row['Publisher'].strip().replace('\n', '').title() issn = row['ISSN'].strip() eissn = row['E-ISSN'].strip() if jname == 'Journal Title': pass else: logger.info( '-------------- ({}) Working on journal: {}'. format(journal_counter, jname)) j_parser = generic_parser.NodeParser( name=jname, issn=issn, eissn=eissn, publisher=publisher, source=WOS_SOURCE_ID) j_parser.match2romeo_publisher() # Attempt to find a match in Orpheus node_id, node_record, match_type = j_parser.match2node( ) j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match( j_parser, node_id, node_record, match_type)
def main(): epmc_embargo2months = { "": None, "Immediate": 0, "0 months or more": 0, "1 month": 1, "2 months": 2, "2 months or less": 2, "3 months": 3, "3 months or more": 3, "6 months": 6, "6 months or less": 6, "12 months": 12, "12 months or less": 12, "24 months": 24, "24 months or less": 24, "36 months": 36, "36 months or less": 36, } participation_level2orpheus = {" Full ": "FULL", " NIH Portfolio ": "NIH"} open_licence2orpheus = {"All": "ALL", "No": "NO", "Some": "SOME"} deposit_status2orpheus = { " ": None, " No New Content ": "NO_NEW", " Now Select ": "NOW_SELECT", " Predecessor ": "PRE" } BASE_DIR = os.path.dirname(os.path.abspath(__file__)) source_parser = generic_parser.SourceParser( 'PMC Journal List', url='https://europepmc.org/journalList') SOURCE_ID = source_parser.match_or_create_source() inputfile = os.path.join(BASE_DIR, 'datasets', 'jlist.csv') with open(inputfile, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) row_counter = 0 for row in reader: row_counter += 1 # if row_counter < 1439: # Don't forget to remove this after testing # continue issn = row['pISSN'].strip() eissn = row['eISSN'].strip() jname = row['Journal title'].replace('\n', ' ').replace(' ', ' ').strip() jsynonym = row['NLM TA'].replace('\n', ' ').replace(' ', ' ').strip() jpublisher = row['Publisher'].replace('\n', ' ').replace(' ', ' ').strip() embargo = row['Free access'] open_licence = row['Open access'] participation_level = row['Participation level'] deposit_status = row[' Deposit status'] epmc_url = row[' Journal URL'] logger.info('-------({}) Working on {}; ' 'issn: {}; eissn: {}'.format(row_counter, jname, issn, eissn)) if deposit_status == " Predecessor ": logger.info( 'Skipped {} (title no longer in publication)'.format( jname)) continue logger.info('--- Parsing publisher info') p_parser = generic_parser.NodeParser(name=jpublisher, type='PUBLISHER') p_node_id, p_node_record, p_match_type = p_parser.match2node() p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match( p_parser, p_node_id, p_node_record, p_match_type) if p_parser.node_record['name_status'] not in ['PRIMARY']: p_preferred_node_id = p_parser.node_record['synonym_of'] else: p_preferred_node_id = p_parser.node_id logger.info('--- Parsing journal info') j_parser = generic_parser.NodeParser( name=jname, issn=issn, eissn=eissn, publisher=jpublisher, source=SOURCE_ID, publisher_node_id=p_preferred_node_id, epmc_url=epmc_url) # Attempt to find a match in Orpheus node_id, node_record, match_type = j_parser.match2node() j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match( j_parser, node_id, node_record, match_type, prompt_responses=PROMPT_RESPONSES) logger.debug('j_parser.node_record: {}'.format( j_parser.node_record)) #determine Orpheus id of preferred name if j_parser.node_record['name_status'] not in ['PRIMARY']: preferred_node_id = j_parser.node_record['synonym_of'] else: preferred_node_id = j_parser.node_id # Parsing abbreviation as a synonym if jsynonym: logger.debug( '--- Processing alternative title "{}"'.format(jsynonym)) syn_parser = generic_parser.NodeParser( name=jsynonym, issn=issn, eissn=eissn, publisher=j_parser.publisher, publisher_node_record=j_parser.publisher_node_record, publisher_node_id=j_parser.publisher_node_id, source=SOURCE_ID) syn_parser.name_status = 'SYNONYM' syn_parser.synonym_of = preferred_node_id syn_id, syn_record, match_type = syn_parser.match2node() generic_parser.act_on_orpheus_match(syn_parser, syn_id, syn_record, match_type) logger.debug('preferred_node_id: {}'.format(preferred_node_id)) logger.debug('j_parser: {}'.format(vars(j_parser))) # Attach policies to preferred name node logger.debug( '--- Calling j_parser.PolicyMatcher(j_parser, policy_type=epmc).match()' ) j_parser.PolicyMatcher(j_parser, policy_type='epmc').match( supersede_existing=False, **{ 'participation_level': participation_level2orpheus[participation_level], 'embargo_months': epmc_embargo2months[embargo], 'open_licence': open_licence2orpheus[open_licence], 'deposit_status': deposit_status2orpheus[deposit_status], })
def main(): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) oup_url = 'https://academic.oup.com/journals/pages/access_purchase/rights_and_permissions/embargo_periods' source_parser = generic_parser.SourceParser( 'OUP website: Accepted Manuscript Embargo Periods', url=oup_url) SOURCE_ID = source_parser.match_or_create_source() website_parser = generic_parser.SourceParser( 'OUP website: Author self-archiving policy', url= 'https://academic.oup.com/journals/pages/access_purchase/rights_and_permissions/' 'author_self_archiving_policy') OUP_WEBSITE_ID = website_parser.match_or_create_source() p_parser = generic_parser.NodeParser(name='Oxford University Press', type='PUBLISHER', source=SOURCE_ID) p_node_id, p_node_record, p_match_type = p_parser.match2node() p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match( p_parser, p_node_id, p_node_record, p_match_type) r = urllib.request.urlopen(oup_url).read() s = BeautifulSoup(r, 'html.parser') journal_counter = 0 embargo_data_values = [] for tr in s.find_all('tr'): j_parser = generic_parser.NodeParser( publisher='Oxford University Press', source=SOURCE_ID, publisher_node_id=p_parser.node_id) preferred_node_id = None td_counter = 0 for td in tr.find_all('td'): td_counter += 1 if td.string: jname = td.string.replace('\n', '').strip() if td.a: if ('\n' in td.a.string) or ('Custom' in td.a.string): embargo_data = ' '.join( td.a.string.replace( 'months', '').split()) #https://stackoverflow.com/a/1546251 else: jname = td.a.string.replace('\n', '').strip() if td_counter % 2 == 0: #Even (This td contains AM embargo_data) if embargo_data in ['Full Open Access', 'Fully Open Access']: j_parser.oa_status = 'FULLY_OA' logger.debug( '{} is an open access journal. Calling j_parser.PolicyMatcher(j_parser, ' 'policy_type=oa_status).match()'.format(j_parser.name)) j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match() elif embargo_data in 'Custom': logger.debug( 'Skipping custom AM self-archiving policy ({})'.format( j_parser.name)) else: if not preferred_node_id: sys.exit('preferred_node_id not set for {}'.format( j_parser.name)) am_policy = generic_parser.GreenPolicyInstance() am_policy.node = preferred_node_id am_policy.outlet = [INST_REPO_ID, SUBJ_REPO_ID, PUBMED_ID] am_policy.version = [AM_ID] am_policy.version_embargo_months = int(embargo_data) am_policy.version_green_licence = CUSTOM_ID am_policy.source = SOURCE_ID am_policy.verbatim = am_verbatim logger.debug( 'Calling j_parser.server_data_match_green_policy(**am_policy.as_dict())' ) j_parser.PolicyMatcher( j_parser, policy_type='green').match(**am_policy.as_dict()) preprint_policy = generic_parser.GreenPolicyInstance() preprint_policy.node = preferred_node_id preprint_policy.outlet = ALL_OUTLETS preprint_policy.version = [PREPRINT_ID] preprint_policy.version_embargo_months = 0 preprint_policy.version_green_licence = CUSTOM_ID preprint_policy.source = OUP_WEBSITE_ID preprint_policy.verbatim = preprint_verbatim logger.debug( 'Calling j_parser.server_data_match_green_policy(**preprint_policy.as_dict())' ) j_parser.PolicyMatcher( j_parser, policy_type='green').match(**preprint_policy.as_dict()) website_policy = generic_parser.GreenPolicyInstance() website_policy.node = preferred_node_id website_policy.outlet = [WEBSITE_ID] website_policy.version = [AM_ID] website_policy.version_embargo_months = 0 website_policy.version_green_licence = CUSTOM_ID website_policy.source = OUP_WEBSITE_ID website_policy.verbatim = 'Authors may make their AM available on their non-commercial homepage or blog. They may also privately share their work within their institution for the purposes of research or education, and make copies available to colleagues or students for their personal use providing that the AM is not made publicly available until after the embargo period.' logger.debug( 'Calling j_parser.server_data_match_green_policy(**website_policy.as_dict())' ) j_parser.PolicyMatcher( j_parser, policy_type='green').match(**website_policy.as_dict()) if j_parser.oa_status != 'FULLY_OA': vor_policy = generic_parser.GreenPolicyInstance() vor_policy.node = preferred_node_id vor_policy.outlet = ALL_OUTLETS vor_policy.deposit_allowed = False vor_policy.version = [VOR_ID] vor_policy.source = OUP_WEBSITE_ID vor_policy.verbatim = vor_verbatim logger.debug( 'Calling j_parser.server_data_match_green_policy(**vor_policy.as_dict())' ) j_parser.PolicyMatcher( j_parser, policy_type='green').match(**vor_policy.as_dict()) else: # Odd (This td contains the journal name) journal_counter += 1 logger.info('------------({}) Working on journal {}'.format( journal_counter, jname)) j_parser.name = jname j_parser.get_issn_from_romeo() # Attempt to find a match in Orpheus node_id, node_record, match_type = j_parser.match2node() j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match( j_parser, node_id, node_record, match_type) logger.debug('j_parser.node_record: {}'.format( j_parser.node_record)) #determine Orpheus id of preferred name if j_parser.node_record['name_status'] not in ['PRIMARY']: preferred_node_id = j_parser.node_record['synonym_of'] else: preferred_node_id = j_parser.node_id logger.info('embargo_data_values: {}'.format(sorted(embargo_data_values)))
def main(): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) source_parser = generic_parser.SourceParser( 'Wiley Author Compliance Tool', url= 'https://authorservices.wiley.com/author-resources/Journal-Authors/licensing-open-access/open-access/author-compliance-tool.html' ) WILEY_SOURCE_ID = source_parser.match_or_create_source() p_parser = generic_parser.NodeParser(name='Wiley', type='PUBLISHER', source=WILEY_SOURCE_ID) p_node_id, p_node_record, p_match_type = p_parser.match2node() p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match( p_parser, p_node_id, p_node_record, p_match_type) f = open(os.path.join(BASE_DIR, 'datasets', 'wiley_compliance_tool.html')) data = f.read() f.close() # translation dicts oa_status_dict = { 'Offers OnlineOpen': 'HYBRID', 'No OA option': 'SUBSCRIPTION', 'Fully Open Access': 'FULLY_OA' } default_licence_options = [CCBY_ID, CCBYNC_ID, CCBYNCND_ID] licence_choices_dict = { 'Choice of CC BY, CC BY-NC, or CC BY-NC-ND<br />CC BY for mandated authors': default_licence_options, '--': [], 'Choice of CC BY-NC or CC BY-NC-ND<br />CC BY for mandated authors': default_licence_options, 'No CC license offered': [CUSTOM_ID], 'CC BY': [CCBY_ID], 'CC BY for mandated authors': [CCBY_ID], 'CC-BY': [CCBY_ID], 'CC BY-NC-ND<br />CC BY for mandated authors': [CCBY_ID, CCBYNCND_ID], 'Choice of CC BY, CC BY-NC, or CC BY-NC-ND': default_licence_options, 'Choice of CC BY, CC BY-NC or CC BY-NC-ND': default_licence_options, 'Choice of CC BY-NC or CC BY-ND-ND<br />CC-BY mandate only': default_licence_options, 'CC BY, CC BY-NC, or CC BY-NC-ND<br />CC BY for mandated authors': default_licence_options, 'CC BY-NC-ND': [CCBYNCND_ID], 'CC BY, CC BY-NC, CC BY-NC-ND ': default_licence_options, 'CC BY (mandated only), CC BY NC, CC BY NC ND': default_licence_options, 'CC BY-NC<br />CC BY for mandated authors': [CCBY_ID, CCBYNC_ID], 'CC BY NC ND': [CCBYNCND_ID] } # journal_block t = re.compile( r'''<select class="journal" id="journal"(.*)</select>\n</form>''', re.DOTALL) m = t.search(data) journal_block = m.group(1).replace( '''"<option value='1132'>Journal of World Intellectual Property - The\n</option>"''', "<option value='1132'>Journal of World Intellectual Property - The</option>" ) journals = [] t = re.compile(r'''^<option value=['"]([0-9]+)['"]>(.+)</option>$''', re.MULTILINE) m = t.findall(journal_block) for id, journal_name in m: journals.append(journal_name) # attributes oa_stata = javascript_variable('JOAP') oa_stata.parse_values(data) gold_licences = javascript_variable('JL') gold_licences.parse_values(data) apcs = javascript_variable('JAPC') apcs.parse_values(data) preprint_embargos = javascript_variable('JSV') preprint_embargos.parse_values(data) am_embargos = javascript_variable('JAV') am_embargos.parse_values(data) # check that the number of values of each attribute matches the number of journals for a in [ oa_stata.values, gold_licences.values, apcs.values, preprint_embargos.values, am_embargos.values ]: if len(a) != len(journals): error_msg = 'Number of values of variable ({}) does not match number of journals ({}). This could be because ' \ '`Journal of World Intellectual Property` spans more than 1 line in input dataset. Check and, ' \ 'if so, edit the input to fix that. First 5 values of variable: {}'.format(len(a), len(journals), a[0:5]) sys.exit(error_msg) # # print list of values for each variable in input file # print_possible_values_of_wiley_variables() t_apc_value = re.compile(r'[0-9,]+') # parse information for each journal and add to Orpheus counter = 0 for j in journals[1:]: counter += 1 logger.info('---------{} Working on journal {}'.format(counter, j)) j_parser = generic_parser.NodeParser( name=j, publisher='Wiley', source=WILEY_SOURCE_ID, publisher_node_id=p_parser.node_id) j_parser.oa_status = oa_status_dict[oa_stata.values[counter]] logger.debug('OA status: {}'.format(j_parser.oa_status)) # obtain issn from romeo; identify romeo_publisher and its node in Orpheus j_parser.get_issn_from_romeo() # Attempt to find a match in Orpheus node_id, node_record, match_type = j_parser.match2node() j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match( j_parser, node_id, node_record, match_type) logger.debug('j_parser.node_record: {}'.format(j_parser.node_record)) #determine Orpheus id of preferred name if j_parser.node_record['name_status'] not in ['PRIMARY']: preferred_node_id = j_parser.node_record['synonym_of'] else: preferred_node_id = j_parser.node_id # Parsing gold policy info licence_options_raw = gold_licences.values[counter] logger.debug('licence_options_raw: {}'.format(licence_options_raw)) apc_raw_str = apcs.values[counter] logger.debug('apc_raw_str: {}'.format(apc_raw_str)) if (j_parser.oa_status == 'SUBSCRIPTION') and (licence_options_raw == '--') and (apc_raw_str == '--'): has_gold_policy = False else: has_gold_policy = True logger.debug('has_gold_policy: {}'.format(has_gold_policy)) gp = generic_parser.GoldPolicyInstance() gp.node = preferred_node_id gp.source = WILEY_SOURCE_ID gp.licence_options = licence_choices_dict[licence_options_raw] logger.debug('gp.licence_options: {}'.format(gp.licence_options)) if apc_raw_str in ['', '--']: pass elif apc_raw_str in [ 'No APC', 'Inquire Directly ', 'Contact journal', '$50 per PU', 'waived 2016-18', '$1,800 for research article $900 for technical report' ]: gp.apc_note = apc_raw_str gp.apc_value_min = 900 gp.apc_value_max = 1800 elif apc_raw_str.strip() == '3500': gp.apc_currency = 'USD' gp.apc_value_min = 3500 gp.apc_value_max = 3500 else: m_apc_value = t_apc_value.search(apc_raw_str) gp.apc_value_min = int(m_apc_value.group().replace(',', '')) gp.apc_value_max = gp.apc_value_min if '$' in apc_raw_str: gp.apc_currency = 'USD' elif '€' in apc_raw_str: gp.apc_currency = 'EUR' else: logger.warning( 'Currency of APC {} could not be recognised. Journal {}' .format(apc_raw_str, j)) logger.debug('gp.apc_currency: {}'.format(gp.apc_currency)) logger.debug('gp.apc_value_min: {}'.format(gp.apc_value_min)) logger.debug('gp.apc_value_max: {}'.format(gp.apc_value_max)) # parsing preprint policy preprint_embargo_raw = preprint_embargos.values[counter] if preprint_embargo_raw in [ 'Refer to copyright or contact managing editor', '--' ]: has_preprint_policy = False else: has_preprint_policy = True preprint_policy = generic_parser.GreenPolicyInstance() preprint_policy.node = preferred_node_id preprint_policy.outlet = [WEBSITE_ID, INST_REPO_ID, SUBJ_REPO_ID] preprint_policy.version = [PREPRINT_ID] preprint_policy.version_embargo_months = 0 preprint_policy.version_green_licence = CUSTOM_ID preprint_policy.source = WILEY_SOURCE_ID preprint_policy.verbatim = preprint_embargo_raw logger.debug('has_preprint_policy: {}'.format(has_preprint_policy)) logger.debug('preprint_policy.verbatim: {}'.format( preprint_policy.verbatim)) # parsing AM policy am_embargo_raw = am_embargos.values[counter] logger.debug('am_embargo_raw: {}'.format(am_embargo_raw)) if am_embargo_raw.strip() in [ 'Refer to copyright or contact managing editor', '--', 'Refer to copyright', 'Does not publish unsolicited manuscripts' ]: has_am_policy = False else: has_am_policy = True am_policy = generic_parser.GreenPolicyInstance() am_policy.node = preferred_node_id am_policy.outlet = [WEBSITE_ID, INST_REPO_ID, SUBJ_REPO_ID] am_policy.version = [AM_ID] am_policy.version_green_licence = CUSTOM_ID am_policy.source = WILEY_SOURCE_ID am_policy.verbatim = am_embargo_raw.strip() if am_embargo_raw.strip() in ['Final version on publication']: am_policy.version = [VOR_ID] am_policy.version_embargo_months = 0 elif am_embargo_raw.strip() in ['On submission']: am_policy.version_embargo_months = 0 elif am_embargo_raw.strip() in ['6mo embargo']: am_policy.version_embargo_months = 6 elif am_embargo_raw.strip() in [ 'Institutional repository after 6 month embargo' ]: am_policy.outlet = [INST_REPO_ID] am_policy.version_embargo_months = 6 elif am_embargo_raw.strip() in [ '12mo embargo', '12 months', '12-24mo embargo' ]: am_policy.version_embargo_months = 12 elif am_embargo_raw.strip() in ['18mo embargo']: am_policy.version_embargo_months = 18 elif am_embargo_raw.strip() in ['24mo embargo']: am_policy.version_embargo_months = 24 elif am_embargo_raw.strip() in [ 'Not permitted', 'Fully Open Access' ]: am_policy.deposit_allowed = False else: logger.error( 'Failed to parse embargo info ({}) for journal {}'.format( am_embargo_raw, j)) logger.debug('has_am_policy: {}'.format(has_am_policy)) logger.debug('am_policy.verbatim: {}'.format(am_policy.verbatim)) logger.debug('am_policy.version_embargo_months: {}'.format( am_policy.version_embargo_months)) # Attach policies to preferred name node logger.debug( 'Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()' ) j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match() if has_gold_policy: logger.debug( "Calling j_parser.PolicyMatcher(j_parser, policy_type='gold').match(**gp.as_dict())" ) j_parser.PolicyMatcher(j_parser, policy_type='gold').match(**gp.as_dict()) if has_preprint_policy: logger.debug( 'Calling j_parser.server_data_match_green_policy(**preprint_policy.as_dict())' ) j_parser.PolicyMatcher( j_parser, policy_type='green').match(**preprint_policy.as_dict()) if has_am_policy: logger.debug( 'Calling j_parser.server_data_match_green_policy(**am_policy.as_dict())' ) j_parser.PolicyMatcher( j_parser, policy_type='green').match(**am_policy.as_dict())
def main(): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) source_parser = generic_parser.SourceParser( 'Elsevier Open Access Price List', url='https://www.elsevier.com/__data/promis_misc/j.custom97.pdf') ELSEVIER_SOURCE_ID = source_parser.match_or_create_source() elsevier_website_parser = generic_parser.SourceParser( 'Elsevier website: Open access licenses', url='https://www.elsevier.com/about/policies/open-access-licenses') ELSEVIER_WEBSITE_ID = elsevier_website_parser.match_or_create_source() p_parser = generic_parser.NodeParser(name='Elsevier', type='PUBLISHER', source=ELSEVIER_WEBSITE_ID) p_node_id, p_node_record, p_match_type = p_parser.match2node() p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match( p_parser, p_node_id, p_node_record, p_match_type) # gold policy gp = generic_parser.GoldPolicyInstance() gp.licence_options = [CCBY_ID, CCBYNCND_ID, CUSTOM_ID] inputfile = os.path.join(BASE_DIR, 'datasets', 'tabula-j.custom97.csv') with open(inputfile, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) row_counter = 0 for row in reader: row_counter += 1 issn = row['ISSN'].strip() jname = row['Journal title'].replace('\n', ' ').replace(' ', ' ').strip() oastatus = row['OA model'].strip() currency = row['Currency'].strip() price = row['Price'].strip() logger.info('-------({}) Working on {}; oastatus: {}; ' 'issn: {}; price: {} {}'.format( row_counter, jname, oastatus, issn, currency, price)) j_parser = generic_parser.NodeParser( name=jname, publisher='Elsevier', source=ELSEVIER_SOURCE_ID, publisher_node_id=p_parser.node_id) j_parser.issn = issn j_parser.oa_status = oastatus.replace('Open Access', 'FULLY_OA').upper() # # identify romeo_publisher and its node in Orpheus # j_parser.match2romeo_publisher(test_mode=TEST_MODE) # Attempt to find a match in Orpheus node_id, node_record, match_type = j_parser.match2node() j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match( j_parser, node_id, node_record, match_type) logger.debug('j_parser.node_record: {}'.format( j_parser.node_record)) #determine Orpheus id of preferred name if j_parser.node_record['name_status'] not in ['PRIMARY']: preferred_node_id = j_parser.node_record['synonym_of'] else: preferred_node_id = j_parser.node_id logger.debug('preferred_node_id: {}'.format(preferred_node_id)) logger.debug('j_parser: {}'.format(vars(j_parser))) # parsing gold policy gp.node = preferred_node_id gp.apc_currency = currency gp.apc_value_min = price gp.apc_value_max = price gp.source = ELSEVIER_SOURCE_ID # Attach policies to preferred name node logger.debug( 'Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()' ) j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match() logger.debug( 'Calling j_parser.PolicyMatcher(j_parser, policy_type=gold).match()' ) j_parser.PolicyMatcher(j_parser, policy_type='gold').match(**gp.as_dict())
def main(): def process_policy(generic_parser_instance, version_list, deposit_allowed): generic_parser_instance.PolicyMatcher( generic_parser_instance, policy_type='green').match( **{ 'outlet': [PUBMED_ID, INST_REPO_ID, WEBSITE_ID], 'version': version_list, 'deposit_allowed': deposit_allowed, 'source': SOURCE_ID, 'verbatim': restrictions_and_conditions, 'problematic': True, 'vetted': False, }) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) source_parser = generic_parser.SourceParser( 'SHERPA/RoMEO', url='http://www.sherpa.ac.uk/romeo/index.php') SOURCE_ID = source_parser.match_or_create_source() offline_file = os.path.join(BASE_DIR, 'romeo_all_publishers.xml') romeo = romeo_client.parser(offline_dataset=offline_file) # romeo = romeo_client.parser('?all=yes&showfunder=none&ak=', save_dataset=offline_file) romeo.parse_response() romeo.convert_restrictions() publisher_counter = 0 for k, v in romeo.output_dict.items(): publisher_counter += 1 logger.info('-------------- ({}) Working on romeo_id {}'.format( publisher_counter, k)) if not k in [ 'outcome', 'apicontrol', 'romeo_id_list', 'romeo_issn_list', 'romeo_publisher_list', 'journals_dicts' ]: if ('&#' in v['name']) and v['alias']: logging.info( 'romeo_parser: using {} instead of {} as name for romeo_id {}' .format(v['alias'], v['name'], k)) name = v['alias'] else: name = v['name'] parser = generic_parser.NodeParser(name=name, romeo_id=k, source=SOURCE_ID, type='PUBLISHER') node_id, node_record = parser.create_node(force_creation=True) if not node_id: logging.error('Failed to parse romeo id {}'.format(k)) else: restrictions_and_conditions = '' if v['prerestriction']: for r in v['prerestriction']: restrictions_and_conditions += 'Restriction on preprint deposit: ' + str( r) + '\n' if v['postrestriction']: for r in v['postrestriction']: restrictions_and_conditions += 'Restriction on AAM deposit: ' + str( r) + '\n' if v['pdfrestriction']: for r in v['pdfrestriction']: restrictions_and_conditions += 'Restriction on VoR deposit: ' + str( r) + '\n' if v['condition']: restrictions_and_conditions += 'Conditions:\n' for c in v['condition']: restrictions_and_conditions += c + '\n' if v['prearchiving'].lower() in ['can', 'restricted']: allowed = True elif v['prearchiving'].lower() in [ 'cannot', 'unclear', 'unknown' ]: allowed = False else: logging.error( 'Value of romeo prearchiving field unrecognised: {}'. format(v['prearchiving'])) allowed = False if v['prearchiving'] == v['postarchiving'] == v['pdfarchiving']: process_policy(parser, [AM_ID, PREPRINT_ID, VOR_ID], allowed) elif (v['prearchiving'] == v['postarchiving']): process_policy(parser, [AM_ID, PREPRINT_ID], allowed) if v['pdfarchiving'].lower() in ['can', 'restricted']: allowed = True elif v['pdfarchiving'].lower() in [ 'cannot', 'unclear', 'unknown' ]: allowed = False else: logging.error( 'Value of romeo pdfarchiving field unrecognised: {}' .format(v['pdfarchiving'])) allowed = False process_policy(parser, [VOR_ID], allowed) elif (v['prearchiving'] == v['pdfarchiving']): process_policy(parser, [PREPRINT_ID, VOR_ID], allowed) if v['postarchiving'].lower() in ['can', 'restricted']: allowed = True elif v['postarchiving'].lower() in [ 'cannot', 'unclear', 'unknown' ]: allowed = False else: logging.error( 'Value of romeo postarchiving field unrecognised: {}' .format(v['postarchiving'])) allowed = False process_policy(parser, [AM_ID], allowed) elif (v['pdfarchiving'] == v['postarchiving']): process_policy(parser, [PREPRINT_ID], allowed) if v['postarchiving'].lower() in ['can', 'restricted']: allowed = True elif v['postarchiving'].lower() in [ 'cannot', 'unclear', 'unknown' ]: allowed = False else: logging.error( 'Value of romeo postarchiving field unrecognised: {}' .format(v['postarchiving'])) allowed = False process_policy(parser, [AM_ID, VOR_ID], allowed) else: for archiving, version_id in [('prearchiving', PREPRINT_ID), ('postarchiving', AM_ID), ('pdfarchiving', VOR_ID)]: if v[archiving].lower() in ['can', 'restricted']: allowed = True elif v[archiving].lower() in [ 'cannot', 'unclear', 'unknown' ]: allowed = False else: logging.error( 'Value of romeo {} field unrecognised: {}'. format(archiving, v[archiving])) allowed = False process_policy(parser, [version_id], allowed) # now process each journal k_journals = get_journals4id(k) journal_counter = 0 for j in k_journals: logger.debug('j: {}'.format(j)) j_parser = generic_parser.NodeParser(name=j[0], issn=j[1], eissn=j[2], publisher_node_id=node_id, source=SOURCE_ID) journal_counter += 1 logger.info('------------({} {}) Working on journal {}'.format( name, journal_counter, j[0])) # Attempt to find a match in Orpheus j_node_id, j_node_record, match_type = j_parser.match2node() j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match( j_parser, j_node_id, j_node_record, match_type)
def main(): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) source_parser = generic_parser.SourceParser('Gray, A. 2018. Elsevier embargo periods, 2013-2018', url='https://doi.org/10.6084/m9.figshare.1554748.v14') ELSEVIER_SOURCE_ID = source_parser.match_or_create_source() elsevier_website_parser = generic_parser.SourceParser('Elsevier website: Article Sharing', url='https://www.elsevier.com/about/policies/sharing') ELSEVIER_WEBSITE_ID = elsevier_website_parser.match_or_create_source() p_parser = generic_parser.NodeParser(name='Elsevier', type='PUBLISHER', source=ELSEVIER_WEBSITE_ID) p_node_id, p_node_record, p_match_type = p_parser.match2node() p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(p_parser, p_node_id, p_node_record, p_match_type) # preprint policy preprint_policy = generic_parser.GreenPolicyInstance() preprint_policy.outlet = [INST_REPO_ID, SUBJ_REPO_ID, WEBSITE_ID, COMMERCIAL_ID, PUBMED_ID, SOCIAL_ID] preprint_policy.version = [PREPRINT_ID] preprint_policy.version_embargo_months = 0 preprint_policy.version_green_licence = CCBYNCND_ID preprint_policy.source = ELSEVIER_WEBSITE_ID preprint_policy.verbatim = preprint_verbatim # AM policy for personal websites am_policy1 = generic_parser.GreenPolicyInstance() am_policy1.outlet = [WEBSITE_ID] am_policy1.version = [AM_ID] am_policy1.version_embargo_months = 0 am_policy1.version_green_licence = CCBYNCND_ID am_policy1.source = ELSEVIER_WEBSITE_ID am_policy1.verbatim = am_verbatim # AM policy for non-commencial hosting platforms am_policy2 = generic_parser.GreenPolicyInstance() am_policy2.outlet = [INST_REPO_ID, SUBJ_REPO_ID, PUBMED_ID] am_policy2.version = [AM_ID] am_policy2.version_green_licence = CCBYNCND_ID am_policy2.source = ELSEVIER_SOURCE_ID am_policy2.verbatim = am_verbatim vor_policy = generic_parser.GreenPolicyInstance() vor_policy.outlet = ALL_OUTLETS vor_policy.deposit_allowed = False vor_policy.version = [VOR_ID] vor_policy.source = ELSEVIER_WEBSITE_ID vor_policy.verbatim = vor_verbatim inputfile = os.path.join(BASE_DIR, 'datasets', 'Elsevier_embargo_periods_by_journal_2013-2018_v_1.14_sheet_UK-2018.csv') with open(inputfile, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) row_counter = 0 for row in reader: row_counter += 1 oastatus = 'HYBRID' issn = row['ISSN'].strip() jname = row['Journal Name'].strip() aam_embargo = row['Embargo Period (months)'].strip() if aam_embargo in ['0 / 12', '0 / 24']: # 0/12 month titles are now OA (no embargo) but 12 months for pre-OA papers aam_embargo = '0' if aam_embargo == '0': oastatus = 'FULLY_OA' logger.info('-------({}) Working on journal {}; oastatus: {}; issn: {}; ' 'AM embargo: {}'.format(row_counter, jname, oastatus, issn, aam_embargo)) j_parser = generic_parser.NodeParser(name=jname, publisher='Elsevier', source=ELSEVIER_SOURCE_ID, publisher_node_id=p_parser.node_id) j_parser.issn = issn j_parser.oa_status = oastatus # # identify romeo_publisher and its node in Orpheus # No need to do this for big publisher datasets #j_parser.match2romeo_publisher(test_mode=TEST_MODE) # Attempt to find a match in Orpheus node_id, node_record, match_type = j_parser.match2node() j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(j_parser, node_id, node_record, match_type) logger.debug('j_parser.node_record: {}'.format(j_parser.node_record)) #determine Orpheus id of preferred name if j_parser.node_record['name_status'] not in ['PRIMARY']: preferred_node_id = j_parser.node_record['synonym_of'] else: preferred_node_id = j_parser.node_id # parsing green policies preprint_policy.node = preferred_node_id am_policy1.node = preferred_node_id am_policy2.node = preferred_node_id am_policy2.version_embargo_months = aam_embargo # Attach policies to preferred name node logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()') j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match() logger.debug('Calling j_parser.server_data_match_green_policy(**preprint_policy.as_dict())') # logger.debug('preprint_policy.as_dict():') # logger.debug(preprint_policy.as_dict()) j_parser.PolicyMatcher(j_parser, policy_type='green').match(**preprint_policy.as_dict()) logger.debug('Calling j_parser.server_data_match_green_policy(**am_policy1.as_dict())') j_parser.PolicyMatcher(j_parser, policy_type='green').match(**am_policy1.as_dict()) logger.debug('Calling j_parser.server_data_match_green_policy(**am_policy2.as_dict())') j_parser.PolicyMatcher(j_parser, policy_type='green').match(**am_policy2.as_dict()) if j_parser.oa_status != 'FULLY_OA': vor_policy.node = preferred_node_id logger.debug('Calling j_parser.server_data_match_green_policy(**vor_policy.as_dict())') j_parser.PolicyMatcher(j_parser, policy_type='green').match(**vor_policy.as_dict())
def main(): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) source_parser = generic_parser.SourceParser('CUP APC Price List 2019.04 24.i.2019', url=None) SOURCE_ID = source_parser.match_or_create_source() cup_website_parser = generic_parser.SourceParser('CUP website: Green Open Access Policy for Journals', url='https://www.cambridge.org/core/services/open-access-policies/open-access-journals/' 'green-open-access-policy-for-journals') CUP_WEBSITE_ID = cup_website_parser.match_or_create_source() p_parser = generic_parser.NodeParser(name='Cambridge University Press', type='PUBLISHER', source=SOURCE_ID) p_node_id, p_node_record, p_match_type = p_parser.match2node() p_parser.node_id, p_parser.node_record = generic_parser.act_on_orpheus_match(p_parser, p_node_id, p_node_record, p_match_type) inputfile = os.path.join(BASE_DIR, 'datasets', 'Cambridge-Journals-APC-price-list-2019.04.csv') cup2orpheus_status = { '': 'SUBSCRIPTION', 'Hybrid OA': 'HYBRID', 'No OA': 'SUBSCRIPTION', 'Full OA': 'FULLY_OA' } cup_embargo2months = { 'On acceptance': 0, 'On Acceptance': 0, 'On acceptance (SSRN deposit permitted)': 0, 'On publication': 0, "Publisher's version pdf, no sooner than first publication of the article": 0, '5 months after publication': 5, '6 months after publication': 6, '6months after publication': 6, "Publisher's version pdf, no sooner than six months after first publication of the article": 6, '12 months after acceptance': 12, '12 months after publication': 12, '13 months after publication': 13, 'Abstract only plus link to Cambridge site': 999, # DISALLOWED 'Abstract only in PDF or HTML, no sooner than publication of full article': 999 # DISALLOWED } cup_licences2orpheus_ids = { 'CC-BY': CCBY_ID, 'CC-BY-NC': CCBYNC_ID, 'CC-BY-NC-SA': CCBYNCSA_ID, 'CC-BY-NC-ND': CCBYNCND_ID } tpounds = re.compile('£[0-9,]+') tdollars = re.compile('\$[0-9,]+') green_combinations = [] embargo_strings = [] with open(inputfile, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) row_counter = 0 for row in reader: row_counter += 1 # if row_counter < 282: # logger.warning('Skipped; remember to delete or comment out this line later') # continue jname = row['Journal'].replace(' ', ' ').strip() logger.info('-------({}) Working on journal {}'.format(row_counter, jname)) jurl = row['URL'].strip() issn = row['ISSN'].strip() eissn = row['eISSN'].strip() oastatus = cup2orpheus_status[row['Open Access status'].strip()] apc_data = row['Gold OA APC (plus tax, where applied)'].strip() mpounds = tpounds.findall(apc_data) mdollars = tdollars.findall(apc_data) licence_data = row['Gold OA CC licence options '].strip().split(' / ') webpage_AAM = [ cup_embargo2months[row["Author's personal web page Accepted Manuscript"].strip()], [WEBSITE_ID], [AM_ID] ] webpage_VoR = [ cup_embargo2months[row["Author's personal web page Version of Record"].strip()], [WEBSITE_ID], [VOR_ID] ] inst_repo_AAM = [ cup_embargo2months[row['Departmental web page / Institutional Repository Accepted Manuscript'].strip()], [INST_REPO_ID], [AM_ID] ] inst_repo_VoR = [ cup_embargo2months[row['Departmental web page / Institutional Repository Version of Record'].strip()], [INST_REPO_ID], [VOR_ID] ] pmc_AAM = [ cup_embargo2months[row['Non-commercial Repository / Subject Repository Accepted Manuscript'].strip()], [SUBJ_REPO_ID, PUBMED_ID], [AM_ID] ] pmc_VoR = [ cup_embargo2months[row['Non-commercial Repository / Subject Repository Version of Record'].strip()], [SUBJ_REPO_ID, PUBMED_ID], [VOR_ID] ] social_AAM = [ cup_embargo2months[row['Commercial Repository / Social Media Site Accepted Manuscript'].strip()], [SOCIAL_ID], [AM_ID] ] social_VoR = [ cup_embargo2months[row['Commercial Repository / Social Media Site Version of Record'].strip()], [SOCIAL_ID], [VOR_ID] ] # collect all green policy combinations that appear in the dataset green_comb = [webpage_AAM[0], webpage_VoR[0], inst_repo_AAM[0], inst_repo_VoR[0], pmc_AAM[0], pmc_VoR[0], social_AAM[0], social_VoR[0]] if green_comb not in green_combinations: green_combinations.append(green_comb) # continue # uncomment this to produce spreadsheet of all green policy combinations in dataset policies_array = optimal_green_policies(green_comb) # policies_array = sorted([webpage_AAM, webpage_VoR, inst_repo_AAM, inst_repo_VoR, # pmc_AAM, pmc_VoR, social_AAM, social_VoR]) # for e in policies_array[:-1]: # next_e = policies_array[policies_array.index(e)+1] # next item in array # if (e[0] == next_e[0]): # # if (e[0] == next_e[0]) and (e[2] == next_e[2]): # if embargo and version identical to those of next_e # policies_array[policies_array.index(e) + 1][1] += e[1] # add this outlet to next item # policies_array[policies_array.index(e) + 1][2] += e[2] # add this version to next item # policies_array.remove(e) # remove this list item # region apc data parsing apc_list = [] apc_currency = None apc_value_min = None apc_value_max = None if mpounds: apc_currency = 'GBP' for apc in mpounds: apc_list.append(int(apc.replace(',','').replace('£','').strip())) elif mdollars: apc_currency = 'USD' apc_list = [] for apc in mdollars: apc_list.append(int(apc.replace(',', '').replace('$','').strip())) apc_list.sort() if apc_list: apc_value_min = apc_list[0] apc_value_max = apc_list[-1] # endregion licence_options = [] for l in licence_data: if l: licence_options.append(cup_licences2orpheus_ids[l]) j_parser = generic_parser.NodeParser(name=jname, publisher='Cambridge University Press', source=SOURCE_ID, issn=issn, eissn=eissn, url=jurl, publisher_node_id=p_parser.node_id) j_parser.oa_status = oastatus # Attempt to find a match in Orpheus node_id, node_record, match_type = j_parser.match2node() j_parser.node_id, j_parser.node_record = generic_parser.act_on_orpheus_match(j_parser, node_id, node_record, match_type) logger.debug('j_parser.node_record: {}'.format(j_parser.node_record)) #determine Orpheus id of preferred name if j_parser.node_record['name_status'] not in ['PRIMARY']: preferred_node_id = j_parser.node_record['synonym_of'] else: preferred_node_id = j_parser.node_id # parsing OA status logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=oa_status).match()') j_parser.PolicyMatcher(j_parser, policy_type='oa_status').match() # parsing green policies preprint = generic_parser.GreenPolicyInstance() preprint.outlet = ALL_OUTLETS preprint.version = [PREPRINT_ID] preprint.version_embargo_months = 0 preprint.version_green_licence = CUSTOM_ID preprint.source = CUP_WEBSITE_ID preprint.verbatim = CUP_GREEN_VERBATIM preprint.node = preferred_node_id logger.debug('Calling j_parser.server_data_match_green_policy(**preprint.as_dict())') j_parser.PolicyMatcher(j_parser, policy_type='green').match(**preprint.as_dict()) for gp in policies_array: green = generic_parser.GreenPolicyInstance() green.outlet = gp[1] green.version = gp[2] if gp[0] == 999: # if deposit disallowed green.deposit_allowed = False else: green.version_embargo_months = gp[0] green.version_green_licence = CUSTOM_ID green.verbatim = CUP_GREEN_VERBATIM green.source = SOURCE_ID green.node = preferred_node_id logger.debug('Calling j_parser.server_data_match_green_policy(**green.as_dict())') j_parser.PolicyMatcher(j_parser, policy_type='green').match(**green.as_dict()) # parsing gold policy if apc_list or licence_data: gold = generic_parser.GoldPolicyInstance() gold.apc_currency = apc_currency gold.apc_value_min = apc_value_min gold.apc_value_max = apc_value_max gold.source = SOURCE_ID gold.licence_options = licence_options gold.apc_note = apc_data gold.node = preferred_node_id logger.debug('Calling j_parser.PolicyMatcher(j_parser, policy_type=gold).match(**gold.as_dict())') j_parser.PolicyMatcher(j_parser, policy_type='gold').match(**gold.as_dict()) pprint(embargo_strings) with open('cup_green_combinations.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['webpage_AAM', 'webpage_VoR', 'inst_repo_AAM', 'inst_repo_VoR', 'pmc_AAM', 'pmc_VoR', 'social_AAM', 'social_VoR']) for c in green_combinations: writer.writerow(c)