def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_filepath: str = None): self.doi_set = CSVManager.load_csv_column_as_set( doi_csv, 'doi') if doi_csv else None self.publishers_mapping = self.load_publishers_mapping( publishers_filepath) if publishers_filepath else None orcid_index = orcid_index if orcid_index else None self.orcid_index = CSVManager(orcid_index)
def test_normalize_id_with_cache(self): identifiers = ['doi:10.1123/ijatt'] output_data = list() csv_manager = CSVManager() csv_manager.data = {'10.1123/ijatt.2015-0070': {'v'}} for id in identifiers: output_data.append( Cleaner(id).normalize_id(valid_dois_cache=csv_manager)) expected_data = [None] expected_cache = { '10.1123/ijatt.2015-0070': {'v'}, '10.1123/ijatt': {'i'} } output = (csv_manager.data, output_data) expected_output = (expected_cache, expected_data) self.assertEqual(output, expected_output)
def test_get_agents_strings_list_overlapping_surnames(self): # The surname of one author is included in the surname of another. authors_list = [ { "given": "Puvaneswari", "family": "Paravamsivam", "sequence": "first", "affiliation": [] }, { "given": "Chua Kek", "family": "Heng", "sequence": "additional", "affiliation": [] }, { "given": "Sri Nurestri Abdul", "family": "Malek", "sequence": "additional", "affiliation": [] }, { "given": "Vikineswary", "family": "Sabaratnam", "sequence": "additional", "affiliation": [] }, { "given": "Ravishankar Ram", "family": "M", "sequence": "additional", "affiliation": [] }, { "given": "Umah Rani", "family": "Kuppusamy", "sequence": "additional", "affiliation": [] } ] crossref_processor = CrossrefProcessing(None, None) csv_manager = CSVManager() csv_manager.data = {'10.9799/ksfan.2012.25.1.105': {'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]'}} crossref_processor.orcid_index = csv_manager authors_strings_list = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list) expected_authors_list = ['Paravamsivam, Puvaneswari', 'Heng, Chua Kek', 'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]', 'Sabaratnam, Vikineswary', 'M, Ravishankar Ram', 'Kuppusamy, Umah Rani'] self.assertEqual(authors_strings_list, expected_authors_list)
def __init__(self, output_path: str, threshold: int = 10000, low_memory: bool = False, verbose: bool = False): self.file_counter = 0 self.threshold = 10000 if not threshold else int(threshold) self.verbose = verbose if self.verbose: print("[INFO: CSVManager] Loading existing csv file") self.doimanager = DOIManager(use_api_service=False) self.csvstorage = CSVManager(output_path=output_path, line_threshold=threshold, low_memory=low_memory) # ORCIDs are extracted to skip the corresponding files at the first reading of an existing CSV. self.cache = self.cache = set( el.split("[")[1][:-1].strip() for _, v in self.csvstorage.data.items() for el in v)
def __init__(self, valid_doi=None, use_api_service=True): if valid_doi is None: valid_doi = CSVManager() self.api = "https://doi.org/api/handles/" self.valid_doi = valid_doi self.use_api_service = use_api_service self.p = "doi:" super(DOIManager, self).__init__()
def test_normalize_id(self): identifiers = [ 'doi:10.1123/ijatt.2015-0070', 'doi:1', 'orcid:0000-0003-0530-4305', 'orcid:0000-0000', 'issn:1479-6708', 'issn:0000-0000', 'isbn:9783319403120', 'isbn:0000-0000' ] output = list() csv_manager = CSVManager() for id in identifiers: output.append( Cleaner(id).normalize_id(valid_dois_cache=csv_manager)) expected_output = [ 'doi:10.1123/ijatt.2015-0070', None, 'orcid:0000-0003-0530-4305', None, 'issn:1479-6708', None, 'isbn:9783319403120', None ] self.assertEqual(output, expected_output)
def __init__(self, config: str): with open(config, encoding='utf-8') as file: settings = yaml.full_load(file) # Mandatory settings self.triplestore_url = settings['triplestore_url'] self.input_csv_dir = normalize_path(settings['input_csv_dir']) self.base_output_dir = normalize_path(settings['base_output_dir']) self.resp_agent = settings['resp_agent'] self.info_dir = os.path.join(self.base_output_dir, 'info_dir') self.output_csv_dir = os.path.join(self.base_output_dir, 'csv') self.output_rdf_dir = os.path.join(self.base_output_dir, f'rdf{os.sep}') self.indexes_dir = os.path.join(self.base_output_dir, 'indexes') self.cache_path = os.path.join(self.base_output_dir, 'cache.txt') self.errors_path = os.path.join(self.base_output_dir, 'errors.txt') # Optional settings self.base_iri = settings['base_iri'] self.context_path = settings['context_path'] self.dir_split_number = settings['dir_split_number'] self.items_per_file = settings['items_per_file'] self.default_dir = settings['default_dir'] self.rdf_output_in_chunks = settings['rdf_output_in_chunks'] self.source = settings['source'] self.valid_dois_cache = CSVManager() if bool( settings['use_doi_api_service']) == True else None self.workers_number = int(settings['workers_number']) supplier_prefix: str = settings['supplier_prefix'] self.supplier_prefix = supplier_prefix[:-1] if supplier_prefix.endswith( '0') else supplier_prefix self.verbose = settings['verbose'] # Time-Agnostic_library integration self.time_agnostic_library_config = os.path.join( os.path.dirname(config), 'time_agnostic_library_config.json') if not os.path.exists(self.time_agnostic_library_config): generate_config_file( config_path=self.time_agnostic_library_config, dataset_urls=[self.triplestore_url], dataset_dirs=list(), provenance_urls=settings['provenance_endpoints'], provenance_dirs=list(), blazegraph_full_text_search=settings[ 'blazegraph_full_text_search'], graphdb_connector_name=settings['graphdb_connector_name'], cache_endpoint=settings['cache_endpoint'], cache_update_endpoint=settings['cache_update_endpoint'])
class Index_orcid_doi: def __init__(self, output_path: str, threshold: int = 10000, low_memory: bool = False, verbose: bool = False): self.file_counter = 0 self.threshold = 10000 if not threshold else int(threshold) self.verbose = verbose if self.verbose: print("[INFO: CSVManager] Loading existing csv file") self.doimanager = DOIManager(use_api_service=False) self.csvstorage = CSVManager(output_path=output_path, line_threshold=threshold, low_memory=low_memory) # ORCIDs are extracted to skip the corresponding files at the first reading of an existing CSV. self.cache = self.cache = set( el.split("[")[1][:-1].strip() for _, v in self.csvstorage.data.items() for el in v) def explorer(self, summaries_path: str) -> None: if self.verbose: print("[INFO: Index_orcid_doi] Counting files to process") files_to_process = [ os.path.join(fold, file) for fold, _, files in os.walk(summaries_path) for file in files if file.replace('.xml', '') not in self.cache ] processed_files = len(self.cache) del self.cache if self.verbose: pbar = tqdm(total=len(files_to_process)) for file in files_to_process: self.finder(file) self.file_counter += 1 cur_file = self.file_counter + processed_files if self.file_counter % self.threshold == 0: self.csvstorage.dump_data( f'{cur_file-self.threshold+1}-{cur_file}.csv') if self.verbose: pbar.update(1) cur_file = self.file_counter + processed_files self.csvstorage.dump_data( f'{cur_file + 1 - (cur_file % self.threshold)}-{cur_file}.csv') if self.verbose: pbar.close() def finder(self, file: str): orcid = file.replace('.xml', '')[-19:] valid_doi = False if file.endswith('.xml'): with open(file, 'r', encoding='utf-8') as xml_file: xml_soup = BeautifulSoup(xml_file, 'xml') ids = xml_soup.findAll('common:external-id') if ids: for el in ids: id_type = el.find('common:external-id-type') rel = el.find('common:external-id-relationship') if id_type and rel: if id_type.get_text().lower( ) == 'doi' and rel.get_text().lower() == 'self': doi = el.find( 'common:external-id-value').get_text() doi = self.doimanager.normalise(doi) if doi: g_name = xml_soup.find( 'personal-details:given-names') f_name = xml_soup.find( 'personal-details:family-name') if f_name: f_name = f_name.get_text() if g_name: g_name = g_name.get_text() name = f_name + ', ' + g_name else: name = f_name auto = name + ' [' + orcid + ']' valid_doi = True self.csvstorage.add_value(doi, auto) if not valid_doi: # Save file names where nothing was found, to skip them during the next run self.csvstorage.add_value('None', f'[{orcid}]')
class CrossrefProcessing: def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_filepath: str = None): self.doi_set = CSVManager.load_csv_column_as_set( doi_csv, 'doi') if doi_csv else None self.publishers_mapping = self.load_publishers_mapping( publishers_filepath) if publishers_filepath else None orcid_index = orcid_index if orcid_index else None self.orcid_index = CSVManager(orcid_index) def csv_creator(self, data: dict) -> list: data = data['items'] output = list() for x in data: if not 'DOI' in x: continue if isinstance(x['DOI'], list): doi = DOIManager().normalise(str(x['DOI'][0])) else: doi = DOIManager().normalise(str(x['DOI'])) if (doi and self.doi_set and doi in self.doi_set) or (doi and not self.doi_set): row = dict() # create empty row keys = [ 'id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type', 'publisher', 'editor' ] for k in keys: row[k] = '' if 'type' in x: if x['type']: row['type'] = x['type'].replace('-', ' ') # row['id'] idlist = list() idlist.append(str('doi:' + doi)) if 'ISBN' in x: if row['type'] in { 'book', 'dissertation', 'edited book', 'monograph', 'reference book', 'report', 'standard' }: self.id_worker(x['ISBN'], idlist, self.isbn_worker) if 'ISSN' in x: if row['type'] in { 'book series', 'book set', 'journal', 'proceedings series', 'series', 'standard series' }: self.id_worker(x['ISSN'], idlist, self.issn_worker) elif row['type'] == 'report series': br_id = True if 'container-title' in x: if x['container-title']: br_id = False if br_id: self.id_worker(x['ISSN'], idlist, self.issn_worker) row['id'] = ' '.join(idlist) # row['title'] if 'title' in x: if x['title']: if isinstance(x['title'], list): text_title = x['title'][0] else: text_title = x['title'] soup = BeautifulSoup(text_title, 'html.parser') title_soup = soup.get_text().replace('\n', '') title = html.unescape(title_soup) row['title'] = title # row['author'] if 'author' in x: autlist = self.get_agents_strings_list(doi, x['author']) row['author'] = '; '.join(autlist) # row['pub_date'] if 'issued' in x: if x['issued']['date-parts'][0][0]: row['pub_date'] = '-'.join( [str(y) for y in x['issued']['date-parts'][0]]) else: row['pub_date'] = '' # row['venue'] row['venue'] = self.get_venue_name(x, row) if 'volume' in x: row['volume'] = x['volume'] if 'issue' in x: row['issue'] = x['issue'] if 'page' in x: row['page'] = self.get_pages(x) row['publisher'] = self.get_publisher_name(doi, x) if 'editor' in x: editlist = self.get_agents_strings_list(doi, x['editor']) row['editor'] = '; '.join(editlist) output.append(row) return output def orcid_finder(self, doi: str) -> dict: found = dict() doi = doi.lower() people: List[str] = self.orcid_index.get_value(doi) if people: for person in people: orcid = re.search(orcid_pattern, person).group(0) name: str = person[:person.find(orcid) - 1] found[orcid] = name.strip().lower() return found def get_pages(self, item: dict) -> str: ''' This function returns the pages interval. :params item: the item's dictionary :type item: dict :returns: str -- The output is a string in the format 'START-END', for example, '583-584'. If there are no pages, the output is an empty string. ''' roman_letters = {'I', 'V', 'X', 'L', 'C', 'D', 'M'} pages_list = re.split(pages_separator, item['page']) clean_pages_list = list() for page in pages_list: # e.g. 583-584 if all(c.isdigit() for c in page): clean_pages_list.append(page) # e.g. G27. It is a born digital document. PeerJ uses this approach, where G27 identifies the whole document, since it has no pages. elif len(pages_list) == 1: clean_pages_list.append(page) # e.g. iv-vii. This syntax is used in the prefaces. elif all(c.upper() in roman_letters for c in page): clean_pages_list.append(page) # 583b-584. It is an error. The b must be removed. elif any(c.isdigit() for c in page): page_without_letters = ''.join( [c for c in page if c.isdigit()]) clean_pages_list.append(page_without_letters) pages = '-'.join(clean_pages_list) return pages def get_publisher_name(self, doi: str, item: dict) -> str: ''' This function aims to return a publisher's name and id. If a mapping was provided, it is used to find the publisher's standardized name from its id or DOI prefix. :params doi: the item's DOI :type doi: str :params item: the item's dictionary :type item: dict :returns: str -- The output is a string in the format 'NAME [SCHEMA:ID]', for example, 'American Medical Association (AMA) [crossref:10]'. If the id does not exist, the output is only the name. Finally, if there is no publisher, the output is an empty string. ''' data = {'publisher': '', 'member': None, 'prefix': doi.split('/')[0]} for field in {'publisher', 'member', 'prefix'}: if field in item: if item[field]: data[field] = item[field] publisher = data['publisher'] member = data['member'] prefix = data['prefix'] relevant_member = False if self.publishers_mapping and member: if member in self.publishers_mapping: relevant_member = True if self.publishers_mapping: if relevant_member: name = self.publishers_mapping[member]['name'] name_and_id = f'{name} [crossref:{member}]' else: member_dict = next(({ member: data } for member, data in self.publishers_mapping.items() if prefix in data['prefixes']), None) if member_dict: member = list(member_dict.keys())[0] name_and_id = f"{member_dict[member]['name']} [crossref:{member}]" else: name_and_id = publisher else: name_and_id = f'{publisher} [crossref:{member}]' if member else publisher return name_and_id def get_venue_name(self, item: dict, row: dict) -> str: ''' This method deals with generating the venue's name, followed by id in square brackets, separated by spaces. HTML tags are deleted and HTML entities escaped. In addition, any ISBN and ISSN are validated. Finally, the square brackets in the venue name are replaced by round brackets to avoid conflicts with the ids enclosures. :params item: the item's dictionary :type item: dict :params row: a CSV row :type row: dict :returns: str -- The output is a string in the format 'NAME [SCHEMA:ID]', for example, 'Nutrition & Food Science [issn:0034-6659]'. If the id does not exist, the output is only the name. Finally, if there is no venue, the output is an empty string. ''' name_and_id = '' if 'container-title' in item: if item['container-title']: if isinstance(item['container-title'], list): ventit = str(item['container-title'][0]).replace('\n', '') else: ventit = str(item['container-title']).replace('\n', '') ven_soup = BeautifulSoup(ventit, 'html.parser') ventit = html.unescape(ven_soup.get_text()) ambiguous_brackets = re.search(ids_inside_square_brackets, ventit) if ambiguous_brackets: match = ambiguous_brackets.group(1) open_bracket = ventit.find(match) - 1 close_bracket = ventit.find(match) + len(match) ventit = ventit[:open_bracket] + '(' + ventit[ open_bracket + 1:] ventit = ventit[:close_bracket] + ')' + ventit[ close_bracket + 1:] venidlist = list() if 'ISBN' in item: if row['type'] in { 'book chapter', 'book part', 'book section', 'book track', 'reference entry' }: self.id_worker(item['ISBN'], venidlist, self.isbn_worker) if 'ISSN' in item: if row['type'] in { 'book', 'data file', 'dataset', 'edited book', 'journal article', 'journal volume', 'journal issue', 'monograph', 'proceedings', 'peer review', 'reference book', 'reference entry', 'report' }: self.id_worker(item['ISSN'], venidlist, self.issn_worker) elif row['type'] == 'report series': if 'container-title' in item: if item['container-title']: self.id_worker(item['ISSN'], venidlist, self.issn_worker) if venidlist: name_and_id = ventit + ' [' + ' '.join(venidlist) + ']' else: name_and_id = ventit return name_and_id def get_agents_strings_list(self, doi: str, agents_list: List[dict]) -> list: agents_strings_list = list() dict_orcid = None if not all('ORCID' in agent for agent in agents_list): dict_orcid = self.orcid_finder(doi) agents_list = [{ k: Cleaner(v).remove_unwanted_characters() if k in {'family', 'given', 'name'} else v for k, v in agent_dict.items() } for agent_dict in agents_list] for agent in agents_list: f_name = None g_name = None agent_string = None if 'family' in agent: f_name = agent['family'] if 'given' in agent: g_name = agent['given'] agent_string = f_name + ', ' + g_name else: agent_string = f_name + ', ' elif 'name' in agent: agent_string = agent['name'] f_name = agent_string.split( )[-1] if ' ' in agent_string else None elif 'given' in agent and 'family' not in agent: agent_string = ', ' + agent['given'] orcid = None if 'ORCID' in agent: if isinstance(agent['ORCID'], list): orcid = str(agent['ORCID'][0]) else: orcid = str(agent['ORCID']) orcid = ORCIDManager().normalise( orcid) if ORCIDManager().is_valid(orcid) else None elif dict_orcid and f_name: for ori in dict_orcid: orc_n: List[str] = dict_orcid[ori].split(', ') orc_f = orc_n[0].lower() orc_g = orc_n[1] if len(orc_n) == 2 else None if f_name.lower() in orc_f.lower() or orc_f.lower( ) in f_name.lower(): # If there are several authors with the same surname if len([ person for person in agents_list if 'family' in person if person['family'] if person['family'].lower() in orc_f.lower() or orc_f.lower() in person['family'].lower() ]) > 1 and g_name and orc_g: # If there are several authors with the same surname and the same given names' initials if len([ person for person in agents_list if 'given' in person if person['given'] if person['given'][0].lower() == orc_g[0].lower() ]) > 1: # If there are no homonyms if not len([ person for person in agents_list if 'given' in person if person['given'] if person['given'].lower() == orc_g.lower() ]) > 1: if orc_g.lower() == g_name.lower(): orcid = ori elif orc_g[0].lower() == g_name[0].lower(): orcid = ori else: orcid = ori if agent_string and orcid: agent_string += ' [' + 'orcid:' + str(orcid) + ']' if agent_string: agents_strings_list.append(agent_string) return agents_strings_list @staticmethod def id_worker(field, idlist: list, func) -> None: if isinstance(field, list): for i in field: func(str(i), idlist) else: id = str(field) func(id, idlist) @staticmethod def issn_worker(issnid, idlist): if ISSNManager().is_valid(issnid): issnid = ISSNManager().normalise(issnid, include_prefix=True) idlist.append(issnid) @staticmethod def isbn_worker(isbnid, idlist): if ISBNManager().is_valid(isbnid): isbnid = ISBNManager().normalise(isbnid, include_prefix=True) idlist.append(isbnid) @staticmethod def load_publishers_mapping(publishers_filepath: str) -> dict: publishers_mapping: Dict[str, Dict[str, set]] = dict() with open(publishers_filepath, 'r', encoding='utf-8') as f: data = DictReader(f) for row in data: id = row['id'] publishers_mapping.setdefault(id, dict()) publishers_mapping[id]['name'] = row['name'] publishers_mapping[id].setdefault('prefixes', set()).add(row['prefix']) return publishers_mapping