def __init__(self, valid_doi=None, use_api_service=True): if valid_doi is None: valid_doi = CSVManager() self.api = "https://doi.org/api/handles/" self.valid_doi = valid_doi self.use_api_service = use_api_service self.p = "doi:" super(DOIManager, self).__init__()
def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_filepath: str = None): self.doi_set = CSVManager.load_csv_column_as_set( doi_csv, 'doi') if doi_csv else None self.publishers_mapping = self.load_publishers_mapping( publishers_filepath) if publishers_filepath else None orcid_index = orcid_index if orcid_index else None self.orcid_index = CSVManager(orcid_index)
def test_normalize_id_with_cache(self): identifiers = ['doi:10.1123/ijatt'] output_data = list() csv_manager = CSVManager() csv_manager.data = {'10.1123/ijatt.2015-0070': {'v'}} for id in identifiers: output_data.append( Cleaner(id).normalize_id(valid_dois_cache=csv_manager)) expected_data = [None] expected_cache = { '10.1123/ijatt.2015-0070': {'v'}, '10.1123/ijatt': {'i'} } output = (csv_manager.data, output_data) expected_output = (expected_cache, expected_data) self.assertEqual(output, expected_output)
def test_normalize_id(self): identifiers = [ 'doi:10.1123/ijatt.2015-0070', 'doi:1', 'orcid:0000-0003-0530-4305', 'orcid:0000-0000', 'issn:1479-6708', 'issn:0000-0000', 'isbn:9783319403120', 'isbn:0000-0000' ] output = list() csv_manager = CSVManager() for id in identifiers: output.append( Cleaner(id).normalize_id(valid_dois_cache=csv_manager)) expected_output = [ 'doi:10.1123/ijatt.2015-0070', None, 'orcid:0000-0003-0530-4305', None, 'issn:1479-6708', None, 'isbn:9783319403120', None ] self.assertEqual(output, expected_output)
def test_get_agents_strings_list_overlapping_surnames(self): # The surname of one author is included in the surname of another. authors_list = [ { "given": "Puvaneswari", "family": "Paravamsivam", "sequence": "first", "affiliation": [] }, { "given": "Chua Kek", "family": "Heng", "sequence": "additional", "affiliation": [] }, { "given": "Sri Nurestri Abdul", "family": "Malek", "sequence": "additional", "affiliation": [] }, { "given": "Vikineswary", "family": "Sabaratnam", "sequence": "additional", "affiliation": [] }, { "given": "Ravishankar Ram", "family": "M", "sequence": "additional", "affiliation": [] }, { "given": "Umah Rani", "family": "Kuppusamy", "sequence": "additional", "affiliation": [] } ] crossref_processor = CrossrefProcessing(None, None) csv_manager = CSVManager() csv_manager.data = {'10.9799/ksfan.2012.25.1.105': {'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]'}} crossref_processor.orcid_index = csv_manager authors_strings_list = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list) expected_authors_list = ['Paravamsivam, Puvaneswari', 'Heng, Chua Kek', 'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]', 'Sabaratnam, Vikineswary', 'M, Ravishankar Ram', 'Kuppusamy, Umah Rani'] self.assertEqual(authors_strings_list, expected_authors_list)
def __init__(self, config: str): with open(config, encoding='utf-8') as file: settings = yaml.full_load(file) # Mandatory settings self.triplestore_url = settings['triplestore_url'] self.input_csv_dir = normalize_path(settings['input_csv_dir']) self.base_output_dir = normalize_path(settings['base_output_dir']) self.resp_agent = settings['resp_agent'] self.info_dir = os.path.join(self.base_output_dir, 'info_dir') self.output_csv_dir = os.path.join(self.base_output_dir, 'csv') self.output_rdf_dir = os.path.join(self.base_output_dir, f'rdf{os.sep}') self.indexes_dir = os.path.join(self.base_output_dir, 'indexes') self.cache_path = os.path.join(self.base_output_dir, 'cache.txt') self.errors_path = os.path.join(self.base_output_dir, 'errors.txt') # Optional settings self.base_iri = settings['base_iri'] self.context_path = settings['context_path'] self.dir_split_number = settings['dir_split_number'] self.items_per_file = settings['items_per_file'] self.default_dir = settings['default_dir'] self.rdf_output_in_chunks = settings['rdf_output_in_chunks'] self.source = settings['source'] self.valid_dois_cache = CSVManager() if bool( settings['use_doi_api_service']) == True else None self.workers_number = int(settings['workers_number']) supplier_prefix: str = settings['supplier_prefix'] self.supplier_prefix = supplier_prefix[:-1] if supplier_prefix.endswith( '0') else supplier_prefix self.verbose = settings['verbose'] # Time-Agnostic_library integration self.time_agnostic_library_config = os.path.join( os.path.dirname(config), 'time_agnostic_library_config.json') if not os.path.exists(self.time_agnostic_library_config): generate_config_file( config_path=self.time_agnostic_library_config, dataset_urls=[self.triplestore_url], dataset_dirs=list(), provenance_urls=settings['provenance_endpoints'], provenance_dirs=list(), blazegraph_full_text_search=settings[ 'blazegraph_full_text_search'], graphdb_connector_name=settings['graphdb_connector_name'], cache_endpoint=settings['cache_endpoint'], cache_update_endpoint=settings['cache_update_endpoint'])
def __init__(self, output_path: str, threshold: int = 10000, low_memory: bool = False, verbose: bool = False): self.file_counter = 0 self.threshold = 10000 if not threshold else int(threshold) self.verbose = verbose if self.verbose: print("[INFO: CSVManager] Loading existing csv file") self.doimanager = DOIManager(use_api_service=False) self.csvstorage = CSVManager(output_path=output_path, line_threshold=threshold, low_memory=low_memory) # ORCIDs are extracted to skip the corresponding files at the first reading of an existing CSV. self.cache = self.cache = set( el.split("[")[1][:-1].strip() for _, v in self.csvstorage.data.items() for el in v)