예제 #1
0
    def __init__(self, valid_doi=None, use_api_service=True):
        if valid_doi is None:
            valid_doi = CSVManager()

        self.api = "https://doi.org/api/handles/"
        self.valid_doi = valid_doi
        self.use_api_service = use_api_service
        self.p = "doi:"
        super(DOIManager, self).__init__()
예제 #2
0
 def __init__(self,
              orcid_index: str = None,
              doi_csv: str = None,
              publishers_filepath: str = None):
     self.doi_set = CSVManager.load_csv_column_as_set(
         doi_csv, 'doi') if doi_csv else None
     self.publishers_mapping = self.load_publishers_mapping(
         publishers_filepath) if publishers_filepath else None
     orcid_index = orcid_index if orcid_index else None
     self.orcid_index = CSVManager(orcid_index)
예제 #3
0
 def test_normalize_id_with_cache(self):
     identifiers = ['doi:10.1123/ijatt']
     output_data = list()
     csv_manager = CSVManager()
     csv_manager.data = {'10.1123/ijatt.2015-0070': {'v'}}
     for id in identifiers:
         output_data.append(
             Cleaner(id).normalize_id(valid_dois_cache=csv_manager))
     expected_data = [None]
     expected_cache = {
         '10.1123/ijatt.2015-0070': {'v'},
         '10.1123/ijatt': {'i'}
     }
     output = (csv_manager.data, output_data)
     expected_output = (expected_cache, expected_data)
     self.assertEqual(output, expected_output)
예제 #4
0
 def test_normalize_id(self):
     identifiers = [
         'doi:10.1123/ijatt.2015-0070', 'doi:1',
         'orcid:0000-0003-0530-4305', 'orcid:0000-0000', 'issn:1479-6708',
         'issn:0000-0000', 'isbn:9783319403120', 'isbn:0000-0000'
     ]
     output = list()
     csv_manager = CSVManager()
     for id in identifiers:
         output.append(
             Cleaner(id).normalize_id(valid_dois_cache=csv_manager))
     expected_output = [
         'doi:10.1123/ijatt.2015-0070', None, 'orcid:0000-0003-0530-4305',
         None, 'issn:1479-6708', None, 'isbn:9783319403120', None
     ]
     self.assertEqual(output, expected_output)
 def test_get_agents_strings_list_overlapping_surnames(self):
     # The surname of one author is included in the surname of another.
     authors_list = [
         {
             "given": "Puvaneswari",
             "family": "Paravamsivam",
             "sequence": "first",
             "affiliation": []
         },
         {
             "given": "Chua Kek",
             "family": "Heng",
             "sequence": "additional",
             "affiliation": []
         },
         {
             "given": "Sri Nurestri Abdul",
             "family": "Malek",
             "sequence": "additional",
             "affiliation": []
         },
         {
             "given": "Vikineswary",
             "family": "Sabaratnam",
             "sequence": "additional",
             "affiliation": []
         },
         {
             "given": "Ravishankar Ram",
             "family": "M",
             "sequence": "additional",
             "affiliation": []
         },
         {
             "given": "Umah Rani",
             "family": "Kuppusamy",
             "sequence": "additional",
             "affiliation": []
         }
     ]
     crossref_processor = CrossrefProcessing(None, None)
     csv_manager = CSVManager()
     csv_manager.data = {'10.9799/ksfan.2012.25.1.105': {'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]'}}
     crossref_processor.orcid_index = csv_manager
     authors_strings_list = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list)
     expected_authors_list = ['Paravamsivam, Puvaneswari', 'Heng, Chua Kek', 'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]', 'Sabaratnam, Vikineswary', 'M, Ravishankar Ram', 'Kuppusamy, Umah Rani']
     self.assertEqual(authors_strings_list, expected_authors_list)
예제 #6
0
 def __init__(self, config: str):
     with open(config, encoding='utf-8') as file:
         settings = yaml.full_load(file)
     # Mandatory settings
     self.triplestore_url = settings['triplestore_url']
     self.input_csv_dir = normalize_path(settings['input_csv_dir'])
     self.base_output_dir = normalize_path(settings['base_output_dir'])
     self.resp_agent = settings['resp_agent']
     self.info_dir = os.path.join(self.base_output_dir, 'info_dir')
     self.output_csv_dir = os.path.join(self.base_output_dir, 'csv')
     self.output_rdf_dir = os.path.join(self.base_output_dir,
                                        f'rdf{os.sep}')
     self.indexes_dir = os.path.join(self.base_output_dir, 'indexes')
     self.cache_path = os.path.join(self.base_output_dir, 'cache.txt')
     self.errors_path = os.path.join(self.base_output_dir, 'errors.txt')
     # Optional settings
     self.base_iri = settings['base_iri']
     self.context_path = settings['context_path']
     self.dir_split_number = settings['dir_split_number']
     self.items_per_file = settings['items_per_file']
     self.default_dir = settings['default_dir']
     self.rdf_output_in_chunks = settings['rdf_output_in_chunks']
     self.source = settings['source']
     self.valid_dois_cache = CSVManager() if bool(
         settings['use_doi_api_service']) == True else None
     self.workers_number = int(settings['workers_number'])
     supplier_prefix: str = settings['supplier_prefix']
     self.supplier_prefix = supplier_prefix[:-1] if supplier_prefix.endswith(
         '0') else supplier_prefix
     self.verbose = settings['verbose']
     # Time-Agnostic_library integration
     self.time_agnostic_library_config = os.path.join(
         os.path.dirname(config), 'time_agnostic_library_config.json')
     if not os.path.exists(self.time_agnostic_library_config):
         generate_config_file(
             config_path=self.time_agnostic_library_config,
             dataset_urls=[self.triplestore_url],
             dataset_dirs=list(),
             provenance_urls=settings['provenance_endpoints'],
             provenance_dirs=list(),
             blazegraph_full_text_search=settings[
                 'blazegraph_full_text_search'],
             graphdb_connector_name=settings['graphdb_connector_name'],
             cache_endpoint=settings['cache_endpoint'],
             cache_update_endpoint=settings['cache_update_endpoint'])
예제 #7
0
 def __init__(self,
              output_path: str,
              threshold: int = 10000,
              low_memory: bool = False,
              verbose: bool = False):
     self.file_counter = 0
     self.threshold = 10000 if not threshold else int(threshold)
     self.verbose = verbose
     if self.verbose:
         print("[INFO: CSVManager] Loading existing csv file")
     self.doimanager = DOIManager(use_api_service=False)
     self.csvstorage = CSVManager(output_path=output_path,
                                  line_threshold=threshold,
                                  low_memory=low_memory)
     # ORCIDs are extracted to skip the corresponding files at the first reading of an existing CSV.
     self.cache = self.cache = set(
         el.split("[")[1][:-1].strip()
         for _, v in self.csvstorage.data.items() for el in v)