def create_csv(doi_file, date_file, orcid_file, issn_file): valid_doi = CSVManager(csv_path=doi_file) id_date = CSVManager(csv_path=date_file) id_orcid = CSVManager(csv_path=orcid_file) id_issn = CSVManager(csv_path=issn_file) return valid_doi, id_date, id_orcid, id_issn
def __init__(self, date=None, orcid=None, issn=None, doi=None, **params): if date is None: date = CSVManager(store_new=False) if orcid is None: orcid = CSVManager(store_new=False) if issn is None: issn = CSVManager(store_new=False) if doi is None: doi = CSVManager(store_new=False) for key in params: setattr(self, key, params[key]) self.issn = issn self.date = date self.orcid = orcid if hasattr(self, 'use_api_service'): self.dm = DOIManager(doi, self.use_api_service) else: self.dm = DOIManager(doi) self.im = ISSNManager() self.om = ORCIDManager() self.headers = { "User-Agent": "ResourceFinder / OpenCitations Indexes " "(http://opencitations.net; mailto:[email protected])" }
def test_get_value(self): csv_m = CSVManager(self.initial_path) retrieved_1 = csv_m.get_value("doi:10.1108/jd-12-2013-0166") self.assertEqual({"2015-03-09"}, retrieved_1) retrieved_2 = csv_m.get_value("doi:10.1108/jd-12-2013-0167") self.assertIsNone(retrieved_2)
def test_datacite_get_orcid(self): # Do not use support files, only APIs df_1 = DataCiteResourceFinder() self.assertIn("0000-0001-7734-8388", df_1.get_orcid("10.5065/d6b8565d")) self.assertNotIn("0000-0001-5506-523X", df_1.get_orcid("10.5065/d6b8565d")) # Do use support files, but avoid using APIs df_2 = DataCiteResourceFinder(orcid=CSVManager(self.orcid_path), doi=CSVManager(self.doi_path), use_api_service=False) self.assertIn("0000-0001-7734-8388", df_2.get_orcid("10.5065/d6b8565d")) self.assertNotIn("0000-0001-5506-523X", df_2.get_orcid("10.5065/d6b8565d")) # Do not use support files neither APIs df_3 = DataCiteResourceFinder(use_api_service=False) self.assertIsNone(df_3.get_orcid("10.5065/d6b8565d"))
def test_orcid_get_orcid(self): # Do not use support files, only APIs of_1 = ORCIDResourceFinder() self.assertIn("0000-0003-0530-4305", of_1.get_orcid("10.1108/jd-12-2013-0166")) self.assertNotIn("0000-0001-5506-523X", of_1.get_orcid("10.1108/jd-12-2013-0166")) # Do use support files, but avoid using APIs of_2 = ORCIDResourceFinder(orcid=CSVManager(self.orcid_path), doi=CSVManager(self.doi_path), use_api_service=False) self.assertIn("0000-0003-0530-4305", of_2.get_orcid("10.1108/jd-12-2013-0166")) self.assertNotIn("0000-0001-5506-523X", of_2.get_orcid("10.1108/jd-12-2013-0166")) # Do not use support files neither APIs of_3 = ORCIDResourceFinder(use_api_service=False) self.assertIsNone(of_3.get_orcid("10.1108/jd-12-2013-0166"))
def test_crossref_get_pub_date(self): # Do not use support files, only APIs cf_1 = CrossrefResourceFinder() self.assertIn("2019-01-02", cf_1.get_pub_date("10.1007/s11192-018-2988-z")) self.assertNotEqual("2019", cf_1.get_pub_date("10.1007/s11192-018-2988-z")) # Do use support files, but avoid using APIs cf_2 = CrossrefResourceFinder(date=CSVManager(self.date_path), doi=CSVManager(self.doi_path), use_api_service=False) self.assertIn("2019-01-02", cf_2.get_pub_date("10.1007/s11192-018-2988-z")) self.assertNotEqual("2018-01-02", cf_2.get_pub_date("10.1007/s11192-018-2988-z")) # Do not use support files neither APIs cf_3 = CrossrefResourceFinder(use_api_service=False) self.assertIsNone(cf_3.get_pub_date("10.1007/s11192-018-2988-z"))
def test_crossref_get_issn(self): # Do not use support files, only APIs cf_1 = CrossrefResourceFinder() self.assertIn("0138-9130", cf_1.get_container_issn("10.1007/s11192-018-2988-z")) self.assertNotIn("0138-9000", cf_1.get_container_issn("10.1007/s11192-018-2988-z")) # Do use support files, but avoid using APIs cf_2 = CrossrefResourceFinder(issn=CSVManager(self.issn_path), doi=CSVManager(self.doi_path), use_api_service=False) self.assertIn("1588-2861", cf_2.get_container_issn("10.1007/s11192-018-2988-z")) self.assertNotIn("0138-9000", cf_2.get_container_issn("10.1007/s11192-018-2988-z")) # Do not use support files neither APIs cf_3 = CrossrefResourceFinder(use_api_service=False) self.assertIsNone(cf_3.get_container_issn("10.1007/s11192-018-2988-z"))
def test_crossref_get_orcid(self): # Do not use support files, only APIs cf_1 = CrossrefResourceFinder() self.assertIn("0000-0003-0530-4305", cf_1.get_orcid("10.1007/s11192-018-2988-z")) self.assertNotIn("0000-0001-5506-523X", cf_1.get_orcid("10.1007/s11192-018-2988-z")) # Do use support files, but avoid using APIs cf_2 = CrossrefResourceFinder(orcid=CSVManager(self.orcid_path), doi=CSVManager(self.doi_path), use_api_service=False) self.assertIn("0000-0003-0530-4305", cf_2.get_orcid("10.1007/s11192-018-2988-z")) self.assertNotIn("0000-0001-5506-523X", cf_2.get_orcid("10.1007/s11192-018-2988-z")) # Do not use support files neither APIs cf_3 = CrossrefResourceFinder(use_api_service=False) self.assertIsNone(cf_3.get_orcid("10.1007/s11192-018-2988-z"))
def test_datacite_get_pub_date(self): # Do not use support files, only APIs df_1 = DataCiteResourceFinder() self.assertIn("2019-05-27", df_1.get_pub_date("10.6092/issn.2532-8816/8555")) self.assertNotEqual("2019", df_1.get_pub_date("10.6092/issn.2532-8816/8555")) # Do use support files, but avoid using APIs df_2 = DataCiteResourceFinder(date=CSVManager(self.date_path), doi=CSVManager(self.doi_path), use_api_service=False) self.assertIn("2019-05-27", df_2.get_pub_date("10.6092/issn.2532-8816/8555")) self.assertNotEqual("2018-01-02", df_2.get_pub_date("10.6092/issn.2532-8816/8555")) # Do not use support files neither APIs df_3 = DataCiteResourceFinder(use_api_service=False) self.assertIsNone(df_3.get_pub_date("10.6092/issn.2532-8816/8555"))
def test_datacite_get_issn(self): # Do not use support files, only APIs df_1 = DataCiteResourceFinder() self.assertIn("2197-6775", df_1.get_container_issn("10.14763/2019.1.1389")) self.assertNotIn("1588-2861", df_1.get_container_issn("10.14763/2019.1.1389")) # Do use support files, but avoid using APIs df_2 = DataCiteResourceFinder(issn=CSVManager(self.issn_path), doi=CSVManager(self.doi_path), use_api_service=False) self.assertIn("2197-6775", df_2.get_container_issn("10.14763/2019.1.1389")) self.assertNotIn("1588-2861", df_2.get_container_issn("10.14763/2019.1.1389")) # Do not use support files neither APIs df_3 = DataCiteResourceFinder(use_api_service=False) self.assertIsNone(df_3.get_container_issn("10.14763/2019.1.1389"))
def test_creation(self): csv_m = CSVManager(self.initial_path) self.assertDictEqual( csv_m.data, { "doi:10.1108/jd-12-2013-0166": {"2015-03-09"}, "doi:10.7717/peerj.4375": {"2018-02-13"} })
def __init__(self, valid_doi=None, use_api_service=True): if valid_doi is None: valid_doi = CSVManager(store_new=False) self.api = "https://doi.org/api/handles/" self.valid_doi = valid_doi self.use_api_service = use_api_service self.p = "doi:" super(DOIManager, self).__init__()
def test_load_csv_column_as_set(self): oci_set = CSVManager.load_csv_column_as_set(self.citation_path, "oci", 4) self.assertSetEqual( oci_set, { "02001000308362819371213133704040001020809-020010009063615193700006300030306151914", "02001000002361927283705040000-02001000002361927283705030002", "02001000002361927283705040000-020010003093612062710020603000720", "02001000308362819371213133704040001020804-02001000308362819371213133704040000030707", "020010000023625242110370100030001-02001010009361222251430273701090809370903040403", "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107" })
def test_doi_is_valid(self): dm_nofile = DOIManager() self.assertTrue(dm_nofile.is_valid(self.valid_doi_1)) self.assertTrue(dm_nofile.is_valid(self.valid_doi_2)) self.assertFalse(dm_nofile.is_valid(self.invalid_doi)) valid_doi = CSVManager(self.valid_doi_path) dm_file = DOIManager(valid_doi=valid_doi, use_api_service=False) self.assertTrue(dm_file.is_valid(self.valid_doi_1)) self.assertFalse(dm_file.is_valid(self.invalid_doi)) dm_nofile_noapi = DOIManager(use_api_service=False) self.assertFalse(dm_nofile_noapi.is_valid(self.valid_doi_1)) self.assertFalse(dm_nofile_noapi.is_valid(self.invalid_doi))
def test_add_value(self): if exists(self.addition_path): remove(self.addition_path) csv_m = CSVManager(self.addition_path) csv_m.add_value("doi:10.1108/jd-12-2013-0166", "orcid:0000-0003-0530-4305") csv_m.add_value("doi:10.7717/peerj.4375", "orcid:0000-0003-1613-5981") csv_m.add_value("doi:10.1108/jd-12-2013-0166", "orcid:0000-0001-5506-523X") self.assertDictEqual( csv_m.data, { "doi:10.1108/jd-12-2013-0166": {"orcid:0000-0003-0530-4305", "orcid:0000-0001-5506-523X"}, "doi:10.7717/peerj.4375": {"orcid:0000-0003-1613-5981"} })
def test_glob(self): if exists(self.output_dir): rmtree(self.output_dir) makedirs(self.output_dir) process(self.input_dir, self.output_dir) orig = CSVManager(self.input_dir + sep + "valid_doi.csv") new = CSVManager(self.output_dir + sep + "valid_doi.csv") self.assertDictEqual(orig.data, new.data) orig = CSVManager(self.input_dir + sep + "id_date.csv") new = CSVManager(self.output_dir + sep + "id_date.csv") self.assertDictEqual(orig.data, new.data) orig = CSVManager(self.input_dir + sep + "id_issn.csv") new = CSVManager(self.output_dir + sep + "id_issn.csv") self.assertDictEqual(orig.data, new.data) orig = CSVManager(self.input_dir + sep + "id_orcid.csv") new = CSVManager(self.output_dir + sep + "id_orcid.csv") self.assertDictEqual(orig.data, new.data)
def process(input_dir, output_dir): if not exists(output_dir): makedirs(output_dir) citing_doi_with_no_date = set() valid_doi = CSVManager(output_dir + sep + "valid_doi.csv") id_date = CSVManager(output_dir + sep + "id_date.csv") id_issn = CSVManager(output_dir + sep + "id_issn.csv") id_orcid = CSVManager(output_dir + sep + "id_orcid.csv") doi_manager = DOIManager(valid_doi) issn_manager = ISSNManager() orcid_manager = ORCIDManager() all_files, opener = get_all_files(input_dir) len_all_files = len(all_files) # Read all the JSON file in the Crossref dump to create the main information of all the indexes print("\n\n# Add valid DOIs from Crossref metadata") for file_idx, file in enumerate(all_files, 1): with opener(file) as f: print("Open file %s of %s" % (file_idx, len_all_files)) try: data = load(f) # When using tar.gz file or zip file a stream of byte is returned by the opener. Thus, # it must be converted into an utf-8 string before loading it into a JSON. except TypeError: utf8reader = codecs.getreader("utf-8") data = load(utf8reader(f)) if "items" in data: for obj in data['items']: if "DOI" in obj: citing_doi = doi_manager.normalise(obj["DOI"], True) doi_manager.set_valid(citing_doi) if id_date.get_value(citing_doi) is None: citing_date = Citation.check_date(build_pubdate(obj)) if citing_date is not None: id_date.add_value(citing_doi, citing_date) if citing_doi in citing_doi_with_no_date: citing_doi_with_no_date.remove(citing_doi) else: citing_doi_with_no_date.add(citing_doi) if id_issn.get_value(citing_doi) is None: if "type" in obj: cur_type = obj["type"] if cur_type is not None and "journal" in cur_type and "ISSN" in obj: cur_issn = obj["ISSN"] if cur_issn is not None: for issn in [issn_manager.normalise(issn) for issn in cur_issn]: if issn is not None: id_issn.add_value(citing_doi, issn) if id_orcid.get_value(citing_doi) is None: if "author" in obj: cur_author = obj['author'] if cur_author is not None: for author in cur_author: if "ORCID" in author: orcid = orcid_manager.normalise(author["ORCID"]) if orcid is not None: id_orcid.add_value(citing_doi, orcid) # Do it again for updating the dates of the cited DOIs, if these are valid print("\n\n# Check cited DOIs from Crossref reference field") doi_date = {} for file_idx, file in enumerate(all_files, 1): with opener(file) as f: print("Open file %s of %s" % (file_idx, len_all_files)) data = load(f) if "items" in data: for obj in data['items']: if "DOI" in obj and "reference" in obj: for ref in obj['reference']: if "DOI" in ref: cited_doi = doi_manager.normalise(ref["DOI"], True) if doi_manager.is_valid(cited_doi) and id_date.get_value(cited_doi) is None: if cited_doi not in doi_date: doi_date[cited_doi] = [] cited_date = Citation.check_date(build_pubdate(ref)) if cited_date is not None: doi_date[cited_doi].append(cited_date) if cited_doi in citing_doi_with_no_date: citing_doi_with_no_date.remove(cited_doi) # Add the date to the DOI if such date is the most adopted one in the various references. # In case two distinct dates are used the most, select the older one. for doi in doi_date: count = Counter(doi_date[doi]) if len(count): top_value = count.most_common(1)[0][1] selected_dates = [] for date in count: if count[date] == top_value: selected_dates.append(date) best_date = sorted(selected_dates)[0] id_date.add_value(doi, best_date) else: id_date.add_value(doi, "") # Add emtpy dates for the remaining DOIs for doi in citing_doi_with_no_date: id_date.add_value(doi, "")
def extract_citations(idbaseurl, baseurl, python, pclass, input, lookup, data, prefix, agent, source, service, verbose, doi_manager, rf_handler, oci_to_do=None): BASE_URL = idbaseurl DATASET_URL = baseurl + "/" if not baseurl.endswith("/") else baseurl oci_manager = OCIManager(lookup_file=lookup) exi_ocis = CSVManager.load_csv_column_as_set( data + sep + "data", "oci" ) # TODO: we need to specify carefully the dir, eg by adding an additional flag to distinguish between the files belonging to a particular process, and it should be aligned with the storer. if oci_to_do is not None: oci_to_do.difference_update(exi_ocis) cit_storer = CitationStorer(data, DATASET_URL) citations_already_present = 0 new_citations_added = 0 error_in_dois_existence = 0 cs = import_citation_source(python, pclass, input) next_citation = cs.get_next_citation_data() while next_citation is not None: citing, cited, created, timespan, journal_sc, author_sc = next_citation oci = oci_manager.get_oci(citing, cited, prefix) oci_noprefix = oci.replace("oci:", "") if oci_noprefix not in exi_ocis and (oci_to_do is None or oci_noprefix in oci_to_do): if doi_manager.is_valid(citing) and doi_manager.is_valid(cited): if created is None: citing_date = rf_handler.get_date(citing) else: citing_date = created cited_date = rf_handler.get_date(cited) if journal_sc is None or type(journal_sc) is not bool: journal_sc = rf_handler.share_issn(citing, cited) if author_sc is None or type(author_sc) is not bool: author_sc = rf_handler.share_orcid(citing, cited) if created is not None and timespan is not None: cit = Citation( oci, BASE_URL + quote(citing), None, BASE_URL + quote(cited), None, created, timespan, 1, agent, source, datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service, "doi", BASE_URL + "([[XXX__decode]])", "reference", journal_sc, author_sc, None, "Creation of the citation", None) else: cit = Citation( oci, BASE_URL + quote(citing), citing_date, BASE_URL + quote(cited), cited_date, None, None, 1, agent, source, datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service, "doi", BASE_URL + "([[XXX__decode]])", "reference", journal_sc, author_sc, None, "Creation of the citation", None) cit_storer.store_citation(cit) if verbose: print( "Create citation data for '%s' between DOI '%s' and DOI '%s'" % (oci, citing, cited)) new_citations_added += 1 exi_ocis.add(oci_noprefix) else: if verbose: print( "WARNING: some DOIs, among '%s' and '%s', do not exist" % (citing, cited)) error_in_dois_existence += 1 if oci_to_do is not None: oci_to_do.remove(oci_noprefix) else: if verbose: print( "WARNING: the citation between DOI '%s' and DOI '%s' has been already processed" % (citing, cited)) citations_already_present += 1 next_citation = cs.get_next_citation_data() return new_citations_added, citations_already_present, error_in_dois_existence