def __load_citations(self, data, prov): return CitationStorer.load_citations_from_file( data, prov, baseurl="http://dx.doi.org/", service_name=self.service, id_type="doi", id_shape="http://dx.doi.org/([[XXX__decode]])", citation_type=None)
def load_and_store_citations(self, data_path, prov_path, ext): tmp_path = self.tmp_path + "_load" if exists(tmp_path): rmtree(tmp_path) origin_citation_list = CitationStorer.load_citations_from_file( data_path, prov_path, baseurl="http://dx.doi.org/", service_name="OpenCitations Index: COCI", id_type="doi", id_shape="http://dx.doi.org/([[XXX__decode]])", citation_type=None) cs = CitationStorer(tmp_path, self.baseurl) for citation in origin_citation_list: cs.store_citation(citation) stored_citation_list = CitationStorerTest.get_stored_citation_list( tmp_path + sep + "data" + sep + self.ext_local_dir[ext] + sep, ext) return origin_citation_list, stored_citation_list
def setUp(self): info_file_path = "index%stest_data%stmp_store%sdata%s.dir_citation_source" % ( sep, sep, sep, sep) if exists(info_file_path): remove(info_file_path) self.oci = OCIManager(lookup_file="index%stest_data%slookup_full.csv" % (sep, sep)) self.citation_list = CitationStorer.load_citations_from_file( "index%stest_data%scitations_data.csv" % (sep, sep), "index%stest_data%scitations_prov.csv" % (sep, sep), baseurl="http://dx.doi.org/", service_name="OpenCitations Index: COCI", id_type="doi", id_shape="http://dx.doi.org/([[XXX__decode]])", citation_type=None)
def get_stored_citation_list(data_path, ext): stored_citation_list = [] for f in [f for f in glob(data_path + "**/*." + ext, recursive=True)]: stored_citation_list.extend( CitationStorer.load_citations_from_file( f, f.replace("%sdata%s" % (sep, sep), "%sprov%s" % (sep, sep)), baseurl="http://dx.doi.org/", service_name="OpenCitations Index: COCI", id_type="doi", id_shape="http://dx.doi.org/([[XXX__decode]])", citation_type=None)) return stored_citation_list
def test_store_citation(self): tmp_subpath = "_store" tmp_path = self.tmp_path + tmp_subpath if exists(tmp_path): rmtree(tmp_path) origin_citation_list = CitationStorer.load_citations_from_file( self.citation_data_csv_path, self.citation_prov_csv_path, baseurl="http://dx.doi.org/", service_name="OpenCitations Index: COCI", id_type="doi", id_shape="http://dx.doi.org/([[XXX__decode]])", citation_type=None) cs = CitationStorer(tmp_path, self.baseurl, n_citations_csv_file=4, n_citations_rdf_file=2, n_citations_slx_file=3) for citation in origin_citation_list: cs.store_citation(citation) data_path = tmp_path + sep + "data" csv_data_path = data_path + sep + "csv" + sep rdf_data_path = data_path + sep + "rdf" + sep slx_data_path = data_path + sep + "slx" + sep prov_path = tmp_path + sep + "prov" + sep csv_prov_path = prov_path + sep + "csv" + sep rdf_prov_path = prov_path + sep + "rdf" + sep # Check if directories exist self.assertTrue( all([ exists(p) for p in [ csv_data_path, rdf_data_path, slx_data_path, csv_prov_path, rdf_prov_path ] ])) # Check if files exist self.assertEqual( len([ f for f in glob(csv_data_path + "**" + sep + "*.csv", recursive=True) ]), 2) self.assertEqual( len([ f for f in glob(csv_prov_path + "**" + sep + "*.csv", recursive=True) ]), 2) self.assertEqual( len([ f for f in glob(rdf_data_path + "**" + sep + "*.ttl", recursive=True) ]), 3) self.assertEqual( len([ f for f in glob(rdf_prov_path + "**" + sep + "*.ttl", recursive=True) ]), 3) self.assertEqual( len([ f for f in glob(slx_data_path + "**" + sep + "*.scholix", recursive=True) ]), 2) # Check if the new stored files contains the same citations of the original one stored_citation_list_csv = CitationStorerTest.get_stored_citation_list( csv_data_path, "csv") self.citations_csv(origin_citation_list, stored_citation_list_csv) stored_citation_list_rdf = CitationStorerTest.get_stored_citation_list( rdf_data_path, "ttl") self.citations_rdf(origin_citation_list, stored_citation_list_rdf) stored_citation_list_slx = CitationStorerTest.get_stored_citation_list( slx_data_path, "scholix") self.citations_slx(origin_citation_list, stored_citation_list_slx) # Store again all citations previously stored and checked in they are correctly # added to the existing files for citation in origin_citation_list: cs.store_citation(citation) self.assertEqual( len([ f for f in glob(csv_data_path + "**" + sep + "*.csv", recursive=True) ]), 3) self.assertEqual( len([ f for f in glob(csv_prov_path + "**" + sep + "*.csv", recursive=True) ]), 3) self.assertEqual( len([ f for f in glob(rdf_data_path + "**" + sep + "*.ttl", recursive=True) ]), 6) self.assertEqual( len([ f for f in glob(rdf_prov_path + "**" + sep + "*.ttl", recursive=True) ]), 6) self.assertEqual( len([ f for f in glob(slx_data_path + "**" + sep + "*.scholix", recursive=True) ]), 4)
def extract_citations(idbaseurl, baseurl, python, pclass, input, lookup, data, prefix, agent, source, service, verbose, doi_manager, rf_handler, oci_to_do=None): BASE_URL = idbaseurl DATASET_URL = baseurl + "/" if not baseurl.endswith("/") else baseurl oci_manager = OCIManager(lookup_file=lookup) exi_ocis = CSVManager.load_csv_column_as_set( data + sep + "data", "oci" ) # TODO: we need to specify carefully the dir, eg by adding an additional flag to distinguish between the files belonging to a particular process, and it should be aligned with the storer. if oci_to_do is not None: oci_to_do.difference_update(exi_ocis) cit_storer = CitationStorer(data, DATASET_URL) citations_already_present = 0 new_citations_added = 0 error_in_dois_existence = 0 cs = import_citation_source(python, pclass, input) next_citation = cs.get_next_citation_data() while next_citation is not None: citing, cited, created, timespan, journal_sc, author_sc = next_citation oci = oci_manager.get_oci(citing, cited, prefix) oci_noprefix = oci.replace("oci:", "") if oci_noprefix not in exi_ocis and (oci_to_do is None or oci_noprefix in oci_to_do): if doi_manager.is_valid(citing) and doi_manager.is_valid(cited): if created is None: citing_date = rf_handler.get_date(citing) else: citing_date = created cited_date = rf_handler.get_date(cited) if journal_sc is None or type(journal_sc) is not bool: journal_sc = rf_handler.share_issn(citing, cited) if author_sc is None or type(author_sc) is not bool: author_sc = rf_handler.share_orcid(citing, cited) if created is not None and timespan is not None: cit = Citation( oci, BASE_URL + quote(citing), None, BASE_URL + quote(cited), None, created, timespan, 1, agent, source, datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service, "doi", BASE_URL + "([[XXX__decode]])", "reference", journal_sc, author_sc, None, "Creation of the citation", None) else: cit = Citation( oci, BASE_URL + quote(citing), citing_date, BASE_URL + quote(cited), cited_date, None, None, 1, agent, source, datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service, "doi", BASE_URL + "([[XXX__decode]])", "reference", journal_sc, author_sc, None, "Creation of the citation", None) cit_storer.store_citation(cit) if verbose: print( "Create citation data for '%s' between DOI '%s' and DOI '%s'" % (oci, citing, cited)) new_citations_added += 1 exi_ocis.add(oci_noprefix) else: if verbose: print( "WARNING: some DOIs, among '%s' and '%s', do not exist" % (citing, cited)) error_in_dois_existence += 1 if oci_to_do is not None: oci_to_do.remove(oci_noprefix) else: if verbose: print( "WARNING: the citation between DOI '%s' and DOI '%s' has been already processed" % (citing, cited)) citations_already_present += 1 next_citation = cs.get_next_citation_data() return new_citations_added, citations_already_present, error_in_dois_existence