示例#1
0
 def __load_citations(self, data, prov):
     return CitationStorer.load_citations_from_file(
         data,
         prov,
         baseurl="http://dx.doi.org/",
         service_name=self.service,
         id_type="doi",
         id_shape="http://dx.doi.org/([[XXX__decode]])",
         citation_type=None)
示例#2
0
    def load_and_store_citations(self, data_path, prov_path, ext):
        tmp_path = self.tmp_path + "_load"

        if exists(tmp_path):
            rmtree(tmp_path)

        origin_citation_list = CitationStorer.load_citations_from_file(
            data_path,
            prov_path,
            baseurl="http://dx.doi.org/",
            service_name="OpenCitations Index: COCI",
            id_type="doi",
            id_shape="http://dx.doi.org/([[XXX__decode]])",
            citation_type=None)

        cs = CitationStorer(tmp_path, self.baseurl)
        for citation in origin_citation_list:
            cs.store_citation(citation)

        stored_citation_list = CitationStorerTest.get_stored_citation_list(
            tmp_path + sep + "data" + sep + self.ext_local_dir[ext] + sep, ext)

        return origin_citation_list, stored_citation_list
示例#3
0
 def setUp(self):
     info_file_path = "index%stest_data%stmp_store%sdata%s.dir_citation_source" % (
         sep, sep, sep, sep)
     if exists(info_file_path):
         remove(info_file_path)
     self.oci = OCIManager(lookup_file="index%stest_data%slookup_full.csv" %
                           (sep, sep))
     self.citation_list = CitationStorer.load_citations_from_file(
         "index%stest_data%scitations_data.csv" % (sep, sep),
         "index%stest_data%scitations_prov.csv" % (sep, sep),
         baseurl="http://dx.doi.org/",
         service_name="OpenCitations Index: COCI",
         id_type="doi",
         id_shape="http://dx.doi.org/([[XXX__decode]])",
         citation_type=None)
示例#4
0
    def get_stored_citation_list(data_path, ext):
        stored_citation_list = []

        for f in [f for f in glob(data_path + "**/*." + ext, recursive=True)]:
            stored_citation_list.extend(
                CitationStorer.load_citations_from_file(
                    f,
                    f.replace("%sdata%s" % (sep, sep),
                              "%sprov%s" % (sep, sep)),
                    baseurl="http://dx.doi.org/",
                    service_name="OpenCitations Index: COCI",
                    id_type="doi",
                    id_shape="http://dx.doi.org/([[XXX__decode]])",
                    citation_type=None))

        return stored_citation_list
示例#5
0
    def test_store_citation(self):
        tmp_subpath = "_store"
        tmp_path = self.tmp_path + tmp_subpath

        if exists(tmp_path):
            rmtree(tmp_path)

        origin_citation_list = CitationStorer.load_citations_from_file(
            self.citation_data_csv_path,
            self.citation_prov_csv_path,
            baseurl="http://dx.doi.org/",
            service_name="OpenCitations Index: COCI",
            id_type="doi",
            id_shape="http://dx.doi.org/([[XXX__decode]])",
            citation_type=None)

        cs = CitationStorer(tmp_path,
                            self.baseurl,
                            n_citations_csv_file=4,
                            n_citations_rdf_file=2,
                            n_citations_slx_file=3)
        for citation in origin_citation_list:
            cs.store_citation(citation)

        data_path = tmp_path + sep + "data"
        csv_data_path = data_path + sep + "csv" + sep
        rdf_data_path = data_path + sep + "rdf" + sep
        slx_data_path = data_path + sep + "slx" + sep

        prov_path = tmp_path + sep + "prov" + sep
        csv_prov_path = prov_path + sep + "csv" + sep
        rdf_prov_path = prov_path + sep + "rdf" + sep

        # Check if directories exist
        self.assertTrue(
            all([
                exists(p) for p in [
                    csv_data_path, rdf_data_path, slx_data_path, csv_prov_path,
                    rdf_prov_path
                ]
            ]))

        # Check if files exist
        self.assertEqual(
            len([
                f for f in glob(csv_data_path + "**" + sep + "*.csv",
                                recursive=True)
            ]), 2)
        self.assertEqual(
            len([
                f for f in glob(csv_prov_path + "**" + sep + "*.csv",
                                recursive=True)
            ]), 2)
        self.assertEqual(
            len([
                f for f in glob(rdf_data_path + "**" + sep + "*.ttl",
                                recursive=True)
            ]), 3)
        self.assertEqual(
            len([
                f for f in glob(rdf_prov_path + "**" + sep + "*.ttl",
                                recursive=True)
            ]), 3)
        self.assertEqual(
            len([
                f for f in glob(slx_data_path + "**" + sep + "*.scholix",
                                recursive=True)
            ]), 2)

        # Check if the new stored files contains the same citations of the original one
        stored_citation_list_csv = CitationStorerTest.get_stored_citation_list(
            csv_data_path, "csv")
        self.citations_csv(origin_citation_list, stored_citation_list_csv)
        stored_citation_list_rdf = CitationStorerTest.get_stored_citation_list(
            rdf_data_path, "ttl")
        self.citations_rdf(origin_citation_list, stored_citation_list_rdf)
        stored_citation_list_slx = CitationStorerTest.get_stored_citation_list(
            slx_data_path, "scholix")
        self.citations_slx(origin_citation_list, stored_citation_list_slx)

        # Store again all citations previously stored and checked in they are correctly
        # added to the existing files
        for citation in origin_citation_list:
            cs.store_citation(citation)
        self.assertEqual(
            len([
                f for f in glob(csv_data_path + "**" + sep + "*.csv",
                                recursive=True)
            ]), 3)
        self.assertEqual(
            len([
                f for f in glob(csv_prov_path + "**" + sep + "*.csv",
                                recursive=True)
            ]), 3)
        self.assertEqual(
            len([
                f for f in glob(rdf_data_path + "**" + sep + "*.ttl",
                                recursive=True)
            ]), 6)
        self.assertEqual(
            len([
                f for f in glob(rdf_prov_path + "**" + sep + "*.ttl",
                                recursive=True)
            ]), 6)
        self.assertEqual(
            len([
                f for f in glob(slx_data_path + "**" + sep + "*.scholix",
                                recursive=True)
            ]), 4)
示例#6
0
def extract_citations(idbaseurl,
                      baseurl,
                      python,
                      pclass,
                      input,
                      lookup,
                      data,
                      prefix,
                      agent,
                      source,
                      service,
                      verbose,
                      doi_manager,
                      rf_handler,
                      oci_to_do=None):
    BASE_URL = idbaseurl
    DATASET_URL = baseurl + "/" if not baseurl.endswith("/") else baseurl

    oci_manager = OCIManager(lookup_file=lookup)
    exi_ocis = CSVManager.load_csv_column_as_set(
        data + sep + "data", "oci"
    )  # TODO: we need to specify carefully the dir, eg by adding an additional flag to distinguish between the files belonging to a particular process, and it should be aligned with the storer.
    if oci_to_do is not None:
        oci_to_do.difference_update(exi_ocis)
    cit_storer = CitationStorer(data, DATASET_URL)

    citations_already_present = 0
    new_citations_added = 0
    error_in_dois_existence = 0

    cs = import_citation_source(python, pclass, input)
    next_citation = cs.get_next_citation_data()

    while next_citation is not None:
        citing, cited, created, timespan, journal_sc, author_sc = next_citation
        oci = oci_manager.get_oci(citing, cited, prefix)
        oci_noprefix = oci.replace("oci:", "")
        if oci_noprefix not in exi_ocis and (oci_to_do is None
                                             or oci_noprefix in oci_to_do):
            if doi_manager.is_valid(citing) and doi_manager.is_valid(cited):
                if created is None:
                    citing_date = rf_handler.get_date(citing)
                else:
                    citing_date = created
                cited_date = rf_handler.get_date(cited)
                if journal_sc is None or type(journal_sc) is not bool:
                    journal_sc = rf_handler.share_issn(citing, cited)
                if author_sc is None or type(author_sc) is not bool:
                    author_sc = rf_handler.share_orcid(citing, cited)

                if created is not None and timespan is not None:
                    cit = Citation(
                        oci, BASE_URL + quote(citing), None,
                        BASE_URL + quote(cited), None, created, timespan, 1,
                        agent, source,
                        datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service,
                        "doi", BASE_URL + "([[XXX__decode]])", "reference",
                        journal_sc, author_sc, None,
                        "Creation of the citation", None)
                else:
                    cit = Citation(
                        oci, BASE_URL + quote(citing), citing_date,
                        BASE_URL + quote(cited), cited_date, None, None, 1,
                        agent, source,
                        datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service,
                        "doi", BASE_URL + "([[XXX__decode]])", "reference",
                        journal_sc, author_sc, None,
                        "Creation of the citation", None)

                cit_storer.store_citation(cit)

                if verbose:
                    print(
                        "Create citation data for '%s' between DOI '%s' and DOI '%s'"
                        % (oci, citing, cited))
                new_citations_added += 1
                exi_ocis.add(oci_noprefix)
            else:
                if verbose:
                    print(
                        "WARNING: some DOIs, among '%s' and '%s', do not exist"
                        % (citing, cited))
                error_in_dois_existence += 1
            if oci_to_do is not None:
                oci_to_do.remove(oci_noprefix)
        else:
            if verbose:
                print(
                    "WARNING: the citation between DOI '%s' and DOI '%s' has been already processed"
                    % (citing, cited))
            citations_already_present += 1

        next_citation = cs.get_next_citation_data()

    return new_citations_added, citations_already_present, error_in_dois_existence