Пример #1
0
def create_csv(doi_file, date_file, orcid_file, issn_file):
    valid_doi = CSVManager(csv_path=doi_file)
    id_date = CSVManager(csv_path=date_file)
    id_orcid = CSVManager(csv_path=orcid_file)
    id_issn = CSVManager(csv_path=issn_file)

    return valid_doi, id_date, id_orcid, id_issn
Пример #2
0
    def __init__(self, date=None, orcid=None, issn=None, doi=None, **params):
        if date is None:
            date = CSVManager(store_new=False)
        if orcid is None:
            orcid = CSVManager(store_new=False)
        if issn is None:
            issn = CSVManager(store_new=False)
        if doi is None:
            doi = CSVManager(store_new=False)

        for key in params:
            setattr(self, key, params[key])

        self.issn = issn
        self.date = date
        self.orcid = orcid
        if hasattr(self, 'use_api_service'):
            self.dm = DOIManager(doi, self.use_api_service)
        else:
            self.dm = DOIManager(doi)
        self.im = ISSNManager()
        self.om = ORCIDManager()

        self.headers = {
            "User-Agent":
            "ResourceFinder / OpenCitations Indexes "
            "(http://opencitations.net; mailto:[email protected])"
        }
Пример #3
0
    def test_get_value(self):
        csv_m = CSVManager(self.initial_path)
        retrieved_1 = csv_m.get_value("doi:10.1108/jd-12-2013-0166")
        self.assertEqual({"2015-03-09"}, retrieved_1)

        retrieved_2 = csv_m.get_value("doi:10.1108/jd-12-2013-0167")
        self.assertIsNone(retrieved_2)
Пример #4
0
    def test_datacite_get_orcid(self):
        # Do not use support files, only APIs
        df_1 = DataCiteResourceFinder()
        self.assertIn("0000-0001-7734-8388", df_1.get_orcid("10.5065/d6b8565d"))
        self.assertNotIn("0000-0001-5506-523X", df_1.get_orcid("10.5065/d6b8565d"))

        # Do use support files, but avoid using APIs
        df_2 = DataCiteResourceFinder(orcid=CSVManager(self.orcid_path),
                                      doi=CSVManager(self.doi_path), use_api_service=False)
        self.assertIn("0000-0001-7734-8388", df_2.get_orcid("10.5065/d6b8565d"))
        self.assertNotIn("0000-0001-5506-523X", df_2.get_orcid("10.5065/d6b8565d"))

        # Do not use support files neither APIs
        df_3 = DataCiteResourceFinder(use_api_service=False)
        self.assertIsNone(df_3.get_orcid("10.5065/d6b8565d"))
Пример #5
0
    def test_orcid_get_orcid(self):
        # Do not use support files, only APIs
        of_1 = ORCIDResourceFinder()
        self.assertIn("0000-0003-0530-4305", of_1.get_orcid("10.1108/jd-12-2013-0166"))
        self.assertNotIn("0000-0001-5506-523X", of_1.get_orcid("10.1108/jd-12-2013-0166"))

        # Do use support files, but avoid using APIs
        of_2 = ORCIDResourceFinder(orcid=CSVManager(self.orcid_path),
                                   doi=CSVManager(self.doi_path), use_api_service=False)
        self.assertIn("0000-0003-0530-4305", of_2.get_orcid("10.1108/jd-12-2013-0166"))
        self.assertNotIn("0000-0001-5506-523X", of_2.get_orcid("10.1108/jd-12-2013-0166"))

        # Do not use support files neither APIs
        of_3 = ORCIDResourceFinder(use_api_service=False)
        self.assertIsNone(of_3.get_orcid("10.1108/jd-12-2013-0166"))
Пример #6
0
    def test_crossref_get_pub_date(self):
        # Do not use support files, only APIs
        cf_1 = CrossrefResourceFinder()
        self.assertIn("2019-01-02", cf_1.get_pub_date("10.1007/s11192-018-2988-z"))
        self.assertNotEqual("2019", cf_1.get_pub_date("10.1007/s11192-018-2988-z"))

        # Do use support files, but avoid using APIs
        cf_2 = CrossrefResourceFinder(date=CSVManager(self.date_path),
                                      doi=CSVManager(self.doi_path), use_api_service=False)
        self.assertIn("2019-01-02", cf_2.get_pub_date("10.1007/s11192-018-2988-z"))
        self.assertNotEqual("2018-01-02", cf_2.get_pub_date("10.1007/s11192-018-2988-z"))

        # Do not use support files neither APIs
        cf_3 = CrossrefResourceFinder(use_api_service=False)
        self.assertIsNone(cf_3.get_pub_date("10.1007/s11192-018-2988-z"))
Пример #7
0
    def test_crossref_get_issn(self):
        # Do not use support files, only APIs
        cf_1 = CrossrefResourceFinder()
        self.assertIn("0138-9130", cf_1.get_container_issn("10.1007/s11192-018-2988-z"))
        self.assertNotIn("0138-9000", cf_1.get_container_issn("10.1007/s11192-018-2988-z"))

        # Do use support files, but avoid using APIs
        cf_2 = CrossrefResourceFinder(issn=CSVManager(self.issn_path),
                                      doi=CSVManager(self.doi_path), use_api_service=False)
        self.assertIn("1588-2861", cf_2.get_container_issn("10.1007/s11192-018-2988-z"))
        self.assertNotIn("0138-9000", cf_2.get_container_issn("10.1007/s11192-018-2988-z"))

        # Do not use support files neither APIs
        cf_3 = CrossrefResourceFinder(use_api_service=False)
        self.assertIsNone(cf_3.get_container_issn("10.1007/s11192-018-2988-z"))
Пример #8
0
    def test_crossref_get_orcid(self):
        # Do not use support files, only APIs
        cf_1 = CrossrefResourceFinder()
        self.assertIn("0000-0003-0530-4305", cf_1.get_orcid("10.1007/s11192-018-2988-z"))
        self.assertNotIn("0000-0001-5506-523X", cf_1.get_orcid("10.1007/s11192-018-2988-z"))

        # Do use support files, but avoid using APIs
        cf_2 = CrossrefResourceFinder(orcid=CSVManager(self.orcid_path),
                                      doi=CSVManager(self.doi_path), use_api_service=False)
        self.assertIn("0000-0003-0530-4305", cf_2.get_orcid("10.1007/s11192-018-2988-z"))
        self.assertNotIn("0000-0001-5506-523X", cf_2.get_orcid("10.1007/s11192-018-2988-z"))

        # Do not use support files neither APIs
        cf_3 = CrossrefResourceFinder(use_api_service=False)
        self.assertIsNone(cf_3.get_orcid("10.1007/s11192-018-2988-z"))
Пример #9
0
    def test_datacite_get_pub_date(self):
        # Do not use support files, only APIs
        df_1 = DataCiteResourceFinder()
        self.assertIn("2019-05-27", df_1.get_pub_date("10.6092/issn.2532-8816/8555"))
        self.assertNotEqual("2019", df_1.get_pub_date("10.6092/issn.2532-8816/8555"))

        # Do use support files, but avoid using APIs
        df_2 = DataCiteResourceFinder(date=CSVManager(self.date_path),
                                      doi=CSVManager(self.doi_path), use_api_service=False)
        self.assertIn("2019-05-27", df_2.get_pub_date("10.6092/issn.2532-8816/8555"))
        self.assertNotEqual("2018-01-02", df_2.get_pub_date("10.6092/issn.2532-8816/8555"))

        # Do not use support files neither APIs
        df_3 = DataCiteResourceFinder(use_api_service=False)
        self.assertIsNone(df_3.get_pub_date("10.6092/issn.2532-8816/8555"))
Пример #10
0
    def test_datacite_get_issn(self):
        # Do not use support files, only APIs
        df_1 = DataCiteResourceFinder()
        self.assertIn("2197-6775", df_1.get_container_issn("10.14763/2019.1.1389"))
        self.assertNotIn("1588-2861", df_1.get_container_issn("10.14763/2019.1.1389"))

        # Do use support files, but avoid using APIs
        df_2 = DataCiteResourceFinder(issn=CSVManager(self.issn_path),
                                      doi=CSVManager(self.doi_path), use_api_service=False)
        self.assertIn("2197-6775", df_2.get_container_issn("10.14763/2019.1.1389"))
        self.assertNotIn("1588-2861", df_2.get_container_issn("10.14763/2019.1.1389"))

        # Do not use support files neither APIs
        df_3 = DataCiteResourceFinder(use_api_service=False)
        self.assertIsNone(df_3.get_container_issn("10.14763/2019.1.1389"))
Пример #11
0
 def test_creation(self):
     csv_m = CSVManager(self.initial_path)
     self.assertDictEqual(
         csv_m.data, {
             "doi:10.1108/jd-12-2013-0166": {"2015-03-09"},
             "doi:10.7717/peerj.4375": {"2018-02-13"}
         })
Пример #12
0
    def __init__(self, valid_doi=None, use_api_service=True):
        if valid_doi is None:
            valid_doi = CSVManager(store_new=False)

        self.api = "https://doi.org/api/handles/"
        self.valid_doi = valid_doi
        self.use_api_service = use_api_service
        self.p = "doi:"
        super(DOIManager, self).__init__()
Пример #13
0
 def test_load_csv_column_as_set(self):
     oci_set = CSVManager.load_csv_column_as_set(self.citation_path, "oci",
                                                 4)
     self.assertSetEqual(
         oci_set, {
             "02001000308362819371213133704040001020809-020010009063615193700006300030306151914",
             "02001000002361927283705040000-02001000002361927283705030002",
             "02001000002361927283705040000-020010003093612062710020603000720",
             "02001000308362819371213133704040001020804-02001000308362819371213133704040000030707",
             "020010000023625242110370100030001-02001010009361222251430273701090809370903040403",
             "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107"
         })
Пример #14
0
    def test_doi_is_valid(self):
        dm_nofile = DOIManager()
        self.assertTrue(dm_nofile.is_valid(self.valid_doi_1))
        self.assertTrue(dm_nofile.is_valid(self.valid_doi_2))
        self.assertFalse(dm_nofile.is_valid(self.invalid_doi))

        valid_doi = CSVManager(self.valid_doi_path)
        dm_file = DOIManager(valid_doi=valid_doi, use_api_service=False)
        self.assertTrue(dm_file.is_valid(self.valid_doi_1))
        self.assertFalse(dm_file.is_valid(self.invalid_doi))

        dm_nofile_noapi = DOIManager(use_api_service=False)
        self.assertFalse(dm_nofile_noapi.is_valid(self.valid_doi_1))
        self.assertFalse(dm_nofile_noapi.is_valid(self.invalid_doi))
Пример #15
0
    def test_add_value(self):
        if exists(self.addition_path):
            remove(self.addition_path)

        csv_m = CSVManager(self.addition_path)
        csv_m.add_value("doi:10.1108/jd-12-2013-0166",
                        "orcid:0000-0003-0530-4305")
        csv_m.add_value("doi:10.7717/peerj.4375", "orcid:0000-0003-1613-5981")
        csv_m.add_value("doi:10.1108/jd-12-2013-0166",
                        "orcid:0000-0001-5506-523X")

        self.assertDictEqual(
            csv_m.data, {
                "doi:10.1108/jd-12-2013-0166":
                {"orcid:0000-0003-0530-4305", "orcid:0000-0001-5506-523X"},
                "doi:10.7717/peerj.4375": {"orcid:0000-0003-1613-5981"}
            })
Пример #16
0
    def test_glob(self):
        if exists(self.output_dir):
            rmtree(self.output_dir)
        makedirs(self.output_dir)

        process(self.input_dir, self.output_dir)

        orig = CSVManager(self.input_dir + sep + "valid_doi.csv")
        new = CSVManager(self.output_dir + sep + "valid_doi.csv")
        self.assertDictEqual(orig.data, new.data)

        orig = CSVManager(self.input_dir + sep + "id_date.csv")
        new = CSVManager(self.output_dir + sep + "id_date.csv")
        self.assertDictEqual(orig.data, new.data)

        orig = CSVManager(self.input_dir + sep + "id_issn.csv")
        new = CSVManager(self.output_dir + sep + "id_issn.csv")
        self.assertDictEqual(orig.data, new.data)

        orig = CSVManager(self.input_dir + sep + "id_orcid.csv")
        new = CSVManager(self.output_dir + sep + "id_orcid.csv")
        self.assertDictEqual(orig.data, new.data)
Пример #17
0
def process(input_dir, output_dir):
    if not exists(output_dir):
        makedirs(output_dir)

    citing_doi_with_no_date = set()
    valid_doi = CSVManager(output_dir + sep + "valid_doi.csv")
    id_date = CSVManager(output_dir + sep + "id_date.csv")
    id_issn = CSVManager(output_dir + sep + "id_issn.csv")
    id_orcid = CSVManager(output_dir + sep + "id_orcid.csv")

    doi_manager = DOIManager(valid_doi)
    issn_manager = ISSNManager()
    orcid_manager = ORCIDManager()

    all_files, opener = get_all_files(input_dir)
    len_all_files = len(all_files)

    # Read all the JSON file in the Crossref dump to create the main information of all the indexes
    print("\n\n# Add valid DOIs from Crossref metadata")
    for file_idx, file in enumerate(all_files, 1):
        with opener(file) as f:
            print("Open file %s of %s" % (file_idx, len_all_files))
            try:
                data = load(f)
            # When using tar.gz file or zip file a stream of byte is returned by the opener. Thus,
            # it must be converted into an utf-8 string before loading it into a JSON.
            except TypeError:
                utf8reader = codecs.getreader("utf-8")
                data = load(utf8reader(f))

            if "items" in data:
                for obj in data['items']:
                    if "DOI" in obj:
                        citing_doi = doi_manager.normalise(obj["DOI"], True)
                        doi_manager.set_valid(citing_doi)

                        if id_date.get_value(citing_doi) is None:
                            citing_date = Citation.check_date(build_pubdate(obj))
                            if citing_date is not None:
                                id_date.add_value(citing_doi, citing_date)
                                if citing_doi in citing_doi_with_no_date:
                                    citing_doi_with_no_date.remove(citing_doi)
                            else:
                                citing_doi_with_no_date.add(citing_doi)

                        if id_issn.get_value(citing_doi) is None:
                            if "type" in obj:
                                cur_type = obj["type"]
                                if cur_type is not None and "journal" in cur_type and "ISSN" in obj:
                                    cur_issn = obj["ISSN"]
                                    if cur_issn is not None:
                                        for issn in [issn_manager.normalise(issn) for issn in cur_issn]:
                                            if issn is not None:
                                                id_issn.add_value(citing_doi, issn)

                        if id_orcid.get_value(citing_doi) is None:
                            if "author" in obj:
                                cur_author = obj['author']
                                if cur_author is not None:
                                    for author in cur_author:
                                        if "ORCID" in author:
                                            orcid = orcid_manager.normalise(author["ORCID"])
                                            if orcid is not None:
                                                id_orcid.add_value(citing_doi, orcid)

    # Do it again for updating the dates of the cited DOIs, if these are valid
    print("\n\n# Check cited DOIs from Crossref reference field")
    doi_date = {}
    for file_idx, file in enumerate(all_files, 1):
        with opener(file) as f:
            print("Open file %s of %s" % (file_idx, len_all_files))
            data = load(f)
            if "items" in data:
                for obj in data['items']:
                    if "DOI" in obj and "reference" in obj:
                        for ref in obj['reference']:
                            if "DOI" in ref:
                                cited_doi = doi_manager.normalise(ref["DOI"], True)
                                if doi_manager.is_valid(cited_doi) and id_date.get_value(cited_doi) is None:
                                    if cited_doi not in doi_date:
                                        doi_date[cited_doi] = []
                                    cited_date = Citation.check_date(build_pubdate(ref))
                                    if cited_date is not None:
                                        doi_date[cited_doi].append(cited_date)
                                        if cited_doi in citing_doi_with_no_date:
                                            citing_doi_with_no_date.remove(cited_doi)

    # Add the date to the DOI if such date is the most adopted one in the various references.
    # In case two distinct dates are used the most, select the older one.
    for doi in doi_date:
        count = Counter(doi_date[doi])
        if len(count):
            top_value = count.most_common(1)[0][1]
            selected_dates = []
            for date in count:
                if count[date] == top_value:
                    selected_dates.append(date)
            best_date = sorted(selected_dates)[0]
            id_date.add_value(doi, best_date)
        else:
            id_date.add_value(doi, "")

    # Add emtpy dates for the remaining DOIs
    for doi in citing_doi_with_no_date:
        id_date.add_value(doi, "")
Пример #18
0
def extract_citations(idbaseurl,
                      baseurl,
                      python,
                      pclass,
                      input,
                      lookup,
                      data,
                      prefix,
                      agent,
                      source,
                      service,
                      verbose,
                      doi_manager,
                      rf_handler,
                      oci_to_do=None):
    BASE_URL = idbaseurl
    DATASET_URL = baseurl + "/" if not baseurl.endswith("/") else baseurl

    oci_manager = OCIManager(lookup_file=lookup)
    exi_ocis = CSVManager.load_csv_column_as_set(
        data + sep + "data", "oci"
    )  # TODO: we need to specify carefully the dir, eg by adding an additional flag to distinguish between the files belonging to a particular process, and it should be aligned with the storer.
    if oci_to_do is not None:
        oci_to_do.difference_update(exi_ocis)
    cit_storer = CitationStorer(data, DATASET_URL)

    citations_already_present = 0
    new_citations_added = 0
    error_in_dois_existence = 0

    cs = import_citation_source(python, pclass, input)
    next_citation = cs.get_next_citation_data()

    while next_citation is not None:
        citing, cited, created, timespan, journal_sc, author_sc = next_citation
        oci = oci_manager.get_oci(citing, cited, prefix)
        oci_noprefix = oci.replace("oci:", "")
        if oci_noprefix not in exi_ocis and (oci_to_do is None
                                             or oci_noprefix in oci_to_do):
            if doi_manager.is_valid(citing) and doi_manager.is_valid(cited):
                if created is None:
                    citing_date = rf_handler.get_date(citing)
                else:
                    citing_date = created
                cited_date = rf_handler.get_date(cited)
                if journal_sc is None or type(journal_sc) is not bool:
                    journal_sc = rf_handler.share_issn(citing, cited)
                if author_sc is None or type(author_sc) is not bool:
                    author_sc = rf_handler.share_orcid(citing, cited)

                if created is not None and timespan is not None:
                    cit = Citation(
                        oci, BASE_URL + quote(citing), None,
                        BASE_URL + quote(cited), None, created, timespan, 1,
                        agent, source,
                        datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service,
                        "doi", BASE_URL + "([[XXX__decode]])", "reference",
                        journal_sc, author_sc, None,
                        "Creation of the citation", None)
                else:
                    cit = Citation(
                        oci, BASE_URL + quote(citing), citing_date,
                        BASE_URL + quote(cited), cited_date, None, None, 1,
                        agent, source,
                        datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service,
                        "doi", BASE_URL + "([[XXX__decode]])", "reference",
                        journal_sc, author_sc, None,
                        "Creation of the citation", None)

                cit_storer.store_citation(cit)

                if verbose:
                    print(
                        "Create citation data for '%s' between DOI '%s' and DOI '%s'"
                        % (oci, citing, cited))
                new_citations_added += 1
                exi_ocis.add(oci_noprefix)
            else:
                if verbose:
                    print(
                        "WARNING: some DOIs, among '%s' and '%s', do not exist"
                        % (citing, cited))
                error_in_dois_existence += 1
            if oci_to_do is not None:
                oci_to_do.remove(oci_noprefix)
        else:
            if verbose:
                print(
                    "WARNING: the citation between DOI '%s' and DOI '%s' has been already processed"
                    % (citing, cited))
            citations_already_present += 1

        next_citation = cs.get_next_citation_data()

    return new_citations_added, citations_already_present, error_in_dois_existence