Пример #1
0
def biblio_to_release(biblio: dict) -> ReleaseEntity:
    """
    Helper for close_fuzzy_biblio_matches() et al
    """
    contribs = []
    if biblio.get('authors'):
        for a in biblio['authors']:
            contribs.append(
                ReleaseContrib(
                    raw_name=a.get('name'),
                    given_name=a.get('given_name'),
                    surname=a.get('surname'),
                ))
    elif biblio.get('author_names'):
        for a in biblio['author_names']:
            contribs.append(ReleaseContrib(raw_name=a))
    elif biblio.get('first_author'):
        contribs.append(ReleaseContrib(raw_name=biblio['first_author']))
    release = ReleaseEntity(
        title=biblio.get("title"),
        ext_ids=ReleaseExtIds(
            doi=clean_doi(biblio.get("doi")),
            pmid=biblio.get("pmid"),
            pmcid=biblio.get("pmcid"),
            arxiv=biblio.get("arxiv_id"),
        ),
        volume=biblio.get("volume"),
        issue=biblio.get("issue"),
        pages=biblio.get("pages") or biblio.get("first_page"),
        publisher=biblio.get("publisher"),
        release_stage=biblio.get("release_stage"),
        release_type=biblio.get("release_type"),
        extra=dict(),
    )
    if biblio.get('journal'):
        release.extra['container_name'] = biblio['journal']
    elif biblio.get('conference'):
        release.extra['container_name'] = biblio['conference']
    if biblio.get('year'):
        year = biblio['year']
        if isinstance(year, str) and len(year) >= 4 and year[0:4].isdigit():
            release.release_year = int(year[0:4])
        elif isinstance(year, int):
            release.release_year = year
    elif biblio.get('date'):
        date = biblio['date']
        if isinstance(date, str) and len(date) >= 4 and date[0:4].isdigit():
            release.release_year = int(date[0:4])
    return release
Пример #2
0
def grobid_ref_to_release(ref: dict) -> ReleaseEntity:
    """
    Takes the dict returned by transform_grobid_ref_xml() and returns a partial
    ReleaseEntity object (for use with fuzzycat)
    """
    contribs = []
    for author in ref.get("authors") or []:
        contribs.append(
            ReleaseContrib(
                raw_name=author.get("name"),
                given_name=author.get("given_name"),
                surname=author.get("surname"),
            ))
    release = ReleaseEntity(
        title=ref.get("title"),
        contribs=contribs,
        volume=ref.get("volume"),
        issue=ref.get("issue"),
        pages=ref.get("pages"),
        ext_ids=ReleaseExtIds(
            doi=clean_doi(ref.get("doi")),
            pmid=ref.get("pmid"),
            pmcid=ref.get("pmcid"),
            arxiv=ref.get("arxiv_id"),
        ),
    )
    if ref.get("journal"):
        release.extra = {"container_name": ref.get("journal")}
    if ref.get("date"):
        if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit():
            release.release_year = int(ref["date"][0:4])
        # TODO: try to parse 'date' into an ISO date format, and assign to release_date?
    return release
def ref_to_release(ref: GrobidBiblio) -> ReleaseEntity:
    contribs = []
    for author in ref.authors or []:
        contribs.append(
            ReleaseContrib(
                raw_name=author.full_name,
                given_name=author.given_name,
                surname=author.surname,
            )
        )
    release = ReleaseEntity(
        title=ref.title,
        contribs=contribs,
        volume=ref.volume,
        issue=ref.issue,
        pages=ref.pages,
        ext_ids=ReleaseExtIds(
            doi=ref.doi,
            pmid=ref.pmid,
            pmcid=ref.pmcid,
            arxiv=ref.arxiv_id,
        ),
    )
    if ref.journal:
        release.extra = {"container_name": ref.journal}
    if ref.date:
        if len(ref.date) == 4 and ref.date.isdigit():
            release.release_year = int(ref.date)
    return release
def ref_to_release(ref: dict) -> ReleaseEntity:
    contribs = []
    for author in ref.get("authors") or []:
        contribs.append(
            ReleaseContrib(
                raw_name=author.get("name"),
                given_name=author.get("given_name"),
                surname=author.get("surname"),
            ))
    release = ReleaseEntity(
        title=ref.get("title"),
        contribs=contribs,
        volume=ref.get("volume"),
        issue=ref.get("issue"),
        pages=ref.get("pages"),
        ext_ids=ReleaseExtIds(
            doi=ref.get("doi"),
            pmid=ref.get("pmid"),
            pmcid=ref.get("pmcid"),
            arxiv=ref.get("arxiv_id"),
        ),
    )
    if ref.get("journal"):
        release.extra = {"container_name": ref.get("journal")}
    if ref.get("date"):
        if len(ref["date"]) == 4 and ref["date"].isdigit():
            release.release_year = int(ref["date"])
    return release
Пример #5
0
def test_fuzzy_match_different(entity_importer, mocker) -> None:
    """
    Simple fuzzycat-mocked test for "strong match" case
    """

    r1 = ReleaseEntity(
        title="example title: novel work",
        contribs=[ReleaseContrib(raw_name="robin hood")],
        ext_ids=ReleaseExtIds(doi="10.1234/abcdefg"),
    )
    r2 = ReleaseEntity(
        title="Example Title: Novel Work?",
        contribs=[ReleaseContrib(raw_name="robin hood")],
        ext_ids=ReleaseExtIds(),
    )
    r3 = ReleaseEntity(
        title="entirely different",
        contribs=[ReleaseContrib(raw_name="king tut")],
        ext_ids=ReleaseExtIds(),
    )

    match_raw = mocker.patch(
        "fatcat_tools.importers.common.match_release_fuzzy")
    match_raw.side_effect = [[r3, r2, r3, r2]]
    resp = entity_importer.match_existing_release_fuzzy(r1)
    assert (resp[0], resp[2]) == ("STRONG", r2)

    match_raw.side_effect = [[r2, r2, r3, r1]]
    resp = entity_importer.match_existing_release_fuzzy(r1)
    assert (resp[0], resp[2]) == ("EXACT", r1)

    match_raw.side_effect = [[r3]]
    resp = entity_importer.match_existing_release_fuzzy(r1)
    assert resp is None

    match_raw.side_effect = [[]]
    resp = entity_importer.match_existing_release_fuzzy(r1)
    assert resp is None
Пример #6
0
 def do_contribs(obj_list: List[Dict[str, Any]],
                 ctype: str) -> List[ReleaseContrib]:
     contribs = []
     for i, am in enumerate(obj_list):
         creator_id = None
         if "ORCID" in am.keys():
             creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1])
         # Sorry humans :(
         if am.get("given") and am.get("family"):
             raw_name: Optional[str] = "{} {}".format(
                 am["given"], am["family"])
         elif am.get("family"):
             raw_name = am["family"]
         else:
             # TODO: can end up empty
             raw_name = am.get("name") or am.get("given")
         extra: Dict[str, Any] = dict()
         if ctype == "author":
             index: Optional[int] = i
         else:
             index = None
         raw_affiliation = None
         affiliation_list = am.get("affiliation") or []
         # TODO: currently requiring a "name" in all affiliations. Could
         # add ROR support (via identifier) in the near future
         affiliation_list = [a for a in affiliation_list if "name" in a]
         if affiliation_list and len(affiliation_list) > 0:
             raw_affiliation = affiliation_list[0]["name"]
             if len(affiliation_list) > 1:
                 # note: affiliation => more_affiliations
                 extra["more_affiliations"] = [
                     clean_str(a["name"]) for a in affiliation_list[1:]
                 ]
         if am.get("sequence") and am.get("sequence") != "additional":
             extra["seq"] = clean_str(am.get("sequence"))
         assert ctype in ("author", "editor", "translator")
         raw_name = clean_str(raw_name)
         # TODO: what if 'raw_name' is None?
         contribs.append(
             ReleaseContrib(
                 creator_id=creator_id,
                 index=index,
                 raw_name=raw_name,
                 given_name=clean_str(am.get("given")),
                 surname=clean_str(am.get("family")),
                 raw_affiliation=clean_str(raw_affiliation),
                 role=ctype,
                 extra=extra or None,
             ))
     return contribs
Пример #7
0
    def update_entity(self, re: ReleaseEntity) -> None:
        """
        Mutates a release entity in place, updating fields with values from
        this form.

        Form must be validated *before* calling this function.
        """
        for simple_attr in RELEASE_SIMPLE_ATTRS:
            a = getattr(self, simple_attr).data
            # special case blank strings
            if a == "":
                a = None
            setattr(re, simple_attr, a)
        for extid_attr in RELEASE_EXTID_ATTRS:
            a = getattr(self, extid_attr).data
            # special case blank strings
            if a == "":
                a = None
            setattr(re.ext_ids, extid_attr, a)
        if self.release_date.data:
            re.release_year = self.release_date.data.year
        # bunch of complexity here to preserve old contrib metadata (eg,
        # affiliation and extra) not included in current forms
        # TODO: this may be broken; either way needs tests
        if re.contribs:
            old_contribs = re.contribs.copy()
            re.contribs = []
        else:
            old_contribs = []
            re.contribs = []
        for c in self.contribs:
            if c.prev_index.data not in ("", None):
                rc = old_contribs[int(c.prev_index.data)]
                rc.role = c.role.data or None
                rc.raw_name = c.raw_name.data or None
            else:
                rc = ReleaseContrib(
                    role=c.role.data or None,
                    raw_name=c.raw_name.data or None,
                )
            re.contribs.append(rc)
        if self.edit_description.data:
            re.edit_extra = dict(description=self.edit_description.data)
Пример #8
0
def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
    """
    For the most part, JALC DC names are in either japanese or english. The
    two common patterns are a list alternating between the two (in which case
    the names are translations), or all in one language or the other.

    Because dublin core is a projection tossing away a bunch of context, the
    other cases are hard to disambiguate. There are also some cases with Korean
    and other languages mixed in. This crude method doesn't handle everything
    right; it tries to just get the two common patterns correct. Sorry humans!

    Edge cases for this function:
    - 10.11316/jpsgaiyo.56.1.4.0_757_3 <= all english, some japanese, works
    - 10.14988/pa.2017.0000013531 <= complex, not japanese/english, mixed
    - 10.15036/arerugi.62.1407_1 <= one japanese, two english; fails
    - 10.14988/pa.2017.0000007327 <= ambiguous; translator in jpn/eng
    """

    persons = []

    # first parse out into language-agnostic dics
    for raw in raw_persons:
        name = raw.find("name") or None
        if name:
            name = clean_str(name.get_text().replace("\n", " "))
        surname = raw.find("familyName") or None
        if surname:
            surname = clean_str(surname.get_text().replace("\n", " "))
        given_name = raw.find("givenName") or None
        if given_name:
            given_name = clean_str(given_name.get_text().replace("\n", " "))
        lang = "en"
        if is_cjk(name):
            lang = "ja"
        if lang == "en" and surname and given_name:
            # english names order is flipped
            name = "{} {}".format(given_name, surname)
        rc = ReleaseContrib(raw_name=name,
                            surname=surname,
                            given_name=given_name,
                            role="author")
        # add an extra hint field; won't end up in serialized object
        rc._lang = lang
        persons.append(rc)

    if not persons:
        return []

    if all([p._lang == "en"
            for p in persons]) or all([p._lang == "ja" for p in persons]):
        # all english names, or all japanese names
        return persons

    # for debugging
    # if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):
    #    print("INTERESTING: {}".format(persons[0]))

    start_lang = persons[0]._lang
    contribs = []
    for p in persons:
        if p._lang == start_lang:
            contribs.append(p)
        else:
            if p._lang == "en" and contribs[-1]._lang == "ja":
                eng = p
                jpn = contribs[-1]
            elif p._lang == "ja" and contribs[-1]._lang == "en":
                eng = contribs[-1]
                jpn = p
            else:
                # give up and just add as another author
                contribs.append(p)
                continue
            eng.extra = {
                "original_name": {
                    "lang": jpn._lang,
                    "raw_name": jpn.raw_name,
                    "given_name": jpn.given_name,
                    "surname": jpn.surname,
                },
            }
            contribs[-1] = eng
    return contribs