Пример #1
0
 def doaj_contribs(
         self,
         authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
     """
     bibjson.author {
         affiliation (string, optional),
         name (string),
         orcid_id (string, optional)
     }
     """
     contribs = []
     index = 0
     for author in authors:
         if not author.get("name"):
             continue
         creator_id = None
         orcid = clean_orcid(author.get("orcid_id"))
         if orcid:
             creator_id = self.lookup_orcid(orcid)
         contribs.append(
             fatcat_openapi_client.ReleaseContrib(
                 raw_name=author.get("name"),
                 role="author",
                 index=index,
                 creator_id=creator_id,
                 raw_affiliation=clean_str(author.get("affiliation")),
             ))
         index += 1
     return contribs
Пример #2
0
    def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib:
        """
        In the future, might try to implement creator key-ificiation and lookup here.

        Example rows:

            <author>Michael H. B&ouml;hlen</author>
            <author orcid="0000-0002-4354-9138">Nicolas Heist</author>
            <author orcid="0000-0001-9108-4278">Jens Lehmann 0001</author>
        """

        creator_id = None
        extra = None
        raw_name = clean_str(elem.text)

        # remove number in author name, if present
        if raw_name.split()[-1].isdigit():
            raw_name = ' '.join(raw_name.split()[:-1])

        if elem.get('orcid'):
            orcid = clean_orcid(elem['orcid'])
            if orcid:
                creator_id = self.lookup_orcid(orcid)
                if not creator_id:
                    extra = dict(orcid=orcid)
        return fatcat_openapi_client.ReleaseContrib(
            raw_name=raw_name,
            creator_id=creator_id,
            extra=extra,
        )
Пример #3
0
def test_contributor_list_contains_contributor():
    Case = collections.namedtuple("Case", "contrib_list contrib want")
    cases = [
        Case([], fatcat_openapi_client.ReleaseContrib(raw_name="Paul Katz"),
             False),
    ]
    for c in cases:
        got = contributor_list_contains_contributor(c.contrib_list, c.contrib)
        assert got == c.want
Пример #4
0
 def do_contribs(obj_list, ctype):
     contribs = []
     for i, am in enumerate(obj_list):
         creator_id = None
         if 'ORCID' in am.keys():
             creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
         # Sorry humans :(
         if am.get('given') and am.get('family'):
             raw_name = "{} {}".format(am['given'], am['family'])
         elif am.get('family'):
             raw_name = am['family']
         else:
             # TODO: can end up empty
             raw_name = am.get('name') or am.get('given')
         extra = dict()
         if ctype == "author":
             index = i
         else:
             index = None
         raw_affiliation = None
         if am.get('affiliation'):
             if len(am.get('affiliation')) > 0:
                 raw_affiliation = am.get('affiliation')[0]['name']
             if len(am.get('affiliation')) > 1:
                 # note: affiliation => more_affiliations
                 extra['more_affiliations'] = [
                     clean(a['name']) for a in am.get('affiliation')[1:]
                 ]
         if am.get('sequence') and am.get('sequence') != "additional":
             extra['seq'] = clean(am.get('sequence'))
         if not extra:
             extra = None
         assert ctype in ("author", "editor", "translator")
         raw_name = clean(raw_name)
         contribs.append(
             fatcat_openapi_client.ReleaseContrib(
                 creator_id=creator_id,
                 index=index,
                 raw_name=raw_name,
                 given_name=clean(am.get('given')),
                 surname=clean(am.get('family')),
                 raw_affiliation=clean(raw_affiliation),
                 role=ctype,
                 extra=extra))
     return contribs
Пример #5
0
    def parse_grobid_json(self, obj):

        if not obj.get('title'):
            return None

        extra_grobid = dict()

        abstract = obj.get('abstract')
        if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(
                abstract) > 10:
            abobj = fatcat_openapi_client.ReleaseAbstract(
                mimetype="text/plain", content=clean(obj.get('abstract')))
            abstracts = [abobj]
        else:
            abstracts = None

        contribs = []
        for i, a in enumerate(obj.get('authors', [])):
            contribs.append(
                fatcat_openapi_client.ReleaseContrib(
                    index=i,
                    raw_name=clean(a['name']),
                    given_name=clean(a.get('given_name')),
                    surname=clean(a.get('surname')),
                    role="author",
                    extra=None))

        refs = []
        for raw in obj.get('citations', []):
            cite_extra = dict()
            year = None
            if raw.get('date'):
                try:
                    year = int(raw['date'].strip()[:4])
                except:
                    pass
            for key in ('volume', 'url', 'issue', 'publisher'):
                if raw.get(key):
                    cite_extra[key] = clean(raw[key])
            if raw.get('authors'):
                cite_extra['authors'] = [
                    clean(a['name']) for a in raw['authors']
                ]

            if not cite_extra:
                cite_extra = None
            refs.append(
                fatcat_openapi_client.ReleaseRef(key=clean(raw.get('id')),
                                                 year=year,
                                                 title=clean(raw['title']),
                                                 extra=cite_extra))

        release_date = None
        release_year = None
        if obj.get('date'):
            # only returns year, ever?
            release_year = int(obj['date'][:4])

        extra = dict()
        if obj.get('doi'):
            extra['doi'] = obj['doi']
        if obj['journal'] and obj['journal'].get('name'):
            extra['container_name'] = clean(obj['journal']['name'])

        # TODO: ISSN/eISSN handling? or just journal name lookup?

        if extra_grobid:
            extra['grobid'] = extra_grobid
        if self.longtail_oa:
            extra['longtail_oa'] = True
        if not extra:
            extra = None

        title = clean(obj['title'], force_xml=True)
        if not title or len(title) < 2:
            return None

        re = fatcat_openapi_client.ReleaseEntity(
            title=title,
            release_type="article-journal",
            release_date=release_date,
            release_year=release_year,
            contribs=contribs,
            refs=refs,
            publisher=clean(obj['journal'].get('publisher')),
            volume=clean(obj['journal'].get('volume')),
            issue=clean(obj['journal'].get('issue')),
            abstracts=abstracts,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(),
            extra=extra)
        return re
Пример #6
0
    def parse_record(self, a):

        medline = a.MedlineCitation
        # PubmedData isn't required by DTD, but seems to always be present
        pubmed = a.PubmedData
        extra = dict()
        extra_pubmed = dict()

        identifiers = pubmed.ArticleIdList
        pmid = medline.PMID.string.strip()
        doi = identifiers.find("ArticleId", IdType="doi")
        if doi and doi.string:
            doi = clean_doi(doi.string)
        else:
            doi = None

        pmcid = identifiers.find("ArticleId", IdType="pmc")
        if pmcid:
            pmcid = clean_pmcid(pmcid.string.strip().upper())

        release_type = None
        pub_types = []
        for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"):
            pub_types.append(pub_type.string)
            if pub_type.string in PUBMED_RELEASE_TYPE_MAP:
                release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
                break
        if pub_types:
            extra_pubmed['pub_types'] = pub_types
        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
            release_type = "retraction"
            retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
            if retraction_of:
                if retraction_of.RefSource:
                    extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
                if retraction_of.PMID:
                    extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string

        # everything in medline is published
        release_stage = "published"
        if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"):
            release_stage = "updated"
        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
            release_stage = "retraction"

        withdrawn_status = None
        if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
            withdrawn_status = "retracted"
        elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"):
            withdrawn_status = "concern"

        pages = medline.find('MedlinePgn')
        if pages:
            pages = pages.string

        title = medline.Article.ArticleTitle.string # always present
        if title:
            if title.endswith('.'):
                title = title[:-1]
            # this hides some "special" titles, but the vast majority are
            # translations; translations don't always include the original_title
            if title.startswith('[') and title.endswith(']'):
                title = title[1:-1]
        else:
            # will filter out later
            title = None

        original_title = medline.Article.find("VernacularTitle", recurse=False)
        if original_title:
            original_title = original_title.string or None
            if original_title and original_title.endswith('.'):
                original_title = original_title[:-1]

        # TODO: happening in alpha order, not handling multi-language well.
        language = medline.Article.Language
        if language:
            language = language.string
            if language in ("und", "un"):
                # "undetermined"
                language = None
            else:
                language = LANG_MAP_MARC.get(language)
                if not language and not (medline.Article.Language.string in LANG_MAP_MARC):
                    warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))

        ### Journal/Issue Metadata
        # MedlineJournalInfo is always present
        issnl = None
        container_id = None
        container_name = None
        container_extra = dict()
        mji = medline.MedlineJournalInfo
        if mji.find("Country"):
            country_name = mji.Country.string.strip()
            country_code = COUNTRY_NAME_MAP.get(country_name)
            if country_code:
                container_extra['country'] = country_code
            elif country_name:
                container_extra['country_name'] = country_name
        if mji.find("ISSNLinking"):
            issnl = mji.ISSNLinking.string

        journal = medline.Article.Journal
        issnp = journal.find("ISSN", IssnType="Print")
        if issnp:
            container_extra['issnp'] = issnp.string
        if not issnl:
            issnll = self.issn2issnl(issnp)

        if issnl:
            container_id = self.lookup_issnl(issnl)

        pub_date = medline.Article.find('ArticleDate')
        if not pub_date:
            pub_date = journal.PubDate
        if not pub_date:
            pub_date = journal.JournalIssue.PubDate
        release_date = None
        release_year = None
        if pub_date.Year:
            release_year = int(pub_date.Year.string)
            if pub_date.find("Day") and pub_date.find("Month"):
                try:
                    release_date = datetime.date(
                        release_year,
                        MONTH_ABBR_MAP[pub_date.Month.string],
                        int(pub_date.Day.string))
                    release_date = release_date.isoformat()
                except ValueError as ve:
                    print("bad date, skipping: {}".format(ve), file=sys.stderr)
                    release_date = None
        elif pub_date.MedlineDate:
            medline_date = pub_date.MedlineDate.string.strip()
            if len(medline_date) >= 4 and medline_date[:4].isdigit():
                release_year = int(medline_date[:4])
                if release_year < 1300 or release_year > 2040:
                    print("bad medline year, skipping: {}".format(release_year), file=sys.stderr)
                    release_year = None
            else:
                print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)

        if journal.find("Title"):
            container_name = journal.Title.string

        if (container_id is None and self.create_containers and (issnl is not None)
                and container_name):
            # name, type, publisher, issnl
            # extra: issnp, issne, original_name, languages, country
            ce = fatcat_openapi_client.ContainerEntity(
                name=container_name,
                container_type='journal',
                #NOTE: publisher not included
                issnl=issnl,
                extra=(container_extra or None))
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            self._issnl_id_map[issnl] = container_id
       
        ji = journal.JournalIssue
        volume = None
        if ji.find("Volume"):
            volume = ji.Volume.string
        issue = None
        if ji.find("Issue"):
            issue = ji.Issue.string

        ### Abstracts
        # "All abstracts are in English"
        abstracts = []
        primary_abstract = medline.find("Abstract")
        if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'):
            joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")])
            abst = fatcat_openapi_client.ReleaseAbstract(
                content=joined,
                mimetype="text/plain",
                lang="en",
            )
            if abst.content:
                abstracts.append(abst)
        elif primary_abstract:
            for abstract in primary_abstract.find_all("AbstractText"):
                abst = fatcat_openapi_client.ReleaseAbstract(
                    content=abstract.get_text().strip(),
                    mimetype="text/plain",
                    lang="en",
                )
                if abst.content:
                    abstracts.append(abst)
                if abstract.find('math'):
                    abst = fatcat_openapi_client.ReleaseAbstract(
                        # strip the <AbstractText> tags
                        content=str(abstract)[14:-15],
                        mimetype="application/mathml+xml",
                        lang="en",
                    )
                    if abst.content:
                        abstracts.append(abst)
        other_abstracts = medline.find_all("OtherAbstract")
        for other in other_abstracts:
            lang = "en"
            if other.get('Language'):
                lang = LANG_MAP_MARC.get(other['Language'])
            abst = fatcat_openapi_client.ReleaseAbstract(
                content=other.AbstractText.get_text().strip(),
                mimetype="text/plain",
                lang=lang,
            )
            if abst.content:
                abstracts.append(abst)
        if not abstracts:
            abstracts = None

        ### Contribs
        contribs = []
        if medline.AuthorList:
            for author in medline.AuthorList.find_all("Author"):
                creator_id = None
                given_name = None
                surname = None
                raw_name = None
                if author.ForeName:
                    given_name = author.ForeName.string
                if author.LastName:
                    surname = author.LastName.string
                if given_name and surname:
                    raw_name = "{} {}".format(given_name, surname)
                elif surname:
                    raw_name = surname
                if not raw_name and author.CollectiveName and author.CollectiveName.string:
                    raw_name = author.CollectiveName.string
                contrib_extra = dict()
                orcid = author.find("Identifier", Source="ORCID")
                if orcid:
                    # needs re-formatting from, eg, "0000000179841889"
                    orcid = orcid.string
                    if orcid.startswith("http://orcid.org/"):
                        orcid = orcid.replace("http://orcid.org/", "")
                    elif orcid.startswith("https://orcid.org/"):
                        orcid = orcid.replace("https://orcid.org/", "")
                    elif not '-' in orcid:
                        orcid = "{}-{}-{}-{}".format(
                            orcid[0:4],
                            orcid[4:8],
                            orcid[8:12],
                            orcid[12:16],
                        )
                    creator_id = self.lookup_orcid(orcid)
                    contrib_extra['orcid'] = orcid
                affiliations = author.find_all("Affiliation")
                raw_affiliation = None
                if affiliations:
                    raw_affiliation = affiliations[0].string
                    if len(affiliations) > 1:
                        contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]]
                if author.find("EqualContrib"):
                    # TODO: schema for this?
                    contrib_extra['equal'] = True
                contribs.append(fatcat_openapi_client.ReleaseContrib(
                    raw_name=raw_name,
                    given_name=given_name,
                    surname=surname,
                    role="author",
                    raw_affiliation=raw_affiliation,
                    creator_id=creator_id,
                    extra=contrib_extra,
                ))

            if medline.AuthorList['CompleteYN'] == 'N':
                contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al."))

        for i, contrib in enumerate(contribs):
            if contrib.raw_name != "et al.":
                contrib.index = i
        if not contribs:
            contribs = None

        ### References
        refs = []
        if pubmed.ReferenceList:
            for ref in pubmed.ReferenceList.find_all('Reference'):
                ref_extra = dict()
                ref_doi = ref.find("ArticleId", IdType="doi")
                if ref_doi:
                    ref_doi = clean_doi(ref_doi.string)
                ref_pmid = ref.find("ArticleId", IdType="pubmed")
                if ref_pmid:
                    ref_pmid = clean_pmid(ref_pmid.string)
                ref_release_id = None
                if ref_doi:
                    ref_extra['doi'] = ref_doi
                    if self.lookup_refs:
                        ref_release_id = self.lookup_doi(ref_doi)
                if ref_pmid:
                    ref_extra['pmid'] = ref_pmid
                    if self.lookup_refs:
                        ref_release_id = self.lookup_pmid(ref_pmid)
                ref_raw = ref.Citation
                if ref_raw:
                    ref_extra['unstructured'] = ref_raw.string
                if not ref_extra:
                    ref_extra = None
                refs.append(fatcat_openapi_client.ReleaseRef(
                    target_release_id=ref_release_id,
                    extra=ref_extra,
                ))
        if not refs:
            refs = None

        # extra:
        #   translation_of
        #   aliases
        #   container_name
        #   group-title
        #   pubmed: retraction refs
        if extra_pubmed:
            extra['pubmed'] = extra_pubmed
        if not extra:
            extra = None

        title = clean(title)
        if not title:
            return None

        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            title=title,
            original_title=clean(original_title),
            release_type=release_type,
            release_stage=release_stage,
            release_date=release_date,
            release_year=release_year,
            withdrawn_status=withdrawn_status,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                pmid=pmid,
                pmcid=pmcid,
                #isbn13     # never in Article
            ),
            volume=volume,
            issue=issue,
            pages=pages,
            #publisher  # not included?
            language=language,
            #license_slug   # not in MEDLINE
            abstracts=abstracts,
            contribs=contribs,
            refs=refs,
            container_id=container_id,
            extra=extra,
        )
        return re
Пример #7
0
    def parse_record(self, article):

        journal_meta = article.front.find("journal-meta")
        article_meta = article.front.find("article-meta")

        extra = dict()
        extra_jstor = dict()

        release_type = JSTOR_TYPE_MAP.get(article['article-type'])
        title = article_meta.find("article-title")
        if title and title.string:
            title = title.string.strip()
        elif title and not title.string:
            title = None

        if not title and release_type.startswith(
                'review') and article_meta.product.source:
            title = "Review: {}".format(article_meta.product.source.string)

        if not title:
            return None

        if title.endswith('.'):
            title = title[:-1]

        if "[Abstract]" in title:
            # TODO: strip the "[Abstract]" bit?
            release_type = "abstract"
        elif "[Editorial" in title:
            release_type = "editorial"
        elif "[Letter" in title:
            release_type = "letter"
        elif "[Poem" in title or "[Photograph" in title:
            release_type = None

        if title.startswith("[") and title.endswith("]"):
            # strip brackets if that is all that is there (eg, translation or non-english)
            title = title[1:-1]

        # JSTOR journal-id
        journal_ids = [j.string for j in journal_meta.find_all('journal-id')]
        if journal_ids:
            extra_jstor['journal_ids'] = journal_ids

        journal_title = journal_meta.find("journal-title").string
        publisher = journal_meta.find("publisher-name").string
        issn = journal_meta.find("issn")
        if issn:
            issn = issn.string
            if len(issn) == 8:
                issn = "{}-{}".format(issn[0:4], issn[4:8])
            else:
                assert len(issn) == 9

        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)

        # create container if it doesn't exist
        if (container_id is None and self.create_containers
                and (issnl is not None) and journal_title):
            ce = fatcat_openapi_client.ContainerEntity(
                issnl=issnl,
                publisher=publisher,
                container_type=self.map_container_type(release_type),
                name=clean(journal_title, force_xml=True))
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            self._issnl_id_map[issnl] = container_id

        doi = article_meta.find("article-id", {"pub-id-type": "doi"})
        if doi:
            doi = doi.string.lower().strip()

        jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
        if jstor_id:
            jstor_id = jstor_id.string.strip()
        if not jstor_id and doi:
            assert doi.startswith('10.2307/')
            jstor_id = doi.replace('10.2307/', '')
        assert jstor_id and int(jstor_id)

        contribs = []
        cgroup = article_meta.find("contrib-group")
        if cgroup:
            for c in cgroup.find_all("contrib"):
                given = c.find("given-names")
                if given:
                    given = clean(given.string)
                surname = c.find("surname")
                if surname:
                    surname = clean(surname.string)
                raw_name = c.find("string-name")
                if raw_name:
                    raw_name = clean(raw_name.string)

                if not raw_name:
                    if given and surname:
                        raw_name = "{} {}".format(given, surname)
                    elif surname:
                        raw_name = surname

                role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author'))
                if not role and c.get('contrib-type'):
                    sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(
                        c['contrib-type']))
                contribs.append(
                    fatcat_openapi_client.ReleaseContrib(
                        role=role,
                        raw_name=raw_name,
                        given_name=given,
                        surname=surname,
                    ))

        for i, contrib in enumerate(contribs):
            if contrib.raw_name != "et al.":
                contrib.index = i

        release_year = None
        release_date = None
        pub_date = article_meta.find('pub-date')
        if pub_date and pub_date.year:
            release_year = int(pub_date.year.string)
            if pub_date.month and pub_date.day:
                release_date = datetime.date(release_year,
                                             int(pub_date.month.string),
                                             int(pub_date.day.string))
                if release_date.day == 1 and release_date.month == 1:
                    # suspect jan 1st dates get set by JSTOR when actual
                    # date not known (citation needed), so drop them
                    release_date = None

        volume = None
        if article_meta.volume:
            volume = article_meta.volume.string or None

        issue = None
        if article_meta.issue:
            issue = article_meta.issue.string or None

        pages = None
        if article_meta.find("page-range"):
            pages = article_meta.find("page-range").string
        elif article_meta.fpage:
            pages = article_meta.fpage.string

        language = None
        cm = article_meta.find("custom-meta")
        if cm.find("meta-name").string == "lang":
            language = cm.find("meta-value").string.split()[0]
            language = LANG_MAP_MARC.get(language)
            if not language:
                warnings.warn("MISSING MARC LANG: {}".format(
                    cm.find("meta-value").string))

        # JSTOR issue-id
        if article_meta.find('issue-id'):
            issue_id = clean(article_meta.find('issue-id').string)
            if issue_id:
                extra_jstor['issue_id'] = issue_id

        # everything in JSTOR is published
        release_stage = "published"

        # extra:
        #   withdrawn_date
        #   translation_of
        #   subtitle
        #   aliases
        #   container_name
        #   group-title
        #   pubmed: retraction refs
        if extra_jstor:
            extra['jstor'] = extra_jstor
        if not extra:
            extra = None

        re = fatcat_openapi_client.ReleaseEntity(
            #work_id
            title=title,
            #original_title
            release_type=release_type,
            release_stage=release_stage,
            release_date=release_date,
            release_year=release_year,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                jstor=jstor_id,
            ),
            volume=volume,
            issue=issue,
            pages=pages,
            publisher=publisher,
            language=language,
            #license_slug

            # content, mimetype, lang
            #abstracts=abstracts,
            contribs=contribs,

            # key, year, container_name, title, locator
            # extra: volume, authors, issue, publisher, identifiers
            #refs=refs,

            #   name, type, publisher, issnl
            #   extra: issnp, issne, original_name, languages, country
            container_id=container_id,
            extra=extra,
        )
        return re
Пример #8
0
    def parse_datacite_creators(self,
                                creators,
                                role='author',
                                set_index=True,
                                doi=None):
        """
        Parses a list of creators into a list of ReleaseContrib objects. Set
        set_index to False, if the index contrib field should be left blank.
        The doi parameter is only used for debugging.
        """
        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
        contribs = []

        # Names, that should be ignored right away.
        name_blacklist = set(('Occdownload Gbif.Org', ))

        for i, c in enumerate(creators):
            if not set_index:
                i = None
            nameType = c.get('nameType', '') or ''
            if nameType in ('', 'Personal'):
                creator_id = None
                for nid in c.get('nameIdentifiers', []):
                    name_scheme = nid.get('nameIdentifierScheme', '') or ''
                    if not name_scheme.lower() == "orcid":
                        continue
                    orcid = nid.get('nameIdentifier',
                                    '').replace('https://orcid.org/', '')
                    if not orcid:
                        continue
                    creator_id = self.lookup_orcid(orcid)
                    # TODO(martin): If creator_id is None, should we create creators?

                # If there are multiple affiliation strings, use the first one.
                affiliations = c.get('affiliation', []) or []
                raw_affiliation = None
                if len(affiliations) == 0:
                    raw_affiliation = None
                else:
                    raw_affiliation = clean(affiliations[0])

                name = c.get('name')
                given_name = c.get('givenName')
                surname = c.get('familyName')

                if name:
                    name = clean(name)
                if not any((name, given_name, surname)):
                    continue
                if not name:
                    name = "{} {}".format(given_name or '', surname
                                          or '').strip()
                if name in name_blacklist:
                    continue
                if name.lower() in UNKNOWN_MARKERS_LOWER:
                    continue
                # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
                if name:
                    name = index_form_to_display_name(name)

                if given_name:
                    given_name = clean(given_name)
                if surname:
                    surname = clean(surname)
                if raw_affiliation == '':
                    continue

                extra = None

                # "DataManager", "DataCurator", "ContactPerson", "Distributor",
                # "RegistrationAgency", "Sponsor", "Researcher",
                # "RelatedPerson", "ProjectLeader", "Editor", "Other",
                # "ProjectMember", "Funder", "RightsHolder", "DataCollector",
                # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
                contributorType = c.get('contributorType', '') or ''

                if contributorType:
                    extra = {'type': contributorType}

                contribs.append(
                    fatcat_openapi_client.ReleaseContrib(
                        creator_id=creator_id,
                        index=i,
                        raw_name=name,
                        given_name=given_name,
                        surname=surname,
                        role=role,
                        raw_affiliation=raw_affiliation,
                        extra=extra,
                    ))
            elif nameType == 'Organizational':
                name = c.get('name', '') or ''
                if name in UNKNOWN_MARKERS:
                    continue
                if len(name) < 3:
                    continue
                extra = {'organization': name}
                contribs.append(
                    fatcat_openapi_client.ReleaseContrib(index=i, extra=extra))
            else:
                print('[{}] unknown name type: {}'.format(doi, nameType),
                      file=sys.stderr)

        return contribs
Пример #9
0
def parse_jalc_persons(raw_persons):
    """
    For the most part, JALC DC names are in either japanese or english. The
    two common patterns are a list alternating between the two (in which case
    the names are translations), or all in one language or the other.

    Because dublin core is a projection tossing away a bunch of context, the
    other cases are hard to disambiguate. There are also some cases with Korean
    and other languages mixed in. This crude method doesn't handle everything
    right; it tries to just get the two common patterns correct. Sorry humans!

    Edge cases for this function:
    - 10.11316/jpsgaiyo.56.1.4.0_757_3 <= all english, some japanese, works
    - 10.14988/pa.2017.0000013531 <= complex, not japanese/english, mixed
    - 10.15036/arerugi.62.1407_1 <= one japanese, two english; fails
    - 10.14988/pa.2017.0000007327 <= ambiguous; translator in jpn/eng
    """

    persons = []

    # first parse out into language-agnostic dics
    for raw in raw_persons:
        name = raw.find('name') or None
        if name:
            name = clean(name.string)
        surname = raw.find('familyName') or None
        if surname:
            surname = clean(surname.string)
        given_name = raw.find('givenName') or None
        if given_name:
            given_name = clean(given_name.string)
        lang = 'en'
        if is_cjk(name):
            lang = 'ja'
        if lang == 'en' and surname and given_name:
            # english names order is flipped
            name = "{} {}".format(given_name, surname)
        rc = fatcat_openapi_client.ReleaseContrib(raw_name=name,
                                                  surname=surname,
                                                  given_name=given_name,
                                                  role="author")
        # add an extra hint field; won't end up in serialized object
        rc._lang = lang
        persons.append(rc)

    if not persons:
        return []

    if all([p._lang == 'en'
            for p in persons]) or all([p._lang == 'ja' for p in persons]):
        # all english names, or all japanese names
        return persons

    # for debugging
    #if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):
    #    print("INTERESTING: {}".format(persons[0]))

    start_lang = persons[0]._lang
    contribs = []
    for p in persons:
        if p._lang == start_lang:
            contribs.append(p)
        else:
            if p._lang == 'en' and contribs[-1]._lang == 'ja':
                eng = p
                jpn = contribs[-1]
            elif p._lang == 'ja' and contribs[-1]._lang == 'en':
                eng = contribs[-1]
                jpn = p
            else:
                # give up and just add as another author
                contribs.append(p)
                continue
            eng.extra = {
                'original_name': {
                    'lang': jpn._lang,
                    'raw_name': jpn.raw_name,
                    'given_name': jpn.given_name,
                    'surname': jpn.surname,
                },
            }
            contribs[-1] = eng
    return contribs
Пример #10
0
    def parse_record(self, record: Any) -> Optional[List[ReleaseEntity]]:

        if not record:
            return None
        metadata = record.arXivRaw
        if not metadata:
            return None
        extra: Dict[str, Any] = dict()
        extra_arxiv: Dict[str, Any] = dict()

        # don't know!
        release_type = "article"

        base_id = metadata.id.string
        doi = None
        if metadata.doi and metadata.doi.string:
            doi = clean_doi(metadata.doi.string.lower().split()[0].strip())
            if doi and not (doi.startswith("10.") and "/" in doi
                            and doi.split("/")[1]):
                sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                doi = None
        title = latex_to_text(metadata.title.get_text().replace("\n", " "))
        authors = parse_arxiv_authors(metadata.authors.get_text().replace(
            "\n", " "))
        contribs = [
            fatcat_openapi_client.ReleaseContrib(index=i,
                                                 raw_name=a,
                                                 role="author")
            for i, a in enumerate(authors)
        ]

        lang: Optional[str] = "en"  # the vast majority in english
        if metadata.comments and metadata.comments.get_text():
            comments = metadata.comments.get_text().replace("\n", " ").strip()
            extra_arxiv["comments"] = comments
            if "in french" in comments.lower():
                lang = "fr"
            elif "in spanish" in comments.lower():
                lang = "es"
            elif "in portuguese" in comments.lower():
                lang = "pt"
            elif "in hindi" in comments.lower():
                lang = "hi"
            elif "in japanese" in comments.lower():
                lang = "ja"
            elif "in german" in comments.lower():
                lang = "de"
            elif "simplified chinese" in comments.lower():
                lang = "zh"
            elif "in russian" in comments.lower():
                lang = "ru"
            # more languages?

        number = None
        if metadata.find("journal-ref") and metadata.find(
                "journal-ref").get_text():
            journal_ref = metadata.find("journal-ref").get_text().replace(
                "\n", " ").strip()
            extra_arxiv["journal_ref"] = journal_ref
            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(
            ):
                release_type = "paper-conference"
        if metadata.find("report-no") and metadata.find("report-no").string:
            number = metadata.find("report-no").string.strip()
            # at least some people plop extra metadata in here. hrmf!
            if "ISSN " in number or "ISBN " in number or len(
                    number.split()) > 2:
                extra_arxiv["report-no"] = number
                number = None
            else:
                release_type = "report"
        if metadata.find("acm-class") and metadata.find("acm-class").string:
            extra_arxiv["acm_class"] = metadata.find(
                "acm-class").string.strip()
        if metadata.categories and metadata.categories.get_text():
            extra_arxiv["categories"] = metadata.categories.get_text().split()
        license_slug = None
        if metadata.license and metadata.license.get_text():
            license_slug = lookup_license_slug(metadata.license.get_text())
        abstracts = None
        if metadata.abstract:
            # TODO: test for this multi-abstract code path
            abstracts = []
            abst = metadata.abstract.get_text().strip()
            orig = None
            if "-----" in abst:
                both = abst.split("-----")
                abst = both[0].strip()
                orig = both[1].strip()
            if "$" in abst or "{" in abst:
                mime = "application/x-latex"
                abst_plain = latex_to_text(abst)
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(
                        content=abst_plain, mimetype="text/plain", lang="en"))
            else:
                mime = "text/plain"
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(content=abst,
                                                      mimetype=mime,
                                                      lang="en"))
            if orig:
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(content=orig,
                                                          mimetype=mime))
                # indicates that fulltext probably isn't english either
                if lang == "en":
                    lang = None

        # extra:
        #   withdrawn_date
        #   translation_of
        #   subtitle
        #   aliases
        #   container_name
        #   group-title
        #   arxiv: comments, categories, etc
        extra_arxiv["base_id"] = base_id
        extra["superceded"] = True
        extra["arxiv"] = extra_arxiv

        versions = []
        for version in metadata.find_all("version"):
            arxiv_id = base_id + version["version"]
            release_date = version.date.string.strip()
            release_date = datetime.datetime.strptime(
                release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
            # TODO: source_type?
            versions.append(
                ReleaseEntity(
                    work_id=None,
                    title=title,
                    # original_title
                    version=version["version"],
                    release_type=release_type,
                    release_stage="submitted",
                    release_date=release_date.isoformat(),
                    release_year=release_date.year,
                    ext_ids=fatcat_openapi_client.ReleaseExtIds(
                        arxiv=arxiv_id, ),
                    number=number,
                    language=lang,
                    license_slug=license_slug,
                    abstracts=abstracts,
                    contribs=contribs,
                    extra=extra.copy(),
                ))
        # TODO: assert that versions are actually in order?
        assert versions

        versions[-1].extra.pop("superceded")

        # only apply DOI to most recent version (HACK)
        if doi:
            versions[-1].ext_ids.doi = doi
            if len(versions) > 1:
                versions[-1].release_stage = "accepted"
        return versions
Пример #11
0
    def parse_record(self, record):

        if not record:
            return None
        metadata = record.arXivRaw
        if not metadata:
            return None
        extra = dict()
        extra_arxiv = dict()

        # don't know!
        release_type = "article"

        base_id = metadata.id.string
        doi = None
        if metadata.doi and metadata.doi.string:
            doi = metadata.doi.string.lower().split()[0].strip()
            if not (doi.startswith('10.') and '/' in doi
                    and doi.split('/')[1]):
                sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                doi = None
        title = latex_to_text(metadata.title.get_text().replace('\n', ' '))
        authors = parse_arxiv_authors(metadata.authors.get_text().replace(
            '\n', ' '))
        contribs = [
            fatcat_openapi_client.ReleaseContrib(index=i,
                                                 raw_name=a,
                                                 role='author')
            for i, a in enumerate(authors)
        ]

        lang = "en"  # the vast majority in english
        if metadata.comments and metadata.comments.get_text():
            comments = metadata.comments.get_text().replace('\n', ' ').strip()
            extra_arxiv['comments'] = comments
            if 'in french' in comments.lower():
                lang = 'fr'
            elif 'in spanish' in comments.lower():
                lang = 'es'
            elif 'in portuguese' in comments.lower():
                lang = 'pt'
            elif 'in hindi' in comments.lower():
                lang = 'hi'
            elif 'in japanese' in comments.lower():
                lang = 'ja'
            elif 'in german' in comments.lower():
                lang = 'de'
            elif 'simplified chinese' in comments.lower():
                lang = 'zh'
            elif 'in russian' in comments.lower():
                lang = 'ru'
            # more languages?

        number = None
        if metadata.find('journal-ref') and metadata.find(
                'journal-ref').get_text():
            journal_ref = metadata.find('journal-ref').get_text().replace(
                '\n', ' ').strip()
            extra_arxiv['journal_ref'] = journal_ref
            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(
            ):
                release_type = "paper-conference"
        if metadata.find('report-no') and metadata.find('report-no').string:
            number = metadata.find('report-no').string.strip()
            # at least some people plop extra metadata in here. hrmf!
            if 'ISSN ' in number or 'ISBN ' in number or len(
                    number.split()) > 2:
                extra_arxiv['report-no'] = number
                number = None
            else:
                release_type = "report"
        if metadata.find('acm-class') and metadata.find('acm-class').string:
            extra_arxiv['acm_class'] = metadata.find(
                'acm-class').string.strip()
        if metadata.categories and metadata.categories.get_text():
            extra_arxiv['categories'] = metadata.categories.get_text().split()
        license_slug = None
        if metadata.license and metadata.license.get_text():
            license_slug = lookup_license_slug(metadata.license.get_text())
        abstracts = None
        if metadata.abstract:
            # TODO: test for this multi-abstract code path
            abstracts = []
            abst = metadata.abstract.get_text().strip()
            orig = None
            if '-----' in abst:
                both = abst.split('-----')
                abst = both[0].strip()
                orig = both[1].strip()
            if '$' in abst or '{' in abst:
                mime = "application/x-latex"
                abst_plain = latex_to_text(abst)
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(
                        content=abst_plain, mimetype="text/plain", lang="en"))
            else:
                mime = "text/plain"
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(content=abst,
                                                      mimetype=mime,
                                                      lang="en"))
            if orig:
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(content=orig,
                                                          mimetype=mime))
                # indicates that fulltext probably isn't english either
                if lang == 'en':
                    lang = None

        # extra:
        #   withdrawn_date
        #   translation_of
        #   subtitle
        #   aliases
        #   container_name
        #   group-title
        #   arxiv: comments, categories, etc
        extra_arxiv['base_id'] = base_id
        extra['superceded'] = True
        extra['arxiv'] = extra_arxiv

        versions = []
        for version in metadata.find_all('version'):
            arxiv_id = base_id + version['version']
            release_date = version.date.string.strip()
            release_date = datetime.datetime.strptime(
                release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
            # TODO: source_type?
            versions.append(
                fatcat_openapi_client.ReleaseEntity(
                    work_id=None,
                    title=title,
                    #original_title
                    version=version['version'],
                    release_type=release_type,
                    release_stage='submitted',
                    release_date=release_date.isoformat(),
                    release_year=release_date.year,
                    ext_ids=fatcat_openapi_client.ReleaseExtIds(
                        arxiv=arxiv_id, ),
                    number=number,
                    language=lang,
                    license_slug=license_slug,
                    abstracts=abstracts,
                    contribs=contribs,
                    extra=extra.copy(),
                ))
        # TODO: assert that versions are actually in order?
        assert versions

        versions[-1].extra.pop('superceded')

        # only apply DOI to most recent version (HACK)
        if doi:
            versions[-1].ext_ids.doi = doi
            if len(versions) > 1:
                versions[-1].release_stage = "accepted"
        return versions
Пример #12
0
    def parse_datacite_creators(
        self,
        creators: List[Dict[str, Any]],
        role: str = "author",
        set_index: bool = True,
        doi: Optional[str] = None,
    ) -> List[ReleaseContrib]:
        """
        Parses a list of creators into a list of ReleaseContrib objects. Set
        set_index to False, if the index contrib field should be left blank.
        The doi parameter is only used for debugging.
        """
        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
        contribs: List[ReleaseContrib] = []

        # Names, that should be ignored right away.
        name_blocklist = set(("Occdownload Gbif.Org", ))

        i: Optional[int] = 0
        for c in creators:
            if not set_index:
                i = None
            nameType = c.get("nameType", "") or ""
            if nameType in ("", "Personal"):
                creator_id = None
                for nid in c.get("nameIdentifiers", []) or []:
                    if not isinstance(nid, dict):
                        # see: fatcat-workers/issues/44035/
                        print(
                            "unexpected nameIdentifiers, expected list of dicts, got: {}"
                            .format(nid),
                            file=sys.stderr,
                        )
                        continue
                    name_scheme = nid.get("nameIdentifierScheme", "") or ""
                    if not name_scheme.lower() == "orcid":
                        continue
                    orcid = nid.get("nameIdentifier") or ""
                    orcid = orcid.replace("https://orcid.org/", "")
                    if not orcid:
                        continue
                    creator_id = self.lookup_orcid(orcid)
                    # TODO(martin): If creator_id is None, should we create creators?

                # If there are multiple affiliation strings, use the first one.
                affiliations = c.get("affiliation", []) or []
                raw_affiliation = None
                if len(affiliations) == 0:
                    raw_affiliation = None
                else:
                    raw_affiliation = clean_str(affiliations[0])

                name = c.get("name")
                given_name = c.get("givenName")
                surname = c.get("familyName")

                if name:
                    name = clean_str(name)
                if not any((name, given_name, surname)):
                    continue
                if not name:
                    name = "{} {}".format(given_name or "", surname
                                          or "").strip()
                if name in name_blocklist:
                    continue
                if name.lower() in UNKNOWN_MARKERS_LOWER:
                    continue
                # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
                if name:
                    name = index_form_to_display_name(name)

                if given_name:
                    given_name = clean_str(given_name)
                surname = clean_str(surname)

                # Perform a final assertion that name does not reduce to zero
                # (e.g. whitespace only name).
                if name:
                    name = name.strip()
                if not name:
                    continue

                if raw_affiliation == "":
                    continue

                extra = None

                # "DataManager", "DataCurator", "ContactPerson", "Distributor",
                # "RegistrationAgency", "Sponsor", "Researcher",
                # "RelatedPerson", "ProjectLeader", "Editor", "Other",
                # "ProjectMember", "Funder", "RightsHolder", "DataCollector",
                # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
                contributorType = c.get("contributorType", "") or ""

                if contributorType:
                    extra = {"type": contributorType}

                rc = fatcat_openapi_client.ReleaseContrib(
                    creator_id=creator_id,
                    index=i,
                    raw_name=name,
                    given_name=given_name,
                    surname=surname,
                    role=role,
                    raw_affiliation=raw_affiliation,
                    extra=extra,
                )
                # Filter out duplicates early.
                if not contributor_list_contains_contributor(contribs, rc):
                    contribs.append(rc)
                    if i is not None:
                        i += 1
            elif nameType == "Organizational":
                name = c.get("name", "") or ""
                if name in UNKNOWN_MARKERS:
                    continue
                if len(name) < 3:
                    continue
                extra = {"organization": name}
                contribs.append(
                    fatcat_openapi_client.ReleaseContrib(index=i, extra=extra))
                if i is not None:
                    i += 1
            else:
                print("[{}] unknown name type: {}".format(doi, nameType),
                      file=sys.stderr)

        return contribs