Пример #1
0
    def parse_grobid_json(self, obj):

        if not obj.get('title'):
            return None

        extra_grobid = dict()

        abstract = obj.get('abstract')
        if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(
                abstract) > 10:
            abobj = fatcat_client.ReleaseEntityAbstracts(
                mimetype="text/plain", content=clean(obj.get('abstract')))
            abstracts = [abobj]
        else:
            abstracts = None

        contribs = []
        for i, a in enumerate(obj.get('authors', [])):
            contribs.append(
                fatcat_client.ReleaseContrib(index=i,
                                             raw_name=clean(a['name']),
                                             role="author",
                                             extra=None))

        refs = []
        for raw in obj.get('citations', []):
            cite_extra = dict()
            year = None
            if raw.get('date'):
                try:
                    year = int(raw['date'].strip()[:4])
                except:
                    pass
            for key in ('volume', 'url', 'issue', 'publisher'):
                if raw.get(key):
                    cite_extra[key] = clean(raw[key])
            if raw.get('authors'):
                cite_extra['authors'] = [
                    clean(a['name']) for a in raw['authors']
                ]

            if not cite_extra:
                cite_extra = None
            refs.append(
                fatcat_client.ReleaseRef(key=clean(raw.get('id')),
                                         year=year,
                                         title=clean(raw['title']),
                                         extra=cite_extra))

        release_date = None
        release_year = None
        if obj.get('date'):
            # only returns year, ever?
            release_year = int(obj['date'][:4])

        extra = dict()
        if obj.get('doi'):
            extra['doi'] = obj['doi']
        if obj['journal'] and obj['journal'].get('name'):
            extra['container_name'] = clean(obj['journal']['name'])

        # TODO: ISSN/eISSN handling? or just journal name lookup?

        if extra_grobid:
            extra['grobid'] = extra_grobid
        if self.longtail_oa:
            extra['longtail_oa'] = True
        if not extra:
            extra = None

        title = clean(obj['title'], force_xml=True)
        if not title or len(title) < 2:
            return None

        re = fatcat_client.ReleaseEntity(
            title=title,
            release_type="article-journal",
            release_date=release_date,
            release_year=release_year,
            contribs=contribs,
            refs=refs,
            publisher=clean(obj['journal'].get('publisher')),
            volume=clean(obj['journal'].get('volume')),
            issue=clean(obj['journal'].get('issue')),
            abstracts=abstracts,
            extra=extra)
        return re
Пример #2
0
    def parse_grobid_json(self, obj):

        if not obj.get('title'):
            return None

        release = dict()
        extra = dict()

        if obj.get('abstract') and len(
                obj.get('abstract')) < MAX_ABSTRACT_BYTES:
            abobj = dict(mimetype="text/plain",
                         language=None,
                         content=obj.get('abstract').strip())
            abstracts = [abobj]
        else:
            abstracts = None

        contribs = []
        for i, a in enumerate(obj.get('authors', [])):
            c = dict(raw_name=a['name'], role="author")
            contribs.append(
                fatcat_client.ReleaseContrib(index=i,
                                             raw_name=a['name'],
                                             role="author",
                                             extra=None))

        refs = []
        for raw in obj.get('citations', []):
            cite_extra = dict()
            ref = dict()
            ref['key'] = raw.get('id')
            if raw.get('title'):
                ref['title'] = raw['title'].strip()
            if raw.get('date'):
                try:
                    year = int(raw['date'].strip()[:4])
                    ref['year'] = year
                except:
                    pass
            for key in ('volume', 'url', 'issue', 'publisher'):
                if raw.get(key):
                    cite_extra[key] = raw[key].strip()
            if raw.get('authors'):
                cite_extra['authors'] = [a['name'] for a in raw['authors']]
            if cite_extra:
                cite_extra = dict(grobid=cite_extra)
            else:
                cite_extra = None
            ref['extra'] = cite_extra
            refs.append(ref)

        release_type = "journal-article"
        release_date = None
        if obj.get('date'):
            # TODO: only returns year, ever? how to handle?
            release_date = datetime.datetime(year=int(obj['date'][:4]),
                                             month=1,
                                             day=1)

        if obj.get('doi'):
            extra['doi'] = obj['doi']
        if obj['journal'] and obj['journal'].get('name'):
            extra['container_name'] = obj['journal']['name']

        extra['is_longtail_oa'] = True

        # TODO: ISSN/eISSN handling? or just journal name lookup?

        if extra:
            extra = dict(grobid=extra)
        else:
            extra = None

        re = fatcat_client.ReleaseEntity(
            title=obj['title'].strip(),
            contribs=contribs,
            refs=refs,
            publisher=obj['journal'].get('publisher'),
            volume=obj['journal'].get('volume'),
            issue=obj['journal'].get('issue'),
            abstracts=abstracts,
            extra=extra)
        return re
Пример #3
0
    def parse_crossref_dict(self, obj):
        """
        obj is a python dict (parsed from json).
        returns a ReleaseEntity
        """

        # This work is out of scope if it doesn't have authors and a title
        if (not 'author' in obj) or (not 'title' in obj):
            return None

        # Other ways to be out of scope (provisionally)
        if (not 'type' in obj):
            return None

        # contribs
        def do_contribs(obj_list, ctype):
            contribs = []
            for i, am in enumerate(obj_list):
                creator_id = None
                if 'ORCID' in am.keys():
                    creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
                # Sorry humans :(
                if am.get('given') and am.get('family'):
                    raw_name = "{} {}".format(am['given'], am['family'])
                elif am.get('family'):
                    raw_name = am['family']
                else:
                    # TODO: defaults back to a pseudo-null value
                    raw_name = am.get('given', '<blank>')
                extra = dict()
                if ctype == "author":
                    index = i
                else:
                    index = None
                if am.get('affiliation'):
                    # note: affiliation => affiliations
                    extra['affiliations'] = am.get('affiliation')
                if am.get('sequence') and am.get('sequence') != "additional":
                    extra['sequence'] = am.get('sequence')
                if not extra:
                    extra = None
                contribs.append(
                    fatcat_client.ReleaseContrib(creator_id=creator_id,
                                                 index=index,
                                                 raw_name=raw_name,
                                                 role=ctype,
                                                 extra=extra))
            return contribs

        contribs = do_contribs(obj['author'], "author")
        contribs.extend(do_contribs(obj.get('editor', []), "editor"))
        contribs.extend(do_contribs(obj.get('translator', []), "translator"))

        # container
        issn = obj.get('ISSN', [None])[0]
        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)
        publisher = obj.get('publisher')

        ce = None
        if (container_id is None and self.create_containers and issnl != None
                and obj.get('container-title')
                and len(obj['container-title']) > 0):
            ce = fatcat_client.ContainerEntity(issnl=issnl,
                                               publisher=publisher,
                                               name=obj['container-title'][0])

        # references
        refs = []
        for i, rm in enumerate(obj.get('reference', [])):
            try:
                year = int(rm.get('year'))
                # NOTE: will need to update/config in the future!
                # NOTE: are there crossref works with year < 100?
                if year > 2025 or year < 100:
                    year = None
            except:
                year = None
            extra = rm.copy()
            if rm.get('DOI'):
                extra['doi'] = rm.get('DOI').lower()
            key = rm.get('key')
            if key and key.startswith(obj['DOI'].upper()):
                key = key.replace(obj['DOI'].upper() + "-", '')
                key = key.replace(obj['DOI'].upper(), '')
            container_name = rm.get('volume-title')
            if not container_name:
                container_name = rm.get('journal-title')
            extra.pop('DOI', None)
            extra.pop('key', None)
            extra.pop('year', None)
            extra.pop('volume-name', None)
            extra.pop('journal-title', None)
            extra.pop('title', None)
            extra.pop('first-page', None)
            extra.pop('doi-asserted-by', None)
            if extra:
                extra = dict(crossref=extra)
            else:
                extra = None
            refs.append(
                fatcat_client.ReleaseRef(
                    index=i,
                    # doing lookups would be a second import pass
                    target_release_id=None,
                    key=key,
                    year=year,
                    container_name=container_name,
                    title=rm.get('title'),
                    locator=rm.get('first-page'),
                    # TODO: just dump JSON somewhere here?
                    extra=extra))

        # abstracts
        abstracts = []
        if obj.get('abstract') != None:
            abstracts.append(
                fatcat_client.ReleaseEntityAbstracts(
                    mimetype="application/xml+jats",
                    content=obj.get('abstract')))

        # extra fields
        extra = dict()
        for key in ('subject', 'type', 'license', 'alternative-id',
                    'container-title', 'original-title', 'subtitle', 'archive',
                    'funder', 'group-title'):
            # TODO: unpack "container-title" array
            val = obj.get(key)
            if val:
                extra[key] = val
        if 'license' in extra and extra['license']:
            for i in range(len(extra['license'])):
                if 'start' in extra['license'][i]:
                    extra['license'][i]['start'] = extra['license'][i][
                        'start']['date-time']
        if len(obj['title']) > 1:
            extra['other-titles'] = obj['title'][1:]
        # TODO: this should be top-level
        extra['is_kept'] = len(obj.get('archive', [])) > 0

        # ISBN
        isbn13 = None
        for raw in obj.get('ISBN', []):
            # TODO: convert if not ISBN-13 format
            if len(raw) == 17:
                isbn13 = raw
                break

        # release status
        if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
                           'dissertation', 'book-chapter'):
            release_status = "published"
        else:
            # unknown
            release_status = None

        # external identifiers
        extids = self.lookup_ext_ids(doi=obj['DOI'].lower())

        # TODO: filter out huge releases; we'll get them later (and fix bug in
        # fatcatd)
        if max(len(contribs), len(refs), len(abstracts)) > 750:
            return None

        # release date parsing is amazingly complex
        release_date = obj['issued']['date-parts'][0]
        if not release_date or not release_date[0]:
            # got some NoneType, even though at least year is supposed to be set
            release_date = None
        elif len(release_date) == 3:
            release_date = datetime.datetime(year=release_date[0],
                                             month=release_date[1],
                                             day=release_date[2])
        else:
            # only the year is actually required; mangle to first day for date
            # (TODO: something better?)
            release_date = datetime.datetime(year=release_date[0],
                                             month=1,
                                             day=1)
        # convert to string ISO datetime format (if not null)
        if release_date:
            release_date = release_date.isoformat() + "Z"

        re = fatcat_client.ReleaseEntity(work_id=None,
                                         title=obj['title'][0],
                                         contribs=contribs,
                                         refs=refs,
                                         container_id=container_id,
                                         publisher=publisher,
                                         release_type=obj['type'],
                                         release_status=release_status,
                                         doi=obj['DOI'].lower(),
                                         isbn13=isbn13,
                                         core_id=extids['core_id'],
                                         pmid=extids['pmid'],
                                         pmcid=extids['pmcid'],
                                         wikidata_qid=extids['wikidata_qid'],
                                         release_date=release_date,
                                         issue=obj.get('issue'),
                                         volume=obj.get('volume'),
                                         pages=obj.get('page'),
                                         abstracts=abstracts,
                                         extra=dict(crossref=extra))
        return (re, ce)
Пример #4
0
    def parse_record(self, obj):
        """
        obj is a python dict (parsed from json).
        returns a ReleaseEntity
        """

        # Ways to be out of scope (provisionally)
        # journal-issue and journal-volume map to None, but allowed for now
        if obj.get('type') in (None, 'journal', 'proceedings',
                'standard-series', 'report-series', 'book-series', 'book-set',
                'book-track', 'proceedings-series'):
            return None

        # Do require the 'title' keys to exsit, as release entities do
        if (not 'title' in obj) or (not obj['title']):
            return None

        release_type = self.map_release_type(obj['type'])

        # contribs
        def do_contribs(obj_list, ctype):
            contribs = []
            for i, am in enumerate(obj_list):
                creator_id = None
                if 'ORCID' in am.keys():
                    creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
                # Sorry humans :(
                if am.get('given') and am.get('family'):
                    raw_name = "{} {}".format(am['given'], am['family'])
                elif am.get('family'):
                    raw_name = am['family']
                else:
                    # TODO: can end up empty
                    raw_name = am.get('given')
                extra = dict()
                if ctype == "author":
                    index = i
                else:
                    index = None
                raw_affiliation = None
                if am.get('affiliation'):
                    if len(am.get('affiliation')) > 0:
                        raw_affiliation = am.get('affiliation')[0]['name']
                    if len(am.get('affiliation')) > 1:
                        # note: affiliation => more_affiliations
                        extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
                if am.get('sequence') and am.get('sequence') != "additional":
                    extra['seq'] = clean(am.get('sequence'))
                if not extra:
                    extra = None
                assert ctype in ("author", "editor", "translator")
                raw_name = clean(raw_name)
                contribs.append(fatcat_client.ReleaseContrib(
                    creator_id=creator_id,
                    index=index,
                    raw_name=raw_name,
                    raw_affiliation=clean(raw_affiliation),
                    role=ctype,
                    extra=extra))
            return contribs
        contribs = do_contribs(obj.get('author', []), "author")
        contribs.extend(do_contribs(obj.get('editor', []), "editor"))
        contribs.extend(do_contribs(obj.get('translator', []), "translator"))

        # container
        issn = obj.get('ISSN', [None])[0]
        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)
        publisher = clean(obj.get('publisher'))

        if (container_id is None and self.create_containers and (issnl is not None)
            and obj.get('container-title') and len(obj['container-title']) > 0):
            ce = fatcat_client.ContainerEntity(
                issnl=issnl,
                publisher=publisher,
                container_type=self.map_container_type(release_type),
                name=clean(obj['container-title'][0], force_xml=True))
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident

        # license slug
        license_slug = None
        license_extra = []
        for l in obj.get('license', []):
            if l['content-version'] not in ('vor', 'unspecified'):
                continue
            slug = lookup_license_slug(l['URL'])
            if slug:
                license_slug = slug
            if 'start' in l:
                l['start'] = l['start']['date-time']
            license_extra.append(l)

        # references
        refs = []
        for i, rm in enumerate(obj.get('reference', [])):
            try:
                year = int(rm.get('year'))
                # TODO: will need to update/config in the future!
                # NOTE: are there crossref works with year < 100?
                if year > 2025 or year < 100:
                    year = None
            except:
                year = None
            ref_extra = dict()
            key = rm.get('key')
            if key and key.startswith(obj['DOI'].upper()):
                key = key.replace(obj['DOI'].upper() + "-", '')
                key = key.replace(obj['DOI'].upper(), '')
            container_name = rm.get('volume-title')
            if not container_name:
                container_name = rm.get('journal-title')
            elif rm.get('journal-title'):
                ref_extra['journal-title'] = rm['journal-title']
            if rm.get('DOI'):
                ref_extra['doi'] = rm.get('DOI').lower()
            author = clean(rm.get('author'))
            if author:
                ref_extra['authors'] = [author]
            for k in ('editor', 'edition', 'authority', 'version', 'genre',
                    'url', 'event', 'issue', 'volume', 'date', 'accessed_date',
                    'issued', 'page', 'medium', 'collection_title', 'chapter_number',
                    'unstructured', 'series-title', 'volume-title'):
                if clean(rm.get(k)):
                    ref_extra[k] = clean(rm[k])
            if not ref_extra:
                ref_extra = None
            refs.append(fatcat_client.ReleaseRef(
                index=i,
                # doing lookups would be a second import pass
                target_release_id=None,
                key=key,
                year=year,
                container_name=clean(container_name),
                title=clean(rm.get('article-title')),
                locator=clean(rm.get('first-page')),
                # TODO: just dump JSON somewhere here?
                extra=ref_extra))

        # abstracts
        abstracts = []
        abstract = clean(obj.get('abstract'))
        if abstract and len(abstract) > 10:
            abstracts.append(fatcat_client.ReleaseEntityAbstracts(
                mimetype="application/xml+jats",
                content=abstract))

        # extra fields
        extra = dict()
        extra_crossref = dict()
        # top-level extra keys
        if not container_id:
            if obj.get('container-title'):
                extra['container_name'] = clean(obj['container-title'][0])
        for key in ('group-title', 'subtitle'):
            val = obj.get(key)
            if val:
                if type(val) == list:
                    val = val[0]
                if type(val) == str:
                    extra[key] = clean(val)
                else:
                    extra[key] = val
        # crossref-nested extra keys
        for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'):
            val = obj.get(key)
            if val:
                if type(val) == str:
                    extra_crossref[key] = clean(val)
                else:
                    extra_crossref[key] = val
        if license_extra:
            extra_crossref['license'] = license_extra

        if len(obj['title']) > 1:
            aliases = [clean(t) for t in obj['title'][1:]]
            aliases = [t for t in aliases if t]
            if aliases:
                extra['aliases'] = aliases

        # ISBN
        isbn13 = None
        for raw in obj.get('ISBN', []):
            # TODO: convert if not ISBN-13 format
            if len(raw) == 17:
                isbn13 = raw
                break

        # release status
        if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
                'dissertation', 'book-chapter'):
            release_status = "published"
        else:
            # unknown
            release_status = None

        # external identifiers
        extids = self.lookup_ext_ids(doi=obj['DOI'].lower())

        # filter out unreasonably huge releases
        if len(abstracts) > 100:
            return None
        if len(refs) > 2000:
            return None
        if len(refs) > 5000:
            return None

        # release date parsing is amazingly complex
        raw_date = obj['issued']['date-parts'][0]
        if not raw_date or not raw_date[0]:
            # got some NoneType, even though at least year is supposed to be set
            release_year = None
            release_date = None
        elif len(raw_date) == 3:
            release_year = raw_date[0]
            release_date = datetime.date(year=raw_date[0], month=raw_date[1], day=raw_date[2])
        else:
            # sometimes only the year is included, not the full date
            release_year = raw_date[0]
            release_date = None


        original_title = None
        if obj.get('original-title'):
            original_title = clean(obj.get('original-title')[0], force_xml=True)

        title = None
        if obj.get('title'):
            title = clean(obj.get('title')[0], force_xml=True)
            if not title or len(title) <= 1:
                # title can't be just a single character
                return None

        if extra_crossref:
            extra['crossref'] = extra_crossref
        if not extra:
            extra = None

        re = fatcat_client.ReleaseEntity(
            work_id=None,
            container_id=container_id,
            title=title,
            original_title=original_title,
            release_type=release_type,
            release_status=release_status,
            release_date=release_date,
            release_year=release_year,
            publisher=publisher,
            doi=obj['DOI'].lower(),
            pmid=extids['pmid'],
            pmcid=extids['pmcid'],
            wikidata_qid=extids['wikidata_qid'],
            isbn13=isbn13,
            core_id=extids['core_id'],
            arxiv_id=extids['arxiv_id'],
            jstor_id=extids['jstor_id'],
            volume=clean(obj.get('volume')),
            issue=clean(obj.get('issue')),
            pages=clean(obj.get('page')),
            language=clean(obj.get('language')),
            license_slug=license_slug,
            extra=extra,
            abstracts=abstracts,
            contribs=contribs,
            refs=refs,
        )
        return re