def parse_grobid_json(self, obj): if not obj.get('title'): return None extra_grobid = dict() abstract = obj.get('abstract') if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len( abstract) > 10: abobj = fatcat_client.ReleaseEntityAbstracts( mimetype="text/plain", content=clean(obj.get('abstract'))) abstracts = [abobj] else: abstracts = None contribs = [] for i, a in enumerate(obj.get('authors', [])): contribs.append( fatcat_client.ReleaseContrib(index=i, raw_name=clean(a['name']), role="author", extra=None)) refs = [] for raw in obj.get('citations', []): cite_extra = dict() year = None if raw.get('date'): try: year = int(raw['date'].strip()[:4]) except: pass for key in ('volume', 'url', 'issue', 'publisher'): if raw.get(key): cite_extra[key] = clean(raw[key]) if raw.get('authors'): cite_extra['authors'] = [ clean(a['name']) for a in raw['authors'] ] if not cite_extra: cite_extra = None refs.append( fatcat_client.ReleaseRef(key=clean(raw.get('id')), year=year, title=clean(raw['title']), extra=cite_extra)) release_date = None release_year = None if obj.get('date'): # only returns year, ever? release_year = int(obj['date'][:4]) extra = dict() if obj.get('doi'): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): extra['container_name'] = clean(obj['journal']['name']) # TODO: ISSN/eISSN handling? or just journal name lookup? if extra_grobid: extra['grobid'] = extra_grobid if self.longtail_oa: extra['longtail_oa'] = True if not extra: extra = None title = clean(obj['title'], force_xml=True) if not title or len(title) < 2: return None re = fatcat_client.ReleaseEntity( title=title, release_type="article-journal", release_date=release_date, release_year=release_year, contribs=contribs, refs=refs, publisher=clean(obj['journal'].get('publisher')), volume=clean(obj['journal'].get('volume')), issue=clean(obj['journal'].get('issue')), abstracts=abstracts, extra=extra) return re
def parse_grobid_json(self, obj): if not obj.get('title'): return None release = dict() extra = dict() if obj.get('abstract') and len( obj.get('abstract')) < MAX_ABSTRACT_BYTES: abobj = dict(mimetype="text/plain", language=None, content=obj.get('abstract').strip()) abstracts = [abobj] else: abstracts = None contribs = [] for i, a in enumerate(obj.get('authors', [])): c = dict(raw_name=a['name'], role="author") contribs.append( fatcat_client.ReleaseContrib(index=i, raw_name=a['name'], role="author", extra=None)) refs = [] for raw in obj.get('citations', []): cite_extra = dict() ref = dict() ref['key'] = raw.get('id') if raw.get('title'): ref['title'] = raw['title'].strip() if raw.get('date'): try: year = int(raw['date'].strip()[:4]) ref['year'] = year except: pass for key in ('volume', 'url', 'issue', 'publisher'): if raw.get(key): cite_extra[key] = raw[key].strip() if raw.get('authors'): cite_extra['authors'] = [a['name'] for a in raw['authors']] if cite_extra: cite_extra = dict(grobid=cite_extra) else: cite_extra = None ref['extra'] = cite_extra refs.append(ref) release_type = "journal-article" release_date = None if obj.get('date'): # TODO: only returns year, ever? how to handle? release_date = datetime.datetime(year=int(obj['date'][:4]), month=1, day=1) if obj.get('doi'): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): extra['container_name'] = obj['journal']['name'] extra['is_longtail_oa'] = True # TODO: ISSN/eISSN handling? or just journal name lookup? if extra: extra = dict(grobid=extra) else: extra = None re = fatcat_client.ReleaseEntity( title=obj['title'].strip(), contribs=contribs, refs=refs, publisher=obj['journal'].get('publisher'), volume=obj['journal'].get('volume'), issue=obj['journal'].get('issue'), abstracts=abstracts, extra=extra) return re
def parse_crossref_dict(self, obj): """ obj is a python dict (parsed from json). returns a ReleaseEntity """ # This work is out of scope if it doesn't have authors and a title if (not 'author' in obj) or (not 'title' in obj): return None # Other ways to be out of scope (provisionally) if (not 'type' in obj): return None # contribs def do_contribs(obj_list, ctype): contribs = [] for i, am in enumerate(obj_list): creator_id = None if 'ORCID' in am.keys(): creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) # Sorry humans :( if am.get('given') and am.get('family'): raw_name = "{} {}".format(am['given'], am['family']) elif am.get('family'): raw_name = am['family'] else: # TODO: defaults back to a pseudo-null value raw_name = am.get('given', '<blank>') extra = dict() if ctype == "author": index = i else: index = None if am.get('affiliation'): # note: affiliation => affiliations extra['affiliations'] = am.get('affiliation') if am.get('sequence') and am.get('sequence') != "additional": extra['sequence'] = am.get('sequence') if not extra: extra = None contribs.append( fatcat_client.ReleaseContrib(creator_id=creator_id, index=index, raw_name=raw_name, role=ctype, extra=extra)) return contribs contribs = do_contribs(obj['author'], "author") contribs.extend(do_contribs(obj.get('editor', []), "editor")) contribs.extend(do_contribs(obj.get('translator', []), "translator")) # container issn = obj.get('ISSN', [None])[0] issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) publisher = obj.get('publisher') ce = None if (container_id is None and self.create_containers and issnl != None and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity(issnl=issnl, publisher=publisher, name=obj['container-title'][0]) # references refs = [] for i, rm in enumerate(obj.get('reference', [])): try: year = int(rm.get('year')) # NOTE: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year > 2025 or year < 100: year = None except: year = None extra = rm.copy() if rm.get('DOI'): extra['doi'] = rm.get('DOI').lower() key = rm.get('key') if key and key.startswith(obj['DOI'].upper()): key = key.replace(obj['DOI'].upper() + "-", '') key = key.replace(obj['DOI'].upper(), '') container_name = rm.get('volume-title') if not container_name: container_name = rm.get('journal-title') extra.pop('DOI', None) extra.pop('key', None) extra.pop('year', None) extra.pop('volume-name', None) extra.pop('journal-title', None) extra.pop('title', None) extra.pop('first-page', None) extra.pop('doi-asserted-by', None) if extra: extra = dict(crossref=extra) else: extra = None refs.append( fatcat_client.ReleaseRef( index=i, # doing lookups would be a second import pass target_release_id=None, key=key, year=year, container_name=container_name, title=rm.get('title'), locator=rm.get('first-page'), # TODO: just dump JSON somewhere here? extra=extra)) # abstracts abstracts = [] if obj.get('abstract') != None: abstracts.append( fatcat_client.ReleaseEntityAbstracts( mimetype="application/xml+jats", content=obj.get('abstract'))) # extra fields extra = dict() for key in ('subject', 'type', 'license', 'alternative-id', 'container-title', 'original-title', 'subtitle', 'archive', 'funder', 'group-title'): # TODO: unpack "container-title" array val = obj.get(key) if val: extra[key] = val if 'license' in extra and extra['license']: for i in range(len(extra['license'])): if 'start' in extra['license'][i]: extra['license'][i]['start'] = extra['license'][i][ 'start']['date-time'] if len(obj['title']) > 1: extra['other-titles'] = obj['title'][1:] # TODO: this should be top-level extra['is_kept'] = len(obj.get('archive', [])) > 0 # ISBN isbn13 = None for raw in obj.get('ISBN', []): # TODO: convert if not ISBN-13 format if len(raw) == 17: isbn13 = raw break # release status if obj['type'] in ('journal-article', 'conference-proceeding', 'book', 'dissertation', 'book-chapter'): release_status = "published" else: # unknown release_status = None # external identifiers extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) # TODO: filter out huge releases; we'll get them later (and fix bug in # fatcatd) if max(len(contribs), len(refs), len(abstracts)) > 750: return None # release date parsing is amazingly complex release_date = obj['issued']['date-parts'][0] if not release_date or not release_date[0]: # got some NoneType, even though at least year is supposed to be set release_date = None elif len(release_date) == 3: release_date = datetime.datetime(year=release_date[0], month=release_date[1], day=release_date[2]) else: # only the year is actually required; mangle to first day for date # (TODO: something better?) release_date = datetime.datetime(year=release_date[0], month=1, day=1) # convert to string ISO datetime format (if not null) if release_date: release_date = release_date.isoformat() + "Z" re = fatcat_client.ReleaseEntity(work_id=None, title=obj['title'][0], contribs=contribs, refs=refs, container_id=container_id, publisher=publisher, release_type=obj['type'], release_status=release_status, doi=obj['DOI'].lower(), isbn13=isbn13, core_id=extids['core_id'], pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], release_date=release_date, issue=obj.get('issue'), volume=obj.get('volume'), pages=obj.get('page'), abstracts=abstracts, extra=dict(crossref=extra)) return (re, ce)
def parse_record(self, obj): """ obj is a python dict (parsed from json). returns a ReleaseEntity """ # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now if obj.get('type') in (None, 'journal', 'proceedings', 'standard-series', 'report-series', 'book-series', 'book-set', 'book-track', 'proceedings-series'): return None # Do require the 'title' keys to exsit, as release entities do if (not 'title' in obj) or (not obj['title']): return None release_type = self.map_release_type(obj['type']) # contribs def do_contribs(obj_list, ctype): contribs = [] for i, am in enumerate(obj_list): creator_id = None if 'ORCID' in am.keys(): creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) # Sorry humans :( if am.get('given') and am.get('family'): raw_name = "{} {}".format(am['given'], am['family']) elif am.get('family'): raw_name = am['family'] else: # TODO: can end up empty raw_name = am.get('given') extra = dict() if ctype == "author": index = i else: index = None raw_affiliation = None if am.get('affiliation'): if len(am.get('affiliation')) > 0: raw_affiliation = am.get('affiliation')[0]['name'] if len(am.get('affiliation')) > 1: # note: affiliation => more_affiliations extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] if am.get('sequence') and am.get('sequence') != "additional": extra['seq'] = clean(am.get('sequence')) if not extra: extra = None assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, raw_name=raw_name, raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) return contribs contribs = do_contribs(obj.get('author', []), "author") contribs.extend(do_contribs(obj.get('editor', []), "editor")) contribs.extend(do_contribs(obj.get('translator', []), "translator")) # container issn = obj.get('ISSN', [None])[0] issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) publisher = clean(obj.get('publisher')) if (container_id is None and self.create_containers and (issnl is not None) and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), name=clean(obj['container-title'][0], force_xml=True)) ce_edit = self.create_container(ce) container_id = ce_edit.ident # license slug license_slug = None license_extra = [] for l in obj.get('license', []): if l['content-version'] not in ('vor', 'unspecified'): continue slug = lookup_license_slug(l['URL']) if slug: license_slug = slug if 'start' in l: l['start'] = l['start']['date-time'] license_extra.append(l) # references refs = [] for i, rm in enumerate(obj.get('reference', [])): try: year = int(rm.get('year')) # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year > 2025 or year < 100: year = None except: year = None ref_extra = dict() key = rm.get('key') if key and key.startswith(obj['DOI'].upper()): key = key.replace(obj['DOI'].upper() + "-", '') key = key.replace(obj['DOI'].upper(), '') container_name = rm.get('volume-title') if not container_name: container_name = rm.get('journal-title') elif rm.get('journal-title'): ref_extra['journal-title'] = rm['journal-title'] if rm.get('DOI'): ref_extra['doi'] = rm.get('DOI').lower() author = clean(rm.get('author')) if author: ref_extra['authors'] = [author] for k in ('editor', 'edition', 'authority', 'version', 'genre', 'url', 'event', 'issue', 'volume', 'date', 'accessed_date', 'issued', 'page', 'medium', 'collection_title', 'chapter_number', 'unstructured', 'series-title', 'volume-title'): if clean(rm.get(k)): ref_extra[k] = clean(rm[k]) if not ref_extra: ref_extra = None refs.append(fatcat_client.ReleaseRef( index=i, # doing lookups would be a second import pass target_release_id=None, key=key, year=year, container_name=clean(container_name), title=clean(rm.get('article-title')), locator=clean(rm.get('first-page')), # TODO: just dump JSON somewhere here? extra=ref_extra)) # abstracts abstracts = [] abstract = clean(obj.get('abstract')) if abstract and len(abstract) > 10: abstracts.append(fatcat_client.ReleaseEntityAbstracts( mimetype="application/xml+jats", content=abstract)) # extra fields extra = dict() extra_crossref = dict() # top-level extra keys if not container_id: if obj.get('container-title'): extra['container_name'] = clean(obj['container-title'][0]) for key in ('group-title', 'subtitle'): val = obj.get(key) if val: if type(val) == list: val = val[0] if type(val) == str: extra[key] = clean(val) else: extra[key] = val # crossref-nested extra keys for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'): val = obj.get(key) if val: if type(val) == str: extra_crossref[key] = clean(val) else: extra_crossref[key] = val if license_extra: extra_crossref['license'] = license_extra if len(obj['title']) > 1: aliases = [clean(t) for t in obj['title'][1:]] aliases = [t for t in aliases if t] if aliases: extra['aliases'] = aliases # ISBN isbn13 = None for raw in obj.get('ISBN', []): # TODO: convert if not ISBN-13 format if len(raw) == 17: isbn13 = raw break # release status if obj['type'] in ('journal-article', 'conference-proceeding', 'book', 'dissertation', 'book-chapter'): release_status = "published" else: # unknown release_status = None # external identifiers extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) # filter out unreasonably huge releases if len(abstracts) > 100: return None if len(refs) > 2000: return None if len(refs) > 5000: return None # release date parsing is amazingly complex raw_date = obj['issued']['date-parts'][0] if not raw_date or not raw_date[0]: # got some NoneType, even though at least year is supposed to be set release_year = None release_date = None elif len(raw_date) == 3: release_year = raw_date[0] release_date = datetime.date(year=raw_date[0], month=raw_date[1], day=raw_date[2]) else: # sometimes only the year is included, not the full date release_year = raw_date[0] release_date = None original_title = None if obj.get('original-title'): original_title = clean(obj.get('original-title')[0], force_xml=True) title = None if obj.get('title'): title = clean(obj.get('title')[0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character return None if extra_crossref: extra['crossref'] = extra_crossref if not extra: extra = None re = fatcat_client.ReleaseEntity( work_id=None, container_id=container_id, title=title, original_title=original_title, release_type=release_type, release_status=release_status, release_date=release_date, release_year=release_year, publisher=publisher, doi=obj['DOI'].lower(), pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], isbn13=isbn13, core_id=extids['core_id'], arxiv_id=extids['arxiv_id'], jstor_id=extids['jstor_id'], volume=clean(obj.get('volume')), issue=clean(obj.get('issue')), pages=clean(obj.get('page')), language=clean(obj.get('language')), license_slug=license_slug, extra=extra, abstracts=abstracts, contribs=contribs, refs=refs, ) return re