def publication_info(self, key, value): """Populate the ``publication_info`` key.""" def _get_cnum(value): w_value = force_single_element(value.get('w', '')) normalized_w_value = w_value.replace('/', '-').upper() return normalized_w_value def _get_material(value): schema = load_schema('elements/material') valid_materials = schema['enum'] m_value = force_single_element(value.get('m', '')) normalized_m_value = m_value.lower() if normalized_m_value in valid_materials: return normalized_m_value def _get_parent_isbn(value): z_value = force_single_element(value.get('z', '')) if z_value: return normalize_isbn(z_value) def _get_pubinfo_freetext(value): x_value = force_single_element(value.get('x', '')) if not x_value.startswith('#DONE'): return x_value page_start, page_end, artid = split_page_artid(value.get('c')) parent_recid = maybe_int(force_single_element(value.get('0'))) parent_record = get_record_ref(parent_recid, 'literature') journal_recid = maybe_int(force_single_element(value.get('1'))) journal_record = get_record_ref(journal_recid, 'journals') conference_recid = maybe_int(force_single_element(value.get('2'))) conference_record = get_record_ref(conference_recid, 'conferences') return { 'artid': artid, 'cnum': _get_cnum(value), 'conf_acronym': force_single_element(value.get('q')), 'conference_record': conference_record, 'hidden': key.startswith('7731') or None, 'journal_issue': force_single_element(value.get('n')), 'journal_record': journal_record, 'journal_title': force_single_element(value.get('p')), 'journal_volume': force_single_element(value.get('v')), 'material': _get_material(value), 'page_end': page_end, 'page_start': page_start, 'parent_isbn': _get_parent_isbn(value), 'parent_record': parent_record, 'parent_report_number': force_single_element(value.get('r')), 'pubinfo_freetext': _get_pubinfo_freetext(value), 'year': maybe_int(force_single_element(value.get('y'))), }
def positions(self, key, value): current = False record = None recid_or_status = force_list(value.get('z')) for el in recid_or_status: if el.lower() == 'current': current = True else: record = get_record_ref(maybe_int(el), 'institutions') institution = { 'name': value.get('a'), 'record': record, 'curated_relation': record is not None, } emails = [el for el in force_list(value.get('m'))] old_emails = [el for el in force_list(value.get('o'))] _rank = value.get('r') rank = normalize_rank(_rank) return { 'institution': institution if institution['name'] else None, 'emails': emails, 'old_emails': old_emails, '_rank': _rank, 'rank': rank, 'start_date': normalize_date(value.get('s')), 'end_date': normalize_date(value.get('t')), 'current': current, }
def related_records(self, key, value): def _get_relation(value): RELATIONS_MAP = { 'a': 'predecessor', 'r': 'other', 't': 'parent', } return RELATIONS_MAP.get(value.get('w')) record = get_record_ref(maybe_int(value.get('0')), 'institutions') relation = _get_relation(value) if record and relation == 'other': return { 'curated_relation': record is not None, 'record': record, 'relation_freetext': relation, } elif record and relation: return { 'curated_relation': record is not None, 'record': record, 'relation': relation, }
def related_records(self, key, value): def _get_relation(value): RELATION_MAP = { 'a': 'predecessor', 'b': 'other', 'r': 'other', } return RELATION_MAP.get(value.get('w')) def _get_relation_freetext(value): return value.get('i') record = get_record_ref(maybe_int(value.get('0')), 'journals') relation = _get_relation(value) relation_freetext = _get_relation_freetext(value) if record and relation == 'other': return { 'curated_relation': record is not None, 'record': record, 'relation_freetext': relation_freetext, } elif record and relation: return { 'curated_relation': record is not None, 'record': record, 'relation': relation, }
def get_recid_from_ref(ref_obj): """Retrieve recid from jsonref reference object. If no recid can be parsed, returns None. """ if not isinstance(ref_obj, dict): return None url = ref_obj.get("$ref", "") return maybe_int(url.split("/")[-1])
def collaboration(self, key, value): record = get_record_ref(maybe_int(value.get('0')), 'experiments') return { 'curated_relation': record is not None, 'record': record, 'value': force_single_element(value.get('g')), }
def _external_system_identifiers(self, key, value): new_recid = maybe_int(value.get('d')) if new_recid: self['new_record'] = get_record_ref(new_recid, endpoint) return [{ 'schema': 'SPIRES', 'value': ext_sys_id, } for ext_sys_id in force_list(value.get('a'))]
def related_records_78708(self, key, value): """Populate the ``related_records`` key.""" record = get_record_ref(maybe_int(value.get('w')), 'literature') if record: return { 'curated_relation': record is not None, 'record': record, 'relation_freetext': value.get('i'), }
def related_records_78502(self, key, value): """Populate the ``related_records`` key.""" record = get_record_ref(maybe_int(value.get('w')), 'literature') if record: return { 'curated_relation': record is not None, 'record': record, 'relation': 'successor', }
def number_of_pages(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) found_pages = RE_PAGES.search(comments) if found_pages: pages = found_pages.group(1) return maybe_int(pages) return None
def extract_journal_info(obj, eng): """Extract the journal information from ``pubinfo_freetext``. Runs ``extract_journal_reference`` on the ``pubinfo_freetext`` key of each ``publication_info``, if it exists, and uses the extracted information to populate the other keys. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if not obj.data.get('publication_info'): return for publication_info in obj.data['publication_info']: try: with local_refextract_kbs_path() as kbs_path: extracted_publication_info = extract_journal_reference( publication_info['pubinfo_freetext'], override_kbs_files=kbs_path, ) if not extracted_publication_info: continue if extracted_publication_info.get('title'): publication_info['journal_title'] = extracted_publication_info[ 'title'] if extracted_publication_info.get('volume'): publication_info[ 'journal_volume'] = extracted_publication_info['volume'] if extracted_publication_info.get('page'): page_start, page_end, artid = split_page_artid( extracted_publication_info['page']) if page_start: publication_info['page_start'] = page_start if page_end: publication_info['page_end'] = page_end if artid: publication_info['artid'] = artid if extracted_publication_info.get('year'): year = maybe_int(extracted_publication_info['year']) if year: publication_info['year'] = year except KeyError: pass obj.data['publication_info'] = convert_old_publication_info_to_new( obj.data['publication_info'])
def _get_json_experiments(marc_dict): start_year = maybe_int(marc_dict.get('s')) end_year = maybe_int(marc_dict.get('d')) names = force_list(marc_dict.get('e')) recids = force_list(marc_dict.get('0')) name_recs = zip(names, recids or [None] * len(names)) for name, recid in name_recs: record = get_record_ref(recid, 'experiments') yield { 'curated_relation': record is not None, 'current': ( True if marc_dict.get('z', '').lower() == 'current' else False ), 'end_year': end_year, 'name': name, 'record': record, 'start_year': start_year, }
def extract_journal_info(obj, eng): """Extract the journal information from ``pubinfo_freetext``. Runs ``extract_journal_reference`` on the ``pubinfo_freetext`` key of each ``publication_info``, if it exists, and uses the extracted information to populate the other keys. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if not obj.data.get('publication_info'): return for publication_info in obj.data['publication_info']: try: with local_refextract_kbs_path() as kbs_path: extracted_publication_info = extract_journal_reference( publication_info['pubinfo_freetext'], override_kbs_files=kbs_path, ) if not extracted_publication_info: continue if extracted_publication_info.get('title'): publication_info['journal_title'] = extracted_publication_info['title'] if extracted_publication_info.get('volume'): publication_info['journal_volume'] = extracted_publication_info['volume'] if extracted_publication_info.get('page'): page_start, page_end, artid = split_page_artid(extracted_publication_info['page']) if page_start: publication_info['page_start'] = page_start if page_end: publication_info['page_end'] = page_end if artid: publication_info['artid'] = artid if extracted_publication_info.get('year'): year = maybe_int(extracted_publication_info['year']) if year: publication_info['year'] = year except KeyError: pass obj.data['publication_info'] = convert_old_publication_info_to_new(obj.data['publication_info'])
def related_records_78002(self, key, values): result = self.get('related_records', []) for value in force_list(values): record = get_record_ref(maybe_int(value.get('w')), 'literature') if record: result.append({ 'curated_relation': record is not None, 'record': record, 'relation': 'predecessor', }) return result
def related_records_78708(self, key, values): result = self.get('related_records', []) for value in force_list(values): record = get_record_ref(maybe_int(value.get('w')), 'literature') if record: result.append({ 'curated_relation': record is not None, 'record': record, 'relation_freetext': value.get('i'), }) return result
def collaborations(self, key, value): """Populate the ``collaborations`` key.""" collaborations = normalize_collaboration(value.get('g')) if len(collaborations) == 1: return [ { 'record': get_record_ref(maybe_int(value.get('0')), 'experiments'), 'value': collaborations[0], }, ] else: return [{'value': collaboration} for collaboration in collaborations]
def copyright(self, key, value): MATERIAL_MAP = { 'Article': 'publication', 'Published thesis as a book': 'publication', } material = value.get('e') or value.get('3') return { 'holder': value.get('d'), 'material': MATERIAL_MAP.get(material), 'statement': value.get('f'), 'url': value.get('u'), 'year': maybe_int(value.get('g')), }
def related_records(self, key, value): def _get_relation(value): RELATIONS_MAP = {'a': 'predecessor'} return RELATIONS_MAP.get(value.get('w')) record = get_record_ref(maybe_int(value.get('0')), 'experiments') relation = _get_relation(value) if record and relation: return { 'curated_relation': record is not None, 'record': record, 'relation': relation, }
def collaborations(self, key, value): """Populate the ``collaborations`` key.""" result = [] for g_value in force_list(value.get('g')): collaborations = normalize_collaboration(g_value) if len(collaborations) == 1: result.append({ 'record': get_record_ref(maybe_int(value.get('0')), 'experiments'), 'value': collaborations[0], }) else: result.extend({'value': collaboration} for collaboration in collaborations) return result
def extract_journal_info(obj, eng): """Extract journal, volume etc. from any freetext publication info.""" publication_info = get_value(obj.data, "publication_info") if not publication_info: return new_publication_info = [] for pubnote in publication_info: if not pubnote: continue freetext = pubnote.get("pubinfo_freetext") if freetext: if isinstance(freetext, (list, tuple)): freetext = ". ".join(freetext) extracted_publication_info = extract_journal_reference( freetext, # override_kbs_files={ # 'journals': get_mappings_from_kbname(['REFEXTRACT_KB_NAME']) # } ) if extracted_publication_info: if "volume" in extracted_publication_info: pubnote["journal_volume"] = extracted_publication_info.get( "volume" ) if "title" in extracted_publication_info: pubnote["journal_title"] = extracted_publication_info.get( "title" ) if "year" in extracted_publication_info: year = maybe_int(extracted_publication_info.get('year')) if year is not None: pubnote['year'] = year if "page" in extracted_publication_info: page_start, page_end, artid = split_page_artid( extracted_publication_info.get("page")) if page_start: pubnote["page_start"] = page_start if page_end: pubnote["page_end"] = page_end if artid: pubnote["artid"] = artid if any(value for value in pubnote.values()): new_publication_info.append(pubnote) obj.data["publication_info"] = new_publication_info
def positions(self, key, value): """Populate the positions field. Also populates the email_addresses field by side effect. """ email_addresses = self.get("email_addresses", []) current = None record = None recid_or_status = force_list(value.get('z')) for el in recid_or_status: if el.lower() == 'current': current = True if value.get('a') else None else: record = get_record_ref(maybe_int(el), 'institutions') rank = normalize_rank(value.get('r')) current_email_addresses = force_list(value.get('m')) non_current_email_addresses = force_list(value.get('o')) email_addresses.extend({ 'value': address, 'current': True, } for address in current_email_addresses) email_addresses.extend({ 'value': address, 'current': False, } for address in non_current_email_addresses) self['email_addresses'] = email_addresses if 'a' not in value: return None return { 'institution': value['a'], 'record': record, 'curated_relation': True if record is not None else None, 'rank': rank, 'start_date': normalize_date(value.get('s')), 'end_date': normalize_date(value.get('t')), 'current': current, }
def _fft(self, key, value): def _get_creation_datetime(value): if value.get('s'): dt = datetime.strptime(value['s'], '%Y-%m-%d %H:%M:%S') return dt.isoformat() is_context = value.get('f', '').endswith('context') if is_context: return return { 'creation_datetime': _get_creation_datetime(value), 'description': value.get('d'), 'filename': value.get('n'), 'flags': force_list(value.get('o')), 'format': value.get('f'), 'path': value.get('a'), 'status': value.get('z'), 'type': value.get('t'), 'version': maybe_int(value.get('v')), }
def experiment(self, key, values): """Populate the ``experiment`` key. Also populates the ``legacy_name``, the ``accelerator``, and the ``institutions`` keys through side effects. """ experiment = self.get('experiment', {}) legacy_name = self.get('legacy_name', '') accelerator = self.get('accelerator', {}) institutions = self.get('institutions', []) for value in force_list(values): if value.get('c'): experiment['value'] = value.get('c') if value.get('d'): experiment['short_name'] = value.get('d') if value.get('a'): legacy_name = value.get('a') if value.get('b'): accelerator['value'] = value.get('b') institution = {} if value.get('u'): institution['value'] = value.get('u') if value.get('z'): record = get_record_ref(maybe_int(value.get('z')), 'institutions') if record: institution['curated_relation'] = True institution['record'] = record institutions.append(institution) self['legacy_name'] = legacy_name self['accelerator'] = accelerator self['institutions'] = institutions return experiment
def _get_number(value): return maybe_int(force_single_element(value.get('n')))
def number_of_pages(self): number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first()) return number_of_pages
def copyright_year(self): copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first() return maybe_int(copyright_year)
def _set_record(el): recid = maybe_int(el) record = get_record_ref(recid, 'literature') rb.set_record(record)
def copyright_year(self): copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first() return maybe_int(copyright_year)
def number_of_pages(self): number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first()) return number_of_pages
def test_maybe_int_returns_none_otherwise(): assert maybe_int('216+337') is None
def copyright_year(self): copyright_year = self.root.xpath( "./*/item-info/copyright[@type]/@year").extract_first() return maybe_int(copyright_year)
def _get_record(value): return get_record_ref(maybe_int(force_single_element(value.get('x'))), 'authors')
def _deleted_records(self, key, value): deleted_recid = maybe_int(value.get('a')) if deleted_recid: return get_record_ref(deleted_recid, endpoint)