def _get_ids(value): def _is_jacow(j_value): return j_value.upper().startswith("JACOW-") def _is_orcid(j_value): return j_value.upper().startswith("ORCID:") and len(j_value) > 6 def _is_naked_orcid(j_value): return ORCID.match(j_value) def _is_cern(j_value): return j_value.startswith("CCID-") result = [] i_values = force_force_list(value.get("i")) for i_value in i_values: result.append({"type": "INSPIRE ID", "value": i_value}) j_values = force_force_list(value.get("j")) for j_value in j_values: if _is_jacow(j_value): result.append({"type": "JACOW", "value": "JACoW-" + j_value[6:]}) elif _is_orcid(j_value): result.append({"type": "ORCID", "value": j_value[6:]}) elif _is_naked_orcid(j_value): result.append({"type": "ORCID", "value": j_value}) elif _is_cern(j_value): result.append({"type": "CERN", "value": "CERN-" + j_value[5:]}) w_values = force_force_list(value.get("w")) for w_value in w_values: result.append({"type": "INSPIRE BAI", "value": w_value}) return result
def persistent_identifiers(self, key, value): """Persistent Standard Identifiers.""" value = force_force_list(value) dois = self.get('dois', []) persistent_identifiers = self.get('persistent_identifiers', []) for val in value: if val: items = force_force_list(val.get('a')) items_type = force_single_element(val.get('2')) if items_type and items_type.lower() == 'doi': for v in items: dois.append({ 'value': v, 'source': val.get('9') }) else: for v in items: persistent_identifiers.append({ 'value': v, 'source': val.get('9'), 'type': val.get('2') }) self['dois'] = dois return persistent_identifiers
def acronym(self, key, value): """Conference acronym.""" self['date'] = value.get('d') self['opening_date'] = value.get('x') self['closing_date'] = value.get('y') self['cnum'] = value.get('g') if value.get('a'): self.setdefault('titles', []) raw_titles = force_force_list(value.get('a')) for raw_title in raw_titles: title = { 'title': raw_title, 'subtitle': value.get('b'), 'source': value.get('9'), } self['titles'].append(title) if value.get('c'): self.setdefault('address', []) raw_addresses = force_force_list(value.get('c')) for raw_address in raw_addresses: address = parse_conference_address(raw_address) self['address'].append(address) return value.get('e')
def _get_acc_exp_json(acc_exp_data): recids = [] if '0' in acc_exp_data: try: recids = [ int(recid) for recid in force_force_list(acc_exp_data.get('0')) ] except (TypeError, ValueError, AttributeError): pass experiment_names = force_force_list(acc_exp_data.get('e')) # XXX: we zip only when they have the same length, otherwise # we might match a value with the wrong recid. if len(recids) == len(experiment_names): for recid, experiment_name in zip(recids, experiment_names): yield { 'record': get_record_ref(recid, 'experiments'), 'accelerator': acc_exp_data.get('a'), 'experiment': experiment_name, 'curated_relation': True } else: for experiment_name in experiment_names: yield { 'accelerator': acc_exp_data.get('a'), 'experiment': experiment_name, 'curated_relation': False, }
def persistent_identifiers(self, key, value): """Persistent Standard Identifiers.""" def _first_non_curator_source(sources): sources = force_force_list(sources) without_curator = filter(lambda el: el.upper() != 'CURATOR', sources) return force_single_element(without_curator) def _is_doi(type_): return type_ and type_.upper() == 'DOI' dois = self.get('dois', []) persistent_identifiers = self.get('persistent_identifiers', []) values = force_force_list(value) for value in values: if value: ids = force_force_list(value.get('a')) type_ = force_single_element(value.get('2')) source = _first_non_curator_source(value.get('9')) if _is_doi(type_): dois.extend([{ 'source': source, 'value': id_, } for id_ in ids]) else: persistent_identifiers.extend([{ 'source': source, 'type': type_, 'value': id_, } for id_ in ids]) self['dois'] = dois return persistent_identifiers
def _get_affiliations(value): result = [] institutions = force_force_list(value.get('u')) recids = force_force_list(value.get('z')) # XXX: we zip only when they have the same length, otherwise # we might match a value with the wrong recid. if len(institutions) == len(recids): for value, recid in zip(institutions, recids): try: record = get_record_ref(int(recid), 'institutions') except (TypeError, ValueError): record = None result.append({ 'curated_relation': record is not None, 'record': record, 'value': value, }) else: for value in institutions: result.append({ 'curated_relation': False, 'value': value, }) return result
def institutions(self, key, value): """Institutions info.""" institutions = self.get('institutions', []) a_values = force_force_list(value.get('a')) z_values = force_force_list(value.get('z')) # XXX: we zip only when they have the same length, otherwise # we might match a value with the wrong recid. if len(a_values) == len(z_values): for a_value, z_value in zip(a_values, z_values): record = get_record_ref(z_value, 'institutions') institutions.append({ 'curated_relation': record is not None, 'name': a_value, 'record': record, }) else: for a_value in a_values: institutions.append({ 'curated_relation': False, 'name': a_value, }) return institutions
def parse_institution_address(address, city, state_province, country, postal_code, country_code): """Parse an institution address.""" address_string = force_force_list(address) state_province = match_us_state(state_province) or state_province postal_code = force_force_list(postal_code) country = force_force_list(country) country_code = match_country_code(country_code) if isinstance(postal_code, (tuple, list)): postal_code = ', '.join(postal_code) if isinstance(city, (tuple, list)): city = ', '.join(city) if isinstance(country, (tuple, list)): country = ', '.join(set(country)) if not country_code and country: country_code = match_country_name_to_its_code(country) if not country_code and state_province and state_province.startswith('US-'): country_code = 'US' return { 'original_address': force_force_list(address), 'city': city, 'state': state_province, 'country': country, 'postal_code': postal_code, 'country_code': country_code, }
def _collection_in_record(record, collection): """Returns True if record is in collection""" colls = force_force_list(record.get("980__", [])) for coll in colls: coll = force_force_list(coll.get('a', [])) if collection in [c.lower() for c in coll]: return True return False
def name_variants(self, key, value): """Variants of the name.""" if value.get("g"): self.setdefault("extra_words", []) self["extra_words"].extend(force_force_list(value.get("g"))) values = self.get("name_variants", []) values.append({"source": value.get("9"), "value": force_force_list(value.get("a", []))}) return values
def ranks(self, key, value): """Ranks.""" self.setdefault('_ranks', []) self.setdefault('ranks', []) values = force_force_list(value) for el in values: _ranks = force_force_list(el.get('a')) for _rank in _ranks: self['_ranks'].append(_rank) self['ranks'].append(classify_rank(_rank))
def _get_ids(value): def _is_jacow(j_value): return j_value.upper().startswith('JACOW-') def _is_orcid(j_value): return j_value.upper().startswith( 'ORCID:') and len(j_value) > 6 def _is_naked_orcid(j_value): return ORCID.match(j_value) def _is_cern(j_value): return j_value.startswith('CCID-') result = [] i_values = force_force_list(value.get('i')) for i_value in i_values: result.append({ 'type': 'INSPIRE ID', 'value': i_value, }) j_values = force_force_list(value.get('j')) for j_value in j_values: if _is_jacow(j_value): result.append({ 'type': 'JACOW', 'value': 'JACoW-' + j_value[6:], }) elif _is_orcid(j_value): result.append({ 'type': 'ORCID', 'value': j_value[6:], }) elif _is_naked_orcid(j_value): result.append({ 'type': 'ORCID', 'value': j_value, }) elif _is_cern(j_value): result.append({ 'type': 'CERN', 'value': 'CERN-' + j_value[5:], }) w_values = force_force_list(value.get('w')) for w_value in w_values: result.append({ 'type': 'INSPIRE BAI', 'value': w_value, }) return result
def collaboration(self, key, value): """Collaboration of experiment.""" values = force_force_list(self.get('collaboration')) values.extend(self.get('collaboration_alternative_names', [])) values.extend(el.get('g') for el in force_force_list(value)) collaborations = sorted(values, key=len) if len(collaborations) > 1: self['collaboration_alternative_names'] = collaborations[1:] if collaborations: return collaborations[0]
def _get_ids(value): def _is_jacow(j_value): return j_value.upper().startswith('JACOW-') def _is_orcid(j_value): return j_value.upper().startswith('ORCID:') and len(j_value) > 6 def _is_naked_orcid(j_value): return ORCID.match(j_value) def _is_cern(j_value): return j_value.startswith('CCID-') result = [] i_values = force_force_list(value.get('i')) for i_value in i_values: result.append({ 'type': 'INSPIRE ID', 'value': i_value, }) j_values = force_force_list(value.get('j')) for j_value in j_values: if _is_jacow(j_value): result.append({ 'type': 'JACOW', 'value': 'JACoW-' + j_value[6:], }) elif _is_orcid(j_value): result.append({ 'type': 'ORCID', 'value': j_value[6:], }) elif _is_naked_orcid(j_value): result.append({ 'type': 'ORCID', 'value': j_value, }) elif _is_cern(j_value): result.append({ 'type': 'CERN', 'value': 'CERN-' + j_value[5:], }) w_values = force_force_list(value.get('w')) for w_value in w_values: result.append({ 'type': 'INSPIRE BAI', 'value': w_value, }) return result
def name_variants(self, key, value): """Variants of the name.""" if value.get('g'): self.setdefault('extra_words', []) self['extra_words'].extend(force_force_list(value.get('g'))) values = self.get('name_variants', []) values.append({ 'source': value.get('9'), 'value': force_force_list(value.get('a', [])), }) return values
def add_book_info(record, blob): """Add link to the appropriate book record.""" collections = [] if 'collections' in record: for c in record.get('collections', ''): if c.get('primary', ''): collections.append(c.get('primary').lower()) if 'bookchapter' in collections: pubinfos = force_force_list(blob.get("773__", [])) for pubinfo in pubinfos: if pubinfo.get('0'): record['book'] = { 'record': get_record_ref( int(force_force_list(pubinfo.get('0'))[0]), 'literature') }
def report_numbers(self, key, value): """Report numbers and arXiv numbers from 037.""" def get_value(value): return { 'source': value.get('9'), 'value': value.get('a', value.get('z')), } def get_value_arxiv(value): return { 'value': value.get('a'), 'categories': force_force_list(value.get('c')), } report_number = self.get('report_numbers', []) arxiv_eprints = self.get('arxiv_eprints', []) value = force_force_list(value) for element in value: if element.get('9') and element.get('9') == 'arXiv' and 'c' in element: arxiv_eprints.append(get_value_arxiv(element)) else: report_number.append(get_value(element)) self['arxiv_eprints'] = arxiv_eprints return report_number
def _get_affiliations(value): result = [] u_values = force_force_list(value.get("u")) z_values = force_force_list(value.get("z")) # XXX: we zip only when they have the same length, otherwise # we might match a value with the wrong recid. if len(u_values) == len(z_values): for u_value, z_value in zip(u_values, z_values): result.append({"record": get_record_ref(z_value, "institutions"), "value": u_value}) else: for u_value in u_values: result.append({"value": u_value}) return result
def _get_full_name(value): a_values = force_force_list(value.get("a")) if a_values: if len(a_values) > 1: logger.warning("Record with mashed up authors list. " "Taking first author: %s", a_values[0]) return a_values[0]
def field_categories(self, key, value): """Field categories.""" self.setdefault('field_categories', []) _terms = force_force_list(value.get('a')) if _terms: for _term in _terms: term = classify_field(_term) scheme = 'INSPIRE' if term else None _scheme = value.get('2') if isinstance(_scheme, (list, tuple)): _scheme = _scheme[0] source = value.get('9') if source: if 'automatically' in source: source = 'INSPIRE' self['field_categories'].append({ 'source': source, '_scheme': _scheme, 'scheme': scheme, '_term': _term, 'term': term, })
def regions(self, key, value): """Regions.""" REGIONS_MAP = { 'AF': 'Africa', 'Africa': 'Africa', 'Asia': 'Asia', 'Australia': 'Australasia', 'Australasia': 'Australasia', 'eu': 'Europe', 'Europe': 'Europe', 'Middle East': 'Middle East', 'na': 'North America', 'United States': 'North America', 'Noth America': 'North America', 'North America': 'North America', 'North Americsa': 'North America', 'South America': 'South America', } result = [] for el in force_force_list(value.get('a')): for region in COMMA_OR_SLASH.split(el): result.append(REGIONS_MAP.get(region)) return result
def get_subject(record): inspire_categories = force_force_list( get_value(record, 'inspire_categories')) terms = [ic['term'] for ic in inspire_categories if ic.get('term')] if terms: return terms[0]
def name(self, key, value): """Name information. Please note that MARC field for an author's name is splitted into two fields, `last_name` and `first_name`. The same situation occurs for the date fields, in JSON it is splitted into `birth_year` and `death_year`. Admissible string values for `100__g`: + active + departed + retired + deceased The only accepted value in `100__c` field is: + Sir Values accepted for `100__b: + Jr. + Sr. + roman numbers (like VII) """ value = force_force_list(value) self.setdefault("breadcrumb_title", value[0].get("a")) self.setdefault("dates", value[0].get("d")) return { "value": value[0].get("a"), "numeration": value[0].get("b"), "title": value[0].get("c"), "status": value[0].get("g"), "preferred_name": value[0].get("q"), }
def split_page_artid(page_artid): """Split page_artid into page_start/end and artid.""" page_start = None page_end = None artid = None if not page_artid: return None, None, None # TODO use force_force_list when it's in inspirehep.utils. page_artid_l = force_force_list(page_artid) for page_artid in page_artid_l: if page_artid: if '-' in page_artid: # if it has a dash it's a page range page_range = page_artid.split('-') if len(page_range) == 2: page_start, page_end = page_range else: artid = page_artid elif _RE_2_CHARS.search(page_artid): # if it has 2 ore more letters it's an article ID artid = page_artid elif len(page_artid) >= 5: # it it is longer than 5 digits it's an article ID artid = page_artid else: if artid is None: artid = page_artid if page_start is None: page_start = page_artid return page_start, page_end, artid
def get_int_value(val): if val: out = force_force_list(val)[0] if out.isdigit(): out = int(out) return out return None
def phd_advisors(self, key, value): degree_type_map = {"phd": "PhD", "master": "Master"} degree_type = None if value.get("g"): degree_type_raw = force_force_list(value.get("g"))[0] degree_type = degree_type_map.get(degree_type_raw.lower(), degree_type_raw) return {"id": value.get("i"), "name": value.get("a"), "degree_type": degree_type}
def _get_source(value): sources = force_force_list(value.get('a')) sources_without_inspire_uid = [ el for el in sources if not el.startswith('inspire:uid:') ] return force_single_element(sources_without_inspire_uid)
def titles(self, key, value): def is_main_title(key): return key.startswith('245') def is_translated_title(key): return key.startswith('242') titles = self.setdefault('titles', []) values = force_force_list(value) for val in values: title_obj = { 'title': val.get('a'), 'subtitle': force_single_element(val.get('b')), # FIXME: #1484 'source': val.get('9'), } if is_main_title(key): titles.insert(0, title_obj) elif is_translated_title(key): title = val.get('a') if title: lang = langdetect.detect(title) if lang: title_obj['language'] = lang self.setdefault('title_translations', []).append(title_obj) else: titles.append(title_obj) return titles
def authors2marc(self, key, value): """Main Entry-Personal Name.""" value = force_force_list(value) def get_value(value): affiliations = [ aff.get('value') for aff in value.get('affiliations', []) ] return { 'a': value.get('full_name'), 'e': value.get('role'), 'q': value.get('alternative_names'), 'i': value.get('inspire_id'), 'j': value.get('orcid'), 'm': value.get('emails'), 'u': affiliations, 'x': get_recid_from_ref(value.get('record')), 'y': value.get('curated_relation') } if len(value) > 1: self["700"] = [] for author in value[1:]: self["700"].append(get_value(author)) return get_value(value[0])
def inspire_categories(self, key, value): """Inspire categories.""" schema = load_schema('elements/inspire_field') possible_sources = schema['properties']['source']['enum'] _terms = force_force_list(value.get('a')) source = value.get('9') if source not in possible_sources: if source == 'automatically added based on DCC, PPF, DK': source = 'curator' elif source == 'submitter': source = 'user' else: source = 'undefined' self.setdefault('inspire_categories', []) if _terms: for _term in _terms: term = classify_field(_term) if term: inspire_category = { 'term': term, 'source': source, } self['inspire_categories'].append(inspire_category)
def references(self, key, value): """Produce list of references.""" value = force_force_list(value) def get_value(value): # Retrieve fields as described here: # https://twiki.cern.ch/twiki/bin/view/Inspire/DevelopmentRecordMarkup. rb = ReferenceBuilder() mapping = [('o', rb.set_number), ('m', rb.add_misc), ('x', partial(rb.add_raw_reference, source='dojson')), ('1', rb.set_texkey), ('u', rb.add_url), ('r', rb.add_report_number), ('s', rb.set_pubnote), ('p', rb.set_publisher), ('y', rb.set_year), ('i', rb.add_uid), ('b', rb.add_uid), ('a', rb.add_uid), ('c', rb.add_collaboration), ('q', rb.add_title), ('t', rb.add_title), ('h', rb.add_refextract_authors_str), ('e', partial(rb.add_author, role='ed.'))] for field, method in mapping: for element in force_force_list(value.get(field)): if element: method(element) if '0' in value: recid = get_int_value(value, '0') rb.set_record(get_record_ref(recid, 'literature')) return rb.obj references = self.get('references', []) references.extend(get_value(v) for v in value) return references
def experiments2marc(self, key, values): """Information about experiments. FIXME: use the flatten decorator once DoJSON 1.3.0 is released. """ def _get_marc_experiment(json_dict): marc = { 'e': json_dict.get('name'), 's': json_dict.get('start_year'), 'd': json_dict.get('end_year'), } status = 'current' if json_dict.get('current') else None if status: marc['z'] = status recid = get_recid_from_ref(json_dict.get('record', None)) if recid: marc['0'] = recid return marc marc_experiments = self.get('693', []) values = force_force_list(values) for experiment in values: if experiment: marc_experiments.append(_get_marc_experiment(experiment)) return marc_experiments
def authors2marc(self, key, value): """Main Entry-Personal Name.""" value = force_force_list(value) def get_value(value): affiliations = [ aff.get('value') for aff in value.get('affiliations', []) ] return { 'a': value.get('full_name'), 'e': value.get('role'), 'q': value.get('alternative_name'), 'i': value.get('inspire_id'), 'j': value.get('orcid'), 'm': value.get('email'), 'u': affiliations, 'x': get_recid_from_ref(value.get('record')), 'y': value.get('curated_relation') } if len(value) > 1: self["700"] = [] for author in value[1:]: self["700"].append(get_value(author)) return get_value(value[0])
def name(self, key, value): """Name information. Please note that MARC field for an author's name is splitted into two fields, `last_name` and `first_name`. The same situation occurs for the date fields, in JSON it is splitted into `birth_year` and `death_year`. Admissible string values for `100__g`: + active + departed + retired + deceased The only accepted value in `100__c` field is: + Sir Values accepted for `100__b: + Jr. + Sr. + roman numbers (like VII) """ value = force_force_list(value) self.setdefault('dates', value[0].get('d')) return { 'value': value[0].get('a'), 'numeration': value[0].get('b'), 'title': value[0].get('c'), 'status': value[0].get('g'), 'preferred_name': value[0].get('q'), }
def get_value(value): # Retrieve fields as described here: # https://twiki.cern.ch/twiki/bin/view/Inspire/DevelopmentRecordMarkup. rb = ReferenceBuilder() mapping = [ ('o', rb.set_number), ('m', rb.add_misc), ('x', partial(rb.add_raw_reference, source='dojson')), ('1', rb.set_texkey), ('u', rb.add_url), ('r', rb.add_report_number), ('s', rb.set_pubnote), ('p', rb.set_publisher), ('y', rb.set_year), ('i', rb.add_uid), ('b', rb.add_uid), ('a', rb.add_uid), ('c', rb.add_collaboration), ('q', rb.add_title), ('t', rb.add_title), ('h', rb.add_refextract_authors_str), ('e', partial(rb.add_author, role='ed.')) ] for field, method in mapping: for element in force_force_list(value.get(field)): if element: method(element) if '0' in value: recid = get_int_value(value, '0') rb.set_record(get_record_ref(recid, 'literature')) return rb.obj
def spires_sysnos2marc(self, key, value): """970 SPIRES number and new recid.""" value = force_force_list(value) existing_values = self.get('970', []) val_recids = [get_recid_from_ref(val) for val in value] existing_values.extend([{'d': val} for val in val_recids if val]) return existing_values
def name_variants(self, key, value): """Variants of the name.""" valid_sources = ["DESY_AFF", "ADS", "INSPIRE"] if value.get('9') and value.get('9') not in valid_sources: return self.get('name_variants', []) if value.get('g'): self.setdefault('extra_words', []) self['extra_words'].extend(force_force_list(value.get('g'))) values = self.get('name_variants', []) values.append({ 'source': value.get('9'), 'value': force_force_list(value.get('a', [])), }) return values
def _was_not_published(json): def _not_published(publication_info): return 'page_start' not in publication_info and 'artid' not in publication_info publication_infos = force_force_list(get_value(json, 'publication_info')) not_published = map(_not_published, publication_infos) return all(not_published)
def collaboration(self, key, value): """Collaboration of experiment.""" value = force_force_list(value) collaborations = sorted((elem["g"] for elem in value if 'g' in elem), key=lambda x: len(x)) if len(collaborations) > 1: self['collaboration_alternative_names'] = collaborations[1:] if collaborations: return collaborations[0]
def other_names(self, key, value): """Other variation of names. Usually a different form of writing the primary name. """ other_names = self.get('other_names', []) other_names.extend(force_force_list(value.get('a'))) return other_names
def _get_full_name(value): a_values = force_force_list(value.get('a')) if a_values: if len(a_values) > 1: logger.warning( 'Record with mashed up authors list. ' 'Taking first author: %s', a_values[0]) return a_values[0]