def populate_affiliation_suggest(record): """Populate the ``affiliation_suggest`` field of Institution records.""" ICN = record.get('ICN', []) institution_acronyms = get_value(record, 'institution_hierarchy.acronym', default=[]) institution_names = get_value(record, 'institution_hierarchy.name', default=[]) legacy_ICN = record.get('legacy_ICN', '') name_variants = force_list(get_value(record, 'name_variants.value', default=[])) postal_codes = force_list(get_value(record, 'addresses.postal_code', default=[])) # XXX: this is need by the curators to search only with numbers extract_numbers_from_umr = [] for name in name_variants: match = re.match(r'UMR\s', name, re.IGNORECASE) if match: umr_number = name.replace(match.group(0), '') extract_numbers_from_umr.append(umr_number) input_values = [] input_values.extend(ICN) input_values.extend(institution_acronyms) input_values.extend(institution_names) input_values.append(legacy_ICN) input_values.extend(name_variants) input_values.extend(postal_codes) input_values.extend(extract_numbers_from_umr) input_values = [el for el in input_values if el] record['affiliation_suggest'] = { 'input': input_values, }
def populate_bookautocomplete(sender, json, *args, **kwargs): """Populate the ```bookautocomplete`` field of Literature records.""" if not is_hep(json): return if 'book' not in json.get('document_type', []): return paths = [ 'imprints.date', 'imprints.publisher', 'isbns.value', ] authors = force_list(get_value(json, 'authors.full_name', default=[])) titles = force_list(get_value(json, 'titles.title', default=[])) input_values = list(chain.from_iterable( force_list(get_value(json, path, default=[])) for path in paths)) input_values.extend(authors) input_values.extend(titles) input_values = [el for el in input_values if el] ref = get_value(json, 'self.$ref') json.update({ 'bookautocomplete': { 'input': input_values, 'payload': { 'authors': authors, 'id': ref, 'title': titles, }, }, })
def populate_affiliation_suggest(sender, json, *args, **kwargs): """Populate the ``affiliation_suggest`` field of Institution records.""" if 'institutions.json' not in json.get('$schema'): return ICN = json.get('ICN', []) institution_acronyms = get_value(json, 'institution_hierarchy.acronym', default=[]) institution_names = get_value(json, 'institution_hierarchy.name', default=[]) legacy_ICN = json.get('legacy_ICN', '') name_variants = force_list(get_value(json, 'name_variants.value', default=[])) postal_codes = force_list(get_value(json, 'addresses.postal_code', default=[])) input_values = [] input_values.extend(ICN) input_values.extend(institution_acronyms) input_values.extend(institution_names) input_values.append(legacy_ICN) input_values.extend(name_variants) input_values.extend(postal_codes) input_values = [el for el in input_values if el] json.update({ 'affiliation_suggest': { 'input': input_values, 'output': legacy_ICN, 'payload': { '$ref': get_value(json, 'self.$ref'), 'ICN': ICN, 'institution_acronyms': institution_acronyms, 'institution_names': institution_names, 'legacy_ICN': legacy_ICN, }, }, })
def get_dois(self, data): dois = data.get('dois', None) control_number = data.get('control_number') if dois and not control_number: data['dois'] = force_list( {'value': get_value(data, 'dois[0]', default=missing)}) elif dois: data['dois'] = force_list( {'value': get_value(data, 'dois[0].value', default=missing)}) return data.get('dois', missing)
def get_arxiv_eprints(self, data): arxiv_eprint = data.pop('arxiv_eprint', None) arxiv_eprints = data.get('arxiv_eprints') if arxiv_eprint: data['arxiv_eprint'] = force_list({'value': arxiv_eprint}) elif arxiv_eprints: data['arxiv_eprint'] = force_list( {'value': get_value(data, 'arxiv_eprints[0].value', default=missing)}) data.pop('arxiv_eprints', None) return data.get('arxiv_eprint', missing)
def get_collection(marc_record): collections = set() for field in force_list(marc_record.get('980__')): for v in field.values(): for e in force_list(v): collections.add(e.upper().strip()) if 'DELETED' in collections: return 'DELETED' for collection in collections: if collection in REAL_COLLECTIONS: return collection return 'HEP'
def is_published(record): """Return if a record is published. We say that a record is published if it is citeable, which means that it has enough information in a ``publication_info``, or if we know its DOI and a ``journal_title``, which means it is in press. Args: record(InspireRecord): a record. Returns: bool: whether the record is published. Examples: >>> record = { ... 'dois': [ ... {'value': '10.1016/0029-5582(61)90469-2'}, ... ], ... 'publication_info': [ ... {'journal_title': 'Nucl.Phys.'}, ... ], ... } >>> is_published(record) True """ citeable = 'publication_info' in record and is_citeable(record['publication_info']) submitted = 'dois' in record and any( 'journal_title' in el for el in force_list(record.get('publication_info'))) return citeable or submitted
def get_linked_records_in_field(record, field_path): """Get all linked records in a given field. Args: record (dict): the record containing the links field_path (string): a dotted field path specification understandable by ``get_value``, containing a json reference to another record. Returns: Iterator[dict]: an iterator on the linked record. Warning: Currently, the order in which the linked records are yielded is different from the order in which they appear in the record. Example: >>> record = {'references': [ ... {'record': {'$ref': 'https://labs.inspirehep.net/api/literature/1234'}}, ... {'record': {'$ref': 'https://labs.inspirehep.net/api/data/421'}}, ... ]} >>> get_linked_record_in_field(record, 'references.record') [...] """ full_path = '.'.join([field_path, '$ref']) pids = force_list([get_pid_from_record_uri(rec) for rec in get_value(record, full_path, [])]) return get_db_records(pids)
def map_refextract_to_schema(extracted_references, source=None): """Convert refextract output to the schema using the builder.""" result = [] for reference in extracted_references: rb = ReferenceBuilder() mapping = [ ('author', rb.add_refextract_authors_str), ('collaboration', rb.add_collaboration), ('doi', rb.add_uid), ('hdl', rb.add_uid), ('isbn', rb.add_uid), ('journal_reference', rb.set_pubnote), ('linemarker', rb.set_label), ('misc', rb.add_misc), ('publisher', rb.set_publisher), ('raw_ref', lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)), ('reportnumber', rb.add_report_number), ('texkey', rb.set_texkey), ('title', rb.add_title), ('url', rb.add_url), ('year', rb.set_year), ] for field, method in mapping: for el in force_list(reference.get(field)): if el: method(el) if get_value(rb.obj, 'reference.urls'): rb.obj['reference']['urls'] = dedupe_list_of_dicts(rb.obj['reference']['urls']) result.append(rb.obj) return result
def get_resolved_references_by_control_number(self, data): data = force_list(data) resolved_records = get_linked_records_in_field( {'references': data}, 'references.record') return { record['control_number']: record for record in resolved_records }
def get_control_numbers_to_resolved_experiments_map(self, data): data = force_list(data) resolved_records = get_linked_records_in_field( {'accelerator_experiments': data}, 'accelerator_experiments.record' ) return { record['control_number']: record for record in resolved_records }
def populate_bookautocomplete(record): """Populate the ```bookautocomplete`` field of Literature records.""" paths = [ 'imprints.date', 'imprints.publisher', 'isbns.value', ] authors = force_list(get_value(record, 'authors.full_name', default=[])) titles = force_list(get_value(record, 'titles.title', default=[])) input_values = list(chain.from_iterable( force_list(get_value(record, path, default=[])) for path in paths)) input_values.extend(authors) input_values.extend(titles) input_values = [el for el in input_values if el] record['bookautocomplete'] = { 'input': input_values, }
def get_and_format_references(record): """Format references. .. deprecated:: 2018-06-07 """ out = [] references = record.get('references') if references: reference_recids = [ str(ref['recid']) for ref in references if ref.get('recid') ] resolved_references = get_es_records( 'lit', reference_recids, _source=[ 'authors', 'citation_count', 'collaboration', 'control_number', 'corporate_author', 'earliest_date', 'publication_info', 'titles', ] ) # Create mapping to keep reference order recid_to_reference = { ref['control_number']: ref for ref in resolved_references } for reference in references: row = [] ref_record = recid_to_reference.get( reference.get('recid'), {} ) if 'reference' in reference: reference.update(reference['reference']) del reference['reference'] if 'publication_info' in reference: reference['publication_info'] = force_list( reference['publication_info'] ) row.append(render_template_to_string( 'inspirehep_theme/references.html', record=ref_record, reference=reference )) row.append(ref_record.get('citation_count', '')) out.append(row) return out
def populate_author_suggest(record, *args, **kwargs): """Populate the ``author_suggest`` field of Authors records.""" author_paths = [ 'name.preferred_name', 'name.previous_names', 'name.name_variants', 'name.native_names', 'name.value', ] input_values = [el for el in chain.from_iterable([force_list(get_value(record, path)) for path in author_paths])] record['author_suggest'] = { 'input': input_values }
def populate_experiment_suggest(record): """Populates experiment_suggest field of experiment records.""" experiment_paths = [ 'accelerator.value', 'collaboration.value', 'experiment.short_name', 'experiment.value', 'institutions.value', 'legacy_name', 'long_name', 'name_variants', ] input_values = [el for el in chain.from_iterable( [force_list(get_value(record, path)) for path in experiment_paths]) if el] record['experiment_suggest'] = { 'input': input_values, }
def populate_earliest_date(sender, json, *args, **kwargs): """Populate the ``earliest_date`` field of Literature records.""" if not is_hep(json): return date_paths = [ 'preprint_date', 'thesis_info.date', 'thesis_info.defense_date', 'publication_info.year', 'legacy_creation_date', 'imprints.date', ] dates = [str(el) for el in chain.from_iterable( [force_list(get_value(json, path)) for path in date_paths])] if dates: result = earliest_date(dates) if result: json['earliest_date'] = result
def populate_earliest_date(record): """Populate the ``earliest_date`` field of Literature records.""" date_paths = [ 'preprint_date', 'thesis_info.date', 'thesis_info.defense_date', 'publication_info.year', 'legacy_creation_date', 'imprints.date', ] dates = [ str(el) for el in chain.from_iterable( [force_list(get_value(record, path)) for path in date_paths] ) ] if dates: result = earliest_date(dates) if result: record['earliest_date'] = result
def visit_less_equal_than_op(self, node, fieldnames): return self._generate_range_queries(force_list(fieldnames), {'lte': node.op.value})
def no_arxiv_in_dois(obj): return 'dois' in obj and \ any(source.lower() != 'arxiv' for source in force_list(get_value(obj, 'dois.source')))
def get_titles(self, data): title = data.pop('title', None) if title: data['titles'] = force_list(title) return data.get('titles', missing)
def _authors(key, value): def _get_affiliations(value): result = [] u_values = force_list(value.get('u')) z_values = force_list(value.get('z')) # XXX: we zip only when they have the same length, otherwise # we might match a value with the wrong recid. if len(u_values) == len(z_values): for u_value, z_value in zip(u_values, z_values): result.append({ 'record': get_record_ref(z_value, 'institutions'), 'value': u_value, }) else: for u_value in u_values: result.append({'value': u_value}) return dedupe_list(result) def _get_affiliations_identifiers(value): t_values = (t_value.split(':') for t_value in dedupe_list(force_list(value.get('t')))) return [{ 'schema': schema.upper(), 'value': identifier } for schema, identifier in t_values] def _get_curated_relation(value): return value.get('y') == '1' or None def _get_emails(value): return [ el[6:] if el.startswith('email:') else el for el in force_list(value.get('m')) ] def _get_full_names(value): return force_list(value.get('a')) def _get_ids(value): def _is_jacow(j_value): return j_value.upper().startswith('JACOW-') def _is_orcid(j_value): return j_value.upper().startswith('ORCID:') and len(j_value) > 6 def _is_naked_orcid(j_value): return ORCID.match(j_value) def _is_cern(j_value): return j_value.startswith('CCID-') result = [] i_values = force_list(value.get('i')) for i_value in i_values: result.append({ 'schema': 'INSPIRE ID', 'value': i_value, }) j_values = force_list(value.get('j')) for j_value in j_values: if _is_jacow(j_value): result.append({ 'schema': 'JACOW', 'value': 'JACoW-' + j_value[6:], }) elif _is_orcid(j_value): result.append({ 'schema': 'ORCID', 'value': j_value[6:].replace('.', ''), }) elif _is_naked_orcid(j_value): result.append({ 'schema': 'ORCID', 'value': j_value, }) elif _is_cern(j_value): result.append({ 'schema': 'CERN', 'value': 'CERN-' + j_value[5:], }) w_values = force_list(value.get('w')) for w_value in w_values: result.append({ 'schema': 'INSPIRE BAI', 'value': w_value, }) return dedupe_list(result) def _get_inspire_roles(value): result = [] e_values = force_list(value.get('e')) if any(el.lower().startswith('ed') for el in e_values): result.append('editor') if key.startswith('701'): result.append('supervisor') return result def _get_raw_affiliations(value): return dedupe_list([{ 'value': el } for el in force_list(value.get('v'))]) def _get_record(value): return get_record_ref(maybe_int(force_single_element(value.get('x'))), 'authors') full_names = _get_full_names(value) if len(full_names) == 1: return [ { 'affiliations': _get_affiliations(value), 'affiliations_identifiers': _get_affiliations_identifiers(value), 'alternative_names': force_list(value.get('q')), 'curated_relation': _get_curated_relation(value), 'emails': _get_emails(value), 'full_name': full_names[0], 'ids': _get_ids(value), 'inspire_roles': _get_inspire_roles(value), 'raw_affiliations': _get_raw_affiliations(value), 'record': _get_record(value), }, ] else: return [{ 'affiliations': _get_affiliations(value), 'affiliations_identifiers': _get_affiliations_identifiers(value), 'full_name': full_name, 'inspire_roles': _get_inspire_roles(value), 'raw_affiliations': _get_raw_affiliations(value), } for full_name in full_names]
def authors2marc(self, key, value): """Populate the ``100`` MARC field. Also populates the ``700`` and the ``701`` MARC fields through side effects. """ value = force_list(value) def _get_ids(value): ids = { 'i': [], 'j': [], } if value.get('ids'): for _id in value.get('ids'): if _id.get('schema') == 'INSPIRE ID': ids['i'].append(_id.get('value')) elif _id.get('schema') == 'ORCID': ids['j'].append('ORCID:' + _id.get('value')) elif _id.get('schema') == 'JACOW': ids['j'].append(_id.get('value')) elif _id.get('schema') == 'CERN': ids['j'].append('CCID-' + _id.get('value')[5:]) return ids def _get_affiliations(value): return [aff.get('value') for aff in value.get('affiliations', [])] def _get_affiliations_identifiers(value): return [ u'{}:{}'.format(aff.get('schema'), aff.get('value')) for aff in value.get('affiliations_identifiers', []) ] def _get_inspire_roles(value): values = force_list(value.get('inspire_roles')) return ['ed.' for role in values if role == 'editor'] def _get_raw_affiliations(value): return [aff.get('value') for aff in value.get('raw_affiliations', [])] def get_value_100_700(value): ids = _get_ids(value) return { 'a': value.get('full_name'), 'e': _get_inspire_roles(value), 'q': value.get('alternative_names'), 'i': ids.get('i'), 'j': ids.get('j'), 'm': value.get('emails'), 't': _get_affiliations_identifiers(value), 'u': _get_affiliations(value), 'v': _get_raw_affiliations(value), } def get_value_701(value): ids = _get_ids(value) return { 'a': value.get('full_name'), 'q': value.get('alternative_names'), 'i': ids.get('i'), 'j': ids.get('j'), 'u': _get_affiliations(value), 'v': _get_raw_affiliations(value), } if len(value) > 1: self["700"] = [] self["701"] = [] for author in value[1:]: is_supervisor = 'supervisor' in author.get('inspire_roles', []) if is_supervisor: self["701"].append(get_value_701(author)) else: self["700"].append(get_value_100_700(author)) return get_value_100_700(value[0])
def get_pid_values(self): pid_values = get_value(self.data, self.pid_value_path, default=[]) if not isinstance(pid_values, (tuple, list)): pid_values = force_list(pid_values) return set(pid_values)
def authors(self): authors_key = self.record.get("author") authors = [self.get_author(author) for author in force_list(authors_key)] return authors
def _get_emails(value): return [el[6:] if el.startswith('email:') else el for el in force_list(value.get('m'))]
def _get_affiliations_identifiers(value): t_values = (t_value.split(':', 1) for t_value in dedupe_list(force_list(value.get('t')))) return [{'schema': schema.upper(), 'value': identifier} for schema, identifier in t_values]
def ranks(self, key, value): """Populate the ``ranks`` key.""" return [normalize_rank(el) for el in force_list(value.get('a'))]
def force_single_element(obj): """Force an object to a list and return the first element.""" lst = force_list(obj) if lst: return lst[0] return None
def serialize(self, pid, record, links_factory=None): """Return a different metrics for a given author recid. :param pid: Persistent identifier instance. :param record: Record instance. :param links_factory: Factory function for the link generation, which are added to the response. """ author_pid = pid.pid_value fields = set() keywords = [] statistics = {} statistics['citations'] = 0 statistics['publications'] = 0 statistics['types'] = {} statistics_citations = {} query = Q('match', authors__recid=author_pid) search = LiteratureSearch().query('nested', path='authors', query=query)\ .params(_source=[ 'citation_count', 'control_number', 'facet_inspire_doc_type', 'facet_inspire_categories', 'keywords', ]) for result in search.scan(): result_source = result.to_dict() # Increment the count of the total number of publications. statistics['publications'] += 1 # Increment the count of citations. citation_count = result_source.get('citation_count', 0) statistics['citations'] += citation_count statistics_citations[result_source['control_number']] = \ citation_count # Count how many times certain type of publication was published. try: publication_type = result_source.get( 'facet_inspire_doc_type', [])[0] except IndexError: pass if publication_type: if publication_type in statistics['types']: statistics['types'][publication_type] += 1 else: statistics['types'][publication_type] = 1 # Get fields. for field in result_source.get('facet_inspire_categories', []): fields.add(field) # Get keywords. keywords.extend([ k for k in force_list( get_value(result_source, 'keywords.value')) if k != '* Automatic Keywords *']) # Calculate h-index together with i10-index. statistics['hindex'] = calculate_h_index(statistics_citations) statistics['i10index'] = calculate_i10_index(statistics_citations) if fields: statistics['fields'] = list(fields) # Return the top 25 keywords. if keywords: counter = Counter(keywords) statistics['keywords'] = [{ 'count': i[1], 'keyword': i[0] } for i in counter.most_common(25)] return json.dumps(statistics)
def _is_not_for_hal(value): normalized_c_values = [el.upper() for el in force_list(value.get('c'))] return 'NOT HAL' in normalized_c_values
def _is_for_cds(value): normalized_c_values = [el.upper() for el in force_list(value.get('c'))] return 'CDS' in normalized_c_values
def historical_data(self, key, value): return force_list(value.get('a'))
def formdata_to_model(obj, formdata): """Manipulate form data to match literature data model.""" def _is_arxiv_url(url): return 'arxiv.org' in url form_fields = copy.deepcopy(formdata) filter_empty_elements(form_fields, ['authors', 'supervisors', 'report_numbers']) builder = LiteratureBuilder(source='submitter') for author in form_fields.get('authors', []): builder.add_author( builder.make_author(author['full_name'], affiliations=force_list(author['affiliation']) if author['affiliation'] else None, roles=['author'])) for supervisor in form_fields.get('supervisors', []): builder.add_author( builder.make_author( supervisor['full_name'], affiliations=force_list(supervisor['affiliation']) if author['affiliation'] else None, roles=['supervisor'])) builder.add_title(title=form_fields.get('title')) document_type = 'conference paper' if form_fields.get('conf_name') \ else form_fields.get('type_of_doc', []) if document_type == 'chapter': document_type = 'book chapter' builder.add_document_type(document_type=document_type) builder.add_abstract( abstract=form_fields.get('abstract'), source='arXiv' if form_fields.get('categories') else None) if form_fields.get('arxiv_id') and form_fields.get('categories'): builder.add_arxiv_eprint( arxiv_id=form_fields.get('arxiv_id'), arxiv_categories=form_fields.get('categories').split()) builder.add_doi(doi=form_fields.get('doi')) builder.add_inspire_categories( subject_terms=form_fields.get('subject_term'), source='user') for key in ('extra_comments', 'nonpublic_note', 'hidden_notes', 'conf_name', 'references'): builder.add_private_note(private_notes=form_fields.get(key)) year = form_fields.get('year') try: year = int(year) except (TypeError, ValueError): year = None builder.add_preprint_date( preprint_date=form_fields.get('preprint_created')) if form_fields.get('type_of_doc') == 'thesis': builder.add_thesis(defense_date=form_fields.get('defense_date'), degree_type=form_fields.get('degree_type'), institution=form_fields.get('institution'), date=form_fields.get('thesis_date')) if form_fields.get('type_of_doc') == 'chapter': if not form_fields.get('journal_title'): builder.add_book_series(title=form_fields.get('series_title')) if form_fields.get('type_of_doc') == 'book': if form_fields.get('journal_title'): form_fields['volume'] = form_fields.get('series_volume') else: builder.add_book_series(title=form_fields.get('series_title'), volume=form_fields.get('series_volume')) builder.add_book(publisher=form_fields.get('publisher_name'), place=form_fields.get('publication_place'), date=form_fields.get('publication_date')) builder.add_publication_info( year=year, cnum=form_fields.get('conference_id'), journal_issue=form_fields.get('issue'), journal_title=form_fields.get('journal_title'), journal_volume=form_fields.get('volume'), page_start=form_fields.get('start_page'), page_end=form_fields.get('end_page'), artid=form_fields.get('artid'), parent_record=form_fields.get('parent_book')) builder.add_accelerator_experiments_legacy_name( legacy_name=form_fields.get('experiment')) language = form_fields.get('other_language') \ if form_fields.get('language') == 'oth' \ else form_fields.get('language') builder.add_language(language=language) builder.add_title_translation(title=form_fields.get('title_translation')) builder.add_title(title=form_fields.get('title_arXiv'), source='arXiv') builder.add_title(title=form_fields.get('title_crossref'), source='crossref') builder.add_license(url=form_fields.get('license_url')) builder.add_public_note(public_note=form_fields.get('public_notes')) builder.add_public_note( public_note=form_fields.get('note'), source='arXiv' if form_fields.get('categories') else 'CrossRef') form_url = form_fields.get('url') form_additional_url = form_fields.get('additional_url') if form_url and not _is_arxiv_url(form_url): obj.extra_data['submission_pdf'] = form_url if not form_additional_url: builder.add_url(url=form_url) if form_additional_url and not _is_arxiv_url(form_additional_url): builder.add_url(url=form_additional_url) [ builder.add_report_number( report_number=report_number.get('report_number')) for report_number in form_fields.get('report_numbers', []) ] builder.add_collaboration(collaboration=form_fields.get('collaboration')) builder.add_acquisition_source( datetime=datetime.datetime.utcnow().isoformat(), submission_number=obj.id, internal_uid=int(obj.id_user), email=form_fields.get('email'), orcid=form_fields.get('orcid'), method='submitter') builder.validate_record() return builder.record
def public_notes_500(self, key, value): """Populate the ``public_notes`` key.""" return [{ 'source': value.get('9'), 'value': public_note, } for public_note in force_list(value.get('a'))]
def license(self): license_keys = self.record.get("license") licenses = [self.get_license(license) for license in force_list(license_keys)] return licenses
def _private_notes_595(self, key, value): """Populate the ``_private_notes`` key.""" return [{ 'source': value.get('9'), 'value': _private_note, } for _private_note in force_list(value.get('a'))]
def _get_raw_affiliations(value): return dedupe_list([{ 'value': el } for el in force_list(value.get('v'))])
def formdata_to_model(obj, formdata): """Manipulate form data to match literature data model.""" def _is_arxiv_url(url): return 'arxiv.org' in url form_fields = copy.deepcopy(formdata) filter_empty_elements( form_fields, ['authors', 'supervisors', 'report_numbers'] ) builder = LiteratureBuilder(source='submitter') for author in form_fields.get('authors', []): builder.add_author(builder.make_author( author['full_name'], affiliations=force_list(author['affiliation']) if author['affiliation'] else None, roles=['author'] )) for supervisor in form_fields.get('supervisors', []): builder.add_author(builder.make_author( supervisor['full_name'], affiliations=force_list(supervisor['affiliation']) if author['affiliation'] else None, roles=['supervisor'] )) builder.add_title(title=form_fields.get('title')) document_type = 'conference paper' if form_fields.get('conf_name') \ else form_fields.get('type_of_doc', []) if document_type == 'chapter': document_type = 'book chapter' builder.add_document_type( document_type=document_type ) builder.add_abstract( abstract=form_fields.get('abstract'), source='arXiv' if form_fields.get('categories') else None ) if form_fields.get('arxiv_id') and form_fields.get('categories'): builder.add_arxiv_eprint( arxiv_id=form_fields.get('arxiv_id'), arxiv_categories=form_fields.get('categories').split() ) builder.add_doi(doi=form_fields.get('doi')) builder.add_inspire_categories( subject_terms=form_fields.get('subject_term'), source='user' ) for key in ('extra_comments', 'nonpublic_note', 'hidden_notes', 'conf_name', 'references'): builder.add_private_note( private_notes=form_fields.get(key) ) year = form_fields.get('year') try: year = int(year) except (TypeError, ValueError): year = None builder.add_preprint_date( preprint_date=form_fields.get('preprint_created') ) if form_fields.get('type_of_doc') == 'thesis': builder.add_thesis( defense_date=form_fields.get('defense_date'), degree_type=form_fields.get('degree_type'), institution=form_fields.get('institution'), date=form_fields.get('thesis_date') ) if form_fields.get('type_of_doc') == 'chapter': if not form_fields.get('journal_title'): builder.add_book_series(title=form_fields.get('series_title')) if form_fields.get('type_of_doc') == 'book': if form_fields.get('journal_title'): form_fields['volume'] = form_fields.get('series_volume') else: builder.add_book_series(title=form_fields.get('series_title'), volume=form_fields.get('series_volume') ) builder.add_book( publisher=form_fields.get('publisher_name'), place=form_fields.get('publication_place'), date=form_fields.get('publication_date')) builder.add_publication_info( year=year, cnum=form_fields.get('conference_id'), journal_issue=form_fields.get('issue'), journal_title=form_fields.get('journal_title'), journal_volume=form_fields.get('volume'), page_start=form_fields.get('start_page'), page_end=form_fields.get('end_page'), artid=form_fields.get('artid'), parent_record=form_fields.get('parent_book') ) builder.add_accelerator_experiments_legacy_name( legacy_name=form_fields.get('experiment') ) language = form_fields.get('other_language') \ if form_fields.get('language') == 'oth' \ else form_fields.get('language') builder.add_language(language=language) if form_fields.get('title_translation'): builder.add_title_translation( title=form_fields['title_translation'], language='en', ) builder.add_title( title=form_fields.get('title_arXiv'), source='arXiv' ) builder.add_title( title=form_fields.get('title_crossref'), source='crossref' ) builder.add_license(url=form_fields.get('license_url')) builder.add_public_note(public_note=form_fields.get('public_notes')) builder.add_public_note( public_note=form_fields.get('note'), source='arXiv' if form_fields.get('categories') else 'CrossRef' ) form_url = form_fields.get('url') form_additional_url = form_fields.get('additional_url') if form_url and not _is_arxiv_url(form_url): obj.extra_data['submission_pdf'] = form_url if not form_additional_url: builder.add_url(url=form_url) if form_additional_url and not _is_arxiv_url(form_additional_url): builder.add_url(url=form_additional_url) [builder.add_report_number( report_number=report_number.get('report_number') ) for report_number in form_fields.get('report_numbers', [])] builder.add_collaboration(collaboration=form_fields.get('collaboration')) builder.add_acquisition_source( datetime=datetime.datetime.utcnow().isoformat(), submission_number=obj.id, internal_uid=int(obj.id_user), email=form_fields.get('email'), orcid=form_fields.get('orcid'), method='submitter' ) return builder.record
def _get_inspire_roles(value): values = force_list(value.get('inspire_roles')) return ['ed.' for role in values if role == 'editor']
def visit_greater_than_op(self, node, fieldnames): return self._generate_range_queries(force_list(fieldnames), {'gt': node.op.value})
def get_titles(self, data): title = data.pop('title', None) if title: data['titles'] = force_list(title) return data.get('titles', missing)
def dois(self, key, value): """Populate the ``dois`` key. Also populates the ``persistent_identifiers`` key through side effects. """ def _get_first_non_curator_source(sources): sources_without_curator = [ el for el in sources if el.upper() != 'CURATOR' ] return force_single_element(sources_without_curator) def _get_material(value): MATERIAL_MAP = { 'addendum': 'addendum', 'ebook': 'publication', 'erratum': 'erratum', 'preprint': 'preprint', 'publication': 'publication', 'reprint': 'reprint', 'translation': 'translation', } q_value = force_single_element(value.get('q', '')) normalized_q_value = q_value.lower() return MATERIAL_MAP.get(normalized_q_value) def _is_doi(id_, type_): return (not type_ or type_.upper() == 'DOI') and is_doi(id_) def _is_handle(id_, type_): return (not type_ or type_.upper() == 'HDL') and is_handle(id_) dois = self.get('dois', []) persistent_identifiers = self.get('persistent_identifiers', []) values = force_list(value) for value in values: id_ = force_single_element(value.get('a', '')) material = _get_material(value) schema = force_single_element(value.get('2', '')) sources = force_list(value.get('9')) source = _get_first_non_curator_source(sources) if _is_doi(id_, schema): dois.append({ 'material': material, 'source': source, 'value': normalize_doi(id_), }) else: schema = 'HDL' if _is_handle(id_, schema) else schema persistent_identifiers.append({ 'material': material, 'schema': schema, 'source': source, 'value': id_, }) self['persistent_identifiers'] = persistent_identifiers return dois
def _is_hidden(value): return 'HIDDEN' in [val.upper() for val in force_list(value.get('o')) ] or _get_source(value) == 'arxiv' or None
def _has_curator_flag(value): normalized_nine_values = [ el.upper() for el in force_list(value.get('9')) ] return 'CURATOR' in normalized_nine_values
def visit_value(self, node, fieldnames=None): if not fieldnames: fieldnames = '_all' if node.contains_wildcard: if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames: return self._generate_date_with_wildcard_query(node.value) bai_fieldnames = self._generate_fieldnames_if_bai_query( node.value, bai_field_variation=FieldVariations.search, query_bai_field_if_dots_in_name=True ) query = self._generate_query_string_query( node.value, fieldnames=bai_fieldnames or fieldnames, analyze_wildcard=True ) if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames: return generate_nested_query(ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query) return query else: if isinstance(fieldnames, list): if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames: # Date queries with simple values are transformed into range queries, among the given and the exact # next date, according to the granularity of the given date. return self._generate_range_queries(force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value}) if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['journal'] == fieldnames: return self._generate_journal_nested_queries(node.value) return { 'multi_match': { 'fields': fieldnames, 'query': node.value, } } else: if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames: bai_fieldnames = self._generate_fieldnames_if_bai_query( node.value, bai_field_variation=FieldVariations.search, query_bai_field_if_dots_in_name=True ) if bai_fieldnames: if len(bai_fieldnames) == 1: query = {"match": {bai_fieldnames[0]: node.value}} return generate_nested_query(ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query) else: # Not an exact BAI pattern match, but node's value looks like BAI (no spaces and dots), # e.g. `S.Mele`. In this case generate a partial match query. return self.visit_partial_match_value(node, bai_fieldnames) return self._generate_author_query(node.value) elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['exact-author'] == fieldnames: return self._generate_exact_author_query(node.value) elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['irn'] == fieldnames: return {'term': {fieldnames: ''.join(('SPIRES-', node.value))}} elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['title'] == fieldnames: return self._generate_title_queries(node.value) elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['type-code'] == fieldnames: return self._generate_type_code_query(node.value) elif fieldnames not in ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME.values(): colon_value = ':'.join([fieldnames, node.value]) given_field_query = generate_match_query(fieldnames, node.value, with_operator_and=True) texkey_query = self._generate_term_query('texkeys.raw', colon_value, boost=2.0) _all_field_query = generate_match_query('_all', colon_value, with_operator_and=True) query = wrap_queries_in_bool_clauses_if_more_than_one([given_field_query, texkey_query, _all_field_query], use_must_clause=False) return wrap_query_in_nested_if_field_is_nested(query, fieldnames, ElasticSearchVisitor.NESTED_FIELDS) return generate_match_query(fieldnames, node.value, with_operator_and=True)
def references2marc(self, key, value): """Populate the ``999C5`` MARC field.""" reference = value.get('reference', {}) pids = force_list(reference.get('persistent_identifiers')) a_values = ['doi:' + el for el in force_list(reference.get('dois'))] a_values.extend( ['hdl:' + el['value'] for el in pids if el.get('schema') == 'HDL']) a_values.extend( ['urn:' + el['value'] for el in pids if el.get('schema') == 'URN']) external_ids = force_list(reference.get('external_system_identifiers')) u_values = force_list(get_value(reference, 'urls.value')) u_values.extend( CDS_RECORD_FORMAT.format(el['value']) for el in external_ids if el.get('schema') == 'CDS') u_values.extend( ADS_RECORD_FORMAT.format(el['value']) for el in external_ids if el.get('schema') == 'ADS') authors = force_list(reference.get('authors')) e_values = [ el['full_name'] for el in authors if el.get('inspire_role') == 'editor' ] h_values = [ el['full_name'] for el in authors if el.get('inspire_role') != 'editor' ] r_values = force_list(reference.get('report_numbers')) if reference.get('arxiv_eprint'): arxiv_eprint = reference['arxiv_eprint'] r_values.append( 'arXiv:' + arxiv_eprint if is_arxiv_post_2007(arxiv_eprint) else arxiv_eprint) if reference.get('publication_info'): reference['publication_info'] = convert_new_publication_info_to_old( [reference['publication_info']])[0] journal_title = get_value(reference, 'publication_info.journal_title') journal_volume = get_value(reference, 'publication_info.journal_volume') page_start = get_value(reference, 'publication_info.page_start') page_end = get_value(reference, 'publication_info.page_end') artid = get_value(reference, 'publication_info.artid') s_value = build_pubnote(journal_title, journal_volume, page_start, page_end, artid) m_value = ' / '.join(force_list(reference.get('misc'))) return { '0': get_recid_from_ref(value.get('record')), '9': 'CURATOR' if value.get('legacy_curated') else None, 'a': a_values, 'b': get_value(reference, 'publication_info.cnum'), 'c': reference.get('collaborations'), 'e': e_values, 'h': h_values, 'i': reference.get('isbn'), 'k': reference.get('texkey'), 'm': m_value, 'o': reference.get('label'), 'p': get_value(reference, 'imprint.publisher'), 'q': get_value(reference, 'publication_info.parent_title'), 'r': r_values, 's': s_value, 't': get_value(reference, 'title.title'), 'u': u_values, 'x': get_value(value, 'raw_refs.value'), 'y': get_value(reference, 'publication_info.year'), 'z': 1 if value.get('curated_relation') else 0, }
def write(data, _id=None, xpath=None, link_text=None): data = force_list(data) elem = wait_for(_id=_id, xpath=xpath, link_text=link_text) return elem.send_keys(*data)
def _get_collections(marcjson): collections = chain.from_iterable( [force_list(el) for el in force_list(get_value(marcjson, '980__.a'))]) normalized_collections = [el.lower() for el in collections] return normalized_collections
def _get_full_names(value): return force_list(value.get('a'))
def serialize(self, pid, record, links_factory=None): """Return a different metrics for a given author recid. :param pid: Persistent identifier instance. :param record: Record instance. :param links_factory: Factory function for the link generation, which are added to the response. """ author_pid = pid.pid_value fields = set() keywords = [] statistics = {} statistics['citations'] = 0 statistics['publications'] = 0 statistics['types'] = {} statistics_citations = {} query = Q('match', authors__recid=author_pid) search = LiteratureSearch().query('nested', path='authors', query=query)\ .params(_source=[ 'citation_count', 'control_number', 'facet_inspire_doc_type', 'facet_inspire_categories', 'keywords', ]) for result in search.scan(): result_source = result.to_dict() # Increment the count of the total number of publications. statistics['publications'] += 1 # Increment the count of citations. citation_count = result_source.get('citation_count', 0) statistics['citations'] += citation_count statistics_citations[result_source['control_number']] = \ citation_count # Count how many times certain type of publication was published. try: publication_type = result_source.get('facet_inspire_doc_type', [])[0] except IndexError: pass if publication_type: if publication_type in statistics['types']: statistics['types'][publication_type] += 1 else: statistics['types'][publication_type] = 1 # Get fields. for field in result_source.get('facet_inspire_categories', []): fields.add(field) # Get keywords. keywords.extend([ k for k in force_list(get_value(result_source, 'keywords.value')) if k != '* Automatic Keywords *' ]) # Calculate h-index together with i10-index. statistics['hindex'] = calculate_h_index(statistics_citations) statistics['i10index'] = calculate_i10_index(statistics_citations) if fields: statistics['fields'] = list(fields) # Return the top 25 keywords. if keywords: counter = Counter(keywords) statistics['keywords'] = [{ 'count': i[1], 'keyword': i[0] } for i in counter.most_common(25)] return json.dumps(statistics)