def test_get_value(double_title, single_title, empty_title): """Test get_value utility""" assert len(get_value(double_title, "titles.title")) == 2 assert get_value(double_title, "titles.title[0]") == "Parton distributions with LHC data" assert get_value(single_title, "titles.title") == ["The Large Hadron Collider"] assert get_value(empty_title, "titles.title") == [] assert get_value(empty_title, "foo", {}) == {}
def _match_with_invenio_matcher(obj, eng): from invenio_matcher.api import match as _match if queries is None: queries_ = [{"type": "exact", "match": "dois.value"}, {"type": "exact", "match": "arxiv_eprints.value"}] else: queries_ = queries record_matches = { "recids": [], "records": [], "base_url": os.path.join(current_app.config["SERVER_NAME"], "record"), } record = {} record["dois.value"] = get_value(obj.data, "dois.value") record["arxiv_eprints.value"] = get_value(obj.data, "arxiv_eprints.value") for matched_record in _match(record, queries=queries_, index=index, doc_type=doc_type): matched_recid = matched_record.record.get("id") record_matches["recids"].append(matched_recid) record_matches["records"].append({"source": matched_record.record.dumps(), "score": matched_record.score}) if len(record_matches["recids"]) > 0: obj.extra_data["record_matches"] = record_matches return True return False
def curation_ticket_context(user, obj): recid = obj.extra_data.get('recid') record_url = obj.extra_data.get('url') arxiv_ids = get_value(obj.data, 'arxiv_eprints.value') or [] for index, arxiv_id in enumerate(arxiv_ids): if arxiv_id and is_arxiv_post_2007(arxiv_id): arxiv_ids[index] = 'arXiv:{0}'.format(arxiv_id) report_numbers = get_value(obj.data, 'report_numbers.value') or [] dois = [ "doi:{0}".format(doi) for doi in get_value(obj.data, 'dois.value') or [] ] link_to_pdf = obj.extra_data.get('submission_data').get('pdf') subject = ' '.join( filter(lambda x: x is not None, arxiv_ids + dois + report_numbers + ['(#{0})'.format(recid)])) references = obj.extra_data.get('submission_data').get('references') user_comment = obj.extra_data.get('submission_data').get('extra_comments') return dict(recid=recid, record_url=record_url, link_to_pdf=link_to_pdf, email=user.email, references=references, user_comment=user_comment, subject=subject)
def get_description(obj): """Get the description column part.""" if not isinstance(obj.data, dict): return "No description found." abstract = "" authors = [] categories = [] final_identifiers = [] # Get identifiers dois = get_value(obj.data, "dois.value", []) if dois: final_identifiers.extend(dois) system_no = get_value(obj.data, "external_system_numbers.value", []) if system_no: final_identifiers.extend(system_no) # Get subject categories, adding main one first. Order matters here. record_categories = get_value(obj.data, "arxiv_eprints.categories", []) + \ get_value(obj.data, "subject_terms.term", []) for category_list in record_categories: if isinstance(category_list, list): categories.extend(category_list) else: categories.append(category_list) categories = list(OrderedDict.fromkeys(categories)) # Unique only abstract = get_value(obj.data, "abstracts.value", [""])[0] authors = obj.data.get("authors", []) return render_template('inspire_workflows/styles/harvesting_record.html', object=obj, authors=authors, categories=categories, abstract=abstract, identifiers=final_identifiers)
def curation_ticket_context(user, obj): recid = obj.extra_data.get('recid') record_url = obj.extra_data.get('url') arxiv_ids = get_value(obj.data, 'arxiv_eprints.value') or [] for index, arxiv_id in enumerate(arxiv_ids): if arxiv_id and is_arxiv_post_2007(arxiv_id): arxiv_ids[index] = 'arXiv:{0}'.format(arxiv_id) report_numbers = get_value(obj.data, 'report_numbers.value') or [] dois = [ "doi:{0}".format(doi) for doi in get_value(obj.data, 'dois.value') or [] ] link_to_pdf = obj.extra_data.get('formdata', {}).get('url') subject = ' '.join(filter( lambda x: x is not None, arxiv_ids + dois + report_numbers + ['(#{0})'.format(recid)] )) references = obj.extra_data.get('formdata').get('references') user_comment = obj.extra_data.get('formdata', {}).get('extra_comments', '') return dict( recid=recid, record_url=record_url, link_to_pdf=link_to_pdf, email=user.email, references=references, user_comment=user_comment, subject=subject )
def prepare_magpie_payload(record, corpus): """Prepare payload to send to Magpie API.""" payload = dict(text="", corpus=corpus) titles = filter(None, get_value(record, "titles.title", [])) abstracts = filter(None, get_value(record, "abstracts.value", [])) payload["text"] = ". ".join( [part.encode('utf-8') for part in titles + abstracts]) return payload
def _structure_data(struct): return { 'type': get_value(struct, "collections[1].primary", "").lower(), # ^^ FIXME: This may not be one of the HAL accepted values: # institution, department, laboratory or researchteam 'name': get_value(struct, "institution[0]", ""), 'address': get_value(struct, "address[0].original_address", []), 'country': get_value(struct, "address[0].country_code", ""), 'recid': get_recid_from_ref(struct['self']), }
def _get_wfs_same_source(obj, eng): current_source = get_value(obj.data, 'acquisition_source.source').lower() try: workflows = obj.extra_data[extra_data_key] except KeyError: workflows = [] for wf_id in workflows: wf = workflow_object_class.get(wf_id) wf_source = get_value(wf.data, 'acquisition_source.source').lower() if wf_source == current_source: return True return False
def is_experimental_paper(obj, eng): """Check if the record is an experimental paper.""" categories = list(get_value(obj.data, "arxiv_eprints.categories", [[]])[0]) + \ list(get_value(obj.data, "field_categories.term", [])) categories_to_check = [ "hep-ex", "nucl-ex", "astro-ph", "astro-ph.IM", "astro-ph.CO", "astro-ph.EP", "astro-ph.GA", "astro-ph.HE", "astro-ph.SR", "physics.ins-det", "Experiment-HEP", "Experiment-Nucl", "Astrophysics", "Instrumentation" ] for experimental_category in categories_to_check: if experimental_category in categories: return True return False
def populate_inspire_subjects(sender, json, *args, **kwargs): """Populate the INSPIRE subjects before indexing. Adds the `facet_inspire_subjects` key to the record, to be used for faceting in the search interface. """ json['facet_inspire_subjects'] = get_value(json, 'inspire_categories.term')
def get_subject(record): inspire_categories = force_force_list( get_value(record, 'inspire_categories')) terms = [ic['term'] for ic in inspire_categories if ic.get('term')] if terms: return terms[0]
def _conference_data(conf): ref = replace_refs(conf, 'db') # FIXME: Add conference city, country, and country code fields if ref: return {'type': "conference", 'name': get_value(ref, "titles[0].title", ""), 'acronym': get_value(ref, "acronym[0]", ""), 'opening_date': get_value(ref, "opening_date", ""), 'closing_date': get_value(ref, "closing_date", "")} else: return {'type': "conference", 'name': "", 'acronym': "", 'opening_date': "", 'closing_date': ""}
def test_get_value_returns_single_title(): empty_titles = InspireRecord({'titles': []}) expected = [] result = get_value(empty_titles, "titles.title") assert expected == result
def prepare_payload(record): """Prepare payload to send to Beard API.""" payload = dict(title="", abstract="", categories=[]) titles = filter(None, get_value(record, "titles.title", [])) # FIXME May have to normalize categories in the future arxiv_categories = map( lambda x: x[0], filter(None, get_value(record, "arxiv_eprints.categories", []))) if titles: payload['title'] = titles[0] abstracts = filter(None, get_value(record, "abstracts.value", [])) if abstracts: payload['abstract'] = abstracts[0] if arxiv_categories: payload['categories'] = arxiv_categories return payload
def test_get_value_returns_empty_dic_when_there_are_no_titles(): empty_titles = InspireRecord({'titles': []}) expected = {} result = get_value(empty_titles, "foo") assert expected == result
def newreview(): """View for INSPIRE author new form review by a cataloger.""" objectid = request.values.get('objectid', 0, type=int) if not objectid: abort(400) workflow_metadata = WorkflowUIRecord.get_record(objectid)['metadata'] # Converting json to populate form workflow_metadata['extra_comments'] = get_value( workflow_metadata, '_private_notes[0].value' ) convert_for_form(workflow_metadata) form = AuthorUpdateForm( data=workflow_metadata, is_review=True) ctx = { "action": url_for('.reviewhandler', objectid=objectid), "name": "authorUpdateForm", "id": "authorUpdateForm", "objectid": objectid } return render_template('authors/forms/review_form.html', form=form, **ctx)
def test_get_value_returns_single_title(): empty_titles = Record({'titles': []}) expected = [] result = get_value(empty_titles, "titles.title") assert expected == result
def test_get_value_returns_empty_dic_when_there_are_no_titles(): empty_titles = Record({'titles': []}) expected = {} result = get_value(empty_titles, "foo") assert expected == result
def references(self, key, value): """Produce list of references.""" value = force_force_list(value) def get_value(value): # Retrieve fields as described here: # https://twiki.cern.ch/twiki/bin/view/Inspire/DevelopmentRecordMarkup. rb = ReferenceBuilder() mapping = [('o', rb.set_number), ('m', rb.add_misc), ('x', partial(rb.add_raw_reference, source='dojson')), ('1', rb.set_texkey), ('u', rb.add_url), ('r', rb.add_report_number), ('s', rb.set_pubnote), ('p', rb.set_publisher), ('y', rb.set_year), ('i', rb.add_uid), ('b', rb.add_uid), ('a', rb.add_uid), ('c', rb.add_collaboration), ('q', rb.add_title), ('t', rb.add_title), ('h', rb.add_refextract_authors_str), ('e', partial(rb.add_author, role='ed.'))] for field, method in mapping: for element in force_force_list(value.get(field)): if element: method(element) if '0' in value: recid = get_int_value(value, '0') rb.set_record(get_record_ref(recid, 'literature')) return rb.obj references = self.get('references', []) references.extend(get_value(v) for v in value) return references
def populate_experiment_suggest(sender, json, *args, **kwargs): """Populates experiment_suggest field of experiment records.""" # FIXME: Use a dedicated method when #1355 will be resolved. if 'experiments.json' in json.get('$schema'): experiment_names = get_value(json, 'experiment_names.title') title_variants = force_list( get_value(json, 'title_variants.title')) json.update({ 'experiment_suggest': { 'input': experiment_names + title_variants, 'output': experiment_names[0], 'payload': {'$ref': get_value(json, 'self.$ref')}, }, })
def is_arxiv_paper(obj, *args, **kwargs): """Check if the record is from arXiv.""" arxiv_id = get_arxiv_id(obj.data) categories = get_value(obj.data, 'arxiv_eprints.categories') if arxiv_id or categories: return True return False
def _was_not_published(json): def _not_published(publication_info): return 'page_start' not in publication_info and 'artid' not in publication_info publication_infos = force_force_list(get_value(json, 'publication_info')) not_published = map(_not_published, publication_infos) return all(not_published)
def is_arxiv_paper(obj, *args, **kwargs): """Check if the record is from arXiv.""" arxiv_id = get_clean_arXiv_id(obj.data) categories = get_value(obj.data, 'arxiv_eprints.categories') if arxiv_id or categories: return True return False
def thesis_supervisors2marc(self, key, value): """Thesis supervisors. FIXME: handle recids to 701__z.""" return { 'a': value.get('full_name'), 'u': get_value(value, 'affiliations.value'), }
def prepare_payload(record): """Prepare payload to send to Beard API.""" payload = dict(title="", abstract="", categories=[]) titles = filter(None, get_value(record, "titles.title", [])) # FIXME May have to normalize categories in the future arxiv_categories = map( lambda x: x[0], filter(None, get_value(record, "arxiv_eprints.categories", [])) ) if titles: payload['title'] = titles[0] abstracts = filter(None, get_value(record, "abstracts.value", [])) if abstracts: payload['abstract'] = abstracts[0] if arxiv_categories: payload['categories'] = arxiv_categories return payload
def get_title(cls, obj, **kwargs): """Return the value to put in the title column of Holding Pen.""" if isinstance(obj.data, dict): titles = filter(None, get_value(obj.data, "titles.title", [])) if titles: # Show first title that evaluates to True return titles[0] return "No title available"
def populate_experiment_suggest(sender, json, *args, **kwargs): """Populates experiment_suggest field of experiment records.""" # FIXME: Use a dedicated method when #1355 will be resolved. if 'experiments.json' in json.get('$schema'): experiment_names = get_value(json, 'experiment_names.title') title_variants = force_force_list( get_value(json, 'title_variants.title')) json.update({ 'experiment_suggest': { 'input': experiment_names + title_variants, 'output': experiment_names[0], 'payload': { '$ref': get_value(json, 'self.$ref') }, }, })
def _get_author(self): """Return list of name(s) of the author(s).""" def _is_supervisor(author): contributor_roles = force_force_list(get_value(author, "contributor_roles.value")) return "Supervision" in contributor_roles result = [] spacinginitials = re.compile(r"([A-Z][a-z]{0,1}[}]?\.)(\b|[-\{])") authors = force_force_list(get_value(self.record, "authors")) non_supervisors = [el for el in authors if not _is_supervisor(el)] result.extend([spacinginitials.sub(r"\1 \2", el["full_name"]) for el in non_supervisors if "full_name" in el]) corporate_authors = force_force_list(get_value(self.record, "corporate_author")) result.extend(corporate_authors) return result
def _parse_authors(record): authors = get_value(record, "authors", []) for author in authors: try: parsed = HumanName(author['full_name']) author['parsed_name'] = parsed except KeyError: continue return authors
def test_get_value_returns_none_on_index_error(): single_title = InspireRecord({ 'titles': [{ 'title': 'Importance of a consistent choice of alpha(s) in the matching of AlpGen and Pythia', }], }) assert get_value(single_title, 'titles.title[1]') is None
def add_arxiv_categories(record, blob): if not record.get('arxiv_eprints') or not blob.get('65017'): return record for category in force_list(get_value(blob, '65017')): if category.get('2') == 'arXiv' and category.get('a'): record['arxiv_eprints'][0]['categories'].append(category['a']) return record
def _lookup(record, value): """Searches a key in a record. Uses `get_value` to lookup a key in a JSON record, and raises a `KeyError` if it wasn't found. """ result = get_value(record, value) if not result: raise KeyError return result
def test_get_value_returns_none_on_index_error(): single_title = Record({ 'titles': [ { 'title': 'Importance of a consistent choice of alpha(s) in the matching of AlpGen and Pythia', } ], }) assert get_value(single_title, 'titles.title[1]') is None
def match_by_doi(record): """Match by DOIs.""" dois = get_value(record, 'dois.value', []) result = set() for doi in dois: query = '0247:"{0}"'.format(doi) result.update(search(query)) return list(result)
def was_already_harvested(record): """Return True if the record was already harvested. We use the following heuristic: if the record belongs to one of the CORE categories then it was probably ingested in some other way. """ categories = get_value(record, 'inspire_categories.term', []) for category in categories: if category.lower() in current_app.config.get('INSPIRE_ACCEPTED_CATEGORIES', []): return True
def match_by_doi(record): """Match by DOIs.""" dois = get_value(record, "dois.value", []) result = set() for doi in dois: query = '0247:"{0}"'.format(doi) result.update(search(query)) return list(result)
def was_already_harvested(record): """Return True if the record was already harvested. We use the following heuristic: if the record belongs to one of the CORE categories then it was probably ingested in some other way. """ categories = get_value(record, "field_categories.term", []) for category in categories: if category.lower() in current_app.config.get("INSPIRE_ACCEPTED_CATEGORIES", []): return True
def _get_hep_record_brief(hep_record): brief = { 'control_number': hep_record['control_number'], 'title': get_value(hep_record, 'titles[0].title'), } abstract = get_value(hep_record, 'abstracts[0].value') if abstract is not None: brief['abstract'] = abstract arxiv_eprint = get_value(hep_record, 'arxiv_eprints[0].value') if arxiv_eprint is not None: brief['arxiv_eprint'] = arxiv_eprint number_of_pages = get_value(hep_record, 'number_of_pages') if number_of_pages is not None: brief['number_of_pages'] = number_of_pages earliest_date = get_value(hep_record, 'earliest_date') if earliest_date is not None: brief['earliest_date'] = earliest_date authors = hep_record.get('authors') if authors is not None: brief['authors_count'] = len(authors) author_briefs = [] for author in authors[:3]: author_briefs.append({'full_name': author['full_name']}) brief['authors'] = author_briefs public_notes = hep_record.get('public_notes') if public_notes is not None: public_notes_value = [] for public_note in public_notes: public_notes_value.append({'value': public_note['value']}) brief['public_notes'] = public_notes_value publication_info = hep_record.get('publication_info') if publication_info is not None: brief['publication_info'] = publication_info return brief
def _match_with_invenio_matcher(obj, eng): from invenio_matcher.api import match as _match if queries is None: queries_ = [ {'type': 'exact', 'match': 'dois.value'}, {'type': 'exact', 'match': 'arxiv_eprints.value'} ] else: queries_ = queries record_matches = { "recids": [], "records": [], "base_url": os.path.join( current_app.config["SERVER_NAME"], 'record' ) } record = {} record['dois.value'] = get_value(obj.data, 'dois.value') record['arxiv_eprints.value'] = get_value( obj.data, 'arxiv_eprints.value' ) for matched_record in _match( record, queries=queries_, index=index, doc_type=doc_type ): matched_recid = matched_record.record.get('id') record_matches['recids'].append(matched_recid) record_matches['records'].append({ "source": matched_record.record.dumps(), "score": matched_record.score }) if len(record_matches['recids']) > 0: obj.extra_data["record_matches"] = record_matches return True return False
def _conference_data(conf): ref = replace_refs(conf, 'db') # FIXME: Add conference city, country, and country code fields if ref: return { 'type': "conference", 'name': get_value(ref, "titles[0].title", ""), 'acronym': get_value(ref, "acronym[0]", ""), 'opening_date': get_value(ref, "opening_date", ""), 'closing_date': get_value(ref, "closing_date", "") } else: return { 'type': "conference", 'name': "", 'acronym': "", 'opening_date': "", 'closing_date': "" }
def thesis_supervisors2marc(self, key, value): """Thesis supervisors. FIXME: handle recids to 701__z.""" _is_supervisor = 'Supervision' in value.get('contributor_roles', []) if _is_supervisor: return { 'a': value.get('full_name'), 'u': get_value(value, 'affiliations.value'), }
def test_get_value_allows_indexes_in_paths(): record = { 'titles': [ {'title': 'first title'}, {'title': 'second title'}, ], } expected = 'second title' result = get_value(record, 'titles.title[1]') assert expected == result
def _get_author(self): """Return list of name(s) of the author(s).""" def _is_supervisor(author): contributor_roles = force_force_list( get_value(author, 'contributor_roles.value')) return 'Supervision' in contributor_roles result = [] spacinginitials = re.compile(r'([A-Z][a-z]{0,1}[}]?\.)(\b|[-\{])') authors = force_force_list(get_value(self.record, 'authors')) non_supervisors = [el for el in authors if not _is_supervisor(el)] result.extend( [spacinginitials.sub(r'\1 \2', el['full_name']) for el in non_supervisors if 'full_name' in el]) corporate_authors = force_force_list( get_value(self.record, 'corporate_author')) result.extend(corporate_authors) return result
def _get_author(self): """Return list of name(s) of the author(s).""" def _is_supervisor(author): contributor_roles = force_list( get_value(author, 'contributor_roles.value')) return 'Supervision' in contributor_roles result = [] spacinginitials = re.compile(r'([A-Z][a-z]{0,1}[}]?\.)(\b|[-\{])') authors = force_list(get_value(self.record, 'authors')) non_supervisors = [el for el in authors if not _is_supervisor(el)] result.extend( [spacinginitials.sub(r'\1 \2', el['full_name']) for el in non_supervisors if 'full_name' in el]) corporate_authors = force_list( get_value(self.record, 'corporate_author')) result.extend(corporate_authors) return result
def new_ticket_context(user, obj): """Context for literature new tickets.""" title = get_title(obj.data) subject = "Your suggestion to INSPIRE: {0}".format(title) user_comment = obj.extra_data.get('submission_data').get('extra_comments') identifiers = get_value(obj.data, "external_system_numbers.value") or [] return dict(email=user.email, title=title, identifier=identifiers or "", user_comment=user_comment, references=obj.extra_data.get("submission_data", {}).get("references"), object=obj, subject=subject)
def test_get_value_returns_the_two_titles(): double_title = Record({ "titles": [{ "title": "Importance of a consistent choice of alpha(s) in the matching of AlpGen and Pythia" }, { "title": "Monte Carlo tuning in the presence of Matching" }], }) expected = 2 result = len(get_value(double_title, "titles.title")) assert expected == result
def test_get_value_returns_the_selected_title(): double_title = InspireRecord({ "titles": [{ "title": "Importance of a consistent choice of alpha(s) in the matching of AlpGen and Pythia" }, { "title": "Monte Carlo tuning in the presence of Matching" }], }) expected = 'Importance of a consistent choice of alpha(s) in the matching of AlpGen and Pythia' result = get_value(double_title, "titles.title[0]") assert expected == result
def test_references_can_be_updated(app, records_to_be_merged): merged_record = get_db_record('lit', 111) deleted_record = get_db_record('lit', 222) deleted_record.merge(merged_record) update_refs.delay('http://localhost:5000/api/literature/222', 'http://localhost:5000/api/literature/111') pointing_record = get_db_record('lit', 333) expected = 'http://localhost:5000/api/literature/111' result = get_value(pointing_record, 'accelerator_experiments[0].record.$ref') assert expected == result
def add_inspire_category(record, blob): if not record.get('arxiv_eprints') or record.get('inspire_categories'): return record record.setdefault('inspire_categories', []) for arxiv_category in get_value(record, 'arxiv_eprints.categories', default=[]): inspire_category = classify_field(arxiv_category) if inspire_category: record['inspire_categories'].append({ 'term': inspire_category, 'source': 'arxiv', }) return record
def pending_in_holding_pen(obj, eng): """Check if a record exists in HP by looking in given KB.""" from elasticsearch_dsl import Q from invenio_db import db from invenio_search import RecordsSearch from invenio_workflows.models import WorkflowObjectModel, ObjectStatus config = current_app.config['WORKFLOWS_UI_REST_ENDPOINT'] index = config.get('search_index') doc_type = config.get('search_type') searcher = RecordsSearch( index=index, doc_type=doc_type ).params(version=True) identifiers = [] for field, lookup in six.iteritems( current_app.config.get("HOLDING_PEN_MATCH_MAPPING", {})): # Add quotes around to make the search exact identifiers += ['{0}:"{1}"'.format(field, i) for i in get_value(obj.data, lookup, [])] # Search for any existing record in Holding Pen, exclude self if identifiers: search = searcher.query(Q('query_string', query=" OR ".join(identifiers), allow_leading_wildcard=False)) search_result = search.execute() id_list = [int(hit.id) for hit in search_result.hits] matches_excluding_self = set(id_list) - set([obj.id]) if matches_excluding_self: obj.extra_data["holdingpen_ids"] = list(matches_excluding_self) pending_records = db.session.query( WorkflowObjectModel ).with_entities(WorkflowObjectModel.id).filter( WorkflowObjectModel.status != ObjectStatus.COMPLETED, WorkflowObjectModel.id.in_(matches_excluding_self) ).all() if pending_records: pending_ids = [o[0] for o in pending_records] obj.extra_data['pending_holdingpen_ids'] = pending_ids obj.log.info( "Pending records already found in Holding Pen ({0})" .format( pending_ids ) ) return True return False