def refextract(obj, eng): """Extract references from various sources and add them to the workflow. Runs ``refextract`` on both the PDF attached to the workflow and the references provided by the submitter, if any, then chooses the one that generated the most and attaches them to the workflow object. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if 'references' in obj.data: extracted_raw_references = dedupe_list( extract_references_from_raw_refs(obj.data['references'])) obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references)) obj.data['references'] = match_references_based_on_flag( extracted_raw_references) return matched_pdf_references, matched_text_references = [], [] source = LiteratureReader(obj.data).source with get_document_in_workflow(obj) as tmp_document: if tmp_document: pdf_references = dedupe_list( extract_references_from_pdf(tmp_document, source)) matched_pdf_references = match_references_based_on_flag( pdf_references) text = get_value(obj.extra_data, 'formdata.references') if text: text_references = dedupe_list( extract_references_from_text(text, source)) matched_text_references = match_references_based_on_flag( text_references) if len(matched_pdf_references) == len(matched_text_references) == 0: obj.log.info('No references extracted.') elif len(matched_pdf_references) > len(matched_text_references): obj.log.info('Extracted %d references from PDF.', len(matched_pdf_references)) obj.data['references'] = matched_pdf_references elif len(matched_text_references) >= len(matched_pdf_references): obj.log.info('Extracted %d references from text.', len(matched_text_references)) obj.data['references'] = matched_text_references
def parse(self): """Extract an arXiv record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(abstract=self.abstract, source=self.source) self.builder.add_title(title=self.title, source=self.source) for license in self.licenses: self.builder.add_license(**license) for author in self.authors: self.builder.add_author(author) self.builder.add_number_of_pages(self.number_of_pages) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) self.builder.add_preprint_date(self.preprint_date) if self.public_note: self.builder.add_public_note(self.public_note, self.source) for rep_number in self.report_numbers: self.builder.add_report_number(rep_number, self.source) self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories) self.builder.add_private_note(self.private_note) self.builder.add_document_type(self.document_type) normalized_categories = [ classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories ] self.builder.add_inspire_categories(dedupe_list(normalized_categories), 'arxiv') return self.builder.record
def dedupe_all_lists(obj, exclude_keys=()): """Recursively remove duplucates from all lists. Args: obj: collection to deduplicate exclude_keys (Container[str]): key names to ignore for deduplication """ squared_dedupe_len = 10 if isinstance(obj, dict): new_obj = {} for key, value in obj.items(): if key in exclude_keys: new_obj[key] = value else: new_obj[key] = dedupe_all_lists(value) return new_obj elif isinstance(obj, (list, tuple, set)): new_elements = [dedupe_all_lists(v) for v in obj] if len(new_elements) < squared_dedupe_len: new_obj = dedupe_list(new_elements) else: new_obj = dedupe_list_of_dicts(new_elements) return type(obj)(new_obj) else: return obj
def refextract_url(): """Run refextract on a URL.""" if current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"): headers = { "Content-Type": "application/json", "Accept": "application/json" } data = { "journal_kb_data": create_journal_dict(), "url": request.json["url"] } response = requests.post( f"{current_app.config['REFEXTRACT_SERVICE_URL']}/extract_references_from_url", headers=headers, data=orjson.dumps(data), ) if response.status_code != 200: return jsonify({"message": "Can not extract references"}, 500) extracted_references = response.json()["extracted_references"] else: extracted_references = extract_references_from_url( request.json["url"], override_kbs_files={"journals": create_journal_dict()}, reference_format="{title},{volume},{page}", ) deduplicated_extracted_references = dedupe_list(extracted_references) references = map_refextract_to_schema(deduplicated_extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def test_dedupe_list(): list_with_duplicates = ['foo', 'bar', 'foo'] expected = ['foo', 'bar'] result = dedupe_list(list_with_duplicates) assert expected == result
def match_references_by_uuids(literature_uuids): record_json = type_coerce(RecordMetadata.json, JSONB) has_references = record_json.has_key("references") # noqa: W601 selected_uuids = RecordMetadata.id.in_(literature_uuids) not_deleted = or_( # exclude deleted records incase some are deleted after uuids are fetched by the callee not_(record_json.has_key("deleted")), # noqa: W601 not_(record_json["deleted"] == cast(True, JSONB)), ) with_references_query = RecordMetadata.query.filter( selected_uuids, has_references, not_deleted ) for record_metadata in with_references_query.all(): references = record_metadata.json["references"] match_result = match_references(references) if not match_result["any_link_modified"]: continue literature = LiteratureRecord(record_metadata.json, model=record_metadata) literature["references"] = dedupe_list(match_result["matched_references"]) literature.update(dict(literature)) db.session.commit() added_recids = match_result["added_recids"] removed_recids = match_result["removed_recids"] LOGGER.info( "References are matched", uuid=record_metadata.id, recid=record_metadata.json["control_number"], added_recids=added_recids, added_recid_count=len(added_recids), removed_recids=removed_recids, removed_recid_count=len(removed_recids), )
def fuzzy_match(obj, eng): """Return ``True`` if a similar record is found in the system. Uses a custom configuration for ``inspire-matcher`` to find records similar to the current workflow object's payload in the system. Also sets the ``matches.fuzzy`` property in ``extra_data`` to the list of the brief of first 5 record that matched. Arguments: obj: a workflow object. eng: a workflow engine. Returns: bool: ``True`` if the workflow object has a duplicate in the system ``False`` otherwise. """ if not current_app.config.get('FEATURE_FLAG_ENABLE_FUZZY_MATCHER'): return False fuzzy_match_config = current_app.config['FUZZY_MATCH'] matches = dedupe_list(match(obj.data, fuzzy_match_config)) record_ids = [_get_hep_record_brief(el['_source']) for el in matches] obj.extra_data.setdefault('matches', {})['fuzzy'] = record_ids[0:5] return bool(record_ids)
def article_exists(obj, eng): """Return ``True`` if the record is already present in the system. Uses the default configuration of the ``inspire-matcher`` to find duplicates of the current workflow object in the system. Also sets the ``record_matches`` property in ``extra_data`` to the list of control numbers that matched. Arguments: obj: a workflow object. eng: a workflow engine. Returns: bool: ``True`` if the workflow object has a duplicate in the system ``False`` otherwise. """ matches = dedupe_list(match(obj.data)) record_ids = [el['_source']['control_number'] for el in matches] if record_ids: obj.extra_data['record_matches'] = record_ids return True obj.extra_data['record_matches'] = [] return False
def already_pending_in_holdingpen_validator(property_name, value): """Check if there's a submission in the holdingpen with the same arXiv ID. """ if property_name == 'arXiv ID': query_should = { 'metadata.arxiv_eprints.value.raw': value, } elif property_name == 'DOI': query_should = { 'metadata.dois.value.raw': value, } query = { "query": { "bool": { "filter": [ { "term": { "metadata.acquisition_source.source": "submitter" }, }, { "bool": { "must_not": { "term": { "_workflow.status": "COMPLETED" } } } } ], "must": [ { "term": query_should, } ] } }, "_source": { "includes": [ "_id" ] } } hits = es.search( index='holdingpen-hep', doc_type='hep', body=query, )['hits']['hits'] matches = dedupe_list(hits) holdingpen_ids = [int(el['_id']) for el in matches] if holdingpen_ids: raise ValidationError( 'There exists already a pending suggestion with the same %s ' '"%s", it will be attended to shortly.' % (property_name, value) )
def add_citation_counts(chunk_size=500, request_timeout=120): def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': { 'citation_count': citation_count } } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar( es_scan(es, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list( list( chain.from_iterable( map(force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( es, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, request_timeout=request_timeout, stats_only=True, ) click.echo( '... DONE: {} records updated with success. {} failures.'.format( success, failed))
def _get_affiliations_identifiers(value): t_values = (t_value.split(':') for t_value in dedupe_list(force_list(value.get('t')))) return [{ 'schema': schema.upper(), 'value': identifier } for schema, identifier in t_values]
def arxiv_categories(self): categories = self.root.xpath('.//categories/text()').extract_first( default='[]') categories = categories.split() categories_without_old = [ normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories ] return dedupe_list(categories_without_old)
def extract_references_from_file(path, recid=None, reference_format=u"{title} {volume} ({year}) {page}", linker_callback=None, override_kbs_files=None): """Extract references from a local pdf file. The first parameter is the path to the file. It returns a list of parsed references. It raises FullTextNotAvailableError if the file does not exist, UnknownDocumentTypeError if it is not a PDF or plain text. The standard reference format is: {title} {volume} ({year}) {page}. E.g. you can change that by passing the reference_format: >>> extract_references_from_file(path, reference_format=u"{title},{volume},{page}") If you want to also link each reference to some other resource (like a record), you can provide a linker_callback function to be executed for every reference element found. To override KBs for journal names etc., use ``override_kbs_files``: >>> extract_references_from_file(path, override_kbs_files={'journals': 'my/path/to.kb'}) """ if not os.path.isfile(path): raise FullTextNotAvailableError(u"File not found: '{0}'".format(path)) docbody = get_plaintext_document_body(path) reflines, dummy, dummy = extract_references_from_fulltext(docbody) if not reflines: docbody = get_plaintext_document_body(path, keep_layout=True) reflines, dummy, dummy = extract_references_from_fulltext(docbody) parsed_refs, stats = parse_references( reflines, recid=recid, reference_format=reference_format, linker_callback=linker_callback, override_kbs_files=override_kbs_files, ) if magic.from_file(path, mime=True) == "application/pdf": extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path) if len(extracted_texkeys_urls) == len(parsed_refs): parsed_refs_updated = [] for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls): update_reference_with_urls(ref, ref_texkey_urls.get('urls', [])) if ref.get('url'): ref['url'] = dedupe_list(ref['url']) parsed_refs_updated.append(dict(ref, texkey=[ref_texkey_urls['texkey']])) return parsed_refs_updated return parsed_refs
def refextract(obj, eng): """Extract references from various sources and add them to the workflow. Runs ``refextract`` on both the PDF attached to the workflow and the references provided by the submitter, if any, then chooses the one that generated the most and attaches them to the workflow object. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if 'references' in obj.data: extracted_raw_references = dedupe_list(extract_references_from_raw_refs(obj.data['references'])) obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references)) obj.data['references'] = match_references(extracted_raw_references) return matched_pdf_references, matched_text_references = [], [] source = LiteratureReader(obj.data).source with get_document_in_workflow(obj) as tmp_document: if tmp_document: pdf_references = dedupe_list(extract_references_from_pdf(tmp_document, source)) matched_pdf_references = match_references(pdf_references) text = get_value(obj.extra_data, 'formdata.references') if text: text_references = dedupe_list(extract_references_from_text(text, source)) matched_text_references = match_references(text_references) if len(matched_pdf_references) == len(matched_text_references) == 0: obj.log.info('No references extracted.') elif len(matched_pdf_references) > len(matched_text_references): obj.log.info('Extracted %d references from PDF.', len(matched_pdf_references)) obj.data['references'] = matched_pdf_references elif len(matched_text_references) >= len(matched_pdf_references): obj.log.info('Extracted %d references from text.', len(matched_text_references)) obj.data['references'] = matched_text_references
def references(self, key, value): """Populate the ``references`` key.""" def _has_curator_flag(value): normalized_nine_values = [ el.upper() for el in force_list(value.get('9')) ] return 'CURATOR' in normalized_nine_values def _is_curated(value): return force_single_element( value.get('z')) == '1' and _has_curator_flag(value) def _set_record(el): recid = maybe_int(el) record = get_record_ref(recid, 'literature') rb.set_record(record) rb = ReferenceBuilder() mapping = [ ('0', _set_record), ('a', rb.add_uid), ('b', rb.add_uid), ('c', rb.add_collaboration), ('e', partial(rb.add_author, role='ed.')), ('h', rb.add_refextract_authors_str), ('i', rb.add_uid), ('k', rb.set_texkey), ('m', rb.add_misc), ('o', rb.set_label), ('p', rb.set_publisher), ('q', rb.add_parent_title), ('r', rb.add_report_number), ('s', rb.set_pubnote), ('t', rb.add_title), ('x', rb.add_raw_reference), ('y', rb.set_year), ] for field, method in mapping: for el in force_list(value.get(field)): if el: method(el) for el in dedupe_list(force_list(value.get('u'))): if el: rb.add_url(el) if _is_curated(value): rb.curate() if _has_curator_flag(value): rb.obj['legacy_curated'] = True return rb.obj
def add_citation_counts(chunk_size=500, request_timeout=120): def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar(es_scan( es, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list(list(chain.from_iterable(map( force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( es, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, request_timeout=request_timeout, stats_only=True, ) click.echo('... DONE: {} records updated with success. {} failures.'.format( success, failed))
def _get_ids(value): def _is_jacow(j_value): return j_value.upper().startswith('JACOW-') def _is_orcid(j_value): return j_value.upper().startswith('ORCID:') and len(j_value) > 6 def _is_naked_orcid(j_value): return ORCID.match(j_value) def _is_cern(j_value): return j_value.startswith('CCID-') result = [] i_values = force_list(value.get('i')) for i_value in i_values: result.append({ 'schema': 'INSPIRE ID', 'value': i_value, }) j_values = force_list(value.get('j')) for j_value in j_values: if _is_jacow(j_value): result.append({ 'schema': 'JACOW', 'value': 'JACoW-' + j_value[6:], }) elif _is_orcid(j_value): result.append({ 'schema': 'ORCID', 'value': j_value[6:].replace('.', ''), }) elif _is_naked_orcid(j_value): result.append({ 'schema': 'ORCID', 'value': j_value, }) elif _is_cern(j_value): result.append({ 'schema': 'CERN', 'value': 'CERN-' + j_value[5:], }) w_values = force_list(value.get('w')) for w_value in w_values: result.append({ 'schema': 'INSPIRE BAI', 'value': w_value, }) return dedupe_list(result)
def pending_in_holding_pen(obj, eng): """Return ``True`` if the record is already present in the Holding Pen. Uses a custom configuration of the ``inspire-matcher`` to find duplicates of the current workflow object in the Holding Pen. Also sets ``holdingpen_matches`` in ``extra_data`` to the list of ids that matched. Arguments: obj: a workflow object. eng: a workflow engine. Returns: bool: ``True`` if the workflow object has a duplicate in the Holding Pen, ``False`` otherwise. """ config = { 'algorithm': [ { 'queries': [ { 'path': 'arxiv_eprints.value', 'search_path': 'metadata.arxiv_eprints.value.raw', 'type': 'exact', }, { 'path': 'dois.value', 'search_path': 'metadata.dois.value.raw', 'type': 'exact', }, ], }, ], 'doc_type': 'hep', 'index': 'holdingpen-hep', } matches = dedupe_list(match(obj.data, config)) holdingpen_ids = [ int(el['_id']) for el in matches if int(el['_id']) != obj.id ] if holdingpen_ids: obj.extra_data['holdingpen_matches'] = holdingpen_ids return True return False
def duplicated_validator(property_name, property_value): def _is_not_deleted(base_record, match_result): return not get_value(match_result, '_source.deleted', default=False) config = { 'algorithm': [ { 'queries': [ { 'path': 'arxiv_id', 'search_path': 'arxiv_eprints.value.raw', 'type': 'exact', }, { 'path': 'doi', 'search_path': 'dois.value.raw', 'type': 'exact', }, ], 'validator': _is_not_deleted, }, ], 'doc_type': 'hep', 'index': 'records-hep', } if property_name == 'arXiv ID': data = { 'arxiv_id': property_value, } if property_name == 'DOI': data = { 'doi': property_value, } matches = dedupe_list(match(data, config)) matched_ids = [int(el['_source']['control_number']) for el in matches] if matched_ids: url = url_for( 'invenio_records_ui.literature', pid_value=matched_ids[0], ) raise ValidationError( 'There exists already an item with the same %s. ' '<a target="_blank" href="%s">See the record.</a>' % (property_name, url))
def duplicated_validator(property_name, property_value): def _is_not_deleted(base_record, match_result): return not get_value(match_result, '_source.deleted', default=False) config = { 'algorithm': [ { 'queries': [ { 'path': 'arxiv_id', 'search_path': 'arxiv_eprints.value.raw', 'type': 'exact', }, { 'path': 'doi', 'search_path': 'dois.value.raw', 'type': 'exact', }, ], 'validator': _is_not_deleted, }, ], 'doc_type': 'hep', 'index': 'records-hep', } if property_name == 'arXiv ID': data = { 'arxiv_id': property_value, } if property_name == 'DOI': data = { 'doi': property_value, } matches = dedupe_list(match(data, config)) matched_ids = [int(el['_source']['control_number']) for el in matches] if matched_ids: url = url_for( 'invenio_records_ui.literature', pid_value=matched_ids[0], ) raise ValidationError( 'There exists already an item with the same %s. ' '<a target="_blank" href="%s">See the record.</a>' % (property_name, url) )
def dedupe_all_lists(obj): """Recursively remove duplucates from all lists.""" squared_dedupe_len = 10 if isinstance(obj, dict): new_obj = {} for key, value in obj.items(): new_obj[key] = dedupe_all_lists(value) return new_obj elif isinstance(obj, (list, tuple, set)): new_elements = [dedupe_all_lists(v) for v in obj] if len(new_elements) < squared_dedupe_len: new_obj = dedupe_list(new_elements) else: new_obj = dedupe_list_of_dicts(new_elements) return type(obj)(new_obj) else: return obj
def _get_affiliations(value): result = [] u_values = force_list(value.get('u')) z_values = force_list(value.get('z')) # XXX: we zip only when they have the same length, otherwise # we might match a value with the wrong recid. if len(u_values) == len(z_values): for u_value, z_value in zip(u_values, z_values): result.append({ 'record': get_record_ref(z_value, 'institutions'), 'value': u_value, }) else: for u_value in u_values: result.append({'value': u_value}) return dedupe_list(result)
def match_reference_with_config(reference, config, previous_matched_recid=None): """Match a reference using inspire-matcher given the config. Args: reference (dict): the metadata of the reference. config (dict): the list of inspire-matcher configurations for queries. previous_matched_recid (int): the record id of the last matched reference from the list of references. Returns: dict: the matched reference. """ # XXX: avoid this type casting. try: reference['reference']['publication_info']['year'] = str( reference['reference']['publication_info']['year']) except KeyError: pass matched_recids = [ matched_record['_source']['control_number'] for matched_record in match(reference, config) ] matched_recids = dedupe_list(matched_recids) same_as_previous = any(matched_recid == previous_matched_recid for matched_recid in matched_recids) if len(matched_recids) == 1: _add_match_to_reference(reference, matched_recids[0], config['index']) elif same_as_previous: _add_match_to_reference(reference, previous_matched_recid, config['index']) # XXX: avoid this type casting. try: reference['reference']['publication_info']['year'] = int( reference['reference']['publication_info']['year']) except KeyError: pass return reference
def _pending_in_holding_pen(obj, validation_func): """Return the list of matching workflows in the holdingpen. Matches the holdingpen records by their ``arxiv_eprint``, their ``doi``, and by a custom validator function. Args: obj: a workflow object. validation_func: a function used to filter the matched records. Returns: (list): the ids matching the current ``obj`` that satisfy ``validation_func``. """ config = { 'algorithm': [ { 'queries': [ { 'path': 'arxiv_eprints.value', 'search_path': 'metadata.arxiv_eprints.value.raw', 'type': 'exact', }, { 'path': 'dois.value', 'search_path': 'metadata.dois.value.raw', 'type': 'exact', }, ], 'validator': validation_func, }, ], 'doc_type': 'hep', 'index': 'holdingpen-hep', } matches = dedupe_list(match(obj.data, config)) return [int(el['_id']) for el in matches if int(el['_id']) != obj.id]
def exact_match(obj, eng): """Return ``True`` if the record is already present in the system. Uses the default configuration of the ``inspire-matcher`` to find duplicates of the current workflow object in the system. Also sets the ``matches.exact`` property in ``extra_data`` to the list of control numbers that matched. Arguments: obj: a workflow object. eng: a workflow engine. Returns: bool: ``True`` if the workflow object has a duplicate in the system ``False`` otherwise. """ exact_match_config = current_app.config['EXACT_MATCH'] matches = dedupe_list(match(obj.data, exact_match_config)) record_ids = [el['_source']['control_number'] for el in matches] obj.extra_data.setdefault('matches', {})['exact'] = record_ids return bool(record_ids)
def match_reference_with_config(reference, config, previous_matched_recid=None): """Match a reference using inspire-matcher given the config. Args: reference (dict): the metadata of the reference. config (dict): the list of inspire-matcher configurations for queries. previous_matched_recid (int): the record id of the last matched reference from the list of references. Returns: dict: the matched reference. """ # XXX: avoid this type casting. try: reference['reference']['publication_info']['year'] = str( reference['reference']['publication_info']['year']) except KeyError: pass matched_recids = [matched_record['_source']['control_number'] for matched_record in match(reference, config)] matched_recids = dedupe_list(matched_recids) same_as_previous = any(matched_recid == previous_matched_recid for matched_recid in matched_recids) if len(matched_recids) == 1: _add_match_to_reference(reference, matched_recids[0], config['index']) elif same_as_previous: _add_match_to_reference(reference, previous_matched_recid, config['index']) # XXX: avoid this type casting. try: reference['reference']['publication_info']['year'] = int( reference['reference']['publication_info']['year']) except KeyError: pass return reference
def _get_raw_affiliations(value): return dedupe_list([{ 'value': el } for el in force_list(value.get('v'))])
def remove_duplicates_from_list(l): return dedupe_list(l)