def serialize(self, pid, record, links_factory=None): search_by_institution = LiteratureSearch().query( 'match', authors__affiliations__recid=get_id(record) ).params( _source=[ 'control_number', ], ) literature_recids = [ get_id(el.to_dict()) for el in search_by_institution.scan()] search_by_recids = LiteratureSearch().filter( 'terms', control_number=literature_recids ).params( _source=[ 'authors.recid', 'collaborations.value', 'control_number', 'earliest_date', 'facet_inspire_doc_type', 'inspire_categories', 'titles.title', ], ) return json.dumps(build_citesummary(search_by_recids))
def test_creating_deleted_record_and_undeleting_created_record_in_es(app): search = LiteratureSearch() json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': [ 'article', ], 'titles': [ {'title': 'foo'}, ], 'deleted': True, '_collections': ['Literature'] } # When a record is created in the DB with deleted flag True, it is not created in ES. record = InspireRecord.create(json) record.commit() db.session.commit() with pytest.raises(NotFoundError): search.get_source(record.id) # When a record is undeleted, it is created in ES. record['deleted'] = False record.commit() db.session.commit() search.get_source(record.id) record._delete(force=True)
def test_that_db_changes_are_mirrored_in_es(app): search = LiteratureSearch() json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': [ 'article', ], 'titles': [ {'title': 'foo'}, ], } # When a record is created in the DB, it is also created in ES. record = InspireRecord.create(json) es_record = search.get_source(record.id) assert get_title(es_record) == 'foo' # When a record is updated in the DB, is is also updated in ES. record['titles'][0]['title'] = 'bar' record.commit() es_record = search.get_source(record.id) assert get_title(es_record) == 'bar' # When a record is deleted in the DB, it is also deleted in ES. record._delete(force=True) with pytest.raises(NotFoundError): es_record = search.get_source(record.id)
def test_deleting_record_triggers_delete_in_es(app): search = LiteratureSearch() json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': [ 'article', ], 'titles': [ {'title': 'foo'}, ], '_collections': ['Literature'] } # When a record is created in the DB, it is also created in ES. record = InspireRecord.create(json) record.commit() db.session.commit() search.get_source(record.id) # When a record is updated with deleted flag true, it is deleted in ES record['deleted'] = True record.commit() db.session.commit() with pytest.raises(NotFoundError): search.get_source(record.id)
def get_experiment_publications(experiment_name): """ Get paper count for a given experiment. :param experiment_name: canonical name of the experiment. :type experiment_name: string """ query = { "term": {"accelerator_experiments.experiment": experiment_name} } search = LiteratureSearch().query(query) search = search.params(search_type="count") return search.execute().hits.total
def get_experiment_publications(experiment_name): """ Get paper count for a given experiment. :param experiment_name: canonical name of the experiment. :type experiment_name: string """ query = { "term": {"accelerator_experiments.experiment": experiment_name} } search = LiteratureSearch().query(query) # FIXME: search_type=count is deprecated, but the whole function doesn't work anymore search = search.params(search_type="count") return search.execute().hits.total
def serialize(self, pid, record, links_factory=None): """Return a list of co-authors for a given author recid. :param pid: Persistent identifier instance. :param record: Record instance. :param links_factory: Factory function for the link generation, which are added to the response. """ author_pid = pid.pid_value coauthors = {} search = LiteratureSearch().query({ "match": { "authors.recid": author_pid } }).params( _source=[ "authors.full_name", "authors.recid", "authors.record", ] ) for result in search.scan(): result_source = result.to_dict()['authors'] for author in result_source: try: # Don't add the reference author. if author['recid'] != author_pid: if author['recid'] in coauthors: coauthors[author['recid']]['count'] += 1 else: coauthors[author['recid']] = dict( count=1, full_name=author['full_name'], id=author['recid'], record=author['record'], ) except KeyError: pass return json.dumps(coauthors.values())
def build_citesummary(search): citesummary = [] for i, el in enumerate(search.scan()): result = el.to_dict() citesummary.append({ 'citations': [], 'collaboration': is_collaboration(result), 'core': is_core(result), 'date': get_date(result), 'document_type': get_document_type(result), 'id': get_id(result), 'subject': get_subject(result), 'title': get_title(result), }) search_by_literature = LiteratureSearch().query( 'match', references__recid=get_id(result) ).params( _source=[ 'authors.recid', 'collaborations.value', 'control_number', 'earliest_date', 'facet_inspire_doc_type', 'inspire_categories', 'titles.title', ] ) for el in search_by_literature.scan(): literature_result = el.to_dict() citesummary[i]['citations'].append({ 'collaboration': is_collaboration(literature_result), 'core': is_core(literature_result), 'date': get_date(literature_result), 'document_type': get_document_type(literature_result), 'id': get_id(literature_result), 'selfcite': is_selfcite( result, literature_result), 'subject': get_subject(literature_result), 'title': get_title(literature_result), }) return citesummary
def suggest(): """Power typeahead.js search bar suggestions.""" field = request.values.get('field') query = request.values.get('query') search = LiteratureSearch() search = search.suggest( 'suggestions', query, completion={"field": field} ) suggestions = search.execute_suggest() if field == "authors.name_suggest": bai_name_map = {} for suggestion in suggestions['suggestions'][0]['options']: bai = suggestion['payload']['bai'] if bai in bai_name_map: bai_name_map[bai].append( suggestion['text'] ) else: bai_name_map[bai] = [suggestion['text']] result = [] for key, value in six.iteritems(bai_name_map): result.append( { 'name': max(value, key=len), 'value': key, 'template': 'author' } ) return jsonify({ 'results': result }) return jsonify({ 'results': [ {'value': s['text']} for s in suggestions['suggestions'][0]['options'] ] })
def test_find_author_or_author(): query = IQ('find a gersdorff, g or a von gersdorff, g', LiteratureSearch()) expected = { "bool": { "should": [{ "bool": { "must": [{ "bool": { "should": [{ "match": { "authors.name_variations": "gersdorff, g" } }, { "term": { "authors.ids.value": "gersdorff, g" } }] } }], "should": [{ "match": { "authors.full_name": "gersdorff, g" } }] } }, { "bool": { "must": [{ "bool": { "should": [{ "match": { "authors.name_variations": "von gersdorff, g" } }, { "term": { "authors.ids.value": "von gersdorff, g" } }] } }], "should": [{ "match": { "authors.full_name": "von gersdorff, g" } }] } }] } } result = query.to_dict() assert expected == result
def ajax_references(): """Handler for datatables references view""" recid = request.args.get('recid', '') endpoint = request.args.get('endpoint', '') pid_type = get_pid_type_from_endpoint(endpoint) pid = PersistentIdentifier.get(pid_type, recid) record = LiteratureSearch().get_source(pid.object_uuid) return jsonify({'data': get_and_format_references(record)})
def index(): """View for literature collection landing page.""" if current_app.config['INSPIRE_FULL_THEME']: number_of_records = LiteratureSearch().count() return render_template( 'inspirehep_theme/search/collection_literature.html', collection='hep', number_of_records=number_of_records, ) else: return render_template('inspirehep_theme/inspire_labs_cover.html')
def test_refersto_colon_recid_colon(): query = IQ('refersto:recid:1286113', LiteratureSearch()) expected = { 'multi_match': { 'query': '1286113', 'fields': ['references.recid'] } } result = query.to_dict() assert expected == result
def ajax_citations(): """Handler for datatables citations view""" recid = request.args.get('recid', '') endpoint = request.args.get('endpoint', '') pid_type = get_pid_type_from_endpoint(endpoint) pid = PersistentIdentifier.get(pid_type, recid) record = LiteratureSearch().get_source(pid.object_uuid) return jsonify({"data": Citation(record).citations()})
def test_that_db_changes_are_mirrored_in_es(app): search = LiteratureSearch() json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': [ 'article', ], 'titles': [ { 'title': 'foo' }, ], '_collections': ['Literature'] } # When a record is created in the DB, it is also created in ES. record = InspireRecord.create(json) record.commit() db.session.commit() es_record = search.get_source(record.id) assert get_title(es_record) == 'foo' # When a record is updated in the DB, is is also updated in ES. record['titles'][0]['title'] = 'bar' record.commit() db.session.commit() es_record = search.get_source(record.id) assert get_title(es_record) == 'bar' # When a record is deleted in the DB, it is also deleted in ES. record._delete(force=True) db.session.commit() with pytest.raises(NotFoundError): es_record = search.get_source(record.id)
def test_deleting_record_triggers_delete_in_es(app): search = LiteratureSearch() json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': [ 'article', ], 'titles': [ { 'title': 'foo' }, ], '_collections': ['Literature'] } # When a record is created in the DB, it is also created in ES. record = InspireRecord.create(json) record.commit() db.session.commit() search.get_source(record.id) # When a record is updated with deleted flag true, it is deleted in ES record['deleted'] = True record.commit() db.session.commit() with pytest.raises(NotFoundError): search.get_source(record.id)
def test_creating_deleted_record_and_undeleting_created_record_in_es(app): search = LiteratureSearch() json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': [ 'article', ], 'titles': [ { 'title': 'foo' }, ], 'deleted': True, '_collections': ['Literature'] } # When a record is created in the DB with deleted flag True, it is not created in ES. record = InspireRecord.create(json) record.commit() db.session.commit() with pytest.raises(NotFoundError): search.get_source(record.id) # When a record is undeleted, it is created in ES. record['deleted'] = False record.commit() db.session.commit() search.get_source(record.id) record._delete(force=True)
def serialize(self, pid, record, links_factory=None): search_by_author = LiteratureSearch().query( 'match', authors__recid=get_id(record)).params(_source=[ 'authors.recid', 'collaborations.value', 'control_number', 'earliest_date', 'facet_inspire_doc_type', 'inspire_categories', 'titles.title', ], ) return json.dumps(build_citesummary(search_by_author))
def test_abstract_colon_with_star_wildcard(): query = IQ('abstract: part*', LiteratureSearch()) expected = { 'query_string': { 'query': 'part*', 'default_field': 'abstracts.value', 'analyze_wildcard': True } } result = query.to_dict() assert expected == result
def test_find_journal(): query = IQ('find j "Phys.Rev.Lett.,105*"', LiteratureSearch()) expected = { 'query_string': { 'query': '"Phys.Rev.Lett.,105*"', 'default_field': 'publication_info.pubnote', 'analyze_wildcard': True } } result = query.to_dict() assert expected == result
def serialize(self, pid, record, links_factory=None): search_by_experiment = LiteratureSearch().query( 'match', accelerator_experiments__recid=get_id(record)).params(_source=[ 'control_number', ], ) literature_recids = [ get_id(el.to_dict()) for el in search_by_experiment.scan() ] search_by_recids = LiteratureSearch().filter( 'terms', control_number=literature_recids).params(_source=[ 'authors.recid', 'collaborations.value', 'control_number', 'earliest_date', 'facet_inspire_doc_type', 'inspire_categories', 'titles.title', ], ) return json.dumps(build_citesummary(search_by_recids))
def ajax_citations(): """Handler for datatables citations view .. deprecated:: 2018-08-23 """ recid = request.args.get('recid', '') endpoint = request.args.get('endpoint', '') pid_type = get_pid_type_from_endpoint(endpoint) pid = PersistentIdentifier.get(pid_type, recid) record = LiteratureSearch().get_source(pid.object_uuid) return jsonify({'data': get_and_format_citations(record)})
def test_exactauthor_colon_bai(): query = IQ('exactauthor:J.Serra.3', LiteratureSearch()) expected = { "multi_match": { "query": "J.Serra.3", "fields": [ "exactauthor.raw", "authors.full_name", "authors.alternative_names", "authors.ids.value" ] } } result = query.to_dict() assert expected == result
def test_find_exactauthor(): query = IQ('find ea witten, edward', LiteratureSearch()) expected = { "multi_match": { "query": "witten, edward", "fields": [ "exactauthor.raw", "authors.full_name", "authors.alternative_names", "authors.ids.value" ] } } result = query.to_dict() assert expected == result
def get_institution_papers_from_es(recid): """ Get papers where some author is affiliated with institution. :param recid: id of the institution. :type recid: string """ return LiteratureSearch().query_from_iq( 'authors.affiliations.recid:{}'.format(recid)).sort( '-earliest_date').params(size=100, _source=[ 'control_number', 'earliest_date', 'titles', 'authors', 'publication_info', 'citation_count', 'collaboration' ]).execute().hits
def get_citations_from_es(record, page=1, size=10): if 'control_number' not in record: return None return LiteratureSearch().query( 'match', references__recid=record['control_number'], ).params( _source=[ 'authors', 'control_number', 'earliest_date', 'titles', 'publication_info' ], from_=(page - 1) * size, size=size, ).sort('-earliest_date').execute().hits
def test_or_of_exactauthor_colon_queries(): query = IQ('exactauthor:X.Yin.1 or exactauthor:"Yin, Xi"', LiteratureSearch()) expected = { "multi_match": { "query": "J.Serra.3", "fields": [ "exactauthor.raw", "authors.full_name", "authors.alternative_names", "authors.inspire_bai" ] } } result = query.to_dict() assert expected == result
def conferences_contributions_from_es(cnum): """Query ES for conferences in the same series.""" query = 'cnum:"{}"'.format(cnum) return LiteratureSearch().query_from_iq( query ).params( size=100, _source=[ 'control_number', 'earliest_date', 'titles', 'authors', 'publication_info', 'citation_count', 'collaboration' ] ).sort('-citation_count').execute().hits
def test_google_style(): query = IQ('kudenko', LiteratureSearch()) expected = { 'multi_match': { 'zero_terms_query': 'all', 'query': 'kudenko', 'fields': [ 'title^3', 'title.raw^10', 'abstract^2', 'abstract.raw^4', 'author^10', 'author.raw^15', 'reportnumber^10', 'eprint^10', 'doi^10' ] } } result = query.to_dict() assert expected == result
def test_author_colon_bai_with_double_quotes_and_collection_colon_and_cited_colon( ): query = IQ( 'author:"E.Witten.1" AND collection:citeable AND cited:500->1000000', LiteratureSearch()) expected = { 'bool': { 'must': [{ 'bool': { 'should': [{ 'match': { u'authors.name_variations': 'E.Witten.1' } }, { 'term': { u'authors.ids.value': 'E.Witten.1' } }] } }, { 'multi_match': { 'fields': ['collections.primary'], 'query': 'citeable' } }, { 'range': { 'citation_count': { 'gte': '500', 'lte': '1000000' } } }], 'should': [{ 'match': { u'authors.full_name': 'E.Witten.1' } }] } } result = query.to_dict() assert expected == result
def test_author_colon_bai_with_double_quotes_and_collection_colon_and_cited_colon( ): query = IQ( 'author:"E.Witten.1" AND collection:citeable AND cited:500->1000000', LiteratureSearch()) expected = { "bool": { "must": [{ "bool": { "should": [{ "match": { "authors.name_variations": "E.Witten.1" } }, { "term": { "authors.inspire_bai": "E.Witten.1" } }] } }, { "multi_match": { "query": "citeable", "fields": ["collections.primary"] } }, { "range": { "citation_count": { "gte": "500", "lte": "1000000" } } }], "should": [{ "match": { "authors.full_name": "E.Witten.1" } }] } } result = query.to_dict() assert expected == result
def get(self, pid, record): page = request.values.get('page', 1, type=int) size = request.values.get('size', 10, type=int) if page < 1 or size < 1: abort(400) citing_records_results = LiteratureSearch.citations(record, page, size) citing_records_count = citing_records_results.total citing_records = [ citation.to_dict() for citation in citing_records_results ] data = { 'citations': citing_records, 'citation_count': citing_records_count } return self.make_response(pid, data)
def test_author_colon_bai(): query = IQ('author:Y.Nomura.1', LiteratureSearch()) expected = { "bool": { "should": [{ "match": { "authors.name_variations": "Y.Nomura.1" } }, { "match": { "authors.full_name": "Y.Nomura.1" } }, { "match": { "authors.inspire_bai": "Y.Nomura.1" } }] } } result = query.to_dict() assert expected == result
def test_find_author_with_hash_wildcard(): query = IQ('find a chkv#', LiteratureSearch()) expected = { 'bool': { 'should': [{ 'query_string': { 'analyze_wildcard': True, 'default_field': 'authors.full_name', 'query': 'chkv*' } }, { 'query_string': { 'analyze_wildcard': True, 'default_field': 'authors.alternative_names', 'query': 'chkv*' } }] } } result = query.to_dict() assert expected == result
def generate_booktitle(record): booktitle = '' pubinfo = '' if 'publication_info' in record: pubinfo = record['publication_info'] for field in pubinfo: if 'reportnumber' in field: rn = field['reportnumber'] if rn: acronym = field['acronym'] if acronym: booktitle = "%s: %s" % ( rn, acronym, ) else: records = LiteratureSearch().query_from_iq( "reportnumber:%s" % (rn, )).execute() if records: rec = records.hits[0] for title in rec['titles']: booktitle = title.get('title', "") if title.get('subtitle'): booktitle += ': ' + title.get('subtitle') if not booktitle: result = [] for field in pubinfo: if 'pubinfo_freetext' in field: result.append(field['pubinfo_freetext']) if result: if any(isinstance(i, list) for i in result): nested_list = list(traverse(result)) booktitle = ', '.join(str(title) for title in nested_list) else: booktitle = ', '.join(str(title) for title in result) return booktitle
def proceedings_link(record): cnum = record.get('cnum', '') out = '' if not cnum: return out records = LiteratureSearch().query_from_iq( 'cnum:%s and 980__a:proceedings' % cnum).execute() if len(records): if len(records) > 1: proceedings = [] for i, record in enumerate(records.hits, start=1): try: dois = record['dois'] proceedings.append( '<a href="/record/{recid}">#{i}</a> (DOI: <a ' 'href="http://dx.doi.org/{doi}">{doi}</a>'.format( recid=record['control_number'], doi=dois[0]['value'], i=i)) except KeyError: # Guards both against records not having a "dois" field # and doi values not having a "value" field. proceedings.append( '<a href="/record/{recid}">#{i}</a>'.format( recid=record['control_number'], i=i)) out = 'Proceedings: ' out += ', '.join(proceedings) else: out += '<a href="/record/{recid}">Proceedings</a>'.format( recid=records[0]['control_number']) return out
def serialize(self, pid, record, links_factory=None): """Return a list of citations for a given author recid. :param pid: Persistent identifier instance. :param record: Record instance. :param links_factory: Factory function for the link generation, which are added to the response. """ author_pid = pid.pid_value citations = {} search = LiteratureSearch().query({ "match": { "authors.recid": author_pid } }).params( _source=[ "authors.recid", "control_number", "self", ] ) # For each publication co-authored by a given author... for result in search.scan(): result_source = result.to_dict() recid = result_source['control_number'] authors = set([i['recid'] for i in result_source['authors']]) citations[recid] = {} nested_search = LiteratureSearch().query({ "match": { "references.recid": recid } }).params( _source=[ "authors.recid", "collections", "control_number", "earliest_date", "self", ] ) # The source record that is being cited. citations[recid]['citee'] = dict( id=recid, record=result_source['self'], ) citations[recid]['citers'] = [] # Check all publications, which cite the parent record. for nested_result in nested_search.scan(): nested_result_source = nested_result.to_dict() # Not every signature has a recid (at least for demo records). try: nested_authors = set( [i['recid'] for i in nested_result_source['authors']] ) except KeyError: nested_authors = set() citation = dict( citer=dict( id=int(nested_result_source['control_number']), record=nested_result_source['self'] ), # If at least one author is shared, it's a self-citation. self_citation=len(authors & nested_authors) > 0, ) # Get the earliest date of a citer. try: citation['date'] = nested_result_source['earliest_date'] except KeyError: pass # Get status if a citer is published. # FIXME: As discussed with Sam, we should have a boolean flag # for this type of information. try: citation['published_paper'] = "Published" in [ i['primary'] for i in nested_result_source[ 'collections']] except KeyError: citation['published_paper'] = False citations[recid]['citers'].append(citation) return json.dumps(citations.values())
def get_publications(): recid = request.values.get('recid', 0, type=int) publications = [] collaborations = set() keywords = set() search = LiteratureSearch().query( {"match": {"authors.recid": recid}} ).params( _source=[ 'accelerator_experiments', 'control_number', 'earliest_date', 'facet_inspire_doc_type', 'publication_info', 'titles', 'keywords' ] ) for result in search.scan(): try: result_source = result.to_dict() publication = {} # Get publication title (required). publication['title'] = get_title(result_source) # Get publication recid (required). publication['recid'] = result_source['control_number'] except (IndexError, KeyError): continue # Get publication type. try: publication['type'] = result_source.get( 'facet_inspire_doc_type', [])[0] except IndexError: publication['type'] = "Not defined" # Get journal title. try: publication['journal_title'] = result_source.get( 'publication_info', [])[0]['journal_title'] # Get journal recid. try: publication['journal_recid'] = result_source.get( 'publication_info', [])[0]['journal_recid'] except KeyError: pass except (IndexError, KeyError): pass # Get publication year. try: publication['year'] = result_source.get( 'publication_info', [])[0]['year'] except (IndexError, KeyError): pass # Get keywords. for keyword in result_source.get('keywords', []): if keyword.get('keyword') is not "* Automatic Keywords *" \ and keyword.get('keyword'): keywords.add(keyword.get('keyword')) # Get collaborations. for experiment in result_source.get( 'accelerator_experiments', []): collaborations.add(experiment.get('experiment')) # Append to the list. publications.append(publication) response = {} response['publications'] = publications response['keywords'] = list(keywords) response['collaborations'] = list(collaborations) return jsonify(response)
def serialize(self, pid, record, links_factory=None): """Return a list of publications for a given author recid. :param pid: Persistent identifier instance. :param record: Record instance. :param links_factory: Factory function for the link generation, which are added to the response. """ author_pid = pid.pid_value publications = [] search = LiteratureSearch().query({ "match": { "authors.recid": author_pid } }).params( _source=[ "accelerator_experiments", "earliest_date", "citation_count", "control_number", "facet_inspire_doc_type", "publication_info", "self", "keywords", "titles", ] ) for result in search.scan(): result_source = result.to_dict() publication = {} publication['id'] = int(result_source['control_number']) publication['record'] = result_source['self'] publication['title'] = get_title(result_source) # Get the earliest date. try: publication['date'] = result_source['earliest_date'] except KeyError: pass # Get publication type. try: publication['type'] = result_source.get( 'facet_inspire_doc_type', [])[0] except IndexError: pass # Get citation count. try: publication['citations'] = result_source['citation_count'] except KeyError: pass # Get journal. try: publication['journal'] = {} publication['journal']['title'] = result_source.get( 'publication_info', [])[0]['journal_title'] # Get journal id and $self. try: publication['journal']['id'] = result_source.get( 'publication_info', [])[0]['journal_recid'] publication['journal']['record'] = result_source.get( 'publication_info', [])[0]['journal_record'] except KeyError: pass except (IndexError, KeyError): del publication['journal'] # Get collaborations. collaborations = set() for experiment in result_source.get('accelerator_experiments', []): collaborations.add(experiment.get('experiment')) if collaborations: publication['collaborations'] = list(collaborations) publications.append(publication) return json.dumps(publications)
def test_edit_article_workflow_deleting(workflow_app, mocked_external_services): app_client = workflow_app.test_client() user = User.query.filter_by(email='*****@*****.**').one() login_user_via_session(app_client, user=user) record = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'arxiv_eprints': [ { 'categories': [ 'nucl-th' ], 'value': '1802.03287' } ], 'control_number': 123, 'document_type': ['article'], 'titles': [{'title': 'Resource Pooling in Large-Scale Content Delivery Systems'}], 'self': {'$ref': 'http://localhost:5000/schemas/records/hep.json'}, '_collections': ['Literature'] } factory = TestRecordMetadata.create_from_kwargs(json=record) eng_uuid = start('edit_article', data=factory.record_metadata.json) obj = WorkflowEngine.from_uuid(eng_uuid).objects[0] obj.id_user = user.get_id() assert obj.status == ObjectStatus.WAITING assert obj.extra_data['callback_url'] record = get_db_record('lit', 123) search = LiteratureSearch() search.get_source(record.id) # simulate changes in the editor and save obj.data['deleted'] = True payload = { 'id': obj.id, 'metadata': obj.data, '_extra_data': obj.extra_data } app_client.put( obj.extra_data['callback_url'], data=json.dumps(payload), content_type='application/json' ) obj = WorkflowEngine.from_uuid(eng_uuid).objects[0] assert obj.status == ObjectStatus.WAITING # waiting for robot_upload assert obj.data['deleted'] is True do_robotupload_callback( app=workflow_app, workflow_id=obj.id, recids=[obj.data['control_number']], ) record = get_db_record('lit', 123) assert record['deleted'] is True with pytest.raises(NotFoundError): search.get_source(record.id) obj = WorkflowEngine.from_uuid(eng_uuid).objects[0] assert obj.status == ObjectStatus.COMPLETED pending_records = WorkflowsPendingRecord.query.filter_by(workflow_id=obj.id).all() assert not pending_records
def test_index_after_commit_indexes_also_cites_record_when_new_citation_is_added( mocked_indexing_task, mocked_permission_check, app, ): # this test doesn't use the isolated_app because it needs to commit to # the DB in order to create records versions. json_data = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{'title': 'This is the record being cited'}], 'control_number': 9999, '_collections': ['Literature'] } cited = InspireRecord.create(data=json_data, skip_files=True) db.session.commit() es.indices.refresh('records-hep') expected_args = 'lit', cited['control_number'], 1 mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec = get_es_record('lit', 9999) assert es_rec['citation_count'] == 0 assert LiteratureSearch.citations(es_rec).total == 0 citing_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{'title': 'Record citing the first one'}], '_collections': ['Literature'], 'control_number': 8888, 'references': [ {"reference": {'authors': [{'full_name': 'Smith, J.'}]}} ] } record = InspireRecord.create(data=citing_json, skip_files=True) db.session.commit() es.indices.refresh('records-hep') expected_args = 'lit', record['control_number'], 1 mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec = get_es_record('lit', 9999) assert es_rec['citation_count'] == 0 assert LiteratureSearch.citations(es_rec).total == 0 references = { 'references': [ { "curated_relation": False, "record": { "$ref": "http://localhost:5000/api/literature/9999" }, "reference": { 'authors': [{'full_name': 'Smith, J.'}], } } ] } citing_json.update(references) record.clear() record.update(citing_json) record.commit() db.session.commit() es.indices.refresh('records-hep') expected_args = 'lit', record['control_number'], 2 mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec = get_es_record('lit', 9999) assert es_rec['citation_count'] == 1 assert LiteratureSearch.citations(es_rec).total == 1 _delete_record('lit', 8888) _delete_record('lit', 9999)
def get_institution_people_datatables_rows(recid): """ Datatable rows to render people working in an institution. :param recid: id of the institution. :type recid: string """ query = LiteratureSearch().query( "term", authors__affiliations__recid=recid ) query = query.params(search_type="count") query.aggs.bucket("authors", "nested", path="authors")\ .bucket("affiliated", "filter", term={ "authors.affiliations.recid": recid })\ .bucket('byrecid', 'terms', field='authors.recid') records_from_es = query.execute().to_dict() # Extract all the record ids from the aggregation papers_per_author = records_from_es[ 'aggregations' ]['authors']['affiliated']['byrecid']['buckets'] recids = [int(paper['key']) for paper in papers_per_author] # Generate query to retrieve records from author index query = "" for i, recid in enumerate(recids): query += "recid:{}".format(recid) if i != len(recids) - 1: query += " OR " results = AuthorsSearch().query_from_iq( query ).params( size=9999, _source=['control_number', 'name'] ).execute() recid_map = dict( [(result.control_number, result.name) for result in results] ) result = [] author_html_link = u"<a href='/authors/{recid}'>{name}</a>" for author in papers_per_author: row = [] try: row.append( author_html_link.format( recid=author['key'], name=recid_map[author['key']].preferred_name ) ) except: # No preferred name, use value row.append( author_html_link.format( recid=author['key'], name=recid_map[author['key']].value ) ) row.append(author['doc_count']) result.append(row) return result
def test_regression_index_after_commit_retries_for_new_record_not_yet_in_db( mocked_indexing_task, mocked_permission_check, app, ): # this test doesn't use the isolated_app because it needs to commit to # the DB in order to create records versions. json_data = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{ 'title': 'This is the record being cited' }], 'control_number': 9999, '_collections': ['Literature'] } cited = InspireRecord.create(data=json_data, skip_files=True) cited.commit() db.session.commit() current_search.flush_and_refresh('records-hep') expected_args = ('lit', 9999, 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec = get_es_record('lit', 9999) assert es_rec['citation_count'] == 0 assert LiteratureSearch.citations(es_rec).total == 0 citing_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{ 'title': 'Record citing the first one' }], '_collections': ['Literature'], 'control_number': 8888, 'references': [{ 'record': { '$ref': 'http://localhost:5000/api/literature/9999' }, 'reference': { 'authors': [{ 'full_name': 'Smith, J.' }], } }] } record = InspireRecord.create(data=citing_json, skip_files=True) record.commit() db.session.commit() current_search.flush_and_refresh('records-hep') expected_args = ('lit', record['control_number'], 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task pretending record is not committed yet to DB _delete_record('lit', record['control_number']) with pytest.raises(RecordGetterError): # XXX: celery in eager mode does not retry, so it raises the first time index_modified_citations_from_record(*expected_args) _delete_record('lit', cited['control_number'])
def test_index_after_commit_indexes_also_cites_two_records( mocked_indexing_task, mocked_permission_check, app, ): # this test doesn't use the isolated_app because it needs to commit to # the DB in order to create records versions. json1 = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{ 'title': 'This is the record being cited' }], 'control_number': 9999, '_collections': ['Literature'] } cited1 = InspireRecord.create(data=json1, skip_files=True) cited1.commit() db.session.commit() current_search.flush_and_refresh('records-hep') expected_args = ('lit', cited1['control_number'], 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) json2 = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{ 'title': 'This also is the record being cited' }], 'control_number': 9998, '_collections': ['Literature'] } cited2 = InspireRecord.create(data=json2, skip_files=True) cited2.commit() db.session.commit() current_search.flush_and_refresh('records-hep') expected_args = ('lit', cited2['control_number'], 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec1 = get_es_record('lit', 9999) es_rec2 = get_es_record('lit', 9998) assert es_rec1['citation_count'] == 0 assert es_rec2['citation_count'] == 0 assert LiteratureSearch.citations(es_rec1).total == 0 assert LiteratureSearch.citations(es_rec2).total == 0 citing_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{ 'title': 'Record citing the first one' }], '_collections': ['Literature'], 'control_number': 8888, 'references': [{ 'reference': { 'authors': [{ 'full_name': 'Smith, J.' }], } }] } record = InspireRecord.create(data=citing_json, skip_files=True) record.commit() db.session.commit() current_search.flush_and_refresh('records-hep') expected_args = ('lit', record['control_number'], 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec1 = get_es_record('lit', 9999) es_rec2 = get_es_record('lit', 9998) assert es_rec1['citation_count'] == 0 assert es_rec2['citation_count'] == 0 assert LiteratureSearch.citations(es_rec1).total == 0 assert LiteratureSearch.citations(es_rec2).total == 0 references = { 'references': [{ 'record': { '$ref': 'http://localhost:5000/api/literature/9998' }, }, { 'record': { '$ref': 'http://localhost:5000/api/literature/9999' }, }] } citing_json.update(references) record.clear() record.update(citing_json) record.commit() db.session.commit() current_search.flush_and_refresh('records-hep') expected_args = ('lit', record['control_number'], 3) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec1 = get_es_record('lit', 9999) es_rec2 = get_es_record('lit', 9998) assert es_rec1['citation_count'] == 1 assert es_rec2['citation_count'] == 1 assert LiteratureSearch.citations(es_rec1).total == 1 assert LiteratureSearch.citations(es_rec2).total == 1 _delete_record('lit', record['control_number']) _delete_record('lit', cited1['control_number']) _delete_record('lit', cited2['control_number'])
def test_index_after_commit_indexes_also_cites_two_records( mocked_indexing_task, mocked_permission_check, app, ): # this test doesn't use the isolated_app because it needs to commit to # the DB in order to create records versions. json1 = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{'title': 'This is the record being cited'}], 'control_number': 9999, '_collections': ['Literature'] } cited1 = InspireRecord.create(data=json1, skip_files=True) cited1.commit() db.session.commit() es.indices.refresh('records-hep') expected_args = ('lit', cited1['control_number'], 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) json2 = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{'title': 'This also is the record being cited'}], 'control_number': 9998, '_collections': ['Literature'] } cited2 = InspireRecord.create(data=json2, skip_files=True) cited2.commit() db.session.commit() es.indices.refresh('records-hep') expected_args = ('lit', cited2['control_number'], 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec1 = get_es_record('lit', 9999) es_rec2 = get_es_record('lit', 9998) assert es_rec1['citation_count'] == 0 assert es_rec2['citation_count'] == 0 assert LiteratureSearch.citations(es_rec1).total == 0 assert LiteratureSearch.citations(es_rec2).total == 0 citing_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{'title': 'Record citing the first one'}], '_collections': ['Literature'], 'control_number': 8888, 'references': [ { 'reference': { 'authors': [{'full_name': 'Smith, J.'}], } } ] } record = InspireRecord.create(data=citing_json, skip_files=True) record.commit() db.session.commit() es.indices.refresh('records-hep') expected_args = ('lit', record['control_number'], 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec1 = get_es_record('lit', 9999) es_rec2 = get_es_record('lit', 9998) assert es_rec1['citation_count'] == 0 assert es_rec2['citation_count'] == 0 assert LiteratureSearch.citations(es_rec1).total == 0 assert LiteratureSearch.citations(es_rec2).total == 0 references = { 'references': [ { 'record': { '$ref': 'http://localhost:5000/api/literature/9998' }, }, { 'record': { '$ref': 'http://localhost:5000/api/literature/9999' }, } ] } citing_json.update(references) record.clear() record.update(citing_json) record.commit() db.session.commit() es.indices.refresh('records-hep') expected_args = ('lit', record['control_number'], 3) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec1 = get_es_record('lit', 9999) es_rec2 = get_es_record('lit', 9998) assert es_rec1['citation_count'] == 1 assert es_rec2['citation_count'] == 1 assert LiteratureSearch.citations(es_rec1).total == 1 assert LiteratureSearch.citations(es_rec2).total == 1 _delete_record('lit', record['control_number']) _delete_record('lit', cited1['control_number']) _delete_record('lit', cited2['control_number'])
def test_regression_index_after_commit_retries_for_new_record_not_yet_in_db( mocked_indexing_task, mocked_permission_check, app, ): # this test doesn't use the isolated_app because it needs to commit to # the DB in order to create records versions. json_data = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{'title': 'This is the record being cited'}], 'control_number': 9999, '_collections': ['Literature'] } cited = InspireRecord.create(data=json_data, skip_files=True) cited.commit() db.session.commit() es.indices.refresh('records-hep') expected_args = ('lit', 9999, 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task index_modified_citations_from_record(*expected_args) es_rec = get_es_record('lit', 9999) assert es_rec['citation_count'] == 0 assert LiteratureSearch.citations(es_rec).total == 0 citing_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': ['article'], 'titles': [{'title': 'Record citing the first one'}], '_collections': ['Literature'], 'control_number': 8888, 'references': [ { 'record': { '$ref': 'http://localhost:5000/api/literature/9999' }, 'reference': { 'authors': [{'full_name': 'Smith, J.'}], } } ] } record = InspireRecord.create(data=citing_json, skip_files=True) record.commit() db.session.commit() es.indices.refresh('records-hep') expected_args = ('lit', record['control_number'], 2) mocked_indexing_task.assert_called_with(*expected_args) # execute mocked task pretending record is not committed yet to DB _delete_record('lit', record['control_number']) with pytest.raises(RecordGetterError): # XXX: celery in eager mode does not retry, so it raises the first time index_modified_citations_from_record(*expected_args) _delete_record('lit', cited['control_number'])
def serialize(self, pid, record, links_factory=None): """Return a different metrics for a given author recid. :param pid: Persistent identifier instance. :param record: Record instance. :param links_factory: Factory function for the link generation, which are added to the response. """ author_pid = pid.pid_value fields = set() keywords = [] statistics = {} statistics['citations'] = 0 statistics['publications'] = 0 statistics['types'] = {} statistics_citations = {} search = LiteratureSearch().query({ "match": { "authors.recid": author_pid } }).params( _source=[ "citation_count", "control_number", "facet_inspire_doc_type", "facet_inspire_subjects", "thesaurus_terms", ] ) for result in search.scan(): result_source = result.to_dict() # Increment the count of the total number of publications. statistics['publications'] += 1 # Increment the count of citations. citation_count = result_source.get('citation_count', 0) statistics['citations'] += citation_count statistics_citations[int(result_source['control_number'])] = \ citation_count # Count how many times certain type of publication was published. try: publication_type = result_source.get( 'facet_inspire_doc_type', [])[0] except IndexError: pass if publication_type: if publication_type in statistics['types']: statistics['types'][publication_type] += 1 else: statistics['types'][publication_type] = 1 # Get fields. for field in result_source.get('facet_inspire_subjects', []): fields.add(field) # Get keywords. keywords.extend([ k for k in force_force_list( get_value(result_source, 'thesaurus_terms.keyword')) if k != '* Automatic Keywords *']) # Calculate h-index together with i10-index. statistics['hindex'] = calculate_h_index(statistics_citations) statistics['i10index'] = calculate_i10_index(statistics_citations) if fields: statistics['fields'] = list(fields) # Return the top 25 keywords. if keywords: counter = Counter(keywords) statistics['keywords'] = [{ 'count': i[1], 'keyword': i[0] } for i in counter.most_common(25)] return json.dumps(statistics)
def get_institution_people_datatables_rows(recid): """ Datatable rows to render people working in an institution. :param recid: id of the institution. :type recid: string """ query = LiteratureSearch().query( "term", authors__affiliations__recid=recid ) # FIXME: search_type=count is deprecated, but the whole function doesn't work anymore query = query.params(search_type="count") query.aggs.bucket("authors", "nested", path="authors")\ .bucket("affiliated", "filter", term={ "authors.affiliations.recid": recid })\ .bucket('byrecid', 'terms', field='authors.recid') records_from_es = query.execute().to_dict() # Extract all the record ids from the aggregation papers_per_author = records_from_es[ 'aggregations' ]['authors']['affiliated']['byrecid']['buckets'] recids = [int(paper['key']) for paper in papers_per_author] # Generate query to retrieve records from author index query = "" for i, recid in enumerate(recids): query += "recid:{}".format(recid) if i != len(recids) - 1: query += " OR " results = AuthorsSearch().query_from_iq( query ).params( size=9999, _source=['control_number', 'name'] ).execute() recid_map = dict( [(result.control_number, result.name) for result in results] ) result = [] author_html_link = u"<a href='/authors/{recid}'>{name}</a>" for author in papers_per_author: row = [] try: row.append( author_html_link.format( recid=author['key'], name=recid_map[author['key']].preferred_name ) ) except Exception: # No preferred name, use value row.append( author_html_link.format( recid=author['key'], name=recid_map[author['key']].value ) ) row.append(author['doc_count']) result.append(row) return result