def jobs_similar(id): out = '' es_query = RecordsSearch(index='records-jobs', doc_type='jobs') es_query = es_query.query( { "more_like_this": { "docs": [ { "_id": id } ], "min_term_freq": 0, "min_doc_freq": 0, } } )[0:2] similar_jobs = es_query.execute() for job in similar_jobs: out = out + (render_template_to_string( "inspirehep_theme/similar_jobs.html", record=job)) return out
def test_es_preference_param_no_request(): """Test that the preference param is not added when not in a request.""" RecordsSearch.__bases__ = (SpySearch, ) rs = RecordsSearch() new_rs = rs.with_preference_param() assert new_rs.exposed_params == {}
def test_cleanup_indexed_deposits(app, db, es, locations, users, deposit_metadata, sip_metadata_types): with app.test_request_context(): datastore = app.extensions['security'].datastore login_user(datastore.get_user(users[0]['email'])) id_ = uuid4() depid = zenodo_deposit_minter(id_, deposit_metadata) ZenodoDeposit.create(deposit_metadata, id_=id_) # Emulate a database "failure", which would wipe any models in the session db.session.remove() current_search.flush_and_refresh(index='deposits') # Deposit has been indexed in ES, but not commimted in DB assert PersistentIdentifier.query.filter( PersistentIdentifier.pid_type == depid.pid_type, PersistentIdentifier.pid_value == depid.pid_value).count() == 0 assert (RecordsSearch(index='deposits').get_record(id_).execute() [0]._deposit.id == depid.pid_value) cleanup_indexed_deposits.apply() current_search.flush_and_refresh(index='deposits') assert PersistentIdentifier.query.filter( PersistentIdentifier.pid_type == depid.pid_type, PersistentIdentifier.pid_value == depid.pid_value).count() == 0 assert len(RecordsSearch(index='deposits').get_record(id_).execute()) == 0
def test_es_preference_param(app): """Test the preference param is correctly added in a request.""" BaseRecordsSearch.__bases__ = (SpySearch, ) with app.test_request_context('/', headers={'User-Agent': 'Chrome'}, environ_base={'REMOTE_ADDR': '212.54.1.8'}): rs = RecordsSearch() new_rs = rs.with_preference_param() alg = hashlib.md5() encoded_user_agent = 'Chrome'.encode('utf8') encoded_user_string = '{ip}-{ua}'.format(ip=request.remote_addr, ua=encoded_user_agent) alg.update(encoded_user_string.encode('utf8')) digest = alg.hexdigest() assert new_rs.exposed_params == dict(preference=digest) # Note: V2 does not require a request context BaseRecordsSearchV2.__bases__ = (SpySearch, ) rs = RecordsSearchV2() new_rs = rs.with_preference_param(preference=1234) assert new_rs.exposed_params == {'preference': 1234}
def cleanup_indexed_deposits(): """Delete indexed deposits that do not exist in the database. .. note:: This task exists because of deposit REST API calls sometimes failing after the deposit has already been sent for indexing to ES, leaving an inconsistent state of a deposit existing in ES and not in the database. It should be removed once a proper signal mechanism has been implemented in the ``invenio-records-rest`` and ``invenio-deposit`` modules. """ search = RecordsSearch(index='deposits') q = (search.query('term', **{ '_deposit.status': 'draft' }).source(['_deposit.id'])) res = q.scan() es_depids_info = [(d.to_dict().get('_deposit.id', [None])[0], d.meta.id, d.meta.index, d.meta.doc_type) for d in res] es_depids = {p[0] for p in es_depids_info} db_depids_query = PersistentIdentifier.query.filter( PersistentIdentifier.pid_type == 'depid', PersistentIdentifier.pid_value.in_(es_depids)) db_depids = {d.pid_value for d in db_depids_query} missing_db_depids = filter(lambda d: d[0] not in db_depids, es_depids_info) indexer = RecordIndexer() for _, deposit_id, index, doc_type in missing_db_depids: indexer.client.delete(id=str(deposit_id), index=index, doc_type=doc_type)
def test_elasticsearch_query_min_score(app): """Test building a query with min_score.""" app.config.update(SEARCH_RESULTS_MIN_SCORE=0.1) q = RecordsSearch() q = q.query(Q('match', title='Higgs')) search_dict = q.to_dict() assert 'min_score' in search_dict assert search_dict['min_score'] == app.config['SEARCH_RESULTS_MIN_SCORE']
def perform_es_search(q, index, start=0, size=10, sort=None, fields=None): """Helper to use elasticsearch_dsl with Spires/Invenio syntax.""" query = IQ(q) search = RecordsSearch(index=index).query(query) if sort: search = search.sort(sort) if fields and isinstance(fields, list): search = search.extra(_source={'include': fields}) return search[start:start + size].execute()
def _build_query(id_): result = RecordsSearch(index='records-jobs', doc_type='jobs') return result.query({ 'more_like_this': { 'docs': [ { '_id': id_, }, ], 'min_term_freq': 0, 'min_doc_freq': 0, } })[0:2]
def get_experiment_publications(experiment_name): """ Get paper count for a given experiment. :param experiment_name: canonical name of the experiment. :type experiment_name: string """ query = { "term": {"accelerator_experiments.experiment": experiment_name} } search = RecordsSearch(index="records-hep").query(query) search = search.params(search_type="count") return search.execute().hits.total
def get_expired_embargos(cls): """Get records for which the embargo period have expired.""" endpoint = current_app.config['RECORDS_REST_ENDPOINTS']['recid'] s = RecordsSearch( using=current_search_client, index=endpoint['search_index']).query( 'query_string', query='access_right:{0} AND embargo_date:{{* TO {1}}}'.format( cls.EMBARGOED, # Uses timestamp instead of date on purpose. datetime.utcnow().isoformat()), allow_leading_wildcard=False).fields([]) return [hit.meta.id for hit in s.scan()]
def directly_list_v2_record_ids(): size = 100 page = 1 while True: search = RecordsSearch().params(version=True) search = search[(page - 1) * size:page * size] search_result = search.execute() for record in search_result.hits.hits: if record.get('_index') == 'records-records': yield record if size * page < search_result.hits.total: page += 1 else: break
def delete_group_relations(group_id): """Delete all relations for given group ID from ES.""" q = RecordsSearch(index='relationships').query('term', Source__ID=group_id) # Ignore versioning conflicts when deleting q.params(conflicts='proceed').delete() q = RecordsSearch(index='relationships').query('term', Target__ID=group_id) q.params(conflicts='proceed').delete()
def delete(user_id): """Delete spam.""" # Only admin can access this view if not Permission(ActionNeed('admin-access')).can(): abort(403) user = User.query.get(user_id) deleteform = DeleteSpamForm() communities = Community.query.filter_by(id_user=user.id) rs = RecordsSearch(index='records').query( Q('query_string', query="owners: {0}".format(user.id))) rec_count = rs.count() ctx = { 'user': user, 'form': deleteform, 'is_new': False, 'communities': communities, 'rec_count': rec_count, } if deleteform.validate_on_submit(): if deleteform.remove_all_communities.data: for c in communities: if not c.deleted_at: if not c.description.startswith('--SPAM--'): c.description = '--SPAM--' + c.description if c.oaiset: db.session.delete(c.oaiset) c.delete() db.session.commit() if deleteform.deactivate_user.data: _datastore.deactivate_user(user) db.session.commit() # delete_record function commits the session internally # for each deleted record if deleteform.remove_all_records.data: for r in rs.scan(): delete_record(r.meta.id, 'spam', int(current_user.get_id())) flash("Spam removed", category='success') return redirect(url_for('.delete', user_id=user.id)) else: records = islice(rs.scan(), 10) ctx.update(records=records) return render_template('zenodo_spam/delete.html', **ctx)
def test_deposit_index(db, es): """Test update embargoed records.""" deposit_index_name = 'deposits-records-record-v1.0.0' rec1 = Record.create({ 'title': 'One', '_deposit': { 'status': 'published', 'pid': { 'type': 'recid', 'value': '1' } } }) PersistentIdentifier.create(pid_type='recid', pid_value='1', status=PIDStatus.REGISTERED, object_uuid=rec1.id, object_type='rec') Deposit.create({ '_deposit': { 'status': 'published', 'pid': { 'type': 'recid', 'value': '1' } } }) db.session.commit() current_search.flush_and_refresh(deposit_index_name) res = RecordsSearch(index=deposit_index_name).execute() # Make sure the 'title' was indexed from record assert res['hits']['hits'][0]['_source']['title'] == 'One'
def loans_of_transaction_library_by_item_location(self, libraries_map, library_pid, trigger): """Number of circulation operation during the specified timeframe. Number of loans of items by location when transaction location is equal to any of the library locations :param libraries_map: dict - map of library pid and name :param library_pid: string - the library to filter with :param trigger: string - action name (checkin, checkout) :return: the number of matched circulation operation :rtype: dict """ location_pids = self._get_locations_pid(library_pid) search = RecordsSearch(index=LoanOperationLog.index_name)\ .filter('range', date=self.date_range)\ .filter('terms', loan__trigger=trigger)\ .filter('terms', loan__transaction_location__pid=location_pids)\ .source('loan').scan() stats = {} for s in search: item_library_pid = s.loan.item.library_pid item_library_name = libraries_map[item_library_pid] location_name = s.loan.item.holding.location_name key = f'{item_library_pid}: {item_library_name} - {location_name}' stats.setdefault(key, { 'location_name': location_name, 'checkin': 0, 'checkout': 0 }) stats[key][s.loan.trigger] += 1 return stats
def get_institution_experiments_from_es(icn): """ Get experiments from a given institution. To avoid killing ElasticSearch the number of experiments is limited. :param icn: Institution canonical name. :type icn: string """ query = { "term": {"affiliation": icn} } search = RecordsSearch(index="records-experiments").query(query)[:100] search = search.sort('-earliest_date') return search.execute().hits
def check_and_handle_spam(community=None, deposit=None): """Checks community/deposit metadata for spam.""" try: if current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'): if community: task = check_metadata_for_spam.delay(community_id=community.id) if deposit: task = check_metadata_for_spam.delay(dep_id=str(deposit.id)) spam_proba = task.get( timeout=current_app.config['ZENODO_SPAM_CHECK_TIMEOUT']) else: spam_proba = 0 if spam_proba > current_app.config['ZENODO_SPAM_THRESHOLD']: if not Permission(ActionNeed('admin-access')).can(): has_records = RecordsSearch(index='records').query( Q('query_string', query="owners:{}".format(community.id_user))).count() has_communities = Community.query.filter_by( id_user=community.id_user).count() - 1 if not (has_records or has_communities): current_app.config['ZENODO_SPAM_HANDLING_ACTIONS']( community=community, deposit=deposit) except HTTPException: raise except Exception: current_app.logger.exception(u'Could not check for spam')
def delete_group_relations(group_ids: Iterable[str]): """Delete all relations for given group IDs from ES.""" RecordsSearch(index='relationships').query( 'bool', should=[ Q('terms', Source__ID=list(group_ids)), Q('terms', Target__ID=list(group_ids)), ]).params(conflicts='proceed').delete() # ignore versioning conflicts
def dump_operation_logs(outfile_name, year): """Dumps operation log records in a given file. :param outfile: JSON operation log output file. """ click.secho('Dumps operation log records:', fg='green') index_name = OperationLog.index_name if year is not None: index_name = f'{index_name}-{year}' search = RecordsSearch(index=index_name) index_count = 0 outfile = JsonWriter(outfile_name) with click.progressbar(search.scan(), length=search.count()) as bar: for oplg in bar: outfile.write(str(oplg.to_dict())) index_count += 1 click.echo(f'created {index_count} operation logs.')
def get_expired_embargos(cls): """Get records for which the embargo period have expired.""" endpoint = current_app.config['RECORDS_REST_ENDPOINTS']['recid'] s = RecordsSearch( using=current_search_client, index=endpoint['search_index'] ).query( 'query_string', query='access_right:{0} AND embargo_date:{{* TO {1}}}'.format( cls.EMBARGOED, # Uses timestamp instead of date on purpose. datetime.utcnow().isoformat() ), allow_leading_wildcard=False ).fields([]) return [hit.meta.id for hit in s.scan()]
def check_and_handle_spam(community=None, deposit=None, retry=True): """Checks community/deposit metadata for spam.""" try: if current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'): if community: task = check_metadata_for_spam.delay(community_id=community.id) user_id = community.id_user if deposit: task = check_metadata_for_spam.delay(dep_id=str(deposit.id)) user_id = deposit['owners'][0] spam_proba = task.get( timeout=current_app.config['ZENODO_SPAM_CHECK_TIMEOUT']) else: spam_proba = 0 if spam_proba > current_app.config['ZENODO_SPAM_THRESHOLD']: if not Permission(ActionNeed('admin-access')).can(): user_records = RecordsSearch(index='records').query( Q('query_string', query="owners:{}".format(user_id))).count() user_communities = Community.query.filter_by( id_user=user_id).count() if community: # Ignore the newly created community user_communities = user_communities - 1 current_app.logger.warning( u'Found spam upload', extra={ 'depid': deposit.id if deposit else None, 'comid': community.id if community else None }) if not (user_records + user_communities > current_app.config['ZENODO_SPAM_SKIP_CHECK_NUM']): current_app.config['ZENODO_SPAM_HANDLING_ACTIONS']( community=community, deposit=deposit) except HTTPException: raise except TimeoutError: if retry: check_and_handle_spam(community=community, deposit=deposit, retry=False) else: current_app.logger.exception( u'Could not check for spam', extra={ 'depid': deposit.id if deposit else None, 'comid': community.id if community else None }) except Exception: current_app.logger.exception(u'Could not check for spam', extra={ 'depid': deposit.id if deposit else None, 'comid': community.id if community else None })
def test_filter_by_patron(app, patron_pid, qs, should_raise): """Test the function filter_by_patron.""" search = RecordsSearch() if should_raise: with pytest.raises(UnauthorizedSearchError): _filter_by_patron(patron_pid, search, qs) else: _search, _qs = _filter_by_patron(patron_pid, search, qs) term = _search.to_dict()["query"]["bool"]["filter"][0]["term"] assert term == {"patron_pid": patron_pid}
def test(): # This function renders the test.html page # The HTML file is a jinja template. Its content is generated dynamically by # creating sections using the passed records. The records that will be displayed are # retrieved with the query made by the "RecordsSearch" class return render_template( "gkhext/test.html", invenio_records=RecordsSearch().sort("-created").execute() )
def test_prefix_index_from_kwargs(app): """Test that index is prefixed when pass it through kwargs.""" prefix_value = 'myprefix-' index_value = 'myindex' app.config['SEARCH_INDEX_PREFIX'] = prefix_value prefixed_index = ['{}{}'.format(prefix_value, index_value)] q = RecordsSearch(index=index_value) _test_original_index_is_stored_when_prefixing(q, prefixed_index, [index_value])
def suggest(): """Power typeahead.js search bar suggestions.""" field = request.values.get('field') query = request.values.get('query') search = RecordsSearch(index='records-hep', doc_type='hep') search = search.suggest( 'suggestions', query, completion={"field": field} ) suggestions = search.execute_suggest() if field == "authors.name_suggest": bai_name_map = {} for suggestion in suggestions['suggestions'][0]['options']: bai = suggestion['payload']['bai'] if bai in bai_name_map: bai_name_map[bai].append( suggestion['text'] ) else: bai_name_map[bai] = [suggestion['text']] result = [] for key, value in six.iteritems(bai_name_map): result.append( { 'name': max(value, key=len), 'value': key, 'template': 'author' } ) return jsonify({ 'results': result }) return jsonify({ 'results': [ {'value': s['text']} for s in suggestions['suggestions'][0]['options'] ] })
def spam_check(self): """Checks deposit metadata for spam content.""" try: if current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'): task = check_metadata_for_spam.delay(str(self.id)) spam_proba = task.get( timeout=current_app.config['ZENODO_SPAM_CHECK_TIMEOUT']) else: spam_proba = 0 if spam_proba > current_app.config['ZENODO_SPAM_THRESHOLD']: if not Permission(ActionNeed('admin-access')).can(): rs = RecordsSearch(index='records').query( Q('query_string', query="owners:{}".format(self['owners'][0]))) if not rs.count(): current_app.config['ZENODO_SPAM_HANDLING_ACTIONS']( self) except HTTPException: raise except Exception: current_app.logger.exception(u'Could not check deposit for spam')
def get_record_stats(recordid, throws=True): """Fetch record statistics from Elasticsearch.""" try: res = ( RecordsSearch().source( include='_stats') # only include "_stats" field .get_record(recordid).execute()) return res[0]._stats.to_dict() if res else None except Exception: if throws: raise pass
def test_prefix_index_list(app): """Test that index is prefixed when pass it through kwargs.""" prefix_value = 'myprefix-' index_value = ['myindex', 'myanotherindex'] app.config['SEARCH_INDEX_PREFIX'] = prefix_value prefixed_index = [ '{}{}'.format(prefix_value, _index) for _index in index_value ] q = RecordsSearch(index=index_value) _test_original_index_is_stored_when_prefixing(q, prefixed_index, index_value)
def number_of_circ_operations(self, library_pid, trigger): """Number of circulation operation during the specified timeframe. :param library_pid: string - the library to filter with :param trigger: string - action name :return: the number of matched circulation operation :rtype: integer """ return RecordsSearch(index=LoanOperationLog.index_name)\ .filter('range', date=self.date_range)\ .filter('term', loan__trigger=trigger)\ .filter('term', loan__item__library_pid=library_pid)\ .count()
def new_documents(self, library_pid): """Number of new documents per library for given time interval. :param library_pid: string - the library to filter with :return: the number of matched documents :rtype: integer """ return RecordsSearch(index=LoanOperationLog.index_name)\ .filter('range', date=self.date_range)\ .filter('term', operation='create')\ .filter('term', record__type='doc')\ .filter('term', library__value=library_pid)\ .count()
def number_of_deleted_items(self, library_pid): """Number of deleted items during the specified timeframe. :param library_pid: string - the library to filter with :return: the number of matched deleted items :rtype: integer """ return RecordsSearch(index=LoanOperationLog.index_name)\ .filter('range', date=self.date_range)\ .filter('term', operation='delete')\ .filter('term', record__type='item')\ .filter('term', library__pid=library_pid)\ .count()
def test_large_stats(app, db, es, locations, event_queues, minimal_record): """Test record page view event import.""" search = Search(using=es) records = create_stats_fixtures( # (3 * 4) -> 12 records and (3 * 4 * 2) -> 24 files metadata=minimal_record, n_records=3, n_versions=4, n_files=2, event_data={'user_id': '1'}, # (31 + 30) * 2 -> 122 event timestamps (61 days and 2 events/day) start_date=datetime(2018, 3, 1), end_date=datetime(2018, 5, 1), interval=timedelta(hours=12)) # Events indices # 4 versions * 3 records * 2 files * 122 events -> 2928 assert search.index('events-stats-file-download').count() == 2928 # 4 versions * 3 records * 122 events -> 1464 assert search.index('events-stats-record-view').count() == 1464 # Aggregations indices # (4 versions + 1 concept) * 3 records -> 15 documents + 2 bookmarks q = search.index('stats-file-download') q = q.doc_type('file-download-day-aggregation') assert q.count() == 915 # 61 days * 15 records q = search.index('stats-record-view') q = q.doc_type('record-view-day-aggregation') assert q.count() == 915 # 61 days * 15 records # Reords index for _, record, _ in records: doc = (RecordsSearch().get_record( record.id).source(include='_stats').execute()[0]) assert doc['_stats'] == { # 4 view events 'views': 122.0, 'version_views': 488.0, # 4 view events over 2 different hours 'unique_views': 122.0, 'version_unique_views': 122.0, # 4 download events * 3 files 'downloads': 244.0, 'version_downloads': 976.0, # 4 download events * 3 files over 2 different hours 'unique_downloads': 122.0, 'version_unique_downloads': 122.0, # 4 download events * 3 files * 10 bytes 'volume': 2440.0, 'version_volume': 9760.0, }
def renewals(self, library_pid, trigger): """Number of items with loan extended. Number of items with loan extended per library for given time interval :param library_pid: string - the library to filter with :param trigger: string - action name extend :return: the number of matched documents :rtype: integer """ return RecordsSearch(index=LoanOperationLog.index_name)\ .filter('range', date=self.date_range)\ .filter('terms', loan__trigger=trigger)\ .filter('term', loan__item__library_pid=library_pid)\ .count()
def checkouts_for_owning_library(self, library_pid, trigger): """Number of circulation operation during the specified timeframe. Number of loans of items per library when the item is owned by the library :param library_pid: string - the library to filter with :param trigger: string - action name (checkout) :return: the number of matched circulation operation :rtype: integer """ return RecordsSearch(index=LoanOperationLog.index_name)\ .filter('range', date=self.date_range)\ .filter('terms', loan__trigger=trigger)\ .filter('term', loan__item__library_pid=library_pid)\ .count()
def test_prefix_multi_index_string(app): """Test that index is prefixed when pass it through kwargs.""" prefix_value = 'myprefix-' index_value = 'myindex,myanotherindex' app.config['SEARCH_INDEX_PREFIX'] = prefix_value prefixed_index = [ ','.join([ '{}{}'.format(prefix_value, _index) for _index in index_value.split(',') ]) ] q = RecordsSearch(index=index_value) _test_original_index_is_stored_when_prefixing(q, prefixed_index, [index_value])
def assert_es_equals_db(): """Assert that the relationships in ES the GroupRelationships in DB. NOTE: This tests takes the state of the DB as the reference for comparison. """ # Wait for ES to be available current_search.flush_and_refresh('relationships') # Fetch all DB objects and all ES objects es_q = list(RecordsSearch(index='relationships').query().scan()) db_q = GroupRelationship.query.all() # normalize and compare two sets es_norm_q = list(map(normalize_es_result, es_q)) db_norm_q = list(map(normalize_db_result, db_q)) assert set(es_norm_q) == set(db_norm_q)
def validated_requests(self, library_pid, trigger): """Number of validated requests. Number of validated requests per library for given time interval Match is done on the library of the librarian. Note: trigger is 'validate' and not 'validate_request' :param library_pid: string - the library to filter with :param trigger: string - action name validate :return: the number of matched documents :rtype: integer """ return RecordsSearch(index=LoanOperationLog.index_name)\ .filter('range', date=self.date_range)\ .filter('terms', loan__trigger=trigger)\ .filter('term', library__value=library_pid)\ .count()
def test_empty_query(app): """Test building an empty query.""" with app.app_context(): q = RecordsSearch() assert q.to_dict()['query'] == {'match_all': {}} q = RecordsSearch.faceted_search('') assert q._s.to_dict()['query'] == {'match_all': {}} q = RecordsSearch()[10] assert q.to_dict()['from'] == 10 assert q.to_dict()['size'] == 1 q = q[10:20] assert q.to_dict()['from'] == 10 assert q.to_dict()['size'] == 10 q = q.sort({'field1': {'order': 'asc'}}) assert q.to_dict()['sort'][0] == {'field1': {'order': 'asc'}} q = q.sort() assert 'sort' not in q.to_dict() q = q.sort('-field1') assert q.to_dict()['sort'][0] == {'field1': {'order': 'desc'}} q = q.sort('field2', {'field3': {'order': 'asc'}}) assert q.to_dict()['sort'][0] == 'field2' assert q.to_dict()['sort'][1] == {'field3': {'order': 'asc'}} q.sort() q = RecordsSearch() q = q.highlight('field1', index_options='offsets') assert len(q.to_dict()['highlight']['fields']) == 1 assert q.to_dict()['highlight']['fields']['field1'] == { 'index_options': 'offsets' } q = q.highlight('field2') assert len(q.to_dict()['highlight']['fields']) == 2 assert q.to_dict()['highlight']['fields']['field1'] == { 'index_options': 'offsets' } assert q.to_dict()['highlight']['fields']['field2'] == {} q = q.highlight() assert 'highligth' not in q.to_dict()