def index_mapping(self, **kwargs): cherrypy.log(str(kwargs)) output = {'success': False} index = kwargs.get('index') if index is None: msg = "Error: 'index' argument is required" output['msg'] = msg return output idx = Index(index) if not idx.exists(): msg = "Error: index {} does not exist".format(index) output['msg'] = msg return output try: mapping = idx.get_mapping() except Exception as e: msg = "Error: index mapping: reason {}".format(e) output['msg'] = msg return output output['success'] = True output['mapping'] = mapping return output
'_type': 'document', 'path': f, 'text': text }) client = Elasticsearch() # Tokenizers: whitespace classic standard letter my_analyzer = analyzer('default', type='custom', tokenizer=tokenizer(args.token), filter=args.filter) try: # Drop index if it exists ind = Index(index, using=client) ind.delete() except NotFoundError: pass # then create it ind.settings(number_of_shards=1) ind.create() ind = Index(index, using=client) # configure default analyzer ind.close() # index must be closed for configuring analyzer ind.analyzer(my_analyzer) # configure the path field so it is not tokenized and we can do exact match search client.indices.put_mapping(
def construct_query(self, system=None, file_path=None, **kwargs): project_query_fields = [ "projectId", "title", "description", "doi", "publications", "pis", "name" ] published_index_name = list( Index(settings.ES_INDEX_PREFIX.format( 'publications')).get_alias().keys())[0] legacy_index_name = list( Index(settings.ES_INDEX_PREFIX.format( 'publications-legacy')).get_alias().keys())[0] filter_queries = [] if kwargs.get('type_filters'): for type_filter in kwargs['type_filters']: if type_filter == 'nees': type_query = Q({'term': {'_index': legacy_index_name}}) else: type_query = Q( 'term', **{'project.value.projectType._exact': type_filter}) filter_queries.append(type_query) ds_user_query = Q({ "nested": { "path": "users", "ignore_unmapped": True, "query": { "query_string": { "query": self.query_string, "fields": [ "users.first_name", "users.last_name", "user.username" ], "lenient": True } } } }) nees_pi_query = Q({ "nested": { "path": "pis", "ignore_unmapped": True, "query": { "query_string": { "query": self.query_string, "fields": ["pis.firstName", "pis.lastName"], "lenient": True } } } }) pub_query = Q('query_string', query=self.query_string, default_operator='and', fields=project_query_fields) published_query = Q( 'bool', must=[ Q('bool', should=[ds_user_query, nees_pi_query, pub_query]), Q({'term': { '_index': legacy_index_name }}), ], must_not=[ Q('term', status='unpublished'), Q('term', status='saved') ]) return published_query
def exists(): return Index(APIDoc.Index.name).exists()
def delete(): Index(APIDoc.Index.name).delete()
# coding:utf-8 ''' @author = super_fazai @File : search.py @Time : 2017/8/11 10:41 @connect : [email protected] ''' from elasticsearch_dsl import ( DocType, Index, ) from scrapy import Field class Post(): id = Field() posts = Index('posts') @posts.doc_type class PostDocument(DocType): class Meta: model = Post fields = [ 'id', ]
def delete(): try: Index(INDEX_NAME).delete() except elasticsearch.exceptions.NotFoundError: log.info( 'Could not delete non-existent index, creating new index...')
import os from elasticsearch_dsl import (Index, tokenizer, analyzer) from pprint import pprint movie_index: Index = Index(os.environ.get('ES_INDEX', 'moovie')) movie_index.settings(number_of_shards=5, number_of_replicas=1) completion_analyzer = analyzer('completion_analyzer', tokenizer=tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=['lowercase']) normalization_analyzer = analyzer('normalization_analyzer', tokenizer="standard", filter=["lowercase", "stop", "snowball"], char_filter=["html_strip"]) movie_index.analyzer(normalization_analyzer) def init_index(): if not movie_index.exists(): movie_index.create() def destroy_index(): if movie_index.exists(): movie_index.delete(ignore=404)
def get_fields_meta_index(): fields_meta = Index(constants.FIELDS_INDEX, using=ElasticInstance.get()) add_analyzer(fields_meta) return fields_meta
from django.template.loader import render_to_string from elasticsearch_dsl import Keyword from django.urls import reverse from abstract.elastic_models import ( BASIC_INDEX_SETTINGS, AbstractDatasetMapping, namesAutocompleteAnalyzer, namesAutocompleteSearchAnalyzer, ukrainianAddressesStopwordsAnalyzer, ) from elasticsearch_dsl import DocType, Index LETS_PARTY_INDEX = "ragoogle_lets_party" lets_party_idx = Index(LETS_PARTY_INDEX) lets_party_idx.settings(**BASIC_INDEX_SETTINGS) lets_party_idx.analyzer(namesAutocompleteAnalyzer) lets_party_idx.analyzer(namesAutocompleteSearchAnalyzer) lets_party_idx.analyzer(ukrainianAddressesStopwordsAnalyzer) @lets_party_idx.doc_type class ElasticLetsPartyModel(AbstractDatasetMapping): start_date = Keyword() end_date = Keyword() def render_infocard(self): from .apps import LetsPartyConfig as AppConfig return render_to_string(
def index(name): if settings.ELASTIC_SEARCH["index_prefix"]: name = "%s-%s" % (settings.ELASTIC_SEARCH["index_prefix"], name) return Index(name)
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields)) ] index_fields = [ 'index', 'docs.count', 'store.size', 'creation.date.string' ] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if all(not index['index'].startswith(omit_prefix) for omit_prefix in ['.', 'index_operations_log'])] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) mappings = Index('_all', using=client).get_mapping(doc_type='variant') active_samples = Sample.objects.filter( dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, is_active=True, elasticsearch_index__isnull=False, ).prefetch_related('individual', 'individual__family') prefetch_related_objects(active_samples, 'individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in active_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add( sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add( sample.individual.guid) for index in indices: index_name = index['index'] index_mapping = mappings[index_name]['mappings']['variant'] index.update(index_mapping.get('_meta', {})) projects_for_index = [] for index_prefix in seqr_index_projects.keys(): if index_name.startswith(index_prefix): projects_for_index += seqr_index_projects.pop( index_prefix).keys() index['projects'] = [{ 'projectGuid': project.guid, 'projectName': project.name } for project in projects_for_index] errors = [ '{} does not exist and is used by project(s) {}'.format( index, ', '.join([ '{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items() ])) for index, project_individuals in seqr_index_projects.items() if project_individuals ] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'errors': errors, })
def setup_teardown(es_url, es_object): os.environ["ES_URL"] = es_url yield Index("store", using=es_object.connection).delete()
def main(): # Disable warning about not verifying certificates urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) args = parse_args() es = Elasticsearch([args.es], verify_certs=args.verify_certs, timeout=35) if args.es_out: es_out = Elasticsearch([args.es_out], verify_certs=args.verify_certs, timeout=35) else: es_out = es es_index = args.es_index es_index_out = args.es_index_out index = Index(name=es_index, using=es) mappings = index.get_field_mapping(doc_type='items', fields='is_git_commit,is_gerrit_review') for mapping in mappings.values(): fields = mapping['mappings']['items'] if 'is_git_commit' in fields: src = 'git' elif 'is_gerrit_review' in fields: src = 'gerrit' else: print("I couldn't identify data source for index, exiting") exit(0) print("Identified data source for index: " + src) src_fields = { 'git': { 'date': 'author_date', 'repo': 'repo_name', 'cardinal': 'hash' }, 'gerrit': { 'date': 'opened', 'repo': 'repository', 'cardinal': 'number' } } def get_authors(es, es_index, src): date_field = src_fields[src]['date'] repo_field = src_fields[src]['repo'] cardinal_field = src_fields[src]['cardinal'] # Buckets by author name, finding first commit for each of them s = Search(using=es, index=es_index) s.aggs.bucket('by_authors', 'terms', field='author_name', size=100000) \ .bucket('repos', 'terms', field=repo_field, size=10000) \ .metric('first_item', 'top_hits', _source=[date_field, 'author_org_name', 'author_uuid', 'project'], size=1, sort=[{date_field: {"order": "asc"}}]) \ .metric('last_item', 'max', field=date_field) \ .metric('contribs', 'cardinality', field=cardinal_field) s = s.sort(date_field) result = s.execute() # Get a dataframe with each author and their first commit buckets_result = result['aggregations']['by_authors']['buckets'] buckets = [] for bucket_author in buckets_result: author = bucket_author['key'] first_all = None last_all = None buckets_author = [] for bucket_repo in bucket_author['repos']['buckets']: first_item = bucket_repo['first_item']['hits']['hits'][0] first_date = first_item['sort'][0]/1000 first = datetime.utcfromtimestamp(first_date) if first_all is None or first < first_all: first_all = first last_date = bucket_repo['last_item']['value']/1000 last = datetime.utcfromtimestamp(last_date) if last_all is None or last > last_all: last_all = last contribs = bucket_repo['contribs']['value'] org_name = first_item['_source']['author_org_name'] project = first_item['_source']['project'] uuid = first_item['_source']['author_uuid'] buckets_author.append( {'first': first, 'last': last, 'author_name': author, 'contribs': contribs, 'uuid': uuid, 'author_org_name': org_name, 'repo_name': bucket_repo['key'], 'project': project} ) for bucket in buckets_author: bucket['first_all'] = first_all bucket['last_all'] = last_all buckets.append(bucket) authors_repos = pd.DataFrame.from_records(buckets) authors_repos.sort_values(by='first', ascending=False, inplace=True) return(authors_repos) authors_repos = get_authors(es, es_index, src=src) authors = authors_repos.groupby('author_name').last().reset_index() authors.sort_values(by='first', ascending=False, inplace=True) print("Creating CSV for first date for authors: " + new_file) authors.to_csv(new_file, columns=['first', 'last', 'author_name', 'contribs', 'author_org_name', 'repo_name', 'project'], index=False) def mapping_es (es, es_index): mapping = Mapping('items') mapping.field('author_name', String(index='not_analyzed')) mapping.field('first', Date()) mapping.field('last', Date()) mapping.field('first_all', Date()) mapping.field('last_all', Date()) mapping.field('contribs', 'integer') mapping.field('author_org_name', String(index='not_analyzed')) mapping.field('repo_name', String(index='not_analyzed')) mapping.field('project', String(index='not_analyzed')) mapping.field('uuid', String(index='not_analyzed')) print("Uploading mapping to ElasticSearch") mapping.save(es_index, using=es) def upload_es (es, es_index, df, columns): es_type = 'items' actions = [] for row in df[columns].to_dict(orient='records'): id_src = row['uuid']+row['repo_name'] id = hashlib.sha1(id_src.encode('utf-8', errors='surrogateescape')).hexdigest() to_write = { '_op_type': 'index', '_index': es_index, '_type': es_type, '_id': id } to_write.update(row) actions.append(to_write) print("Uploading to ElasticSearch") result = elasticsearch.helpers.bulk(es, actions, raise_on_error=True, stats_only=True) print("Bulk upload result (succesful / errors): ", result) mapping_es(es_out, es_index_out) upload_es(es_out, es_index_out, authors_repos, ['first', 'last', 'first_all', 'last_all', 'author_name', 'uuid', 'contribs', 'author_org_name', 'repo_name', 'project'])
def test_conflicting_analyzer_raises_error(): i = Index('i') i.analyzer('my_analyzer', tokenizer='whitespace', filter=['lowercase', 'stop']) with raises(ValueError): i.analyzer('my_analyzer', tokenizer='keyword', filter=['lowercase', 'stop'])
def __init__(self): movies = Index('imdb', using=es) movies.doc_type(Movie) movies.delete(ignore=404) movies.create()
from django.conf import settings edge_ngram_analyzer = analyzer('edge_ngram_analyzer', type='custom', tokenizer='standard', filter=[ 'lowercase', token_filter('edge_ngram_filter', type='edgeNGram', min_gram=1, max_gram=20) ]) class TitleDoc(DocType): id = Keyword() domain = Keyword(required=True) url = Keyword(required=True, index=False) title = Text(required=True, analyzer=edge_ngram_analyzer, search_analyzer='standard') popularity = Float() group = Keyword() # create an index and register the doc types index = Index(settings.ES_INDEX) index.settings(**settings.ES_INDEX_SETTINGS) index.doc_type(TitleDoc)
def index(self): return Index(self.index_name)
def index_doc(doc, index_name="wiki-dumps"): global es index = Index(index_name) index.create()
if address.get("streetNumber"): address["streetAddress"] += ", " + address["streetNumber"] address["fullAddress"] += ", " + address["streetNumber"] del address["streetNumber"] except (ValueError, KeyError, IndexError) as e: print(e) return address return address class Meta: index = ADDRESSES_INDEX addresses_idx = Index(ADDRESSES_INDEX) addresses_idx.settings(number_of_shards=settings.NUM_THREADS, number_of_replicas=0) addresses_idx.doc_type(Address) shingle_analyzer = analyzer( "shingleAnalyzer", tokenizer=tokenizer( "ukrainianTokenizer", type="pattern", pattern="[А-ЯЄІЇҐа-яєіїґA-Za-z0-9']+" ), filter=[ token_filter( "shingleFilter", type="shingle", max_shingle_size=5,
def exists(): return Index(INDEX_NAME).exists()
import faker from elasticsearch_dsl import Index, Search from django.test import TestCase from django_datajsonar.tasks import read_datajson from django_datajsonar.models import ReadDataJsonTask, Node, Field as datajsonar_Field from elasticsearch_dsl.connections import connections from series_tiempo_ar_api.apps.metadata.indexer.catalog_meta_indexer import CatalogMetadataIndexer from series_tiempo_ar_api.apps.metadata.indexer.index import add_analyzer from series_tiempo_ar_api.apps.metadata.models import IndexMetadataTask from series_tiempo_ar_api.apps.management import meta_keys SAMPLES_DIR = os.path.join(os.path.dirname(__file__), 'samples') fake = faker.Faker() fake_index = Index(fake.pystr(max_chars=50).lower()) add_analyzer(fake_index) class IndexerTests(TestCase): def setUp(self): self.task = ReadDataJsonTask.objects.create() self.meta_task = IndexMetadataTask.objects.create() def test_index(self): index_ok = self._index(catalog_id='test_catalog', catalog_url='single_distribution.json') search = Search( index=fake_index._name, ).filter('term', catalog_id='test_catalog')
Should return list of family member names """ family = getattr(self.general, "family", None) if family: for member in family: if hasattr(member, "family_name"): yield member.family_name else: for member in parse_raw_family_string( getattr(self.general, "family_raw", "") ): if "family_name" in member: yield member["family_name"] declarations_idx = Index(OLD_DECLARATION_INDEX) declarations_idx.settings( number_of_shards=NUMBER_OF_SHARDS, number_of_replicas=NUMBER_OF_REPLICAS ) declarations_idx.analyzer(namesAutocompleteAnalyzer) declarations_idx.analyzer(namesAutocompleteSearchAnalyzer) @declarations_idx.doc_type class Declaration(DocType, AbstractDeclaration): """Declaration document. Assumes there's a dynamic mapping with all fields not indexed by default.""" persons = Text(analyzer="ukrainian", copy_to="all") countries = Text(analyzer="ukrainian", copy_to="all") companies = Text(analyzer="ukrainian", copy_to="all")
def setUp(self): m = get_user_model() self.user1 = m.objects.create_user('test1', '*****@*****.**', 'super_password') self.user2 = m.objects.create_user('test2', '*****@*****.**', 'super_password') self.user1_actions = [ CustomAction.objects.create( owner=self.user1, name="Custom action %s" % i, description="Custom action %s description" % i ) for i in range(0, 10) ] self.user2_actions = [ CustomAction.objects.create( owner=self.user2, name="Custom action %s" % i, description="Custom action %s description" % i ) for i in range(0, 25) ] self.user1_campaigns = [ Campaign.objects.create( owner=self.user1, name="Campaign %s" % i, description="Campaign %s description" % i, start=timezone.now() ) for i in range(0, 10) ] self.user2_campaigns = [ Campaign.objects.create( owner=self.user2, name="Campaign %s" % i, description="Campaign %s description" % i, start=timezone.now() ) for i in range(0, 25) ] self.other_users = [ m.objects.create_user('other_test_%s' % k, '*****@*****.**' % k, 'super_password') for k in range(0, 10) ] self.es = Elasticsearch(settings.ES_CLUSTER) c_a_idx_data = [ { 'user_id': u.id, 'page_id': 0, 'subscription_uuid': self.user1_actions[0].uuid, 'start': None, 'end': None, 'created': timezone.now(), 'modified': timezone.now(), 'payload': { 'data': 'Super-awesome' }, 'status': 'active' } for u in self.other_users ] c_idx_data = [ { 'user_id': u.id, 'page_id': 0, 'subscription_uuid': self.user1_campaigns[0].uuid, 'start': None, 'end': None, 'created': timezone.now(), 'modified': timezone.now(), 'payload': { 'data': 'Super-awesome' }, 'status': 'active' } for u in self.other_users ] self.campaign_idx = Index(settings.ES_CAMPAIGN_SUBSCRIBERS_IDX, using=self.es) self.custom_action_idx = Index(settings.ES_CUSTOM_ACTION_SUBSCRIBERS_IDX, using=self.es) self.campaign_idx.doc_type(CampaignSubscriptionObj) self.custom_action_idx.doc_type(CustomActionSubscriptionObj) for u in self.other_users: c = CampaignSubscriptionObj( user_id=u.id, page_id=0, subscription_uuid=self.user1_campaigns[0].uuid, start=None, end=None, created=timezone.now(), modified=timezone.now(), status='active' ) c.save(using=self.es) c = CustomActionSubscriptionObj( user_id=u.id, page_id=0, subscription_uuid=self.user1_actions[0].uuid, start=None, end=None, created=timezone.now(), modified=timezone.now(), status='active' ) c.save(using=self.es) sleep(1)
def refresh(): index = Index(APIDoc.Index.name) index.refresh()
from urllib import parse from os.path import splitext, basename res = requests.get('http://localhost:9200') print(res.content) es = Elasticsearch([{'host': 'localhost', 'port': '9200'}]) client = Elasticsearch() fimgin = open('./image_data.json', "r") img_list = json.load(fimgin) m = Mapping() m.field('image_id', 'long') idx1 = Index('idx1') idx1.mapping(m) for CDict in img_list: image_id = CDict["image_id"] url = CDict["url"] imagefile = os.path.basename(url) CDict['imagefile'] = imagefile print("Imagefile= ", imagefile) img_str = json.dumps(CDict) es.index(index='idx1', body=json.loads(img_str)) def get_imgfile(img_id): s = Search(index='idx1').query('match', image_id=img_id)
def test_search_is_limited_to_index_name(): i = Index('my-index') s = i.search() assert s._index == ['my-index']
from elasticsearch_dsl import DocType, Index, Integer from elasticsearch_dsl.connections import connections from paginatify_elasticsearch_dsl import Pagination conn = connections.create_connection(hosts=['localhost']) index = Index('test-paginatify-elasticsearch-dsl') @index.doc_type class Item(DocType): id = Integer(index='not_analyzed') def paginate(count, page=1): if conn.indices.exists(index._name): index.delete() index.create() try: Item.init() for i in range(1, count + 1): Item(id=i, meta={'id': i}).save(refresh=True) return Pagination(Item.search().sort('id'), page=page, map_=lambda x: x.id, per_page=3, per_nav=3) finally: index.delete()
def get(self, request): """GET handler.""" q = request.GET.get('query_string') offset = int(request.GET.get('offset', 0)) limit = int(request.GET.get('limit', 10)) if limit > 500: return HttpResponseBadRequest("limit must not exceed 500") type_filter = request.GET.get('type_filter', 'all') doc_type_map = { Index(settings.ES_INDEX_PREFIX.format('publications')).get_alias().keys( )[0]: 'publication', Index(settings.ES_INDEX_PREFIX.format('publications-legacy')).get_alias( ).keys()[0]: 'publication', Index(settings.ES_INDEX_PREFIX.format('files')).get_alias().keys()[0]: 'file', Index(settings.ES_INDEX_PREFIX.format('cms')).get_alias().keys()[0]: 'modelresult' } public_files_query = CommunityDataSearchManager( request).construct_query() | PublishedDataSearchManager( request).construct_query() publications_query = PublicationsSearchManager( request).construct_query() cms_query = es_query = CMSSearchManager(request).construct_query() if type_filter == 'public_files': es_query = Search().query(public_files_query) elif type_filter == 'published': es_query = Search().query(publications_query) elif type_filter == 'cms': es_query = Search().query(cms_query).highlight( 'body', fragment_size=100).highlight_options(pre_tags=["<b>"], post_tags=["</b>"], require_field_match=False) elif type_filter == 'all': es_query = Search().query(public_files_query | publications_query | cms_query).highlight( 'body', fragment_size=100).highlight_options( pre_tags=["<b>"], post_tags=["</b>"], require_field_match=False) try: res = es_query.execute() except (TransportError, ConnectionTimeout) as err: if getattr(err, 'status_code', 500) == 404: raise res = es_query.execute() out = {} hits = [] for r in res: d = r.to_dict() d["doc_type"] = doc_type_map[r.meta.index] if hasattr(r.meta, 'highlight'): highlight = r.meta.highlight.to_dict() d["highlight"] = highlight if r.meta.doc_type == 'publication' and hasattr(r, 'users'): users = r.users pi = r.project.value.pi pi_user = filter(lambda x: x.username == pi, users)[0] d["piLabel"] = "{}, {}".format(pi_user.last_name, pi_user.first_name) hits.append(d) out['total_hits'] = res.hits.total.value out['hits'] = hits out['all_total'] = Search().query(public_files_query | publications_query | cms_query).count() out['public_files_total'] = Search().query(public_files_query).count() out['published_total'] = Search().query(publications_query).count() out['cms_total'] = Search().query(cms_query).count() return JsonResponse(out, safe=False)
def update_record_statistics(start_date=None, end_date=None): """Update "_stats" field of affected records.""" start_date = dateutil_parse(start_date) if start_date else None end_date = dateutil_parse(end_date) if start_date else None aggr_configs = {} if not start_date and not end_date: start_date = datetime.utcnow() end_date = datetime.utcnow() for aggr_name in current_stats.enabled_aggregations: aggr_cfg = current_stats.aggregations[aggr_name] aggr = aggr_cfg.aggregator_class(name=aggr_cfg.name, **aggr_cfg.aggregator_config) if not Index(aggr.aggregation_alias, using=aggr.client).exists(): if not Index(aggr.event_index, using=aggr.client).exists(): start_date = min(start_date, datetime.utcnow()) else: start_date = min(start_date, aggr._get_oldest_event_timestamp()) # Retrieve the last two bookmarks bookmarks = Search(using=aggr.client, index=aggr.aggregation_alias, doc_type=aggr.bookmark_doc_type)[0:2].sort({ 'date': { 'order': 'desc' } }).execute() if len(bookmarks) >= 1: end_date = max( end_date, datetime.strptime(bookmarks[0].date, aggr.doc_id_suffix)) if len(bookmarks) == 2: start_date = min( start_date, datetime.strptime(bookmarks[1].date, aggr.doc_id_suffix)) aggr_configs[aggr.aggregation_alias] = aggr elif start_date and end_date: for aggr_name in current_stats.enabled_aggregations: aggr_cfg = current_stats.aggregations[aggr_name] aggr = aggr_cfg.aggregator_class(name=aggr_cfg.name, **aggr_cfg.aggregator_config) aggr_configs[aggr.aggregation_alias] = aggr else: return # Get conceptrecids for all the affected records between the two dates conceptrecids = set() for aggr_alias, aggr in aggr_configs.items(): query = Search( using=aggr.client, index=aggr.aggregation_alias, doc_type=aggr.aggregation_doc_type, ).filter('range', timestamp={ 'gte': start_date.replace(microsecond=0).isoformat() + '||/d', 'lte': end_date.replace(microsecond=0).isoformat() + '||/d' }).extra(_source=False) query.aggs.bucket('ids', 'terms', field='conceptrecid', size=0) conceptrecids |= { b.key for b in query.execute().aggregations.ids.buckets } indexer = RecordIndexer() for concpetrecid_val in conceptrecids: conceptrecid = PersistentIdentifier.get('recid', concpetrecid_val) pv = PIDVersioning(parent=conceptrecid) children_recids = pv.children.all() indexer.bulk_index([str(p.object_uuid) for p in children_recids])