def get_flow_batch(start_ts, end_ts): es_client = Elasticsearch(ES_CONF, **ES_OPTS) indices = ','.join([ INDEX_NAME + '__0_*_' + it for it in timestamps_to_index_dates(start_ts, end_ts) ]) query_body = { 'query': { 'range': { TIME_FIELD: { 'gte': start_ts, 'lt': end_ts } } }, 'stored_fields': STORED_FIELDS } ret = es_client.search(index=indices, body=query_body, scroll=SCROLL_TIMEOUT, size=SCROLL_SIZE) while True: sid = ret['_scroll_id'] size = len(ret['hits']['hits']) if size == 0: break yield ret['hits']['hits'] ret = es_client.scroll(scroll_id=sid, scroll=SCROLL_TIMEOUT) es_client.clear_scroll(scroll_id=sid)
def scroll_hits(es: Elasticsearch, query: dict, index: str, doc_type: str = '_doc', size: int = 100, scroll: str = '2m') -> iter: response = es.search(index=index, doc_type=doc_type, scroll=scroll, size=size, body=query) sid = response['_scroll_id'] scroll_size = response['hits']['total'] print('total hits:', scroll_size, "\thits per scroll:", len(response['hits']['hits'])) if type(scroll_size) == dict: scroll_size = scroll_size['value'] # Start scrolling while scroll_size > 0: for hit in response['hits']['hits']: yield hit response = es.scroll(scroll_id=sid, scroll=scroll) # Update the scroll ID sid = response['_scroll_id'] # Get the number of results that we returned in the last scroll scroll_size = len(response['hits']['hits']) # Do something with the obtained page # remove scroll context es.clear_scroll(scroll_id=sid)
def export(s, sindex, size): es = Elasticsearch(s) page = es.search(index=sindex, scroll='5m', size=size, body={ "query": { "bool": { "must": [{ "range": { "bizType": { "gte": 30, "lte": 40 } } }], "must_not": [{ "exists": { "field": "seriesColumnId" } }] } }, "sort": [{ "bizId": { "order": "asc" } }] }) try: sid = page['_scroll_id'] except TypeError: print "type Error" print page sys.exit(2) scroll_size = page['hits']['total'] allsize = len(page['hits']['hits']) bulkdata(es, page['hits']['hits'], size) while scroll_size > 0: printSth("Scrolling...") page = getpage(es, sid) # Update the scroll ID sid = page['_scroll_id'] scroll_size = len(page['hits']['hits']) allsize += scroll_size printSth("scroll size: " + str(scroll_size) + " all size: " + str(allsize) + " \r\n") bulkdata(es, page['hits']['hits'], size) print("\n" + sid) es.clear_scroll(sid)
def collect(self): hosts = split_and_strip(self.hosts, ",") es = Elasticsearch(hosts) query = json.loads(self.query) if self.query else {} query["sort"] = "_doc" if self.slice_max > 1: query["slice"] = {"id": self.slice_id, "max": self.slice_max} # initial search resp = es.search(index=self.indexes, body=query, scroll=self.scroll, size=self.DEFAULT_SIZE) scroll_id = resp.get("_scroll_id") if scroll_id is None: return self._build_collection_task_result( CollectionTaskStatus.SUCCESS) try: first_run = True while True: # if we didn't set search_type to scan initial search contains data if first_run: first_run = False else: resp = es.scroll(scroll_id, scroll=self.scroll) self._write_docs_to_aliyun_log(resp["hits"]) # check if we have any errrors if resp["_shards"]["successful"] < resp["_shards"]["total"]: msg = "Scroll request has only succeeded on %d shards out of %d." % \ (resp["_shards"]["successful"], resp["_shards"]["total"]) logging.warning(msg) # return self._build_collection_task_result(CollectionTaskStatus.FAIL_NO_RETRY, msg) scroll_id = resp.get("_scroll_id") # end of scroll if scroll_id is None or not resp["hits"]["hits"]: break return self._build_collection_task_result( CollectionTaskStatus.SUCCESS) finally: if scroll_id: es.clear_scroll(body={"scroll_id": [scroll_id]}, ignore=(404, ))
def export(s, sindex, d, dindex, size): es = Elasticsearch([s]) es_des = Elasticsearch([d]) page = es.search(index=sindex, scroll='2m', size=size) try: sid = page['_scroll_id'] except TypeError: print "type Error" print page sys.exit(2) scroll_size = page['hits']['total'] allsize = len(page['hits']['hits']) bulkdata(es_des, page['hits']['hits'], dindex, size) while scroll_size > 0: printSth("Scrolling...") page = getpage(es, sid) # Update the scroll ID sid = page['_scroll_id'] scroll_size = len(page['hits']['hits']) allsize += scroll_size printSth("scroll size: " + str(scroll_size) + " all size: " + str(allsize) + " \r\n") bulkdata(es_des, page['hits']['hits'], dindex, size) print("\n" + sid) es.clear_scroll(sid)
class ElasticsearchReader(object): def __init__(self, index, hosts=None, source='{"query":{"match_all":{}}}', max_docs=0, scroll_size=10, scroll_time='5m', request_timeout=600, report=1000 ): if hosts is None: hosts = ['http://localhost:9200'] self.index = index self.source = json.loads(source) if isinstance(source, str) else source self.max_docs = max_docs self.scroll_time = scroll_time self.scroll_size = scroll_size self.request_timeout = request_timeout self.es = Elasticsearch(hosts=hosts) self.report = report self.scroll_id = None def __iter__(self): self.scroll_id = None counter = 0 running = True try: while(running): if self.scroll_id is None: response = self.es.search(index=self.index, body=self.source, params={"request_timeout": self.request_timeout, "scroll": self.scroll_time, "size": self.scroll_size}) logger.info(u'{0} docs exist.'.format(response['hits']['total'])) else: response = self.es.scroll(scroll_id=self.scroll_id, params={"request_timeout": self.request_timeout, "scroll": self.scroll_time}) if len(response['hits']['hits']) == 0: self.scroll_id = None running = False break self.scroll_id = response['_scroll_id'] for hit in response['hits']['hits']: if '_source' in hit: counter += 1 if self.max_docs > 0 and counter >= self.max_docs: logger.info(u'%d docs are loaded, but it exceeded %d docs.', counter, self.max_docs) running = False break if counter % self.report == 0: logger.info(u'%d docs are loaded.', counter) yield hit['_source'] except NotFoundError: logger.exception(u'NotFoundError(Loaded %d docs)', counter) except: logger.exception(u"Failed to load documents from Elasticsearch(Loaded %d docs).", counter) logger.info('Loaded %d documents.', counter) def __enter__(self): return self def __exit__(self, exception_type, exception_value, traceback): self.close() def close(self): if self.scroll_id is not None: self.es.clear_scroll(scroll_id=self.scroll_id, params={"request_timeout": self.request_timeout})
class ElasticSearchSeqSource(base.DataSource): """ Data source which executes arbitrary queries on ElasticSearch This is the tabular reader: will return dataframes. Nested return items will become dict-like objects in the output. Parameters ---------- query: str Query to execute. Can either be in Lucene single-line format, or a JSON structured query (presented as text) qargs: dict Further parameters to pass to the query, such as set of indexes to consider, filtering, ordering. See http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search es_kwargs: dict Settings for the ES connection, e.g., a simple local connection may be ``{'host': 'localhost', 'port': 9200}``. Other keywords to the Plugin that end up here and are material: scroll: str how long the query is live for, default ``'100m'`` size: int the paging size when downloading, default 1000. metadata: dict Extra information for this source. """ name = 'elasticsearch_seq' container = 'python' version = __version__ partition_access = False def __init__(self, query, qargs={}, metadata={}, **es_kwargs): from elasticsearch import Elasticsearch self._query = query self._qargs = qargs self._scroll = es_kwargs.pop('scroll', '100m') self._size = es_kwargs.pop('size', 1000) # default page size self._es_kwargs = es_kwargs self._dataframe = None self.es = Elasticsearch([es_kwargs]) # maybe should be (more) global? super(ElasticSearchSeqSource, self).__init__(metadata=metadata) def _run_query(self, size=None, end=None): if size is None: size = self._size if end is not None: size = min(end, size) try: q = json.loads(self._query) if 'query' not in q: q = {'query': q} s = self.es.search(body=q, size=size, scroll=self._scroll, **self._qargs) except (JSONDecodeError, TypeError): s = self.es.search(q=self._query, size=size, scroll=self._scroll, **self._qargs) sid = s['_scroll_id'] scroll_size = s['hits']['total'] while scroll_size > len(s['hits']['hits']): page = self.es.scroll(scroll_id=sid, scroll=self._scroll) sid = page['_scroll_id'] s['hits']['hits'].extend(page['hits']['hits']) if end is not None and len(s['hits']['hits']) > end: break self.es.clear_scroll(scroll_id=sid) return s def _get_schema(self, retry=2): """Get schema from first 10 hits or cached dataframe""" return base.Schema(datashape=None, dtype=None, shape=None, npartitions=1, extra_metadata={}) def _get_partition(self, _): """Downloads all data ES has a hard maximum of 10000 items to fetch. Otherwise need to implement paging, known to ES as "scroll" https://stackoverflow.com/questions/41655913/elk-how-do-i-retrieve-more-than-10000-results-events-in-elastic-search """ results = self._run_query() return [r['_source'] for r in results['hits']['hits']]
class ElasticsearchClient(): def __init__(self, config): self.config = config self.client = Elasticsearch(config.elasticsearch_url) self.scroll_id = None def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self._try_clear_scroll() def next_page_of_records(self): result = self._retrieve_page_of_data() self.scroll_id = self._get_scroll_id(result) site_map_entries = self._get_site_map_entries(result) LOGGER.info('Retrieved {} records from elasticsearch'.format( len(site_map_entries))) return site_map_entries def _retrieve_page_of_data(self): try: if self.scroll_id: result = self._scroll(self.scroll_id) else: result = self._search() return result except Exception as e: raise Exception( 'Failed to retrieve a page of data from elasticsearch', e) def _get_site_map_entries(self, result): try: address_page = self._get_addresses(result) site_map_entries = self._convert_to_site_map_entries(address_page) return site_map_entries except Exception as e: raise Exception( 'Failed to convert elasticsearch result to a list of site map entries', e) def _scroll(self, scroll_id): return self.client.scroll(scroll_id=scroll_id, params={ 'scroll': self.config.scroll_expiry, 'search_type': 'scan', 'size': self.config.page_size, 'timeout': self.config.request_timeout, }) def _search(self): return self.client.search(self.config.es_index, self.config.es_doc_type, body=None, params={ 'size': self.config.page_size, 'scroll': self.config.scroll_expiry, 'timeout': self.config.request_timeout, }) def _try_clear_scroll(self): try: self.client.clear_scroll(self.scroll_id) LOGGER.info('Cleared elasticsearch scroll') except Exception as e: LOGGER.warn('Failed to clear scroll in elasticsearch', e) def _get_scroll_id(self, search_result): try: return search_result['_scroll_id'] except Exception as e: raise Exception( 'Failed to extract scroll ID from the elasticsearch result', e) def _get_addresses(self, search_result): return search_result['hits']['hits'] def _convert_to_site_map_entries(self, address_page): return [self._get_site_map_entry(address) for address in address_page] def _get_page_url(self, address): data = address['_source'] postcode = data['postcode'] address_key = data['addressKey'] address_url_segment = address_key[:len(address_key) - len(postcode) - 1] return '{}/{}/{}'.format(self.config.base_page_url, postcode.replace(' ', '_'), address_url_segment) def _get_site_map_entry(self, address): entry_datetime = datetime.strptime(address['_source']['entryDatetime'], '%Y-%m-%dT%H:%M:%S+00') return SiteMapUrl( location=self._get_page_url(address), last_modified=entry_datetime.strftime('%Y-%m-%dT%H:%M+00:00'), change_frequency=self.config.url_change_frequency, )
class ElasticsearchReader(object): def __init__(self, index, hosts=None, source='{"query":{"match_all":{}}}', max_docs=0, scroll_size=10, scroll_time='5m', request_timeout=600, report=1000): if hosts is None: hosts = ['http://localhost:9200'] self.index = index self.source = json.loads(source) if isinstance(source, str) else source self.max_docs = max_docs self.scroll_time = scroll_time self.scroll_size = scroll_size self.request_timeout = request_timeout self.es = Elasticsearch(hosts=hosts) self.report = report self.scroll_id = None def __iter__(self): self.scroll_id = None counter = 0 running = True try: while (running): if self.scroll_id is None: response = self.es.search(index=self.index, body=self.source, params={ "request_timeout": self.request_timeout, "scroll": self.scroll_time, "size": self.scroll_size }) logger.info(u'{0} docs exist.'.format( response['hits']['total'])) else: response = self.es.scroll(scroll_id=self.scroll_id, params={ "request_timeout": self.request_timeout, "scroll": self.scroll_time }) if len(response['hits']['hits']) == 0: self.scroll_id = None running = False break self.scroll_id = response['_scroll_id'] for hit in response['hits']['hits']: if '_source' in hit: counter += 1 if self.max_docs > 0 and counter >= self.max_docs: logger.info( u'%d docs are loaded, but it exceeded %d docs.', counter, self.max_docs) running = False break if counter % self.report == 0: logger.info(u'%d docs are loaded.', counter) yield hit['_source'] except NotFoundError: logger.exception(u'NotFoundError(Loaded %d docs)', counter) except: logger.exception( u"Failed to load documents from Elasticsearch(Loaded %d docs).", counter) logger.info('Loaded %d documents.', counter) def __enter__(self): return self def __exit__(self, exception_type, exception_value, traceback): self.close() def close(self): if self.scroll_id is not None: self.es.clear_scroll( scroll_id=self.scroll_id, params={"request_timeout": self.request_timeout})
class Elastic: def __init__(self, dbname=None): # Fetch config config = PonymailConfig() self.dbname = dbname or config.get("elasticsearch", "dbname") ssl = config.get("elasticsearch", "ssl", fallback="false").lower() == 'true' uri = config.get("elasticsearch", "uri", fallback="") auth = None if config.has_option('elasticsearch', 'user'): auth = (config.get('elasticsearch', 'user'), config.get('elasticsearch', 'password')) # elasticsearch logs lots of warnings on retries/connection failure logging.getLogger("elasticsearch").setLevel(logging.ERROR) # # add debug # trace = logging.getLogger("elasticsearch.trace") # trace.setLevel(logging.DEBUG) # # create console handler # consoleHandler = logging.StreamHandler() # trace.addHandler(consoleHandler) self.es = Elasticsearch( [{ 'host': config.get("elasticsearch", "hostname"), 'port': int(config.get("elasticsearch", "port")), 'use_ssl': ssl, 'url_prefix': uri, 'auth': auth, 'ca_certs': certifi.where() }], max_retries=5, retry_on_timeout=True) self.dbVersion = None # Mimic ES hierarchy: es.indices.xyz() self.indices = _indices_wrap(self) def libraryVersion(self): return ES_VERSION def libraryMajor(self): return ES_VERSION[0] def engineVersion(self): if not self.dbVersion: try: self.dbVersion = self.info()['version']['number'] except ES_ConnectionError: # default if cannot connect; allows retry return '0.0.0' return self.dbVersion def engineMajor(self): return int(self.engineVersion().split('.')[0]) def getdbname(self): return self.dbname def search(self, doc_type='mbox', **kwargs): return self.es.search(index=self.dbname, doc_type=doc_type, **kwargs) def index(self, **kwargs): return self.es.index(index=self.dbname, **kwargs) def update(self, **kwargs): return self.es.update(index=self.dbname, **kwargs) def scan(self, doc_type='mbox', scroll='3m', size=100, **kwargs): return self.es.search(index=self.dbname, doc_type=doc_type, search_type='scan', size=size, scroll=scroll, **kwargs) def get(self, **kwargs): return self.es.get(index=self.dbname, **kwargs) def scroll(self, **kwargs): return self.es.scroll(**kwargs) def info(self, **kwargs): return self.es.info(**kwargs) def bulk(self, actions, **kwargs): return helpers.bulk(self.es, actions, **kwargs) def clear_scroll(self, *args, **kwargs): """ Call this to release the scroll id and its resources It looks like the Python library already releases the SID if the caller scrolls to the end of the results, so only need to call this when terminating scrolling early. """ return self.es.clear_scroll(*args, **kwargs)
class SuricateFDW(ForeignDataWrapper): """ Elastic Search Foreign Data Wrapper """ @property def rowid_column(self): """ Returns a column name which will act as a rowid column for delete/update operations. This can be either an existing column name, or a made-up one. This column name should be subsequently present in every returned resultset. """ return self._rowid_column def __init__(self, options, columns): super(SuricateFDW, self).__init__(options, columns) self.index = options.pop("index", "") self.query_column = options.pop("query_column", None) self.response_column = options.pop("response_column", None) self.pg_id_column = options.pop("pg_id_column", None) self.size = int(options.pop("size", 10)) self.explain = (options.pop("explain", "false").lower() == "true") self._rowid_column = options.pop("rowid_column", "id") username = options.pop("username", None) password = options.pop("password", None) # self.score_column = options.pop("score_column", None) # self.default_sort = options.pop("default_sort", "") # self.sort_column = options.pop("sort_column", None) # self.scroll_size = int(options.pop("scroll_size", "1000")) # self.scroll_duration = options.pop("scroll_duration", "10m") self.path = "/{index}".format(index=self.index) if (username is None) != (password is None): raise ValueError("Must provide both username and password") if username is not None: auth = (username, password) else: auth = None host = options.pop("host", "localhost") port = int(options.pop("port", "9200")) timeout = int(options.pop("timeout", "10")) self.client = Elasticsearch([{ "host": host, "port": port }], http_auth=auth, timeout=timeout, **options) self.scroll_id = None def get_rel_size(self, quals, columns): """ Helps the planner by returning costs. Returns a tuple of the form (number of rows, average row width) """ try: query = self._get_query(quals) q_dict = json.loads(query.encode('utf-8')) response = self.client.count(body=q_dict, index=self.index) return (response["count"], len(columns) * 100) except Exception as exception: log2pg( "COUNT for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return (0, 0) def execute(self, quals, columns): """ Execute the query """ try: query = self._get_query(quals) q_dict = json.loads(query.encode('utf-8')) pg_id = self._get_pg_id(quals) response = self.client.search(body=q_dict, index=self.index, size=self.size, explain=self.explain) while True: for result in response["hits"]["hits"]: yield self._format_out(result, pg_id=pg_id, query=query) return except Exception as exception: log2pg( "SEARCH for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return def _get_pg_id(self, quals): if not self.query_column: return None return next( (qualifier.value for qualifier in quals if qualifier.field_name == self.pg_id_column), None, ) def end_scan(self): """ Hook called at the end of a foreign scan. """ if self.scroll_id: self.client.clear_scroll(scroll_id=self.scroll_id) self.scroll_id = None def _format_out(self, response, pg_id, query): result_dict = { self.response_column: json.dumps(response), self.pg_id_column: pg_id, self.query_column: query } return result_dict def _get_query(self, quals): return next( (qualifier.value for qualifier in quals if qualifier.field_name == self.query_column), None, ) def _convert_response_row(self, row_data, columns, query, sort): return_dict = { column: self._convert_response_column(column, row_data) for column in columns if column in row_data["_source"] or column == self.rowid_column or column == self.score_column } if query: return_dict[self.query_column] = query return_dict[self.sort_column] = sort return return_dict def _read_by_id(self, row_id): try: results = self.client.search( body={"query": { "ids": { "values": [row_id] } }}, index=self.index)["hits"]["hits"] if results: return self._convert_response_row(results[0], self.columns, None, None) log2pg( "SEARCH for {path} row_id {row_id} returned nothing".format( path=self.path, row_id=row_id), logging.WARNING, ) return {self.rowid_column: row_id} except Exception as exception: log2pg( "SEARCH for {path} row_id {row_id} failed: {exception}".format( path=self.path, row_id=row_id, exception=exception), logging.ERROR, ) return {}
class ESManager: def __init__(self) -> None: config = create_config() es_aws = config.get('DB-Section', 'es-aws') elk_credentials = config.get('Secrets-Section', 'elk-secret').strip('"').split(' ') self.elk_repo_name = config.get('General-Section', 'elk-repo-name') es_host_config = { 'host': config.get('DB-Section', 'es-host', fallback='localhost'), 'port': config.get('DB-Section', 'es-port', fallback='9200') } if es_aws == 'True': self.es = Elasticsearch(hosts=[es_host_config], http_auth=(elk_credentials[0], elk_credentials[1]), scheme='https') else: self.es = Elasticsearch(hosts=[es_host_config]) def create_index(self, index: ESIndices): """ Create Elasticsearch index with given name. Argument: :param index_name (str) name of the index to be created """ index_name = index.value index_json_name = f'initialize_{index_name}_index.json' index_json_path = os.path.join(os.environ['BACKEND'], 'elasticsearchIndexing/json/', index_json_name) with open(index_json_path, encoding='utf-8') as reader: index_config = json.load(reader) create_result = None try: create_result = self.es.indices.create(index=index_name, body=index_config, ignore=400) except AuthorizationException: # https://discuss.elastic.co/t/forbidden-12-index-read-only-allow-delete-api/110282/4 read_only_query = {'index': {'blocks': {'read_only_allow_delete': 'false'}}} self.es.indices.put_settings(index=index_name, body=read_only_query) create_result = self.es.indices.create(index=index_name, body=index_config, ignore=400) return create_result def index_exists(self, index: ESIndices) -> bool: """ Check if the index already exists. """ return self.es.indices.exists(index=index.value) def autocomplete(self, index: ESIndices, keyword: KeywordsNames, searched_term: str) -> list: """ Get list of the modules which will be returned as autocomplete after entering the "search_term" by the user. Arguments: :param keyword (KeywordsNames) :param searched_term (str) """ autocomplete_json_path = os.path.join(os.environ['BACKEND'], 'elasticsearchIndexing/json/completion.json') with open(autocomplete_json_path, encoding='utf-8') as reader: autocomplete_query = json.load(reader) autocomplete_query['query']['bool']['must'][0]['term'] = {keyword.value: searched_term.lower()} autocomplete_query['aggs']['groupby_module']['terms']['field'] = f'{keyword.value}.keyword' rows = self.es.search(index=index.value, body=autocomplete_query) hits = rows['aggregations']['groupby_module']['buckets'] result = [hit['key'] for hit in hits] return result def delete_from_index(self, index: ESIndices, module: dict): delete_module_query = self._get_name_revision_query(index, module) return self.es.delete_by_query(index=index.value, body=delete_module_query, conflicts='proceed') def delete_from_indices(self, module: dict): for index in ESIndices: self.delete_from_index(index, module) def index_module(self, index: ESIndices, document: dict): # TODO: Remove this after reindexing and unification of both indices if index == ESIndices.MODULES: path = document['path'] del document['path'] document['dir'] = path name = document['name'] del document['name'] document['module'] = name return self.es.index(index=index.value, body=document, request_timeout=40) def match_all(self, index: ESIndices): def _store_hits(hits: list, all_results: dict): for hit in hits: name = '' path = '' if index == ESIndices.AUTOCOMPLETE: name = hit['_source']['name'] path = hit['_source']['path'] if index == ESIndices.MODULES: name = hit['_source']['module'] path = hit['_source']['dir'] mod = { 'name': name, 'revision': hit['_source']['revision'], 'organization': hit['_source']['organization'], 'path': path } key = '{}@{}/{}'.format(mod.get('name'), mod.get('revision'), mod.get('organization')) if key not in all_results: all_results[key] = mod else: print('{} already in all results'.format(key)) all_results = {} match_all_query = { 'query': { 'match_all': {} } } total_index_docs = 0 es_result = self.es.search(index=index.value, body=match_all_query, scroll=u'10s', size=250) scroll_id = es_result.get('_scroll_id') hits = es_result['hits']['hits'] _store_hits(hits, all_results) total_index_docs += len(hits) while es_result['hits']['hits']: es_result = self.es.scroll( scroll_id=scroll_id, scroll=u'10s' ) scroll_id = es_result.get('_scroll_id') hits = es_result['hits']['hits'] _store_hits(hits, all_results) total_index_docs += len(hits) self.es.clear_scroll(scroll_id=scroll_id, ignore=(404, )) return all_results def get_module_by_name_revision(self, index: ESIndices, module: dict) -> bool: get_module_query = self._get_name_revision_query(index, module) es_result = self.es.search(index=index.value, body=get_module_query) return es_result['hits']['hits'] def get_latest_module_revision(self, index: ESIndices, name: str): query_path = os.path.join(os.environ['BACKEND'], 'elasticsearchIndexing/json/latest_revision_query.json') with open(query_path, encoding='utf-8') as reader: latest_revision_query = json.load(reader) # TODO: Remove this after reindexing and unification of both indices if index == ESIndices.AUTOCOMPLETE: del latest_revision_query['query']['bool']['must'][0]['match_phrase']['module.keyword'] latest_revision_query['query']['bool']['must'][0]['match_phrase'] = { 'name.keyword': { 'query': name } } else: latest_revision_query['query']['bool']['must'][0]['match_phrase']['module.keyword']['query'] = name es_result = self.es.search(index=index.value, body=latest_revision_query) return es_result['hits']['hits'] def document_exists(self, index: ESIndices, module: dict) -> bool: get_module_query = self._get_name_revision_query(index, module) es_count = self.es.count(index=index.value, body=get_module_query) return es_count['count'] > 0 def create_snapshot_repository(self, compress): body = { 'type': 'fs', 'settings': { 'location': self.elk_repo_name, 'compress': compress } } es_result = self.es.snapshot.create_repository(repository=self.elk_repo_name, body=body) return es_result def create_snapshot(self, snapshot_name: str): index_body = { 'indices': '_all' } return self.es.snapshot.create(repository=self.elk_repo_name, snapshot=snapshot_name, body=index_body) def get_sorted_snapshots(self) -> list: snapshots = self.es.snapshot.get(repository=self.elk_repo_name, snapshot='_all')['snapshots'] return sorted(snapshots, key=itemgetter('start_time_in_millis')) def restore_snapshot(self, snapshot_name: str): index_body = { 'indices': '_all' } return self.es.snapshot.restore(repository=self.elk_repo_name, snapshot=snapshot_name, body=index_body) def delete_snapshot(self, snapshot_name: str): return self.es.snapshot.delete(repository=self.elk_repo_name, snapshot=snapshot_name) def _get_name_revision_query(self, index: ESIndices, module: dict): module_search_path = os.path.join(os.environ['BACKEND'], 'elasticsearchIndexing/json/module_search.json') with open(module_search_path, encoding='utf-8') as reader: name_revision_query = json.load(reader) # TODO: Remove this after reindexing and unification of both indices if index == ESIndices.AUTOCOMPLETE: del name_revision_query['query']['bool']['must'][0]['match_phrase']['module.keyword'] name_revision_query['query']['bool']['must'][0]['match_phrase'] = { 'name.keyword': { 'query': module['name'] } } else: name_revision_query['query']['bool']['must'][0]['match_phrase']['module.keyword']['query'] = module['name'] name_revision_query['query']['bool']['must'][1]['match_phrase']['revision']['query'] = module['revision'] return name_revision_query
results_list = [] if first_query is True: es_response = es.search(index=index_name, body={'query': es_query}, scroll=ES_SCROLL_TIMEOUT, size=ES_DOC_COUNT) es_scroll_id = es_response['_scroll_id'] total_docs = es_response['hits']['total'] print total_docs sys.exit() first_query = False else: es_response = es.scroll(scroll_id=es_scroll_id, scroll=ES_SCROLL_TIMEOUT) es_scroll_id = es_response['_scroll_id'] for entry in es_response: if (len(es_response['hits']['hits']) != 0) and (entry not in results_list): results_list.append(es_response) count += len(es_response['hits']['hits']) print 'Retrieved %s of %s docs' % (count, total_docs) if len(es_response['hits']['hits']) == 0: queries_complete = True print 'Creating report....' print 'done.' #gc.collect() print es.clear_scroll(scroll_id=es_scroll_id) runtime = datetime.now() - start_time print 'Total runtime %s' % runtime
class Elastic: def __init__(self, dbname=None, **kwargs): # Fetch config config = configparser.RawConfigParser() config.read('ponymail.cfg') self.dbname = dbname or config.get("elasticsearch", "dbname") ssl = config.get("elasticsearch", "ssl", fallback="false").lower() == 'true' uri = config.get("elasticsearch", "uri", fallback="") auth = None if config.has_option('elasticsearch', 'user'): auth = (config.get('elasticsearch', 'user'), config.get('elasticsearch', 'password')) # elasticsearch logs lots of warnings on retries/connection failure logging.getLogger("elasticsearch").setLevel(logging.ERROR) # # add debug # trace = logging.getLogger("elasticsearch.trace") # trace.setLevel(logging.DEBUG) # # create console handler # consoleHandler = logging.StreamHandler() # trace.addHandler(consoleHandler) self.es = Elasticsearch( [{ 'host': config.get("elasticsearch", "hostname"), 'port': int(config.get("elasticsearch", "port")), 'use_ssl': ssl, 'url_prefix': uri, 'auth': auth }], max_retries=5, retry_on_timeout=True) def search(self, doc_type='mbox', **kwargs): return self.es.search(index=self.dbname, doc_type=doc_type, **kwargs) def index(self, **kwargs): return self.es.index(index=self.dbname, **kwargs) def update(self, **kwargs): return self.es.update(index=self.dbname, **kwargs) def scan(self, doc_type='mbox', scroll='3m', size=100, **kwargs): return self.es.search(index=self.dbname, doc_type=doc_type, search_type='scan', size=size, scroll=scroll, **kwargs) def scroll(self, **kwargs): return self.es.scroll(**kwargs) def bulk(self, actions, **kwargs): return helpers.bulk(self.es, actions, **kwargs) """ Call this to release the scroll id and its resources It looks like the Python library already releases the SID if the caller scrolls to the end of the results, so only need to call this when terminating scrolling early. """ def clear_scroll(self, *args, **kwargs): return self.es.clear_scroll(*args, **kwargs)
class ESSearch: def __init__(self, connection, es_index='covid_tweets'): aws_auth = AWS4Auth(connection['ACCESS_KEY'], connection['SECRET_KEY'], 'us-east-2', 'es') self.es = ES(hosts=[{ 'host': connection['AWS_HOST'], 'port': 443 }], http_auth=aws_auth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, timeout=60) self.es_index = es_index def format_query(self, keywords, startDateString=None, endDateString=None, tweettype=None, user=None): if len(keywords) == 0: keywords = None queries = [] time_range = {} if startDateString: startDate = datetime.strptime(startDateString, '%m/%d/%Y') time_range['gte'] = startDate.strftime('00-%d-%m-%Y') if endDateString: endDate = datetime.strptime(endDateString, '%m/%d/%Y') time_range['lte'] = endDate.strftime('23-%d-%m-%Y') if time_range: queries.append({'range': {'date': time_range}}) if keywords: queries.append({ "query_string": { "query": keywords, "fields": ["text"], "default_operator": "or" } }) if user: queries.append({"match": {"user_name": user}}) if tweettype: queries.append({"terms": { "tweet_type": tweettype, }}) print(queries) return queries def get_doc(self, tweet_id): retval = self.es.get(index=self.es_index, id=tweet_id, doc_type='document') return retval def get_user_tweet(self, keywords, startDateString=None, endDateString=None, tweettype=None, user=None): queries = self.format_query(keywords, startDateString=startDateString, endDateString=endDateString, tweettype=tweettype, user=user) total_qry = {"size": 250, 'query': {'bool': {'must': queries}}} retval = self.es.search(index=self.es_index, doc_type='document', body=total_qry) return retval def count(self, keywords, startDateString=None, endDateString=None, tweettype=None, user=False): queries = self.format_query(keywords, startDateString, endDateString, tweettype) total_qry = {} retval = {} if user: total_qry = { "size": 0, 'query': { 'bool': { 'must': queries } }, "aggs": { "users_count": { "cardinality": { "field": "user_name" } } } } retval = self.es.search(index=self.es_index, doc_type='document', body=total_qry) else: total_qry = {'query': {'bool': {'must': queries}}} retval = self.es.count(index=self.es_index, body=total_qry) return retval def agg_qry(self, keywords, startDateString=None, endDateString=None, tweettype=None): return retval def query(self, keywords, startDateString=None, endDateString=None, size=None, tweettype=None): queries = self.format_query(keywords, startDateString, endDateString, tweettype) if size: retval = self.es.search(index=self.es_index, scroll='1m', doc_type='document', body={ 'size': size, 'query': { 'bool': { 'must': queries } } }) for tw in retval['hits']['hits']: yield tw else: retval = self.es.search(index=self.es_index, scroll='1m', doc_type='document', body={ 'size': 1000, 'query': { 'bool': { 'must': queries } } }) total = retval["hits"]["total"] print(total) sid = retval['_scroll_id'] scroll_size = len(retval['hits']['hits']) while scroll_size > 0: print("Scrolling...") for tw in retval['hits']['hits']: yield tw if not size: retval = self.es.scroll(scroll_id=sid, scroll='1m') sid = retval['_scroll_id'] scroll_size = len(retval['hits']['hits']) self.es.clear_scroll(scroll_id=sid) def sizequery(self, keywords, startDateString=None, endDateString=None, size=None, tweettype=None, random=True): queries = self.format_query(keywords, startDateString, endDateString, tweettype) queries = {'bool': {'must': queries}} print(queries) if random: queries = { "function_score": { "query": queries, "random_score": {}, } } if size < 1000: retval = self.es.search(index=self.es_index, scroll='1m', doc_type='document', body={ 'size': size, 'query': queries }) for tw in retval['hits']['hits']: yield tw else: retval = self.es.search(index=self.es_index, scroll='1m', doc_type='document', body={ 'size': 1000, 'query': queries }) total = retval["hits"]["total"] so_far = len(retval['hits']['hits']) if size == None: size = total sid = retval['_scroll_id'] scroll_size = len(retval['hits']['hits']) while scroll_size > 0 and so_far <= size: print(so_far) print(size) print(total) print("Scrolling...") for tw in retval['hits']['hits']: yield tw retval = self.es.scroll(scroll_id=sid, scroll='1m') sid = retval['_scroll_id'] scroll_size = len(retval['hits']['hits']) so_far = so_far + scroll_size self.es.clear_scroll(scroll_id=sid) """count = 0
def query_index_by_time_range( index: str, es: Elasticsearch, min_time: Optional[datetime] = None, max_time: Optional[datetime] = None, step: int = 10000, ) -> Iterator[List[Dict]]: body = { "query": { "bool": { "must": [], "must_not": [], "should": [] } }, "sort": ['_doc'], "aggs": {} } time_range_body = { "range": { "startTimeMillis": { "format": "epoch_millis" } } } if min_time is not None: time_range_body["range"]["startTimeMillis"]["gte"] = int( min_time.timestamp() * 1000) if max_time is not None: time_range_body["range"]["startTimeMillis"]["lte"] = int( max_time.timestamp() * 1000) body["query"]["bool"]["must"].append(time_range_body) logger.debug(f"query_index_by_time_range body: {body}") rsp = None retry_counts = 0 while rsp is None: try: rsp = es.search(index=index, body=dict(**body, size=step), scroll='5m', timeout='5m') except Exception as e: logger.warning(f"Exception in search: {e}. Retry") rsp = None retry_counts += 1 time.sleep(random.randint(1, 10)) if retry_counts > 100: raise RuntimeError("get search error for too many times") total = rsp['hits']['total']["value"] scroll_id = rsp['_scroll_id'] scroll_size = total rets = rsp["hits"]["hits"] yield rets total -= len(rets) del rets, rsp with tqdm(total=total, desc=f"{index=} {min_time}-{max_time}") as pbar: while scroll_size > 0: try: _rsp = es.scroll(scroll_id=scroll_id, scroll='5m') except Exception as e: logger.warning(f"Exception in scroll: {e}. Retry") continue scroll_id = _rsp['_scroll_id'] scroll_size = len(_rsp['hits']['hits']) pbar.update(scroll_size) total -= scroll_size _rets = _rsp["hits"]["hits"] yield _rets es.clear_scroll(scroll_id=scroll_id)
class ElasticsearchFDW(ForeignDataWrapper): """ Elastic Search Foreign Data Wrapper """ @property def rowid_column(self): """Returns a column name which will act as a rowid column for delete/update operations. This can be either an existing column name, or a made-up one. This column name should be subsequently present in every returned resultset.""" return self._rowid_column def __init__(self, options, columns): super(ElasticsearchFDW, self).__init__(options, columns) self.index = options.pop("index", "") self.doc_type = options.pop("type", "") self.query_column = options.pop("query_column", None) self.score_column = options.pop("score_column", None) self.scroll_size = int(options.pop("scroll_size", "1000")) self.scroll_duration = options.pop("scroll_duration", "10m") self._rowid_column = options.pop("rowid_column", "id") username = options.pop("username", None) password = options.pop("password", None) if ELASTICSEARCH_VERSION[0] >= 7: self.path = "/{index}".format(index=self.index) self.arguments = {"index": self.index} else: self.path = "/{index}/{doc_type}".format(index=self.index, doc_type=self.doc_type) self.arguments = {"index": self.index, "doc_type": self.doc_type} if (username is None) != (password is None): raise ValueError("Must provide both username and password") if username is not None: auth = (username, password) else: auth = None host = options.pop("host", "localhost") port = int(options.pop("port", "9200")) timeout = int(options.pop("timeout", "10")) self.client = Elasticsearch([{ "host": host, "port": port }], http_auth=auth, timeout=timeout, **options) self.columns = columns self.json_columns = { column.column_name for column in columns.values() if column.base_type_name.upper() in {"JSON", "JSONB"} } self.scroll_id = None def get_rel_size(self, quals, columns): """Helps the planner by returning costs. Returns a tuple of the form (number of rows, average row width)""" try: query, _ = self._get_query(quals) if query: response = self.client.count(body=query, **self.arguments) else: response = self.client.count(**self.arguments) return (response["count"], len(columns) * 100) except Exception as exception: log2pg( "COUNT for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return (0, 0) def can_pushdown_upperrel(self): return { "groupby_supported": True, "agg_functions": list(_PG_TO_ES_AGG_FUNCS), "operators_supported": _OPERATORS_SUPPORTED, } def explain( self, quals, columns, sortkeys=None, aggs=None, group_clauses=None, verbose=False, ): query, _ = self._get_query(quals, aggs=aggs, group_clauses=group_clauses) return [ "Elasticsearch query to %s" % self.client, "Query: %s" % json.dumps(query, indent=4), ] def execute(self, quals, columns, aggs=None, group_clauses=None): """ Execute the query """ try: query, query_string = self._get_query(quals, aggs=aggs, group_clauses=group_clauses) is_aggregation = aggs or group_clauses if query: response = self.client.search( size=self.scroll_size if not is_aggregation else 0, scroll=self.scroll_duration if not is_aggregation else None, body=query, **self.arguments) else: response = self.client.search(size=self.scroll_size, scroll=self.scroll_duration, **self.arguments) if not response["hits"]["hits"] and not is_aggregation: return if is_aggregation: yield from self._handle_aggregation_response( query, response, aggs, group_clauses) return while True: self.scroll_id = response["_scroll_id"] for result in response["hits"]["hits"]: yield self._convert_response_row(result, columns, query_string) if len(response["hits"]["hits"]) < self.scroll_size: return response = self.client.scroll(scroll_id=self.scroll_id, scroll=self.scroll_duration) except Exception as exception: log2pg( "SEARCH for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return def end_scan(self): if self.scroll_id: self.client.clear_scroll(scroll_id=self.scroll_id) self.scroll_id = None def insert(self, new_values): """ Insert new documents into Elastic Search """ if self.rowid_column not in new_values: log2pg( 'INSERT requires "{rowid}" column. Missing in: {values}'. format(rowid=self.rowid_column, values=new_values), logging.ERROR, ) return (0, 0) document_id = new_values[self.rowid_column] new_values.pop(self.rowid_column, None) for key in self.json_columns.intersection(new_values.keys()): new_values[key] = json.loads(new_values[key]) try: response = self.client.index(id=document_id, body=new_values, **self.arguments) return response except Exception as exception: log2pg( "INDEX for {path}/{document_id} and document {document} failed: {exception}" .format( path=self.path, document_id=document_id, document=new_values, exception=exception, ), logging.ERROR, ) return (0, 0) def update(self, document_id, new_values): """ Update existing documents in Elastic Search """ new_values.pop(self.rowid_column, None) for key in self.json_columns.intersection(new_values.keys()): new_values[key] = json.loads(new_values[key]) try: response = self.client.index(id=document_id, body=new_values, **self.arguments) return response except Exception as exception: log2pg( "INDEX for {path}/{document_id} and document {document} failed: {exception}" .format( path=self.path, document_id=document_id, document=new_values, exception=exception, ), logging.ERROR, ) return (0, 0) def delete(self, document_id): """ Delete documents from Elastic Search """ try: response = self.client.delete(id=document_id, **self.arguments) return response except Exception as exception: log2pg( "DELETE for {path}/{document_id} failed: {exception}".format( path=self.path, document_id=document_id, exception=exception), logging.ERROR, ) return (0, 0) def _get_query(self, quals, aggs=None, group_clauses=None): ignore_columns = [] if self.query_column: ignore_columns.append(self.query_column) if self.score_column: ignore_columns.append(self.score_column) query = quals_to_es( quals, aggs=aggs, group_clauses=group_clauses, ignore_columns=ignore_columns, column_map={self._rowid_column: "_id"} if self._rowid_column else None, ) if group_clauses is not None: # Configure pagination for GROUP BY's query["aggs"]["group_buckets"]["composite"][ "size"] = self.scroll_size if not self.query_column: return query, None query_string = next( (qualifier.value for qualifier in quals if qualifier.field_name == self.query_column), None, ) if query_string: query["query"]["bool"]["must"].append( {"query_string": { "query": query_string }}) return query, query_string def _convert_response_row(self, row_data, columns, query): if query: # Postgres checks the query after too, so the query column needs to be present return dict( [(column, self._convert_response_column(column, row_data)) for column in columns if column in row_data["_source"] or column == self.rowid_column or column == self.score_column] + [(self.query_column, query)]) return { column: self._convert_response_column(column, row_data) for column in columns if column in row_data["_source"] or column == self.rowid_column or column == self.score_column } def _convert_response_column(self, column, row_data): if column == self.rowid_column: return row_data["_id"] if column == self.score_column: return row_data["_score"] value = row_data["_source"][column] if isinstance(value, (list, dict)): return json.dumps(value) return value def _handle_aggregation_response(self, query, response, aggs, group_clauses): if group_clauses is None: result = {} for agg_name in aggs: if agg_name == "count.*": # COUNT(*) is a special case, since it doesn't have a # corresponding aggregation primitive in ES result[agg_name] = response["hits"]["total"]["value"] continue result[agg_name] = response["aggregations"][agg_name]["value"] yield result else: while True: for bucket in response["aggregations"]["group_buckets"][ "buckets"]: result = {} for column in group_clauses: result[column] = bucket["key"][column] if aggs is not None: for agg_name in aggs: if agg_name == "count.*": # In general case with GROUP BY clauses COUNT(*) # is taken from the bucket's doc_count field result[agg_name] = bucket["doc_count"] continue result[agg_name] = bucket[agg_name]["value"] yield result # Check if we need to paginate results if "after_key" not in response["aggregations"][ "group_buckets"]: break query["aggs"]["group_buckets"]["composite"][ "after"] = response["aggregations"]["group_buckets"][ "after_key"] response = self.client.search(size=0, body=query, **self.arguments)
class Elastic: db_mbox: str db_source: str db_attachment: str db_account: str db_session: str db_notification: str db_auditlog: str dbname: str def __init__(self, logger_level=None, trace_level=None): # Fetch config config = ponymailconfig.PonymailConfig() # Set default names for all indices we use dbname = config.get('elasticsearch', 'dbname', fallback='ponymail') self.dbname = dbname self.db_mbox = dbname + '-mbox' self.db_source = dbname + '-source' self.db_account = dbname + '-account' self.db_attachment = dbname + '-attachment' self.db_session = dbname + '-session' self.db_notification = dbname + '-notification' self.db_auditlog = dbname + '-auditlog' self.db_version = 0 dburl = config.get('elasticsearch', 'dburl', fallback=None) if not dburl: ssl = config.get('elasticsearch', 'ssl', fallback=False) uri = config.get('elasticsearch', 'uri', fallback='') auth = None if config.has_option('elasticsearch', 'user'): auth = (config.get('elasticsearch', 'user'), config.get('elasticsearch', 'password')) dburl = { "host": config.get('elasticsearch', 'hostname', fallback='localhost'), "port": config.get('elasticsearch', 'port', fallback=9200), "use_ssl": ssl, "url_prefix": uri, "auth": auth, "ca_certs": certifi.where(), } # Always allow this to be set; will be replaced as necessary by wait_for_active_shards self.consistency = config.get("elasticsearch", "write", fallback="quorum") if logger_level: eslog = logging.getLogger("elasticsearch") eslog.setLevel(logger_level) eslog.addHandler(logging.StreamHandler()) else: # elasticsearch logs lots of warnings on retries/connection failure logging.getLogger("elasticsearch").setLevel(logging.ERROR) if trace_level: trace = logging.getLogger("elasticsearch.trace") trace.setLevel(trace_level) trace.addHandler(logging.StreamHandler()) self.es = Elasticsearch( [dburl], max_retries=5, retry_on_timeout=True, ) es_engine_major = self.engineMajor() if es_engine_major in [7, 8]: self.wait_for_active_shards = config.get("elasticsearch", "wait", fallback=1) else: raise Exception("Unexpected elasticsearch version ", es_engine_major) # Mimic ES hierarchy: es.indices.xyz() self.indices = _indices_wrap(self) # convert index type to index name def index_name(self, index): return self.dbname + "-" + index @staticmethod def libraryVersion(): return ES_VERSION @staticmethod def libraryMajor(): return ES_VERSION[0] def engineVersion(self): if not self.db_version: try: self.db_version = self.es.info()["version"]["number"] except ES_ConnectionError: # default if cannot connect; allows retry return "0.0.0" return self.db_version def engineMajor(self): return int(self.engineVersion().split(".")[0]) def search(self, **kwargs): return self.es.search(**kwargs) def index(self, **kwargs): kwargs["wait_for_active_shards"] = self.wait_for_active_shards kwargs["doc_type"] = "_doc" return self.es.index(**kwargs) def create(self, **kwargs): return self.es.create(**kwargs) def info(self, **kwargs): return self.es.info(**kwargs) def update(self, **kwargs): return self.es.update(**kwargs) # TODO: is this used? Does it make sense for ES7 ? def scan(self, scroll="3m", size=100, **kwargs): return self.es.search(search_type="scan", size=size, scroll=scroll, **kwargs) def get(self, **kwargs): return self.es.get(**kwargs) def scroll(self, **kwargs): return self.es.scroll(**kwargs) def info(self, **kwargs): return self.es.info(**kwargs) def bulk(self, actions, **kwargs): return helpers.bulk(self.es, actions, **kwargs) def streaming_bulk(self, actions, **kwargs): return helpers.streaming_bulk(self.es, actions, **kwargs) def clear_scroll(self, *args, **kwargs): """ Call this to release the scroll id and its resources It looks like the Python library already releases the SID if the caller scrolls to the end of the results, so only need to call this when terminating scrolling early. """ return self.es.clear_scroll(*args, **kwargs)
class ElasticQuery(object): """ Class with pre-built ElasticSearch queries to download/query data from server or local """ # File path for storing data pickles DISK_PATH = '/media/jerry/RecordedFuture/Data' # Minimum file window size FILE_TIME_DELTA = timedelta(minutes=5) def __init__(self, es_server, es_index, username, password): """ Initiates class by signing authenticating on es_server with username and password :param es_server: es server addr :param index: es index on server """ self.QUERY_SIZE = 10000 self.es_index = es_index self.es_server = es_server try: logger.debug('Initializing connection.') self.client = Elasticsearch(self.es_server, http_auth=(username, password), timeout=600) except exceptions.AuthenticationException as e: logger.error('Client Authorization Failed.') raise e logger.debug('Connection established.') # Features of interest from the Netflow ElasticSearch self.col_time = ['timestamp'] self.col_flow = ['src_addr', 'src_port', 'dst_addr', 'dst_port', 'ip_protocol', 'packets', 'bytes'] self.col_node = ['ipaddr'] self.columns = self.col_time + self.col_flow + self.col_node # Construct actual feature name for extraction from ElasitcSearch self.response_columns = ['hits.hits._source.flow.' + _ for _ in self.col_flow] + \ ['hits.hits._source.node.' + _ for _ in self.col_node] self.response_filter = ['_scroll_id', 'hits.total.value', 'hits.hits._source.@timestamp'] + self.response_columns def query_unique(self, field): """ Finds number of unique feature values for given field from ElasticSearch :param field: field following 'hits.hits._source' [examples: flow.ip_protocol, node.hostname] :return: dataframe """ query = \ { 'aggs': { 'nodes': { 'terms': { 'field': field, } } } } logger.debug('Querying uniques for field %s' % field) response = self._search(query, filter_response=False) if response['timed_out']: logger.warning('Query timed out') return pd.DataFrame() logger.debug('%i flows processed in %.2f seconds' % (response['hits']['total']['value'], response['took']/1000)) return pd.DataFrame().from_dict(response['aggregations']['nodes']['buckets']) def get_first_last(self): """ Query ElasticSearch for the first and last timestamp of the records in current index :return: (date_last, date_first, total_hits) """ dates = [] for order in ['desc', 'asc']: query = \ { "query": { "match_all": {} }, "sort": [ { "@timestamp": { "order": order } } ] } response = self._search(query, filter_response=False, size=1) dates.append(datetime.strptime(response['hits']['hits'][0]['_source']['@timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ')) total_hits = response['hits']['total']['value'] # (dates[0] - dates[1]).total_seconds()/(60*60*24) return dates[0], dates[1], total_hits def query_time(self, start_time: datetime, window_size: timedelta, from_disk: bool = True): """ Queries ElasticSearch server starting at start_time :param start_time: datetime to start search at :param window_size: lookup window size in timedelta :param from_disk: check if file exists on disk and load it else download the file :return: dataframe containing data in the time window if any """ # Time parameters time_current = start_time time_change = window_size logger.debug('Querying time %s' % time_current.isoformat()) # Try loading from disk if file(s) is available, else download file(s) to disk and load it if from_disk: if not os.path.exists(self._get_pp(time_current)): while start_time < start_time + time_change: self.download_pickle(start_time) start_time += self.FILE_TIME_DELTA return self.load_pickle(time_current, time_change) query = \ {'query': {'bool': {'filter': {'range': {'@timestamp': {'gte': time_current.isoformat(), 'lt': (time_current + time_change).isoformat()} } } } } } return self._query_data(query) def query_ip(self, ip, start_time: datetime, end_time: datetime, src=True): """ Queries ElasticSearch server for src/dst ip/cidr in given time range :param ip: ip, cidr notation acceptable [ex. 192.168.1.1/16] :param start_time: start time in range :param end_time: end time in range :param src: lookup in src_addr/dst_addr :return: dataframe with results """ time_start = start_time time_end = end_time flow_feature = 'flow.src_addr' if src else 'flow.dst_addr' query = \ {'query': {'bool': {'filter': [{'term': {flow_feature: ip} }, {'range': {'@timestamp': {'gte': time_start.isoformat(), 'lt': time_end.isoformat()} } } ] } } } logger.debug('Querying ip %s in time %s' % (ip, time_start.isoformat())) return self._query_data(query) def load_pickle(self, start_time: datetime, window_size: timedelta): """ Load saved pickle files from disk instead of query. """ windows = int(window_size.total_seconds()/(60 * 5)) # Number of windows logger.debug('Loading time %s ' % start_time.isoformat()) df_lst = [] for _ in range(windows): pp = self._get_pp(start_time) df_lst.append(pd.read_pickle(pp)) start_time += self.FILE_TIME_DELTA return pd.concat(df_lst, sort=False, ignore_index=True) def download_pickle(self, start_time: datetime): """ Download pickle file recursively by calling query_time and save the results :param start_time: """ logger.debug('Downloading %s ' % start_time.isoformat()) df = eq.query_time(start_time, self.FILE_TIME_DELTA, from_disk=False) pp = self._get_pp(start_time) if not os.path.exists(os.path.dirname(pp)): os.makedirs(os.path.dirname(pp)) df.to_pickle(pp) def _get_pp(self, current_date): """ :return: pickle file path for given date """ pickle_path = os.path.join(self.DISK_PATH, str(current_date.month), str(current_date.day), '%02d%02d.pickle' % (current_date.hour, current_date.minute)) return pickle_path def _query_data(self, query): """ Queries ElasticSearch server with given query body, saves result to file if a path is given :param query: query body given to elastic search :return: dataframe """ df_lst = [] df_tmp = pd.DataFrame(columns=self.columns) response = self._search(query) scroll_id = response['_scroll_id'] n_flows = response['hits']['total']['value'] if n_flows == 0: logger.warning('Entries not found.\n') return df_tmp lines_skipped = 0 batches = int(np.ceil(n_flows/self.QUERY_SIZE)) logger.debug('Processing %i flows.' % n_flows) for batch in range(max(batches-1, 1)): rows = [] for hit in response['hits']['hits']: row = hit['_source'].get('flow', None) if not row: lines_skipped += 1 continue row.update(hit['_source']['node']) row.update({'timestamp': hit['_source']['@timestamp']}) rows.append(row) df_lst.append(df_tmp.from_dict(rows)) response = self._scroll(scroll_id) self.client.clear_scroll(scroll_id) # Clear Scroll after Finish logger.debug('Processed %i batches, skipped %i lines.' % (batches, lines_skipped)) return pd.concat(df_lst, sort=False, ignore_index=True) def _search(self, query, filter_response=True, size=None): """ Wrapper for ElasticSearch search function :param query: query body :param filter_response: :return: """ response_filter = self.response_filter if filter_response else None size = self.QUERY_SIZE if not size else size return self.client.search(index=self.es_index, body=query, size=size, scroll='1m', filter_path=response_filter) def _scroll(self, scroll_id, filter_response=True): """ Wrapper for ElasticSearch scroll function :param scroll_id: :param filter_response: :return: """ response_filter = self.response_filter if filter_response else None return self.client.scroll(scroll_id=scroll_id, scroll='1m', filter_path=response_filter)
class ElasticSearchSeqSource(base.DataSource): """ Data source which executes arbitrary queries on ElasticSearch This is the sequential reader: will return a list of dictionaries. Parameters ---------- query: str Query to execute. Can either be in Lucene single-line format, or a JSON structured query (presented as text) npartitions: int Split query into this many sections. If one, will not split. qargs: dict Further parameters to pass to the query, such as set of indexes to consider, filtering, ordering. See http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search es_kwargs: dict Settings for the ES connection, e.g., a simple local connection may be ``{'host': 'localhost', 'port': 9200}``. Other keywords to the Plugin that end up here and are material: scroll: str how long the query is live for, default ``'100m'`` size: int the paging size when downloading, default 1000. metadata: dict Extra information for this source. """ name = 'elasticsearch_seq' container = 'python' version = __version__ partition_access = False def __init__(self, query, npartitions=1, qargs={}, metadata={}, **es_kwargs): from elasticsearch import Elasticsearch self._query = query self._qargs = qargs self._scroll = es_kwargs.pop('scroll', '100m') self._size = es_kwargs.pop('size', 1000) # default page size self._es_kwargs = es_kwargs self._dataframe = None self.es = Elasticsearch([es_kwargs]) # maybe should be (more) global? self.es_version = tuple( int(v) if v.isdigit() else -1 for v in self.es.info()['version']['number'].strip().split(".")) super(ElasticSearchSeqSource, self).__init__(metadata=metadata) self.npartitions = npartitions def _run_query(self, size=None, end=None, slice_id=None, slice_max=None): """Execute query on ES Parameters ---------- size: int Number of objects per page end: int Cut query down to this number of results, useful for getting a sample slice_id, slice_max: int If given, this is one of slice_max partitions. """ if size is None: size = self._size if end is not None: size = min(end, size) slice_dict = None if slice_id is not None: slice_dict = {'slice': {'id': slice_id, 'max': slice_max}} try: q = json.loads(self._query) if 'query' not in q: q = {'query': q} if slice_dict: q.update(slice_dict) s = self.es.search(body=q, size=size, scroll=self._scroll, **self._qargs) except (JSONDecodeError, TypeError): s = self.es.search(body=slice_dict, q=self._query, size=size, scroll=self._scroll, **self._qargs) sid = s['_scroll_id'] if self.es_version[0] >= 7: scroll_size = s['hits']['total']['value'] else: scroll_size = s['hits']['total'] while scroll_size > len(s['hits']['hits']): page = self.es.scroll(scroll_id=sid, scroll=self._scroll) sid = page['_scroll_id'] s['hits']['hits'].extend(page['hits']['hits']) if end is not None and len(s['hits']['hits']) > end: break self.es.clear_scroll(scroll_id=sid) return s def read(self): """Read all data in one go""" return self._get_partition() def to_dask(self): """Form partitions into a dask.bag""" import dask.bag as db from dask import delayed self.discover() parts = [] if self.npartitions == 1: part = delayed(self._get_partition)() return db.from_delayed([part]) for slice_id in range(self.npartitions): parts.append(delayed(self._get_partition)(slice_id)) return db.from_delayed(parts) def _get_schema(self, retry=2): return base.Schema(datashape=None, dtype=None, shape=None, npartitions=self.npartitions, extra_metadata={}) def _get_partition(self, partition=None): """ Downloads all data or get specific partion slice of the query Parameters ---------- partition: int or None If None, get all data; otherwise, get specific partition """ slice_id = partition results = self._run_query(slice_id=slice_id, slice_max=self.npartitions) return [r['_source'] for r in results['hits']['hits']]
class ElasticsearchFDW(ForeignDataWrapper): """ Elastic Search Foreign Data Wrapper """ @property def rowid_column(self): """ Returns a column name which will act as a rowid column for delete/update operations. This can be either an existing column name, or a made-up one. This column name should be subsequently present in every returned resultset. """ return self._rowid_column def __init__(self, options, columns): super(ElasticsearchFDW, self).__init__(options, columns) self.index = options.pop("index", "") self.doc_type = options.pop("type", "") self.query_column = options.pop("query_column", None) self.is_json_query = options.pop("query_dsl", "false").lower() == "true" self.score_column = options.pop("score_column", None) self.default_sort = options.pop("default_sort", "") self.sort_column = options.pop("sort_column", None) self.scroll_size = int(options.pop("scroll_size", "1000")) self.scroll_duration = options.pop("scroll_duration", "10m") self._rowid_column = options.pop("rowid_column", "id") username = options.pop("username", None) password = options.pop("password", None) self.refresh = options.pop("refresh", "false").lower() if self.refresh not in {"true", "false", "wait_for"}: raise ValueError( "refresh option must be one of true, false, or wait_for") self.complete_returning = (options.pop("complete_returning", "false").lower() == "true") if ELASTICSEARCH_VERSION[0] >= 7: self.path = "/{index}".format(index=self.index) self.arguments = {"index": self.index} else: self.path = "/{index}/{doc_type}".format(index=self.index, doc_type=self.doc_type) self.arguments = {"index": self.index, "doc_type": self.doc_type} if (username is None) != (password is None): raise ValueError("Must provide both username and password") if username is not None: auth = (username, password) else: auth = None host = options.pop("host", "localhost") port = int(options.pop("port", "9200")) timeout = int(options.pop("timeout", "10")) self.client = Elasticsearch([{ "host": host, "port": port }], http_auth=auth, timeout=timeout, **options) self.columns = columns self.json_columns = { column.column_name for column in columns.values() if column.base_type_name.upper() in {"JSON", "JSONB"} } self.scroll_id = None def get_rel_size(self, quals, columns): """ Helps the planner by returning costs. Returns a tuple of the form (number of rows, average row width) """ try: query = self._get_query(quals) if query: if self.is_json_query: response = self.client.count(body=json.loads(query), **self.arguments) else: response = self.client.count(q=query, **self.arguments) else: response = self.client.count(**self.arguments) return (response["count"], len(columns) * 100) except Exception as exception: log2pg( "COUNT for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return (0, 0) def execute(self, quals, columns): """ Execute the query """ try: arguments = dict(self.arguments) arguments["sort"] = self._get_sort(quals) sort = arguments["sort"] query = self._get_query(quals) if query: if self.is_json_query: response = self.client.search(size=self.scroll_size, scroll=self.scroll_duration, body=json.loads(query), **self.arguments) else: response = self.client.search(size=self.scroll_size, scroll=self.scroll_duration, q=query, **self.arguments) else: response = self.client.search(size=self.scroll_size, scroll=self.scroll_duration, **arguments) while True: self.scroll_id = response["_scroll_id"] for result in response["hits"]["hits"]: yield self._convert_response_row(result, columns, query, sort) if len(response["hits"]["hits"]) < self.scroll_size: return response = self.client.scroll(scroll_id=self.scroll_id, scroll=self.scroll_duration) except Exception as exception: log2pg( "SEARCH for {path} failed: {exception}".format( path=self.path, exception=exception), logging.ERROR, ) return def end_scan(self): """ Hook called at the end of a foreign scan. """ if self.scroll_id: self.client.clear_scroll(scroll_id=self.scroll_id) self.scroll_id = None def insert(self, new_values): """ Insert new documents into Elastic Search """ if self.rowid_column not in new_values: log2pg( 'INSERT requires "{rowid}" column. Missing in: {values}'. format(rowid=self.rowid_column, values=new_values), logging.ERROR, ) return (0, 0) document_id = new_values[self.rowid_column] new_values.pop(self.rowid_column, None) for key in self.json_columns.intersection(new_values.keys()): new_values[key] = json.loads(new_values[key]) try: response = self.client.index(id=document_id, body=new_values, refresh=self.refresh, **self.arguments) if self.complete_returning: return self._read_by_id(response["_id"]) return {self.rowid_column: response["_id"]} except Exception as exception: log2pg( "INDEX for {path}/{document_id} and document {document} failed: {exception}" .format( path=self.path, document_id=document_id, document=new_values, exception=exception, ), logging.ERROR, ) return (0, 0) def update(self, document_id, new_values): """ Update existing documents in Elastic Search """ new_values.pop(self.rowid_column, None) for key in self.json_columns.intersection(new_values.keys()): new_values[key] = json.loads(new_values[key]) try: response = self.client.index(id=document_id, body=new_values, refresh=self.refresh, **self.arguments) if self.complete_returning: return self._read_by_id(response["_id"]) return {self.rowid_column: response["_id"]} except Exception as exception: log2pg( "INDEX for {path}/{document_id} and document {document} failed: {exception}" .format( path=self.path, document_id=document_id, document=new_values, exception=exception, ), logging.ERROR, ) return (0, 0) def delete(self, document_id): """ Delete documents from Elastic Search """ if self.complete_returning: document = self._read_by_id(document_id) else: document = {self.rowid_column: document_id} try: self.client.delete(id=document_id, refresh=self.refresh, **self.arguments) return document except Exception as exception: log2pg( "DELETE for {path}/{document_id} failed: {exception}".format( path=self.path, document_id=document_id, exception=exception), logging.ERROR, ) return (0, 0) def _get_query(self, quals): if not self.query_column: return None return next( (qualifier.value for qualifier in quals if qualifier.field_name == self.query_column), None, ) def _get_sort(self, quals): if not self.sort_column: return self.default_sort return next( (qualifier.value for qualifier in quals if qualifier.field_name == self.sort_column and qualifier.value), self.default_sort, ) def _convert_response_row(self, row_data, columns, query, sort): return_dict = { column: self._convert_response_column(column, row_data) for column in columns if column in row_data["_source"] or column == self.rowid_column or column == self.score_column } if query: return_dict[self.query_column] = query return_dict[self.sort_column] = sort return return_dict def _convert_response_column(self, column, row_data): if column == self.rowid_column: return row_data["_id"] if column == self.score_column: return row_data["_score"] value = row_data["_source"][column] if isinstance(value, (list, dict)): return json.dumps(value) return value def _read_by_id(self, row_id): try: arguments = dict(self.arguments) results = self.client.search( body={"query": { "ids": { "values": [row_id] } }}, **arguments)["hits"]["hits"] if results: return self._convert_response_row(results[0], self.columns, None, None) log2pg( "SEARCH for {path} row_id {row_id} returned nothing".format( path=self.path, row_id=row_id), logging.WARNING, ) return {self.rowid_column: row_id} except Exception as exception: log2pg( "SEARCH for {path} row_id {row_id} failed: {exception}".format( path=self.path, row_id=row_id, exception=exception), logging.ERROR, ) return {}
# Initialize the scroll res = es.search(index=sourceIndexName, scroll="90s", size=10, body=query) scroll_id = res['_scroll_id'] scroll_size = res['hits']['total'] hits_left = scroll_size # Start scrolling while (scroll_size > 0): #print("Querried %d Hits: %d left" % res['hits']['total'], hits_left) for hit in res['hits']['hits']: result = formatResult(hit) #print(result) csv_writer.writerow(result) res = es.scroll(scroll_id=scroll_id, scroll='2m') # Update the scroll ID sid = res['_scroll_id'] # Get the number of results that we returned in the last scroll scroll_size = len(res['hits']['hits']) hits_left = hits_left - scroll_size # print ("scroll size: " + str(scroll_size)) # Do something with the obtained page es.clear_scroll(scroll_id) csv_file.close() #for hit in res['hits']['hits']: # print("%(MessageUUID)s" % hit["_source"])
def main(): config = create_config() es_aws = config.get('DB-Section', 'es-aws') elk_credentials = config.get('Secrets-Section', 'elk-secret').strip('"').split(' ') # ------------------------------------------------------------------------------------------------------------------ # INIT ES CONNECTION # ------------------------------------------------------------------------------------------------------------------ es_host_config = { 'host': config.get('DB-Section', 'es-host', fallback='localhost'), 'port': config.get('DB-Section', 'es-port', fallback='9200') } if es_aws == 'True': es = Elasticsearch(hosts=[es_host_config], http_auth=(elk_credentials[0], elk_credentials[1]), scheme='https') else: es = Elasticsearch(hosts=[es_host_config]) # ------------------------------------------------------------------------------------------------------------------ # INIT ALL INDICES # ------------------------------------------------------------------------------------------------------------------ es_manager = ESManager() for index in ESIndices: if not es_manager.index_exists(index): create_result = es_manager.create_index(index) print(create_result) # ------------------------------------------------------------------------------------------------------------------ # GET ALL MODULES FROM 'modules' INDEX # ------------------------------------------------------------------------------------------------------------------ all_results = {} match_all_query = {'query': {'match_all': {}}} total_index_docs = 0 es_result = es.search(index=ESIndices.MODULES.value, body=match_all_query, scroll=u'10s', size=250) scroll_id = es_result.get('_scroll_id') hits = es_result['hits']['hits'] _store_hits(hits, all_results) total_index_docs += len(hits) while len(es_result['hits']['hits']): es_result = es.scroll(scroll_id=scroll_id, scroll=u'10s') scroll_id = es_result.get('_scroll_id') hits = es_result['hits']['hits'] _store_hits(hits, all_results) total_index_docs += len(hits) es.clear_scroll(scroll_id=scroll_id, ignore=(404, )) print('Total number of modules retreived from "modules" index: {}'.format( total_index_docs)) # ------------------------------------------------------------------------------------------------------------------ # FILL 'autocomplete' INDEX # ------------------------------------------------------------------------------------------------------------------ for query in all_results.values(): es_manager.delete_from_index(ESIndices.AUTOCOMPLETE, query) index_result = es_manager.index_module(ESIndices.AUTOCOMPLETE, query) if index_result['result'] != 'created': print(index_result)
class Elastic: db_mbox: str db_source: str db_attachment: str db_account: str db_session: str db_notification: str db_mailinglist: str def __init__(self, dbname=None): # Fetch config config = plugins.ponymailconfig.PonymailConfig() # Set default names for all indices we use self.dbname = config.get('elasticsearch', 'dbname', fallback='ponymail') self.db_mbox = self.dbname + '-mbox' self.db_source = self.dbname + '-source' self.db_account = self.dbname + '-account' self.db_attachment = self.dbname + '-attachment' self.db_session = self.dbname + '-session' self.db_notification = self.dbname + '-notification' self.db_mailinglist = self.dbname + '-mailinglist' self.db_version = 0 ssl = config.get('elasticsearch', 'ssl', fallback=False) uri = config.get('elasticsearch', 'uri', fallback='') auth = None if config.has_option('elasticsearch', 'user'): auth = ( config.get('elasticsearch', 'user'), config.get('elasticsearch', 'password') ) # Always allow this to be set; will be replaced as necessary by wait_for_active_shards self.consistency = config.get("elasticsearch", "write", fallback="quorum") # elasticsearch logs lots of warnings on retries/connection failure logging.getLogger("elasticsearch").setLevel(logging.ERROR) # # add debug # trace = logging.getLogger("elasticsearch.trace") # trace.setLevel(logging.DEBUG) # # create console handler # consoleHandler = logging.StreamHandler() # trace.addHandler(consoleHandler) self.es = Elasticsearch( [ { "host": config.get('elasticsearch', 'hostname', fallback='localhost'), "port": config.get('elasticsearch', 'port', fallback=9200), "use_ssl": ssl, "url_prefix": uri, "auth": auth, "ca_certs": certifi.where(), } ], max_retries=5, retry_on_timeout=True, ) es_engine_major = self.engineMajor() if es_engine_major in [7, 8]: self.wait_for_active_shards = config.get("elasticsearch", "wait", fallback=1) else: raise Exception("Unexpected elasticsearch version ", es_engine_major) # Mimic ES hierarchy: es.indices.xyz() self.indices = _indices_wrap(self) def libraryVersion(self): return ES_VERSION def libraryMajor(self): return ES_VERSION[0] def engineVersion(self): if not self.db_version: try: self.db_version = self.es.info()["version"]["number"] except ES_ConnectionError: # default if cannot connect; allows retry return "0.0.0" return self.db_version def engineMajor(self): return int(self.engineVersion().split(".")[0]) def getdbname(self): return self.dbname def search(self, **kwargs): return self.es.search(index=self.dbname, **kwargs) def index(self, **kwargs): kwargs["wait_for_active_shards"] = self.wait_for_active_shards kwargs["doc_type"] = "_doc" return self.es.index(**kwargs) def update(self, **kwargs): return self.es.update(index=self.dbname, **kwargs) def scan(self, scroll="3m", size=100, **kwargs): return self.es.search( index=self.dbname, search_type="scan", size=size, scroll=scroll, **kwargs ) def scan_and_scroll(self, scroll="3m", size=100, **kwargs): """ Run a backwards compatible scan/scroll, passing an iterator that returns one page of hits per iteration. This incorporates es.scroll for continuous iteration, and thus the scroll() does NOT need to be called at all by the calling process. """ results = self.es.search(index=self.dbname, size=size, scroll=scroll, **kwargs) if results["hits"].get("hits", []): # Might not be there in 2.x? yield results # While we have hits waiting, scroll... scroll_size = results["hits"]["total"] while scroll_size > 0: results = self.scroll(scroll_id=results["_scroll_id"], scroll=scroll) scroll_size = len( results["hits"]["hits"] ) # If >0, try another scroll next. yield results def get(self, **kwargs): return self.es.get(index=self.dbname, **kwargs) def scroll(self, **kwargs): return self.es.scroll(**kwargs) def info(self, **kwargs): return self.es.info(**kwargs) def bulk(self, actions, **kwargs): return helpers.bulk(self.es, actions, **kwargs) def clear_scroll(self, *args, **kwargs): """ Call this to release the scroll id and its resources It looks like the Python library already releases the SID if the caller scrolls to the end of the results, so only need to call this when terminating scrolling early. """ return self.es.clear_scroll(*args, **kwargs)
# make a search() request to get all docs in the index res = elastic_client.search(index="test_index", body=filter, scroll='2m') # Print records of first batch print("total hits:", len(res["hits"]["hits"])) # Get scroll id for scroll api sid = res['_scroll_id'] scroll_size = len(res['hits']['total']) all_hits = res['hits']['hits'] for num, doc in enumerate(all_hits): print("DOC ID:", doc["_id"]) for key, value in doc.items(): print(key, "-->", value) print("\n\n") while (scroll_size > 0): print("Scrolling...") res = elastic_client.scroll(scroll_id=sid, scroll='2m') # Update the scroll ID sid = res['_scroll_id'] # Get the number of results that we returned in the last scroll scroll_size = len(res['hits']['hits']) all_hits = res['hits']['hits'] for num, doc in enumerate(all_hits): print("DOC ID:", doc["_id"]) for key, value in doc.items(): print(key, "-->", value) print("\n\n") print("scroll size: " + str(scroll_size)) # Clear memory for this batch elastic_client.clear_scroll(body={'scroll_id': [sid]})
iname = "" dtype = "" # Initialize the scroll page = es.search( index = iname, doc_type = "", scroll = '2m', search_type = 'scan', size = 1000, body = { # your query's body} ) sid = page['_scroll_id'] scroll_size = page['hits']['total'] # start scrolling while (scroll_size > 0): print "scrolling.." page = es.scroll(scroll_id = sid, scroll = '2m') # update the scroll ID sid = page['_scroll_id'] # Get the number of results that we returned in the last scroll scroll_size = len(page['hits']['hits']) print "scroll size: " + str(scroll_size) # Do something with the obtained page # clear scroll es.clear_scroll(body={'scroll_id': [sid]}, ignore=(404, ))