示例#1
0
def get_flow_batch(start_ts, end_ts):
    es_client = Elasticsearch(ES_CONF, **ES_OPTS)
    indices = ','.join([
        INDEX_NAME + '__0_*_' + it
        for it in timestamps_to_index_dates(start_ts, end_ts)
    ])
    query_body = {
        'query': {
            'range': {
                TIME_FIELD: {
                    'gte': start_ts,
                    'lt': end_ts
                }
            }
        },
        'stored_fields': STORED_FIELDS
    }
    ret = es_client.search(index=indices,
                           body=query_body,
                           scroll=SCROLL_TIMEOUT,
                           size=SCROLL_SIZE)
    while True:
        sid = ret['_scroll_id']
        size = len(ret['hits']['hits'])
        if size == 0:
            break
        yield ret['hits']['hits']
        ret = es_client.scroll(scroll_id=sid, scroll=SCROLL_TIMEOUT)
    es_client.clear_scroll(scroll_id=sid)
示例#2
0
def scroll_hits(es: Elasticsearch,
                query: dict,
                index: str,
                doc_type: str = '_doc',
                size: int = 100,
                scroll: str = '2m') -> iter:
    response = es.search(index=index,
                         doc_type=doc_type,
                         scroll=scroll,
                         size=size,
                         body=query)
    sid = response['_scroll_id']
    scroll_size = response['hits']['total']
    print('total hits:', scroll_size, "\thits per scroll:",
          len(response['hits']['hits']))
    if type(scroll_size) == dict:
        scroll_size = scroll_size['value']
    # Start scrolling
    while scroll_size > 0:
        for hit in response['hits']['hits']:
            yield hit
        response = es.scroll(scroll_id=sid, scroll=scroll)
        # Update the scroll ID
        sid = response['_scroll_id']
        # Get the number of results that we returned in the last scroll
        scroll_size = len(response['hits']['hits'])
        # Do something with the obtained page
    # remove scroll context
    es.clear_scroll(scroll_id=sid)
示例#3
0
def export(s, sindex, size):
    es = Elasticsearch(s)
    page = es.search(index=sindex,
                     scroll='5m',
                     size=size,
                     body={
                         "query": {
                             "bool": {
                                 "must": [{
                                     "range": {
                                         "bizType": {
                                             "gte": 30,
                                             "lte": 40
                                         }
                                     }
                                 }],
                                 "must_not": [{
                                     "exists": {
                                         "field": "seriesColumnId"
                                     }
                                 }]
                             }
                         },
                         "sort": [{
                             "bizId": {
                                 "order": "asc"
                             }
                         }]
                     })

    try:
        sid = page['_scroll_id']
    except TypeError:
        print "type Error"
        print page
        sys.exit(2)

    scroll_size = page['hits']['total']

    allsize = len(page['hits']['hits'])

    bulkdata(es, page['hits']['hits'], size)

    while scroll_size > 0:
        printSth("Scrolling...")
        page = getpage(es, sid)
        # Update the scroll ID
        sid = page['_scroll_id']

        scroll_size = len(page['hits']['hits'])

        allsize += scroll_size
        printSth("scroll size: " + str(scroll_size) + "  all size: " +
                 str(allsize) + "  \r\n")

        bulkdata(es, page['hits']['hits'], size)

    print("\n" + sid)

    es.clear_scroll(sid)
    def collect(self):
        hosts = split_and_strip(self.hosts, ",")
        es = Elasticsearch(hosts)

        query = json.loads(self.query) if self.query else {}
        query["sort"] = "_doc"
        if self.slice_max > 1:
            query["slice"] = {"id": self.slice_id, "max": self.slice_max}

        # initial search
        resp = es.search(index=self.indexes,
                         body=query,
                         scroll=self.scroll,
                         size=self.DEFAULT_SIZE)

        scroll_id = resp.get("_scroll_id")
        if scroll_id is None:
            return self._build_collection_task_result(
                CollectionTaskStatus.SUCCESS)

        try:
            first_run = True
            while True:
                # if we didn't set search_type to scan initial search contains data
                if first_run:
                    first_run = False
                else:
                    resp = es.scroll(scroll_id, scroll=self.scroll)

                self._write_docs_to_aliyun_log(resp["hits"])

                # check if we have any errrors
                if resp["_shards"]["successful"] < resp["_shards"]["total"]:
                    msg = "Scroll request has only succeeded on %d shards out of %d." % \
                          (resp["_shards"]["successful"], resp["_shards"]["total"])
                    logging.warning(msg)
                    # return self._build_collection_task_result(CollectionTaskStatus.FAIL_NO_RETRY, msg)
                scroll_id = resp.get("_scroll_id")
                # end of scroll
                if scroll_id is None or not resp["hits"]["hits"]:
                    break

            return self._build_collection_task_result(
                CollectionTaskStatus.SUCCESS)
        finally:
            if scroll_id:
                es.clear_scroll(body={"scroll_id": [scroll_id]},
                                ignore=(404, ))
def export(s, sindex, d, dindex, size):
    es = Elasticsearch([s])
    es_des = Elasticsearch([d])

    page = es.search(index=sindex, scroll='2m', size=size)

    try:
        sid = page['_scroll_id']
    except TypeError:
        print "type Error"
        print page
        sys.exit(2)

    scroll_size = page['hits']['total']

    allsize = len(page['hits']['hits'])

    bulkdata(es_des, page['hits']['hits'], dindex, size)

    while scroll_size > 0:
        printSth("Scrolling...")
        page = getpage(es, sid)
        # Update the scroll ID
        sid = page['_scroll_id']

        scroll_size = len(page['hits']['hits'])

        allsize += scroll_size
        printSth("scroll size: " + str(scroll_size) + "  all size: " +
                 str(allsize) + "  \r\n")

        bulkdata(es_des, page['hits']['hits'], dindex, size)

    print("\n" + sid)

    es.clear_scroll(sid)
示例#6
0
class ElasticsearchReader(object):

    def __init__(self,
                 index,
                 hosts=None,
                 source='{"query":{"match_all":{}}}',
                 max_docs=0,
                 scroll_size=10,
                 scroll_time='5m',
                 request_timeout=600,
                 report=1000
                 ):
        if hosts is None:
            hosts = ['http://localhost:9200']
        self.index = index
        self.source = json.loads(source) if isinstance(source, str) else source
        self.max_docs = max_docs
        self.scroll_time = scroll_time
        self.scroll_size = scroll_size
        self.request_timeout = request_timeout
        self.es = Elasticsearch(hosts=hosts)
        self.report = report
        self.scroll_id = None

    def __iter__(self):
        self.scroll_id = None
        counter = 0
        running = True
        try:
            while(running):
                if self.scroll_id is None:
                    response = self.es.search(index=self.index,
                                              body=self.source,
                                              params={"request_timeout": self.request_timeout,
                                                      "scroll": self.scroll_time,
                                                      "size": self.scroll_size})
                    logger.info(u'{0} docs exist.'.format(response['hits']['total']))
                else:
                    response = self.es.scroll(scroll_id=self.scroll_id,
                                              params={"request_timeout": self.request_timeout,
                                                      "scroll": self.scroll_time})
                if len(response['hits']['hits']) == 0:
                    self.scroll_id = None
                    running = False
                    break
                self.scroll_id = response['_scroll_id']
                for hit in response['hits']['hits']:
                    if '_source' in hit:
                        counter += 1
                        if self.max_docs > 0 and counter >= self.max_docs:
                            logger.info(u'%d docs are loaded, but it exceeded %d docs.',
                                        counter,
                                        self.max_docs)
                            running = False
                            break
                        if counter % self.report == 0:
                            logger.info(u'%d docs are loaded.', counter)
                        yield hit['_source']
        except NotFoundError:
            logger.exception(u'NotFoundError(Loaded %d docs)', counter)
        except:
            logger.exception(u"Failed to load documents from Elasticsearch(Loaded %d docs).",
                             counter)

        logger.info('Loaded %d documents.', counter)

    def __enter__(self):
        return self

    def __exit__(self, exception_type, exception_value, traceback):
        self.close()

    def close(self):
        if self.scroll_id is not None:
            self.es.clear_scroll(scroll_id=self.scroll_id,
                                 params={"request_timeout": self.request_timeout})
示例#7
0
class ElasticSearchSeqSource(base.DataSource):
    """
    Data source which executes arbitrary queries on ElasticSearch

    This is the tabular reader: will return dataframes. Nested return items
    will become dict-like objects in the output.

    Parameters
    ----------
    query: str
       Query to execute. Can either be in Lucene single-line format, or a
       JSON structured query (presented as text)
    qargs: dict
        Further parameters to pass to the query, such as set of indexes to
        consider, filtering, ordering. See
        http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search
    es_kwargs: dict
        Settings for the ES connection, e.g., a simple local connection may be
        ``{'host': 'localhost', 'port': 9200}``.
        Other keywords to the Plugin that end up here and are material:

        scroll: str
            how long the query is live for, default ``'100m'``
        size: int
            the paging size when downloading, default 1000.
    metadata: dict
        Extra information for this source.
    """
    name = 'elasticsearch_seq'
    container = 'python'
    version = __version__
    partition_access = False

    def __init__(self, query, qargs={}, metadata={}, **es_kwargs):
        from elasticsearch import Elasticsearch
        self._query = query
        self._qargs = qargs
        self._scroll = es_kwargs.pop('scroll', '100m')
        self._size = es_kwargs.pop('size', 1000)  # default page size
        self._es_kwargs = es_kwargs
        self._dataframe = None
        self.es = Elasticsearch([es_kwargs])  # maybe should be (more) global?

        super(ElasticSearchSeqSource, self).__init__(metadata=metadata)

    def _run_query(self, size=None, end=None):
        if size is None:
            size = self._size
        if end is not None:
            size = min(end, size)
        try:
            q = json.loads(self._query)
            if 'query' not in q:
                q = {'query': q}
            s = self.es.search(body=q,
                               size=size,
                               scroll=self._scroll,
                               **self._qargs)
        except (JSONDecodeError, TypeError):
            s = self.es.search(q=self._query,
                               size=size,
                               scroll=self._scroll,
                               **self._qargs)
        sid = s['_scroll_id']
        scroll_size = s['hits']['total']
        while scroll_size > len(s['hits']['hits']):
            page = self.es.scroll(scroll_id=sid, scroll=self._scroll)
            sid = page['_scroll_id']
            s['hits']['hits'].extend(page['hits']['hits'])
            if end is not None and len(s['hits']['hits']) > end:
                break
        self.es.clear_scroll(scroll_id=sid)
        return s

    def _get_schema(self, retry=2):
        """Get schema from first 10 hits or cached dataframe"""
        return base.Schema(datashape=None,
                           dtype=None,
                           shape=None,
                           npartitions=1,
                           extra_metadata={})

    def _get_partition(self, _):
        """Downloads all data

        ES has a hard maximum of 10000 items to fetch. Otherwise need to
        implement paging, known to ES as "scroll"
        https://stackoverflow.com/questions/41655913/elk-how-do-i-retrieve-more-than-10000-results-events-in-elastic-search
        """
        results = self._run_query()
        return [r['_source'] for r in results['hits']['hits']]
class ElasticsearchClient():
    def __init__(self, config):
        self.config = config
        self.client = Elasticsearch(config.elasticsearch_url)
        self.scroll_id = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self._try_clear_scroll()

    def next_page_of_records(self):
        result = self._retrieve_page_of_data()
        self.scroll_id = self._get_scroll_id(result)
        site_map_entries = self._get_site_map_entries(result)
        LOGGER.info('Retrieved {} records from elasticsearch'.format(
            len(site_map_entries)))
        return site_map_entries

    def _retrieve_page_of_data(self):
        try:
            if self.scroll_id:
                result = self._scroll(self.scroll_id)
            else:
                result = self._search()

            return result
        except Exception as e:
            raise Exception(
                'Failed to retrieve a page of data from elasticsearch', e)

    def _get_site_map_entries(self, result):
        try:
            address_page = self._get_addresses(result)
            site_map_entries = self._convert_to_site_map_entries(address_page)
            return site_map_entries
        except Exception as e:
            raise Exception(
                'Failed to convert elasticsearch result to a list of site map entries',
                e)

    def _scroll(self, scroll_id):
        return self.client.scroll(scroll_id=scroll_id,
                                  params={
                                      'scroll': self.config.scroll_expiry,
                                      'search_type': 'scan',
                                      'size': self.config.page_size,
                                      'timeout': self.config.request_timeout,
                                  })

    def _search(self):
        return self.client.search(self.config.es_index,
                                  self.config.es_doc_type,
                                  body=None,
                                  params={
                                      'size': self.config.page_size,
                                      'scroll': self.config.scroll_expiry,
                                      'timeout': self.config.request_timeout,
                                  })

    def _try_clear_scroll(self):
        try:
            self.client.clear_scroll(self.scroll_id)
            LOGGER.info('Cleared elasticsearch scroll')
        except Exception as e:
            LOGGER.warn('Failed to clear scroll in elasticsearch', e)

    def _get_scroll_id(self, search_result):
        try:
            return search_result['_scroll_id']
        except Exception as e:
            raise Exception(
                'Failed to extract scroll ID from the elasticsearch result', e)

    def _get_addresses(self, search_result):
        return search_result['hits']['hits']

    def _convert_to_site_map_entries(self, address_page):
        return [self._get_site_map_entry(address) for address in address_page]

    def _get_page_url(self, address):
        data = address['_source']
        postcode = data['postcode']
        address_key = data['addressKey']
        address_url_segment = address_key[:len(address_key) - len(postcode) -
                                          1]

        return '{}/{}/{}'.format(self.config.base_page_url,
                                 postcode.replace(' ', '_'),
                                 address_url_segment)

    def _get_site_map_entry(self, address):
        entry_datetime = datetime.strptime(address['_source']['entryDatetime'],
                                           '%Y-%m-%dT%H:%M:%S+00')

        return SiteMapUrl(
            location=self._get_page_url(address),
            last_modified=entry_datetime.strftime('%Y-%m-%dT%H:%M+00:00'),
            change_frequency=self.config.url_change_frequency,
        )
示例#9
0
class ElasticsearchReader(object):
    def __init__(self,
                 index,
                 hosts=None,
                 source='{"query":{"match_all":{}}}',
                 max_docs=0,
                 scroll_size=10,
                 scroll_time='5m',
                 request_timeout=600,
                 report=1000):
        if hosts is None:
            hosts = ['http://localhost:9200']
        self.index = index
        self.source = json.loads(source) if isinstance(source, str) else source
        self.max_docs = max_docs
        self.scroll_time = scroll_time
        self.scroll_size = scroll_size
        self.request_timeout = request_timeout
        self.es = Elasticsearch(hosts=hosts)
        self.report = report
        self.scroll_id = None

    def __iter__(self):
        self.scroll_id = None
        counter = 0
        running = True
        try:
            while (running):
                if self.scroll_id is None:
                    response = self.es.search(index=self.index,
                                              body=self.source,
                                              params={
                                                  "request_timeout":
                                                  self.request_timeout,
                                                  "scroll": self.scroll_time,
                                                  "size": self.scroll_size
                                              })
                    logger.info(u'{0} docs exist.'.format(
                        response['hits']['total']))
                else:
                    response = self.es.scroll(scroll_id=self.scroll_id,
                                              params={
                                                  "request_timeout":
                                                  self.request_timeout,
                                                  "scroll": self.scroll_time
                                              })
                if len(response['hits']['hits']) == 0:
                    self.scroll_id = None
                    running = False
                    break
                self.scroll_id = response['_scroll_id']
                for hit in response['hits']['hits']:
                    if '_source' in hit:
                        counter += 1
                        if self.max_docs > 0 and counter >= self.max_docs:
                            logger.info(
                                u'%d docs are loaded, but it exceeded %d docs.',
                                counter, self.max_docs)
                            running = False
                            break
                        if counter % self.report == 0:
                            logger.info(u'%d docs are loaded.', counter)
                        yield hit['_source']
        except NotFoundError:
            logger.exception(u'NotFoundError(Loaded %d docs)', counter)
        except:
            logger.exception(
                u"Failed to load documents from Elasticsearch(Loaded %d docs).",
                counter)

        logger.info('Loaded %d documents.', counter)

    def __enter__(self):
        return self

    def __exit__(self, exception_type, exception_value, traceback):
        self.close()

    def close(self):
        if self.scroll_id is not None:
            self.es.clear_scroll(
                scroll_id=self.scroll_id,
                params={"request_timeout": self.request_timeout})
示例#10
0
class Elastic:
    def __init__(self, dbname=None):
        # Fetch config
        config = PonymailConfig()
        self.dbname = dbname or config.get("elasticsearch", "dbname")
        ssl = config.get("elasticsearch", "ssl",
                         fallback="false").lower() == 'true'
        uri = config.get("elasticsearch", "uri", fallback="")
        auth = None
        if config.has_option('elasticsearch', 'user'):
            auth = (config.get('elasticsearch',
                               'user'), config.get('elasticsearch',
                                                   'password'))

        # elasticsearch logs lots of warnings on retries/connection failure
        logging.getLogger("elasticsearch").setLevel(logging.ERROR)

        #         # add debug
        #         trace = logging.getLogger("elasticsearch.trace")
        #         trace.setLevel(logging.DEBUG)
        #         # create console handler
        #         consoleHandler = logging.StreamHandler()
        #         trace.addHandler(consoleHandler)

        self.es = Elasticsearch(
            [{
                'host': config.get("elasticsearch", "hostname"),
                'port': int(config.get("elasticsearch", "port")),
                'use_ssl': ssl,
                'url_prefix': uri,
                'auth': auth,
                'ca_certs': certifi.where()
            }],
            max_retries=5,
            retry_on_timeout=True)
        self.dbVersion = None
        # Mimic ES hierarchy: es.indices.xyz()
        self.indices = _indices_wrap(self)

    def libraryVersion(self):
        return ES_VERSION

    def libraryMajor(self):
        return ES_VERSION[0]

    def engineVersion(self):
        if not self.dbVersion:
            try:
                self.dbVersion = self.info()['version']['number']
            except ES_ConnectionError:
                # default if cannot connect; allows retry
                return '0.0.0'
        return self.dbVersion

    def engineMajor(self):
        return int(self.engineVersion().split('.')[0])

    def getdbname(self):
        return self.dbname

    def search(self, doc_type='mbox', **kwargs):
        return self.es.search(index=self.dbname, doc_type=doc_type, **kwargs)

    def index(self, **kwargs):
        return self.es.index(index=self.dbname, **kwargs)

    def update(self, **kwargs):
        return self.es.update(index=self.dbname, **kwargs)

    def scan(self, doc_type='mbox', scroll='3m', size=100, **kwargs):
        return self.es.search(index=self.dbname,
                              doc_type=doc_type,
                              search_type='scan',
                              size=size,
                              scroll=scroll,
                              **kwargs)

    def get(self, **kwargs):
        return self.es.get(index=self.dbname, **kwargs)

    def scroll(self, **kwargs):
        return self.es.scroll(**kwargs)

    def info(self, **kwargs):
        return self.es.info(**kwargs)

    def bulk(self, actions, **kwargs):
        return helpers.bulk(self.es, actions, **kwargs)

    def clear_scroll(self, *args, **kwargs):
        """
            Call this to release the scroll id and its resources

            It looks like the Python library already releases the SID
            if the caller scrolls to the end of the results, so only need to call this
            when terminating scrolling early.
        """
        return self.es.clear_scroll(*args, **kwargs)
class SuricateFDW(ForeignDataWrapper):
    """ Elastic Search Foreign Data Wrapper """
    @property
    def rowid_column(self):
        """ Returns a column name which will act as a rowid column for
            delete/update operations.

            This can be either an existing column name, or a made-up one. This
            column name should be subsequently present in every returned
            resultset. """

        return self._rowid_column

    def __init__(self, options, columns):
        super(SuricateFDW, self).__init__(options, columns)

        self.index = options.pop("index", "")
        self.query_column = options.pop("query_column", None)
        self.response_column = options.pop("response_column", None)
        self.pg_id_column = options.pop("pg_id_column", None)
        self.size = int(options.pop("size", 10))
        self.explain = (options.pop("explain", "false").lower() == "true")
        self._rowid_column = options.pop("rowid_column", "id")
        username = options.pop("username", None)
        password = options.pop("password", None)
        # self.score_column = options.pop("score_column", None)
        # self.default_sort = options.pop("default_sort", "")
        # self.sort_column = options.pop("sort_column", None)
        # self.scroll_size = int(options.pop("scroll_size", "1000"))
        # self.scroll_duration = options.pop("scroll_duration", "10m")

        self.path = "/{index}".format(index=self.index)

        if (username is None) != (password is None):
            raise ValueError("Must provide both username and password")
        if username is not None:
            auth = (username, password)
        else:
            auth = None

        host = options.pop("host", "localhost")
        port = int(options.pop("port", "9200"))
        timeout = int(options.pop("timeout", "10"))
        self.client = Elasticsearch([{
            "host": host,
            "port": port
        }],
                                    http_auth=auth,
                                    timeout=timeout,
                                    **options)
        self.scroll_id = None

    def get_rel_size(self, quals, columns):
        """ Helps the planner by returning costs.
            Returns a tuple of the form (number of rows, average row width) """

        try:
            query = self._get_query(quals)
            q_dict = json.loads(query.encode('utf-8'))
            response = self.client.count(body=q_dict, index=self.index)
            return (response["count"], len(columns) * 100)
        except Exception as exception:
            log2pg(
                "COUNT for {path} failed: {exception}".format(
                    path=self.path, exception=exception),
                logging.ERROR,
            )
            return (0, 0)

    def execute(self, quals, columns):
        """ Execute the query """

        try:
            query = self._get_query(quals)
            q_dict = json.loads(query.encode('utf-8'))
            pg_id = self._get_pg_id(quals)
            response = self.client.search(body=q_dict,
                                          index=self.index,
                                          size=self.size,
                                          explain=self.explain)
            while True:
                for result in response["hits"]["hits"]:
                    yield self._format_out(result, pg_id=pg_id, query=query)

                return
        except Exception as exception:
            log2pg(
                "SEARCH for {path} failed: {exception}".format(
                    path=self.path, exception=exception),
                logging.ERROR,
            )
            return

    def _get_pg_id(self, quals):
        if not self.query_column:
            return None

        return next(
            (qualifier.value for qualifier in quals
             if qualifier.field_name == self.pg_id_column),
            None,
        )

    def end_scan(self):
        """ Hook called at the end of a foreign scan. """
        if self.scroll_id:
            self.client.clear_scroll(scroll_id=self.scroll_id)
            self.scroll_id = None

    def _format_out(self, response, pg_id, query):
        result_dict = {
            self.response_column: json.dumps(response),
            self.pg_id_column: pg_id,
            self.query_column: query
        }
        return result_dict

    def _get_query(self, quals):
        return next(
            (qualifier.value for qualifier in quals
             if qualifier.field_name == self.query_column),
            None,
        )

    def _convert_response_row(self, row_data, columns, query, sort):
        return_dict = {
            column: self._convert_response_column(column, row_data)
            for column in columns if column in row_data["_source"]
            or column == self.rowid_column or column == self.score_column
        }
        if query:
            return_dict[self.query_column] = query
        return_dict[self.sort_column] = sort
        return return_dict

    def _read_by_id(self, row_id):
        try:
            results = self.client.search(
                body={"query": {
                    "ids": {
                        "values": [row_id]
                    }
                }},
                index=self.index)["hits"]["hits"]
            if results:
                return self._convert_response_row(results[0], self.columns,
                                                  None, None)
            log2pg(
                "SEARCH for {path} row_id {row_id} returned nothing".format(
                    path=self.path, row_id=row_id),
                logging.WARNING,
            )
            return {self.rowid_column: row_id}
        except Exception as exception:
            log2pg(
                "SEARCH for {path} row_id {row_id} failed: {exception}".format(
                    path=self.path, row_id=row_id, exception=exception),
                logging.ERROR,
            )
            return {}
示例#12
0
class ESManager:
    def __init__(self) -> None:
        config = create_config()
        es_aws = config.get('DB-Section', 'es-aws')
        elk_credentials = config.get('Secrets-Section', 'elk-secret').strip('"').split(' ')
        self.elk_repo_name = config.get('General-Section', 'elk-repo-name')
        es_host_config = {
            'host': config.get('DB-Section', 'es-host', fallback='localhost'),
            'port': config.get('DB-Section', 'es-port', fallback='9200')
        }
        if es_aws == 'True':
            self.es = Elasticsearch(hosts=[es_host_config], http_auth=(elk_credentials[0], elk_credentials[1]), scheme='https')
        else:
            self.es = Elasticsearch(hosts=[es_host_config])

    def create_index(self, index: ESIndices):
        """ Create Elasticsearch index with given name.

        Argument:
            :param index_name   (str) name of the index to be created
        """
        index_name = index.value
        index_json_name = f'initialize_{index_name}_index.json'
        index_json_path = os.path.join(os.environ['BACKEND'], 'elasticsearchIndexing/json/', index_json_name)
        with open(index_json_path, encoding='utf-8') as reader:
            index_config = json.load(reader)

        create_result = None
        try:
            create_result = self.es.indices.create(index=index_name, body=index_config, ignore=400)
        except AuthorizationException:
            # https://discuss.elastic.co/t/forbidden-12-index-read-only-allow-delete-api/110282/4
            read_only_query = {'index': {'blocks': {'read_only_allow_delete': 'false'}}}
            self.es.indices.put_settings(index=index_name, body=read_only_query)
            create_result = self.es.indices.create(index=index_name, body=index_config, ignore=400)
        return create_result

    def index_exists(self, index: ESIndices) -> bool:
        """ Check if the index already exists. """
        return self.es.indices.exists(index=index.value)

    def autocomplete(self, index: ESIndices, keyword: KeywordsNames, searched_term: str) -> list:
        """ Get list of the modules which will be returned as autocomplete
        after entering the "search_term" by the user.
        Arguments:
            :param keyword          (KeywordsNames)
            :param searched_term    (str)
        """
        autocomplete_json_path = os.path.join(os.environ['BACKEND'], 'elasticsearchIndexing/json/completion.json')
        with open(autocomplete_json_path, encoding='utf-8') as reader:
            autocomplete_query = json.load(reader)

        autocomplete_query['query']['bool']['must'][0]['term'] = {keyword.value: searched_term.lower()}
        autocomplete_query['aggs']['groupby_module']['terms']['field'] = f'{keyword.value}.keyword'
        rows = self.es.search(index=index.value, body=autocomplete_query)
        hits = rows['aggregations']['groupby_module']['buckets']

        result = [hit['key'] for hit in hits]

        return result

    def delete_from_index(self, index: ESIndices, module: dict):
        delete_module_query = self._get_name_revision_query(index, module)

        return self.es.delete_by_query(index=index.value, body=delete_module_query, conflicts='proceed')

    def delete_from_indices(self, module: dict):
        for index in ESIndices:
            self.delete_from_index(index, module)

    def index_module(self, index: ESIndices, document: dict):
        # TODO: Remove this after reindexing and unification of both indices
        if index == ESIndices.MODULES:
            path = document['path']
            del document['path']
            document['dir'] = path

            name = document['name']
            del document['name']
            document['module'] = name

        return self.es.index(index=index.value, body=document, request_timeout=40)

    def match_all(self, index: ESIndices):
        def _store_hits(hits: list, all_results: dict):
            for hit in hits:
                name = ''
                path = ''
                if index == ESIndices.AUTOCOMPLETE:
                    name = hit['_source']['name']
                    path = hit['_source']['path']
                if index == ESIndices.MODULES:
                    name = hit['_source']['module']
                    path = hit['_source']['dir']
                mod = {
                    'name': name,
                    'revision': hit['_source']['revision'],
                    'organization': hit['_source']['organization'],
                    'path': path
                }
                key = '{}@{}/{}'.format(mod.get('name'), mod.get('revision'), mod.get('organization'))
                if key not in all_results:
                    all_results[key] = mod
                else:
                    print('{} already in all results'.format(key))

        all_results = {}
        match_all_query = {
            'query': {
                'match_all': {}
            }
        }
        total_index_docs = 0
        es_result = self.es.search(index=index.value, body=match_all_query, scroll=u'10s', size=250)
        scroll_id = es_result.get('_scroll_id')
        hits = es_result['hits']['hits']
        _store_hits(hits, all_results)
        total_index_docs += len(hits)

        while es_result['hits']['hits']:
            es_result = self.es.scroll(
                scroll_id=scroll_id,
                scroll=u'10s'
            )

            scroll_id = es_result.get('_scroll_id')
            hits = es_result['hits']['hits']
            _store_hits(hits, all_results)
            total_index_docs += len(hits)

        self.es.clear_scroll(scroll_id=scroll_id, ignore=(404, ))
        return all_results

    def get_module_by_name_revision(self, index: ESIndices, module: dict) -> bool:
        get_module_query = self._get_name_revision_query(index, module)

        es_result = self.es.search(index=index.value, body=get_module_query)

        return es_result['hits']['hits']

    def get_latest_module_revision(self, index: ESIndices, name: str):
        query_path = os.path.join(os.environ['BACKEND'], 'elasticsearchIndexing/json/latest_revision_query.json')
        with open(query_path, encoding='utf-8') as reader:
            latest_revision_query = json.load(reader)

        # TODO: Remove this after reindexing and unification of both indices
        if index == ESIndices.AUTOCOMPLETE:
            del latest_revision_query['query']['bool']['must'][0]['match_phrase']['module.keyword']
            latest_revision_query['query']['bool']['must'][0]['match_phrase'] = {
                'name.keyword': {
                    'query': name
                }
            }
        else:
            latest_revision_query['query']['bool']['must'][0]['match_phrase']['module.keyword']['query'] = name

        es_result = self.es.search(index=index.value, body=latest_revision_query)

        return es_result['hits']['hits']

    def document_exists(self, index: ESIndices, module: dict) -> bool:
        get_module_query = self._get_name_revision_query(index, module)

        es_count = self.es.count(index=index.value, body=get_module_query)

        return es_count['count'] > 0

    def create_snapshot_repository(self, compress):
        body = {
            'type': 'fs',
            'settings': {
                'location': self.elk_repo_name,
                'compress': compress
            }
        }
        es_result = self.es.snapshot.create_repository(repository=self.elk_repo_name, body=body)

        return es_result

    def create_snapshot(self, snapshot_name: str):
        index_body = {
            'indices': '_all'
        }
        return self.es.snapshot.create(repository=self.elk_repo_name, snapshot=snapshot_name, body=index_body)

    def get_sorted_snapshots(self) -> list:
        snapshots = self.es.snapshot.get(repository=self.elk_repo_name, snapshot='_all')['snapshots']
        return sorted(snapshots, key=itemgetter('start_time_in_millis'))

    def restore_snapshot(self, snapshot_name: str):
        index_body = {
            'indices': '_all'
        }
        return self.es.snapshot.restore(repository=self.elk_repo_name, snapshot=snapshot_name, body=index_body)

    def delete_snapshot(self, snapshot_name: str):
        return self.es.snapshot.delete(repository=self.elk_repo_name, snapshot=snapshot_name)

    def _get_name_revision_query(self, index: ESIndices, module: dict):
        module_search_path = os.path.join(os.environ['BACKEND'], 'elasticsearchIndexing/json/module_search.json')
        with open(module_search_path, encoding='utf-8') as reader:
            name_revision_query = json.load(reader)

        # TODO: Remove this after reindexing and unification of both indices
        if index == ESIndices.AUTOCOMPLETE:
            del name_revision_query['query']['bool']['must'][0]['match_phrase']['module.keyword']
            name_revision_query['query']['bool']['must'][0]['match_phrase'] = {
                'name.keyword': {
                    'query': module['name']
                }
            }
        else:
            name_revision_query['query']['bool']['must'][0]['match_phrase']['module.keyword']['query'] = module['name']
        name_revision_query['query']['bool']['must'][1]['match_phrase']['revision']['query'] = module['revision']

        return name_revision_query
示例#13
0
		results_list = []

		if first_query is True:
			es_response = es.search(index=index_name, body={'query': es_query}, scroll=ES_SCROLL_TIMEOUT, size=ES_DOC_COUNT)
			es_scroll_id = es_response['_scroll_id']
			total_docs = es_response['hits']['total']
			print total_docs
			sys.exit()
			first_query = False
		else:
			es_response = es.scroll(scroll_id=es_scroll_id, scroll=ES_SCROLL_TIMEOUT)
			es_scroll_id = es_response['_scroll_id']
		for entry in es_response:
			if (len(es_response['hits']['hits']) != 0) and (entry not in results_list):
				results_list.append(es_response)

		count += len(es_response['hits']['hits'])
		print 'Retrieved %s of %s docs' % (count, total_docs)

		if len(es_response['hits']['hits']) == 0:
			queries_complete = True

		print 'Creating report....'
		print 'done.'

		#gc.collect()

	print es.clear_scroll(scroll_id=es_scroll_id)
	runtime = datetime.now() - start_time
	print 'Total runtime %s' % runtime
示例#14
0
class Elastic:
    def __init__(self, dbname=None, **kwargs):
        # Fetch config
        config = configparser.RawConfigParser()
        config.read('ponymail.cfg')
        self.dbname = dbname or config.get("elasticsearch", "dbname")
        ssl = config.get("elasticsearch", "ssl",
                         fallback="false").lower() == 'true'
        uri = config.get("elasticsearch", "uri", fallback="")
        auth = None
        if config.has_option('elasticsearch', 'user'):
            auth = (config.get('elasticsearch',
                               'user'), config.get('elasticsearch',
                                                   'password'))

        # elasticsearch logs lots of warnings on retries/connection failure
        logging.getLogger("elasticsearch").setLevel(logging.ERROR)

        #         # add debug
        #         trace = logging.getLogger("elasticsearch.trace")
        #         trace.setLevel(logging.DEBUG)
        #         # create console handler
        #         consoleHandler = logging.StreamHandler()
        #         trace.addHandler(consoleHandler)

        self.es = Elasticsearch(
            [{
                'host': config.get("elasticsearch", "hostname"),
                'port': int(config.get("elasticsearch", "port")),
                'use_ssl': ssl,
                'url_prefix': uri,
                'auth': auth
            }],
            max_retries=5,
            retry_on_timeout=True)

    def search(self, doc_type='mbox', **kwargs):
        return self.es.search(index=self.dbname, doc_type=doc_type, **kwargs)

    def index(self, **kwargs):
        return self.es.index(index=self.dbname, **kwargs)

    def update(self, **kwargs):
        return self.es.update(index=self.dbname, **kwargs)

    def scan(self, doc_type='mbox', scroll='3m', size=100, **kwargs):
        return self.es.search(index=self.dbname,
                              doc_type=doc_type,
                              search_type='scan',
                              size=size,
                              scroll=scroll,
                              **kwargs)

    def scroll(self, **kwargs):
        return self.es.scroll(**kwargs)

    def bulk(self, actions, **kwargs):
        return helpers.bulk(self.es, actions, **kwargs)

    """ 
        Call this to release the scroll id and its resources

        It looks like the Python library already releases the SID
        if the caller scrolls to the end of the results, so only need to call this
        when terminating scrolling early.
    """

    def clear_scroll(self, *args, **kwargs):
        return self.es.clear_scroll(*args, **kwargs)
示例#15
0
class ESSearch:
    def __init__(self, connection, es_index='covid_tweets'):
        aws_auth = AWS4Auth(connection['ACCESS_KEY'], connection['SECRET_KEY'],
                            'us-east-2', 'es')
        self.es = ES(hosts=[{
            'host': connection['AWS_HOST'],
            'port': 443
        }],
                     http_auth=aws_auth,
                     use_ssl=True,
                     verify_certs=True,
                     connection_class=RequestsHttpConnection,
                     timeout=60)
        self.es_index = es_index

    def format_query(self,
                     keywords,
                     startDateString=None,
                     endDateString=None,
                     tweettype=None,
                     user=None):
        if len(keywords) == 0:
            keywords = None
        queries = []
        time_range = {}
        if startDateString:
            startDate = datetime.strptime(startDateString, '%m/%d/%Y')
            time_range['gte'] = startDate.strftime('00-%d-%m-%Y')
        if endDateString:
            endDate = datetime.strptime(endDateString, '%m/%d/%Y')
            time_range['lte'] = endDate.strftime('23-%d-%m-%Y')
        if time_range:
            queries.append({'range': {'date': time_range}})
        if keywords:
            queries.append({
                "query_string": {
                    "query": keywords,
                    "fields": ["text"],
                    "default_operator": "or"
                }
            })
        if user:
            queries.append({"match": {"user_name": user}})
        if tweettype:
            queries.append({"terms": {
                "tweet_type": tweettype,
            }})
        print(queries)
        return queries

    def get_doc(self, tweet_id):
        retval = self.es.get(index=self.es_index,
                             id=tweet_id,
                             doc_type='document')
        return retval

    def get_user_tweet(self,
                       keywords,
                       startDateString=None,
                       endDateString=None,
                       tweettype=None,
                       user=None):
        queries = self.format_query(keywords,
                                    startDateString=startDateString,
                                    endDateString=endDateString,
                                    tweettype=tweettype,
                                    user=user)
        total_qry = {"size": 250, 'query': {'bool': {'must': queries}}}
        retval = self.es.search(index=self.es_index,
                                doc_type='document',
                                body=total_qry)
        return retval

    def count(self,
              keywords,
              startDateString=None,
              endDateString=None,
              tweettype=None,
              user=False):
        queries = self.format_query(keywords, startDateString, endDateString,
                                    tweettype)
        total_qry = {}
        retval = {}
        if user:
            total_qry = {
                "size": 0,
                'query': {
                    'bool': {
                        'must': queries
                    }
                },
                "aggs": {
                    "users_count": {
                        "cardinality": {
                            "field": "user_name"
                        }
                    }
                }
            }
            retval = self.es.search(index=self.es_index,
                                    doc_type='document',
                                    body=total_qry)
        else:
            total_qry = {'query': {'bool': {'must': queries}}}
            retval = self.es.count(index=self.es_index, body=total_qry)
        return retval

    def agg_qry(self,
                keywords,
                startDateString=None,
                endDateString=None,
                tweettype=None):
        return retval

    def query(self,
              keywords,
              startDateString=None,
              endDateString=None,
              size=None,
              tweettype=None):
        queries = self.format_query(keywords, startDateString, endDateString,
                                    tweettype)
        if size:
            retval = self.es.search(index=self.es_index,
                                    scroll='1m',
                                    doc_type='document',
                                    body={
                                        'size': size,
                                        'query': {
                                            'bool': {
                                                'must': queries
                                            }
                                        }
                                    })
            for tw in retval['hits']['hits']:
                yield tw
        else:
            retval = self.es.search(index=self.es_index,
                                    scroll='1m',
                                    doc_type='document',
                                    body={
                                        'size': 1000,
                                        'query': {
                                            'bool': {
                                                'must': queries
                                            }
                                        }
                                    })
            total = retval["hits"]["total"]
            print(total)
            sid = retval['_scroll_id']
            scroll_size = len(retval['hits']['hits'])
            while scroll_size > 0:
                print("Scrolling...")
                for tw in retval['hits']['hits']:
                    yield tw
                if not size:
                    retval = self.es.scroll(scroll_id=sid, scroll='1m')
                    sid = retval['_scroll_id']
                    scroll_size = len(retval['hits']['hits'])
            self.es.clear_scroll(scroll_id=sid)

    def sizequery(self,
                  keywords,
                  startDateString=None,
                  endDateString=None,
                  size=None,
                  tweettype=None,
                  random=True):
        queries = self.format_query(keywords, startDateString, endDateString,
                                    tweettype)
        queries = {'bool': {'must': queries}}
        print(queries)
        if random:
            queries = {
                "function_score": {
                    "query": queries,
                    "random_score": {},
                }
            }
        if size < 1000:
            retval = self.es.search(index=self.es_index,
                                    scroll='1m',
                                    doc_type='document',
                                    body={
                                        'size': size,
                                        'query': queries
                                    })
            for tw in retval['hits']['hits']:
                yield tw
        else:
            retval = self.es.search(index=self.es_index,
                                    scroll='1m',
                                    doc_type='document',
                                    body={
                                        'size': 1000,
                                        'query': queries
                                    })
            total = retval["hits"]["total"]
            so_far = len(retval['hits']['hits'])
            if size == None:
                size = total
            sid = retval['_scroll_id']
            scroll_size = len(retval['hits']['hits'])
            while scroll_size > 0 and so_far <= size:
                print(so_far)
                print(size)
                print(total)
                print("Scrolling...")
                for tw in retval['hits']['hits']:
                    yield tw
                retval = self.es.scroll(scroll_id=sid, scroll='1m')
                sid = retval['_scroll_id']
                scroll_size = len(retval['hits']['hits'])
                so_far = so_far + scroll_size
            self.es.clear_scroll(scroll_id=sid)
        """count = 0
示例#16
0
def query_index_by_time_range(
    index: str,
    es: Elasticsearch,
    min_time: Optional[datetime] = None,
    max_time: Optional[datetime] = None,
    step: int = 10000,
) -> Iterator[List[Dict]]:
    body = {
        "query": {
            "bool": {
                "must": [],
                "must_not": [],
                "should": []
            }
        },
        "sort": ['_doc'],
        "aggs": {}
    }

    time_range_body = {
        "range": {
            "startTimeMillis": {
                "format": "epoch_millis"
            }
        }
    }
    if min_time is not None:
        time_range_body["range"]["startTimeMillis"]["gte"] = int(
            min_time.timestamp() * 1000)
    if max_time is not None:
        time_range_body["range"]["startTimeMillis"]["lte"] = int(
            max_time.timestamp() * 1000)
    body["query"]["bool"]["must"].append(time_range_body)

    logger.debug(f"query_index_by_time_range body: {body}")
    rsp = None
    retry_counts = 0
    while rsp is None:
        try:
            rsp = es.search(index=index,
                            body=dict(**body, size=step),
                            scroll='5m',
                            timeout='5m')
        except Exception as e:
            logger.warning(f"Exception in search: {e}. Retry")
            rsp = None
            retry_counts += 1
            time.sleep(random.randint(1, 10))
            if retry_counts > 100:
                raise RuntimeError("get search error for too many times")
    total = rsp['hits']['total']["value"]
    scroll_id = rsp['_scroll_id']
    scroll_size = total

    rets = rsp["hits"]["hits"]
    yield rets
    total -= len(rets)
    del rets, rsp

    with tqdm(total=total, desc=f"{index=} {min_time}-{max_time}") as pbar:
        while scroll_size > 0:
            try:
                _rsp = es.scroll(scroll_id=scroll_id, scroll='5m')
            except Exception as e:
                logger.warning(f"Exception in scroll: {e}. Retry")
                continue
            scroll_id = _rsp['_scroll_id']
            scroll_size = len(_rsp['hits']['hits'])
            pbar.update(scroll_size)
            total -= scroll_size
            _rets = _rsp["hits"]["hits"]
            yield _rets

    es.clear_scroll(scroll_id=scroll_id)
class ElasticsearchFDW(ForeignDataWrapper):
    """ Elastic Search Foreign Data Wrapper """
    @property
    def rowid_column(self):
        """Returns a column name which will act as a rowid column for
        delete/update operations.

        This can be either an existing column name, or a made-up one. This
        column name should be subsequently present in every returned
        resultset."""

        return self._rowid_column

    def __init__(self, options, columns):
        super(ElasticsearchFDW, self).__init__(options, columns)

        self.index = options.pop("index", "")
        self.doc_type = options.pop("type", "")
        self.query_column = options.pop("query_column", None)
        self.score_column = options.pop("score_column", None)
        self.scroll_size = int(options.pop("scroll_size", "1000"))
        self.scroll_duration = options.pop("scroll_duration", "10m")
        self._rowid_column = options.pop("rowid_column", "id")
        username = options.pop("username", None)
        password = options.pop("password", None)

        if ELASTICSEARCH_VERSION[0] >= 7:
            self.path = "/{index}".format(index=self.index)
            self.arguments = {"index": self.index}
        else:
            self.path = "/{index}/{doc_type}".format(index=self.index,
                                                     doc_type=self.doc_type)
            self.arguments = {"index": self.index, "doc_type": self.doc_type}

        if (username is None) != (password is None):
            raise ValueError("Must provide both username and password")
        if username is not None:
            auth = (username, password)
        else:
            auth = None

        host = options.pop("host", "localhost")
        port = int(options.pop("port", "9200"))
        timeout = int(options.pop("timeout", "10"))
        self.client = Elasticsearch([{
            "host": host,
            "port": port
        }],
                                    http_auth=auth,
                                    timeout=timeout,
                                    **options)

        self.columns = columns
        self.json_columns = {
            column.column_name
            for column in columns.values()
            if column.base_type_name.upper() in {"JSON", "JSONB"}
        }

        self.scroll_id = None

    def get_rel_size(self, quals, columns):
        """Helps the planner by returning costs.
        Returns a tuple of the form (number of rows, average row width)"""

        try:
            query, _ = self._get_query(quals)

            if query:
                response = self.client.count(body=query, **self.arguments)
            else:
                response = self.client.count(**self.arguments)
            return (response["count"], len(columns) * 100)
        except Exception as exception:
            log2pg(
                "COUNT for {path} failed: {exception}".format(
                    path=self.path, exception=exception),
                logging.ERROR,
            )
            return (0, 0)

    def can_pushdown_upperrel(self):
        return {
            "groupby_supported": True,
            "agg_functions": list(_PG_TO_ES_AGG_FUNCS),
            "operators_supported": _OPERATORS_SUPPORTED,
        }

    def explain(
        self,
        quals,
        columns,
        sortkeys=None,
        aggs=None,
        group_clauses=None,
        verbose=False,
    ):
        query, _ = self._get_query(quals,
                                   aggs=aggs,
                                   group_clauses=group_clauses)
        return [
            "Elasticsearch query to %s" % self.client,
            "Query: %s" % json.dumps(query, indent=4),
        ]

    def execute(self, quals, columns, aggs=None, group_clauses=None):
        """ Execute the query """

        try:
            query, query_string = self._get_query(quals,
                                                  aggs=aggs,
                                                  group_clauses=group_clauses)

            is_aggregation = aggs or group_clauses

            if query:
                response = self.client.search(
                    size=self.scroll_size if not is_aggregation else 0,
                    scroll=self.scroll_duration
                    if not is_aggregation else None,
                    body=query,
                    **self.arguments)
            else:
                response = self.client.search(size=self.scroll_size,
                                              scroll=self.scroll_duration,
                                              **self.arguments)

            if not response["hits"]["hits"] and not is_aggregation:
                return

            if is_aggregation:
                yield from self._handle_aggregation_response(
                    query, response, aggs, group_clauses)
                return

            while True:
                self.scroll_id = response["_scroll_id"]

                for result in response["hits"]["hits"]:
                    yield self._convert_response_row(result, columns,
                                                     query_string)

                if len(response["hits"]["hits"]) < self.scroll_size:
                    return
                response = self.client.scroll(scroll_id=self.scroll_id,
                                              scroll=self.scroll_duration)
        except Exception as exception:
            log2pg(
                "SEARCH for {path} failed: {exception}".format(
                    path=self.path, exception=exception),
                logging.ERROR,
            )
            return

    def end_scan(self):
        if self.scroll_id:
            self.client.clear_scroll(scroll_id=self.scroll_id)
            self.scroll_id = None

    def insert(self, new_values):
        """ Insert new documents into Elastic Search """

        if self.rowid_column not in new_values:
            log2pg(
                'INSERT requires "{rowid}" column. Missing in: {values}'.
                format(rowid=self.rowid_column, values=new_values),
                logging.ERROR,
            )
            return (0, 0)

        document_id = new_values[self.rowid_column]
        new_values.pop(self.rowid_column, None)

        for key in self.json_columns.intersection(new_values.keys()):
            new_values[key] = json.loads(new_values[key])

        try:
            response = self.client.index(id=document_id,
                                         body=new_values,
                                         **self.arguments)
            return response
        except Exception as exception:
            log2pg(
                "INDEX for {path}/{document_id} and document {document} failed: {exception}"
                .format(
                    path=self.path,
                    document_id=document_id,
                    document=new_values,
                    exception=exception,
                ),
                logging.ERROR,
            )
            return (0, 0)

    def update(self, document_id, new_values):
        """ Update existing documents in Elastic Search """

        new_values.pop(self.rowid_column, None)

        for key in self.json_columns.intersection(new_values.keys()):
            new_values[key] = json.loads(new_values[key])

        try:
            response = self.client.index(id=document_id,
                                         body=new_values,
                                         **self.arguments)
            return response
        except Exception as exception:
            log2pg(
                "INDEX for {path}/{document_id} and document {document} failed: {exception}"
                .format(
                    path=self.path,
                    document_id=document_id,
                    document=new_values,
                    exception=exception,
                ),
                logging.ERROR,
            )
            return (0, 0)

    def delete(self, document_id):
        """ Delete documents from Elastic Search """

        try:
            response = self.client.delete(id=document_id, **self.arguments)
            return response
        except Exception as exception:
            log2pg(
                "DELETE for {path}/{document_id} failed: {exception}".format(
                    path=self.path,
                    document_id=document_id,
                    exception=exception),
                logging.ERROR,
            )
            return (0, 0)

    def _get_query(self, quals, aggs=None, group_clauses=None):
        ignore_columns = []
        if self.query_column:
            ignore_columns.append(self.query_column)
        if self.score_column:
            ignore_columns.append(self.score_column)

        query = quals_to_es(
            quals,
            aggs=aggs,
            group_clauses=group_clauses,
            ignore_columns=ignore_columns,
            column_map={self._rowid_column: "_id"}
            if self._rowid_column else None,
        )

        if group_clauses is not None:
            # Configure pagination for GROUP BY's
            query["aggs"]["group_buckets"]["composite"][
                "size"] = self.scroll_size

        if not self.query_column:
            return query, None

        query_string = next(
            (qualifier.value for qualifier in quals
             if qualifier.field_name == self.query_column),
            None,
        )

        if query_string:
            query["query"]["bool"]["must"].append(
                {"query_string": {
                    "query": query_string
                }})

        return query, query_string

    def _convert_response_row(self, row_data, columns, query):
        if query:
            # Postgres checks the query after too, so the query column needs to be present
            return dict(
                [(column, self._convert_response_column(column, row_data))
                 for column in columns if column in row_data["_source"] or
                 column == self.rowid_column or column == self.score_column] +
                [(self.query_column, query)])
        return {
            column: self._convert_response_column(column, row_data)
            for column in columns if column in row_data["_source"]
            or column == self.rowid_column or column == self.score_column
        }

    def _convert_response_column(self, column, row_data):
        if column == self.rowid_column:
            return row_data["_id"]
        if column == self.score_column:
            return row_data["_score"]
        value = row_data["_source"][column]
        if isinstance(value, (list, dict)):
            return json.dumps(value)
        return value

    def _handle_aggregation_response(self, query, response, aggs,
                                     group_clauses):
        if group_clauses is None:
            result = {}

            for agg_name in aggs:
                if agg_name == "count.*":
                    # COUNT(*) is a special case, since it doesn't have a
                    # corresponding aggregation primitive in ES
                    result[agg_name] = response["hits"]["total"]["value"]
                    continue

                result[agg_name] = response["aggregations"][agg_name]["value"]
            yield result
        else:
            while True:
                for bucket in response["aggregations"]["group_buckets"][
                        "buckets"]:
                    result = {}

                    for column in group_clauses:
                        result[column] = bucket["key"][column]

                    if aggs is not None:
                        for agg_name in aggs:
                            if agg_name == "count.*":
                                # In general case with GROUP BY clauses COUNT(*)
                                # is taken from the bucket's doc_count field
                                result[agg_name] = bucket["doc_count"]
                                continue

                            result[agg_name] = bucket[agg_name]["value"]

                    yield result

                # Check if we need to paginate results
                if "after_key" not in response["aggregations"][
                        "group_buckets"]:
                    break

                query["aggs"]["group_buckets"]["composite"][
                    "after"] = response["aggregations"]["group_buckets"][
                        "after_key"]

                response = self.client.search(size=0,
                                              body=query,
                                              **self.arguments)
示例#18
0
class Elastic:
    db_mbox: str
    db_source: str
    db_attachment: str
    db_account: str
    db_session: str
    db_notification: str
    db_auditlog: str
    dbname: str

    def __init__(self, logger_level=None, trace_level=None):
        # Fetch config
        config = ponymailconfig.PonymailConfig()

        # Set default names for all indices we use
        dbname = config.get('elasticsearch', 'dbname', fallback='ponymail')
        self.dbname = dbname
        self.db_mbox = dbname + '-mbox'
        self.db_source = dbname + '-source'
        self.db_account = dbname + '-account'
        self.db_attachment = dbname + '-attachment'
        self.db_session = dbname + '-session'
        self.db_notification = dbname + '-notification'
        self.db_auditlog = dbname + '-auditlog'
        self.db_version = 0

        dburl = config.get('elasticsearch', 'dburl', fallback=None)
        if not dburl:
            ssl = config.get('elasticsearch', 'ssl', fallback=False)
            uri = config.get('elasticsearch', 'uri', fallback='')
            auth = None
            if config.has_option('elasticsearch', 'user'):
                auth = (config.get('elasticsearch', 'user'),
                        config.get('elasticsearch', 'password'))
            dburl = {
                "host":
                config.get('elasticsearch', 'hostname', fallback='localhost'),
                "port":
                config.get('elasticsearch', 'port', fallback=9200),
                "use_ssl":
                ssl,
                "url_prefix":
                uri,
                "auth":
                auth,
                "ca_certs":
                certifi.where(),
            }

        # Always allow this to be set; will be replaced as necessary by wait_for_active_shards
        self.consistency = config.get("elasticsearch",
                                      "write",
                                      fallback="quorum")

        if logger_level:
            eslog = logging.getLogger("elasticsearch")
            eslog.setLevel(logger_level)
            eslog.addHandler(logging.StreamHandler())
        else:
            # elasticsearch logs lots of warnings on retries/connection failure
            logging.getLogger("elasticsearch").setLevel(logging.ERROR)

        if trace_level:
            trace = logging.getLogger("elasticsearch.trace")
            trace.setLevel(trace_level)
            trace.addHandler(logging.StreamHandler())

        self.es = Elasticsearch(
            [dburl],
            max_retries=5,
            retry_on_timeout=True,
        )

        es_engine_major = self.engineMajor()
        if es_engine_major in [7, 8]:
            self.wait_for_active_shards = config.get("elasticsearch",
                                                     "wait",
                                                     fallback=1)
        else:
            raise Exception("Unexpected elasticsearch version ",
                            es_engine_major)

        # Mimic ES hierarchy: es.indices.xyz()
        self.indices = _indices_wrap(self)

    # convert index type to index name
    def index_name(self, index):
        return self.dbname + "-" + index

    @staticmethod
    def libraryVersion():
        return ES_VERSION

    @staticmethod
    def libraryMajor():
        return ES_VERSION[0]

    def engineVersion(self):
        if not self.db_version:
            try:
                self.db_version = self.es.info()["version"]["number"]
            except ES_ConnectionError:
                # default if cannot connect; allows retry
                return "0.0.0"
        return self.db_version

    def engineMajor(self):
        return int(self.engineVersion().split(".")[0])

    def search(self, **kwargs):
        return self.es.search(**kwargs)

    def index(self, **kwargs):
        kwargs["wait_for_active_shards"] = self.wait_for_active_shards
        kwargs["doc_type"] = "_doc"
        return self.es.index(**kwargs)

    def create(self, **kwargs):
        return self.es.create(**kwargs)

    def info(self, **kwargs):
        return self.es.info(**kwargs)

    def update(self, **kwargs):
        return self.es.update(**kwargs)

    # TODO: is this used? Does it make sense for ES7 ?
    def scan(self, scroll="3m", size=100, **kwargs):
        return self.es.search(search_type="scan",
                              size=size,
                              scroll=scroll,
                              **kwargs)

    def get(self, **kwargs):
        return self.es.get(**kwargs)

    def scroll(self, **kwargs):
        return self.es.scroll(**kwargs)

    def info(self, **kwargs):
        return self.es.info(**kwargs)

    def bulk(self, actions, **kwargs):
        return helpers.bulk(self.es, actions, **kwargs)

    def streaming_bulk(self, actions, **kwargs):
        return helpers.streaming_bulk(self.es, actions, **kwargs)

    def clear_scroll(self, *args, **kwargs):
        """
            Call this to release the scroll id and its resources

            It looks like the Python library already releases the SID
            if the caller scrolls to the end of the results, so only need to call this
            when terminating scrolling early.
        """
        return self.es.clear_scroll(*args, **kwargs)
示例#19
0
class ElasticQuery(object):
    """
    Class with pre-built ElasticSearch queries to download/query data from server or local
    """

    # File path for storing data pickles
    DISK_PATH = '/media/jerry/RecordedFuture/Data'

    # Minimum file window size
    FILE_TIME_DELTA = timedelta(minutes=5)

    def __init__(self, es_server, es_index, username, password):
        """
        Initiates class by signing authenticating on es_server with username and password

        :param es_server: es server addr
        :param index: es index on server
        """
        self.QUERY_SIZE = 10000
        self.es_index = es_index
        self.es_server = es_server

        try:
            logger.debug('Initializing connection.')
            self.client = Elasticsearch(self.es_server, http_auth=(username, password), timeout=600)
        except exceptions.AuthenticationException as e:
            logger.error('Client Authorization Failed.')
            raise e
        logger.debug('Connection established.')

        # Features of interest from the Netflow ElasticSearch
        self.col_time = ['timestamp']
        self.col_flow = ['src_addr', 'src_port', 'dst_addr', 'dst_port', 'ip_protocol', 'packets', 'bytes']
        self.col_node = ['ipaddr']
        self.columns = self.col_time + self.col_flow + self.col_node

        # Construct actual feature name for extraction from ElasitcSearch
        self.response_columns = ['hits.hits._source.flow.' + _ for _ in self.col_flow] + \
                                ['hits.hits._source.node.' + _ for _ in self.col_node]
        self.response_filter = ['_scroll_id', 'hits.total.value', 'hits.hits._source.@timestamp'] + self.response_columns

    def query_unique(self, field):
        """
        Finds number of unique feature values for given field from ElasticSearch

        :param field: field following 'hits.hits._source' [examples: flow.ip_protocol, node.hostname]
        :return: dataframe
        """
        query = \
            {
                'aggs': {
                    'nodes': {
                        'terms': {
                            'field': field,
                        }
                    }
                }
            }

        logger.debug('Querying uniques for field %s' % field)
        response = self._search(query, filter_response=False)

        if response['timed_out']:
            logger.warning('Query timed out')
            return pd.DataFrame()
        logger.debug('%i flows processed in %.2f seconds' % (response['hits']['total']['value'], response['took']/1000))
        return pd.DataFrame().from_dict(response['aggregations']['nodes']['buckets'])

    def get_first_last(self):
        """
        Query ElasticSearch for the first and last timestamp of the records in current index

        :return: (date_last, date_first, total_hits)
        """
        dates = []
        for order in ['desc', 'asc']:
            query = \
                {
                    "query": {
                        "match_all": {}
                    },
                    "sort": [
                        {
                            "@timestamp": {
                                "order": order
                            }
                        }
                    ]
                }
            response = self._search(query, filter_response=False, size=1)
            dates.append(datetime.strptime(response['hits']['hits'][0]['_source']['@timestamp'],
                                           '%Y-%m-%dT%H:%M:%S.%fZ'))
            total_hits = response['hits']['total']['value']
            # (dates[0] - dates[1]).total_seconds()/(60*60*24)
        return dates[0], dates[1], total_hits

    def query_time(self, start_time: datetime, window_size: timedelta, from_disk: bool = True):
        """
        Queries ElasticSearch server starting at start_time

        :param start_time: datetime to start search at
        :param window_size: lookup window size in timedelta
        :param from_disk: check if file exists on disk and load it else download the file
        :return: dataframe containing data in the time window if any
        """
        # Time parameters
        time_current = start_time
        time_change = window_size

        logger.debug('Querying time %s' % time_current.isoformat())
        # Try loading from disk if file(s) is available, else download file(s) to disk and load it
        if from_disk:
            if not os.path.exists(self._get_pp(time_current)):
                while start_time < start_time + time_change:
                    self.download_pickle(start_time)
                    start_time += self.FILE_TIME_DELTA

            return self.load_pickle(time_current, time_change)

        query = \
            {'query':
                 {'bool':
                      {'filter':
                           {'range':
                                {'@timestamp':
                                     {'gte': time_current.isoformat(),
                                      'lt': (time_current + time_change).isoformat()}
                                 }
                            }
                       }
                  }
             }

        return self._query_data(query)

    def query_ip(self, ip, start_time: datetime, end_time: datetime, src=True):
        """
        Queries ElasticSearch server for src/dst ip/cidr in given time range

        :param ip: ip, cidr notation acceptable [ex. 192.168.1.1/16]
        :param start_time: start time in range
        :param end_time: end time in range
        :param src: lookup in src_addr/dst_addr
        :return: dataframe with results
        """
        time_start = start_time
        time_end = end_time

        flow_feature = 'flow.src_addr' if src else 'flow.dst_addr'

        query = \
            {'query':
                 {'bool':
                      {'filter':
                           [{'term':
                                {flow_feature: ip}
                           },
                               {'range':
                                    {'@timestamp':
                                         {'gte': time_start.isoformat(),
                                          'lt': time_end.isoformat()}
                                     }
                                }
                           ]
                      }
                  }
             }

        logger.debug('Querying ip %s in time %s' % (ip, time_start.isoformat()))
        return self._query_data(query)

    def load_pickle(self, start_time: datetime, window_size: timedelta):
        """
        Load saved pickle files from disk instead of query.
        """
        windows = int(window_size.total_seconds()/(60 * 5))     # Number of windows

        logger.debug('Loading time %s ' % start_time.isoformat())
        df_lst = []
        for _ in range(windows):
            pp = self._get_pp(start_time)
            df_lst.append(pd.read_pickle(pp))
            start_time += self.FILE_TIME_DELTA
        return pd.concat(df_lst, sort=False, ignore_index=True)

    def download_pickle(self, start_time: datetime):
        """
        Download pickle file recursively by calling query_time and save the results

        :param start_time:
        """
        logger.debug('Downloading %s ' % start_time.isoformat())

        df = eq.query_time(start_time, self.FILE_TIME_DELTA, from_disk=False)
        pp = self._get_pp(start_time)
        if not os.path.exists(os.path.dirname(pp)):
            os.makedirs(os.path.dirname(pp))
        df.to_pickle(pp)

    def _get_pp(self, current_date):
        """
        :return: pickle file path for given date
        """
        pickle_path = os.path.join(self.DISK_PATH,
                                   str(current_date.month),
                                   str(current_date.day),
                                   '%02d%02d.pickle' % (current_date.hour, current_date.minute))
        return pickle_path

    def _query_data(self, query):
        """
        Queries ElasticSearch server with given query body, saves result to file if a path is given

        :param query: query body given to elastic search
        :return: dataframe
        """
        df_lst = []
        df_tmp = pd.DataFrame(columns=self.columns)

        response = self._search(query)
        scroll_id = response['_scroll_id']
        n_flows = response['hits']['total']['value']
        if n_flows == 0:
            logger.warning('Entries not found.\n')
            return df_tmp

        lines_skipped = 0
        batches = int(np.ceil(n_flows/self.QUERY_SIZE))
        logger.debug('Processing %i flows.' % n_flows)
        for batch in range(max(batches-1, 1)):
            rows = []
            for hit in response['hits']['hits']:
                row = hit['_source'].get('flow', None)
                if not row:
                    lines_skipped += 1
                    continue
                row.update(hit['_source']['node'])
                row.update({'timestamp': hit['_source']['@timestamp']})
                rows.append(row)
            df_lst.append(df_tmp.from_dict(rows))
            response = self._scroll(scroll_id)
        self.client.clear_scroll(scroll_id)     # Clear Scroll after Finish

        logger.debug('Processed %i batches, skipped %i lines.' % (batches, lines_skipped))
        return pd.concat(df_lst, sort=False, ignore_index=True)

    def _search(self, query, filter_response=True, size=None):
        """
        Wrapper for ElasticSearch search function

        :param query: query body
        :param filter_response:
        :return:
        """
        response_filter = self.response_filter if filter_response else None
        size = self.QUERY_SIZE if not size else size
        return self.client.search(index=self.es_index,
                                  body=query,
                                  size=size,
                                  scroll='1m',
                                  filter_path=response_filter)

    def _scroll(self, scroll_id, filter_response=True):
        """
        Wrapper for ElasticSearch scroll function

        :param scroll_id:
        :param filter_response:
        :return:
        """
        response_filter = self.response_filter if filter_response else None
        return self.client.scroll(scroll_id=scroll_id,
                                  scroll='1m',
                                  filter_path=response_filter)
class ElasticSearchSeqSource(base.DataSource):
    """
    Data source which executes arbitrary queries on ElasticSearch

    This is the sequential reader: will return a list of dictionaries.

    Parameters
    ----------
    query: str
       Query to execute. Can either be in Lucene single-line format, or a
       JSON structured query (presented as text)
    npartitions: int
        Split query into this many sections. If one, will not split.
    qargs: dict
        Further parameters to pass to the query, such as set of indexes to
        consider, filtering, ordering. See
        http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search
    es_kwargs: dict
        Settings for the ES connection, e.g., a simple local connection may be
        ``{'host': 'localhost', 'port': 9200}``.
        Other keywords to the Plugin that end up here and are material:

        scroll: str
            how long the query is live for, default ``'100m'``
        size: int
            the paging size when downloading, default 1000.
    metadata: dict
        Extra information for this source.
    """
    name = 'elasticsearch_seq'
    container = 'python'
    version = __version__
    partition_access = False

    def __init__(self,
                 query,
                 npartitions=1,
                 qargs={},
                 metadata={},
                 **es_kwargs):
        from elasticsearch import Elasticsearch
        self._query = query
        self._qargs = qargs
        self._scroll = es_kwargs.pop('scroll', '100m')
        self._size = es_kwargs.pop('size', 1000)  # default page size
        self._es_kwargs = es_kwargs
        self._dataframe = None
        self.es = Elasticsearch([es_kwargs])  # maybe should be (more) global?
        self.es_version = tuple(
            int(v) if v.isdigit() else -1
            for v in self.es.info()['version']['number'].strip().split("."))

        super(ElasticSearchSeqSource, self).__init__(metadata=metadata)
        self.npartitions = npartitions

    def _run_query(self, size=None, end=None, slice_id=None, slice_max=None):
        """Execute query on ES

        Parameters
        ----------
        size: int
            Number of objects per page
        end: int
            Cut query down to this number of results, useful for getting a
            sample
        slice_id, slice_max: int
            If given, this is one of slice_max partitions.
        """
        if size is None:
            size = self._size
        if end is not None:
            size = min(end, size)

        slice_dict = None
        if slice_id is not None:
            slice_dict = {'slice': {'id': slice_id, 'max': slice_max}}
        try:
            q = json.loads(self._query)
            if 'query' not in q:
                q = {'query': q}

            if slice_dict:
                q.update(slice_dict)
            s = self.es.search(body=q,
                               size=size,
                               scroll=self._scroll,
                               **self._qargs)
        except (JSONDecodeError, TypeError):
            s = self.es.search(body=slice_dict,
                               q=self._query,
                               size=size,
                               scroll=self._scroll,
                               **self._qargs)
        sid = s['_scroll_id']
        if self.es_version[0] >= 7:
            scroll_size = s['hits']['total']['value']
        else:
            scroll_size = s['hits']['total']
        while scroll_size > len(s['hits']['hits']):
            page = self.es.scroll(scroll_id=sid, scroll=self._scroll)
            sid = page['_scroll_id']
            s['hits']['hits'].extend(page['hits']['hits'])
            if end is not None and len(s['hits']['hits']) > end:
                break
        self.es.clear_scroll(scroll_id=sid)
        return s

    def read(self):
        """Read all data in one go"""
        return self._get_partition()

    def to_dask(self):
        """Form partitions into a dask.bag"""
        import dask.bag as db
        from dask import delayed
        self.discover()
        parts = []
        if self.npartitions == 1:
            part = delayed(self._get_partition)()
            return db.from_delayed([part])

        for slice_id in range(self.npartitions):
            parts.append(delayed(self._get_partition)(slice_id))
        return db.from_delayed(parts)

    def _get_schema(self, retry=2):
        return base.Schema(datashape=None,
                           dtype=None,
                           shape=None,
                           npartitions=self.npartitions,
                           extra_metadata={})

    def _get_partition(self, partition=None):
        """
        Downloads all data or get specific partion slice of the query

        Parameters
        ----------
        partition: int or None
            If None, get all data; otherwise, get specific partition
        """
        slice_id = partition
        results = self._run_query(slice_id=slice_id,
                                  slice_max=self.npartitions)
        return [r['_source'] for r in results['hits']['hits']]
class ElasticsearchFDW(ForeignDataWrapper):
    """ Elastic Search Foreign Data Wrapper """
    @property
    def rowid_column(self):
        """ Returns a column name which will act as a rowid column for
            delete/update operations.

            This can be either an existing column name, or a made-up one. This
            column name should be subsequently present in every returned
            resultset. """

        return self._rowid_column

    def __init__(self, options, columns):
        super(ElasticsearchFDW, self).__init__(options, columns)

        self.index = options.pop("index", "")
        self.doc_type = options.pop("type", "")
        self.query_column = options.pop("query_column", None)
        self.is_json_query = options.pop("query_dsl",
                                         "false").lower() == "true"
        self.score_column = options.pop("score_column", None)
        self.default_sort = options.pop("default_sort", "")
        self.sort_column = options.pop("sort_column", None)
        self.scroll_size = int(options.pop("scroll_size", "1000"))
        self.scroll_duration = options.pop("scroll_duration", "10m")
        self._rowid_column = options.pop("rowid_column", "id")
        username = options.pop("username", None)
        password = options.pop("password", None)

        self.refresh = options.pop("refresh", "false").lower()
        if self.refresh not in {"true", "false", "wait_for"}:
            raise ValueError(
                "refresh option must be one of true, false, or wait_for")
        self.complete_returning = (options.pop("complete_returning",
                                               "false").lower() == "true")

        if ELASTICSEARCH_VERSION[0] >= 7:
            self.path = "/{index}".format(index=self.index)
            self.arguments = {"index": self.index}
        else:
            self.path = "/{index}/{doc_type}".format(index=self.index,
                                                     doc_type=self.doc_type)
            self.arguments = {"index": self.index, "doc_type": self.doc_type}

        if (username is None) != (password is None):
            raise ValueError("Must provide both username and password")
        if username is not None:
            auth = (username, password)
        else:
            auth = None

        host = options.pop("host", "localhost")
        port = int(options.pop("port", "9200"))
        timeout = int(options.pop("timeout", "10"))
        self.client = Elasticsearch([{
            "host": host,
            "port": port
        }],
                                    http_auth=auth,
                                    timeout=timeout,
                                    **options)

        self.columns = columns
        self.json_columns = {
            column.column_name
            for column in columns.values()
            if column.base_type_name.upper() in {"JSON", "JSONB"}
        }
        self.scroll_id = None

    def get_rel_size(self, quals, columns):
        """ Helps the planner by returning costs.
            Returns a tuple of the form (number of rows, average row width) """

        try:
            query = self._get_query(quals)
            if query:
                if self.is_json_query:
                    response = self.client.count(body=json.loads(query),
                                                 **self.arguments)
                else:
                    response = self.client.count(q=query, **self.arguments)
            else:
                response = self.client.count(**self.arguments)
            return (response["count"], len(columns) * 100)
        except Exception as exception:
            log2pg(
                "COUNT for {path} failed: {exception}".format(
                    path=self.path, exception=exception),
                logging.ERROR,
            )
            return (0, 0)

    def execute(self, quals, columns):
        """ Execute the query """

        try:
            arguments = dict(self.arguments)
            arguments["sort"] = self._get_sort(quals)
            sort = arguments["sort"]
            query = self._get_query(quals)

            if query:
                if self.is_json_query:
                    response = self.client.search(size=self.scroll_size,
                                                  scroll=self.scroll_duration,
                                                  body=json.loads(query),
                                                  **self.arguments)
                else:
                    response = self.client.search(size=self.scroll_size,
                                                  scroll=self.scroll_duration,
                                                  q=query,
                                                  **self.arguments)
            else:
                response = self.client.search(size=self.scroll_size,
                                              scroll=self.scroll_duration,
                                              **arguments)

            while True:
                self.scroll_id = response["_scroll_id"]

                for result in response["hits"]["hits"]:
                    yield self._convert_response_row(result, columns, query,
                                                     sort)

                if len(response["hits"]["hits"]) < self.scroll_size:
                    return
                response = self.client.scroll(scroll_id=self.scroll_id,
                                              scroll=self.scroll_duration)
        except Exception as exception:
            log2pg(
                "SEARCH for {path} failed: {exception}".format(
                    path=self.path, exception=exception),
                logging.ERROR,
            )
            return

    def end_scan(self):
        """ Hook called at the end of a foreign scan. """
        if self.scroll_id:
            self.client.clear_scroll(scroll_id=self.scroll_id)
            self.scroll_id = None

    def insert(self, new_values):
        """ Insert new documents into Elastic Search """

        if self.rowid_column not in new_values:
            log2pg(
                'INSERT requires "{rowid}" column. Missing in: {values}'.
                format(rowid=self.rowid_column, values=new_values),
                logging.ERROR,
            )
            return (0, 0)

        document_id = new_values[self.rowid_column]
        new_values.pop(self.rowid_column, None)

        for key in self.json_columns.intersection(new_values.keys()):
            new_values[key] = json.loads(new_values[key])

        try:
            response = self.client.index(id=document_id,
                                         body=new_values,
                                         refresh=self.refresh,
                                         **self.arguments)
            if self.complete_returning:
                return self._read_by_id(response["_id"])
            return {self.rowid_column: response["_id"]}
        except Exception as exception:
            log2pg(
                "INDEX for {path}/{document_id} and document {document} failed: {exception}"
                .format(
                    path=self.path,
                    document_id=document_id,
                    document=new_values,
                    exception=exception,
                ),
                logging.ERROR,
            )
            return (0, 0)

    def update(self, document_id, new_values):
        """ Update existing documents in Elastic Search """

        new_values.pop(self.rowid_column, None)

        for key in self.json_columns.intersection(new_values.keys()):
            new_values[key] = json.loads(new_values[key])

        try:
            response = self.client.index(id=document_id,
                                         body=new_values,
                                         refresh=self.refresh,
                                         **self.arguments)
            if self.complete_returning:
                return self._read_by_id(response["_id"])
            return {self.rowid_column: response["_id"]}
        except Exception as exception:
            log2pg(
                "INDEX for {path}/{document_id} and document {document} failed: {exception}"
                .format(
                    path=self.path,
                    document_id=document_id,
                    document=new_values,
                    exception=exception,
                ),
                logging.ERROR,
            )
            return (0, 0)

    def delete(self, document_id):
        """ Delete documents from Elastic Search """

        if self.complete_returning:
            document = self._read_by_id(document_id)
        else:
            document = {self.rowid_column: document_id}

        try:
            self.client.delete(id=document_id,
                               refresh=self.refresh,
                               **self.arguments)
            return document
        except Exception as exception:
            log2pg(
                "DELETE for {path}/{document_id} failed: {exception}".format(
                    path=self.path,
                    document_id=document_id,
                    exception=exception),
                logging.ERROR,
            )
            return (0, 0)

    def _get_query(self, quals):
        if not self.query_column:
            return None

        return next(
            (qualifier.value for qualifier in quals
             if qualifier.field_name == self.query_column),
            None,
        )

    def _get_sort(self, quals):
        if not self.sort_column:
            return self.default_sort

        return next(
            (qualifier.value for qualifier in quals
             if qualifier.field_name == self.sort_column and qualifier.value),
            self.default_sort,
        )

    def _convert_response_row(self, row_data, columns, query, sort):
        return_dict = {
            column: self._convert_response_column(column, row_data)
            for column in columns if column in row_data["_source"]
            or column == self.rowid_column or column == self.score_column
        }
        if query:
            return_dict[self.query_column] = query
        return_dict[self.sort_column] = sort
        return return_dict

    def _convert_response_column(self, column, row_data):
        if column == self.rowid_column:
            return row_data["_id"]
        if column == self.score_column:
            return row_data["_score"]
        value = row_data["_source"][column]
        if isinstance(value, (list, dict)):
            return json.dumps(value)
        return value

    def _read_by_id(self, row_id):
        try:
            arguments = dict(self.arguments)
            results = self.client.search(
                body={"query": {
                    "ids": {
                        "values": [row_id]
                    }
                }}, **arguments)["hits"]["hits"]
            if results:
                return self._convert_response_row(results[0], self.columns,
                                                  None, None)
            log2pg(
                "SEARCH for {path} row_id {row_id} returned nothing".format(
                    path=self.path, row_id=row_id),
                logging.WARNING,
            )
            return {self.rowid_column: row_id}
        except Exception as exception:
            log2pg(
                "SEARCH for {path} row_id {row_id} failed: {exception}".format(
                    path=self.path, row_id=row_id, exception=exception),
                logging.ERROR,
            )
            return {}
示例#22
0

# Initialize the scroll
res = es.search(index=sourceIndexName, scroll="90s", size=10, body=query)

scroll_id = res['_scroll_id']
scroll_size = res['hits']['total']
hits_left = scroll_size

# Start scrolling
while (scroll_size > 0):
    #print("Querried %d Hits: %d left" % res['hits']['total'], hits_left)
    for hit in res['hits']['hits']:
        result = formatResult(hit)
        #print(result)
        csv_writer.writerow(result)

    res = es.scroll(scroll_id=scroll_id, scroll='2m')
    # Update the scroll ID
    sid = res['_scroll_id']
    # Get the number of results that we returned in the last scroll
    scroll_size = len(res['hits']['hits'])
    hits_left = hits_left - scroll_size
    # print ("scroll size: " + str(scroll_size))
    # Do something with the obtained page

es.clear_scroll(scroll_id)
csv_file.close()
#for hit in res['hits']['hits']:
#    print("%(MessageUUID)s" % hit["_source"])
示例#23
0
def main():
    config = create_config()
    es_aws = config.get('DB-Section', 'es-aws')
    elk_credentials = config.get('Secrets-Section',
                                 'elk-secret').strip('"').split(' ')

    # ------------------------------------------------------------------------------------------------------------------
    # INIT ES CONNECTION
    # ------------------------------------------------------------------------------------------------------------------
    es_host_config = {
        'host': config.get('DB-Section', 'es-host', fallback='localhost'),
        'port': config.get('DB-Section', 'es-port', fallback='9200')
    }
    if es_aws == 'True':
        es = Elasticsearch(hosts=[es_host_config],
                           http_auth=(elk_credentials[0], elk_credentials[1]),
                           scheme='https')
    else:
        es = Elasticsearch(hosts=[es_host_config])
    # ------------------------------------------------------------------------------------------------------------------
    # INIT ALL INDICES
    # ------------------------------------------------------------------------------------------------------------------
    es_manager = ESManager()
    for index in ESIndices:
        if not es_manager.index_exists(index):
            create_result = es_manager.create_index(index)
            print(create_result)
    # ------------------------------------------------------------------------------------------------------------------
    # GET ALL MODULES FROM 'modules' INDEX
    # ------------------------------------------------------------------------------------------------------------------
    all_results = {}
    match_all_query = {'query': {'match_all': {}}}

    total_index_docs = 0
    es_result = es.search(index=ESIndices.MODULES.value,
                          body=match_all_query,
                          scroll=u'10s',
                          size=250)
    scroll_id = es_result.get('_scroll_id')
    hits = es_result['hits']['hits']
    _store_hits(hits, all_results)
    total_index_docs += len(hits)

    while len(es_result['hits']['hits']):
        es_result = es.scroll(scroll_id=scroll_id, scroll=u'10s')
        scroll_id = es_result.get('_scroll_id')
        hits = es_result['hits']['hits']
        _store_hits(hits, all_results)
        total_index_docs += len(hits)

    es.clear_scroll(scroll_id=scroll_id, ignore=(404, ))
    print('Total number of modules retreived from "modules" index: {}'.format(
        total_index_docs))
    # ------------------------------------------------------------------------------------------------------------------
    # FILL 'autocomplete' INDEX
    # ------------------------------------------------------------------------------------------------------------------
    for query in all_results.values():
        es_manager.delete_from_index(ESIndices.AUTOCOMPLETE, query)
        index_result = es_manager.index_module(ESIndices.AUTOCOMPLETE, query)
        if index_result['result'] != 'created':
            print(index_result)
class Elastic:
    db_mbox:            str
    db_source:          str
    db_attachment:      str
    db_account:         str
    db_session:         str
    db_notification:    str
    db_mailinglist:     str

    def __init__(self, dbname=None):
        # Fetch config
        config = plugins.ponymailconfig.PonymailConfig()

        # Set default names for all indices we use
        self.dbname = config.get('elasticsearch', 'dbname', fallback='ponymail')
        self.db_mbox = self.dbname + '-mbox'
        self.db_source = self.dbname + '-source'
        self.db_account = self.dbname + '-account'
        self.db_attachment = self.dbname + '-attachment'
        self.db_session = self.dbname + '-session'
        self.db_notification = self.dbname + '-notification'
        self.db_mailinglist = self.dbname + '-mailinglist'
        self.db_version = 0

        ssl = config.get('elasticsearch', 'ssl', fallback=False)
        uri = config.get('elasticsearch', 'uri', fallback='')
        auth = None
        if config.has_option('elasticsearch', 'user'):
            auth = (
                config.get('elasticsearch', 'user'),
                config.get('elasticsearch', 'password')
            )

        # Always allow this to be set; will be replaced as necessary by wait_for_active_shards
        self.consistency = config.get("elasticsearch", "write", fallback="quorum")

        # elasticsearch logs lots of warnings on retries/connection failure
        logging.getLogger("elasticsearch").setLevel(logging.ERROR)

        #         # add debug
        #         trace = logging.getLogger("elasticsearch.trace")
        #         trace.setLevel(logging.DEBUG)
        #         # create console handler
        #         consoleHandler = logging.StreamHandler()
        #         trace.addHandler(consoleHandler)

        self.es = Elasticsearch(
            [
                {
                    "host": config.get('elasticsearch', 'hostname', fallback='localhost'),
                    "port": config.get('elasticsearch', 'port', fallback=9200),
                    "use_ssl": ssl,
                    "url_prefix": uri,
                    "auth": auth,
                    "ca_certs": certifi.where(),
                }
            ],
            max_retries=5,
            retry_on_timeout=True,
        )

        es_engine_major = self.engineMajor()
        if es_engine_major in [7, 8]:
            self.wait_for_active_shards = config.get("elasticsearch", "wait", fallback=1)
        else:
            raise Exception("Unexpected elasticsearch version ", es_engine_major)

        # Mimic ES hierarchy: es.indices.xyz()
        self.indices = _indices_wrap(self)

    def libraryVersion(self):
        return ES_VERSION

    def libraryMajor(self):
        return ES_VERSION[0]

    def engineVersion(self):
        if not self.db_version:
            try:
                self.db_version = self.es.info()["version"]["number"]
            except ES_ConnectionError:
                # default if cannot connect; allows retry
                return "0.0.0"
        return self.db_version

    def engineMajor(self):
        return int(self.engineVersion().split(".")[0])

    def getdbname(self):
        return self.dbname

    def search(self, **kwargs):
        return self.es.search(index=self.dbname, **kwargs)

    def index(self, **kwargs):
        kwargs["wait_for_active_shards"] = self.wait_for_active_shards
        kwargs["doc_type"] = "_doc"
        return self.es.index(**kwargs)

    def update(self, **kwargs):
        return self.es.update(index=self.dbname, **kwargs)

    def scan(self, scroll="3m", size=100, **kwargs):
        return self.es.search(
            index=self.dbname, search_type="scan", size=size, scroll=scroll, **kwargs
        )

    def scan_and_scroll(self, scroll="3m", size=100, **kwargs):
        """ Run a backwards compatible scan/scroll, passing an iterator
            that returns one page of hits per iteration. This
            incorporates es.scroll for continuous iteration, and thus the
            scroll() does NOT need to be called at all by the calling
            process. """
        results = self.es.search(index=self.dbname, size=size, scroll=scroll, **kwargs)
        if results["hits"].get("hits", []):  # Might not be there in 2.x?
            yield results

        # While we have hits waiting, scroll...
        scroll_size = results["hits"]["total"]
        while scroll_size > 0:
            results = self.scroll(scroll_id=results["_scroll_id"], scroll=scroll)
            scroll_size = len(
                results["hits"]["hits"]
            )  # If >0, try another scroll next.
            yield results

    def get(self, **kwargs):
        return self.es.get(index=self.dbname, **kwargs)

    def scroll(self, **kwargs):
        return self.es.scroll(**kwargs)

    def info(self, **kwargs):
        return self.es.info(**kwargs)

    def bulk(self, actions, **kwargs):
        return helpers.bulk(self.es, actions, **kwargs)

    def clear_scroll(self, *args, **kwargs):
        """
            Call this to release the scroll id and its resources

            It looks like the Python library already releases the SID
            if the caller scrolls to the end of the results, so only need to call this
            when terminating scrolling early.
        """
        return self.es.clear_scroll(*args, **kwargs)
示例#25
0
# make a search() request to get all docs in the index
res = elastic_client.search(index="test_index", body=filter, scroll='2m')
# Print records of first batch
print("total hits:", len(res["hits"]["hits"]))
# Get scroll id for scroll api
sid = res['_scroll_id']
scroll_size = len(res['hits']['total'])
all_hits = res['hits']['hits']
for num, doc in enumerate(all_hits):
    print("DOC ID:", doc["_id"])
    for key, value in doc.items():
        print(key, "-->", value)
    print("\n\n")

while (scroll_size > 0):
    print("Scrolling...")
    res = elastic_client.scroll(scroll_id=sid, scroll='2m')
    # Update the scroll ID
    sid = res['_scroll_id']
    # Get the number of results that we returned in the last scroll
    scroll_size = len(res['hits']['hits'])
    all_hits = res['hits']['hits']
    for num, doc in enumerate(all_hits):
        print("DOC ID:", doc["_id"])
        for key, value in doc.items():
            print(key, "-->", value)
        print("\n\n")
    print("scroll size: " + str(scroll_size))
    # Clear memory for this batch
    elastic_client.clear_scroll(body={'scroll_id': [sid]})
示例#26
0
iname = ""
dtype = ""

# Initialize the scroll
page = es.search(
    index = iname,
    doc_type = "",
    scroll = '2m',
    search_type = 'scan',
    size = 1000,
    body = { # your query's body}
    )

sid = page['_scroll_id']
scroll_size = page['hits']['total']

# start scrolling

while (scroll_size > 0):
    print "scrolling.."
    page = es.scroll(scroll_id = sid, scroll = '2m')
    # update the scroll ID
    sid = page['_scroll_id']
    # Get the number of results that we returned in the last scroll
    scroll_size = len(page['hits']['hits'])
    print "scroll size: " + str(scroll_size)
    # Do something with the obtained page

    # clear scroll
    es.clear_scroll(body={'scroll_id': [sid]}, ignore=(404, ))