Exemplo n.º 1
0
def export_emails_archive(data_set_id, email_ids=["f9c9c59a-7fe8-11e5-bb05-08002705cb99"]):
    cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, email_ids))
    if not data_set_id:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing index")
    # if not email:
    #     return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id")

    # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10))
    es = Elasticsearch([{"host" : "10.1.70.143", "port" : 9200}], request_timeout=60)
    # TODO can implement with multiple doc_types and combine attachments in
    emails = es.mget(index=data_set_id, doc_type="emails", body={"docs":[{"_id":id} for id in email_ids]})


    # TODO filename
    filename= "export.tar.gz"
    tangelo.content_type("application/x-gzip")
    header("Content-Disposition", 'attachment; filename="{}"'.format(filename))

    string_buffer = cStringIO.StringIO()
    tar = tarfile.open(mode='w:gz', fileobj=string_buffer)

    # Add each email to the tar
    for email_source in emails["docs"]:

        email = email_source["_source"]

        tarinfo_parent= tarfile.TarInfo(name = email["id"])
        tarinfo_parent.type = tarfile.DIRTYPE
        tarinfo_parent.mode = 0755
        tarinfo_parent.mtime = time.time()
        tar.addfile(tarinfo_parent)

        tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".json")
        # TODO -- email transformation
        data_string = json.dumps(email)
        fobj = cStringIO.StringIO(data_string)

        tarinfo.size = len(data_string)
        tarinfo.mode = 0644
        tarinfo.mtime = time.time()
        tar.addfile(tarinfo, fobj)

        # Get the attachments
        if email["attachments"]:
            attachments = es.mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":attch["guid"]} for attch in email["attachments"]]})
            for attachment_source in attachments["docs"]:
                attachment = attachment_source["_source"]
                filename = attachment["filename"]
                attch_data = str(base64.b64decode(attachment["contents64"]))

                tarinfo_attch = tarfile.TarInfo(email["id"]+"/"+filename)
                tarinfo_attch.size = len(attch_data)
                tarinfo_attch.mode = 0644
                tarinfo_attch.mtime = time.time()
                tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))
    tar.close()

    return string_buffer.getvalue()
def esidfileconsumegenerator(host=None,port=9200,index=None,type=None,body=None,source=True,source_exclude=None,source_include=None,idfile=None,headless=False,chunksize=1000,timeout=10):
    if os.path.isfile(idfile):
        ids=list()
        notfound_ids=set()
        with open(idfile,"r") as inp:
            for ppn in inp:
                _id=ppn.rstrip()
                ids.append(_id)
        if not source:
            source=True
        tracer = logging.getLogger('elasticsearch')
        tracer.setLevel(logging.WARNING)
        tracer.addHandler(logging.FileHandler('errors.txt'))
        es=Elasticsearch([{'host':host}],port=port,timeout=timeout, max_retries=10, retry_on_timeout=True)
        success=False
        _ids=set()
        try:
            for _id in ids:
                _ids.add(ids.pop())
                if len(_ids)>=chunksize:
                    for doc in es.mget(index=index,doc_type=type,body={'ids':list(_ids)},_source_include=source_include,_source_exclude=source_exclude,_source=source).get("docs"):
                        if headless:
                            yield doc.get("_source")
                        else:
                            yield doc
                    _ids.clear()
            if len(_ids)>0:
                for doc in es.mget(index=index,doc_type=type,body={'ids':list(_ids)},_source_include=source_include,_source_exclude=source_exclude,_source=source).get("docs"):
                    if headless:
                        yield doc.get("_source")
                    else:
                        yield doc
                _ids.clear()
                ids.clear()
        except exceptions.NotFoundError:
            notfound_ids.add(_ids)
        else:
            os.remove(idfile)
        finally:
            ids+=notfound_ids
            with open(idfile,"w") as outp:
                for _id in ids:
                    print(_id,file=outp)
Exemplo n.º 3
0
 def _get_es_docs_by_ids(self, docs_ids: List[str]):
     """
     Retrieve several documents from an ElasticSearch index.
     :param docs_ids: ids of the documents to retrieve
     :return: a list of tuples <doc_uri: document_dict>
     """
     if not docs_ids:
         return []
     elastic = Elasticsearch(self._config.es_host)
     return [(doc['_id'], doc['_source'])
             for doc in elastic.mget(body={'ids': docs_ids},
                                     index='dbpedia')['docs']
             if '_source' in doc]
Exemplo n.º 4
0
class ESRetrieve(BatchStage):
    def __init__(self, es_hosts: str, es_indices: str):
        super().__init__(size=10, timeout=5)
        self._es_indices = es_indices.strip().split(",")
        self._es_hosts = es_hosts
        self._es_client = None
        self._retrieve = None

    def on_start(self):
        self._es_client = Elasticsearch(self._es_hosts)
        # use Elasticsearch mget when a single index is specified
        if len(
                self._es_indices
        ) == 1 and not self._es_client.indices.exists_alias(self._es_indices):
            self._retrieve = self._mget
        else:
            self._retrieve = self._search

    @staticmethod
    def _mget(self, items: Sequence[DataItem]) -> Sequence[DataItem]:
        body = {"docs": [{"_id": item.payload["_id"]} for item in items]}
        resp = self._es_client.mget(body=body, index=self._es_indices)
        for i, doc in enumerate(resp["docs"]):
            if "error" not in doc:
                items[i].payload.update(doc)
        return items

    @staticmethod
    def _search(self, items: Sequence[DataItem]) -> Sequence[DataItem]:
        query = {
            "query": {
                "ids": {
                    "values": [item.payload["_id"] for item in items]
                }
            }
        }
        resp = self._es_client.search(body=query, index=self._es_indices)
        for i, doc in enumerate(resp["hits"]["hits"]):
            items[i].payload.update(doc)
        return items

    def process_batch(self, items: Sequence[DataItem]) -> Sequence[DataItem]:
        return self._retrieve(self, items)
Exemplo n.º 5
0
class SearchEngine(object):
    def __init__(self, prefix=settings.ELASTICSEARCH_PREFIX):
        #
        serializer = JSONSerializer()
        serializer.mimetype = 'application/json'
        serializer.dumps = serializer.serialize
        serializer.loads = JSONDeserializer().deserialize
        self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS,
                                serializer=serializer,
                                **settings.ELASTICSEARCH_CONNECTION_OPTIONS)
        self.logger = logging.getLogger(__name__)
        self.prefix = prefix.lower()

    def _add_prefix(self, *args, **kwargs):
        if args:
            index = args[0].strip()
        else:
            index = kwargs.get('index', '').strip()
        if index is None or index == '':
            raise NotImplementedError("Elasticsearch index not specified.")

        prefix = '%s_' % self.prefix.strip(
        ) if self.prefix and self.prefix.strip() != '' else ''
        index = '%s%s' % (prefix, index)
        if args:
            return index
        else:
            return dict(kwargs, index=index)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index, doc_type, and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        kwargs = self._add_prefix(**kwargs)
        body = kwargs.pop('body', None)
        if body != None:
            try:
                data = []
                refresh = kwargs.pop('refresh', False)
                for hit in helpers.scan(self.es, query=body, **kwargs):
                    hit['_op_type'] = 'delete'
                    data.append(hit)

                return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
            except Exception as detail:
                try:
                    # ignore 404 errors (index_not_found_exception)
                    if detail.status_code == 404:
                        pass
                except:
                    self.logger.warning(
                        '%s: WARNING: failed to delete document by query: %s \nException detail: %s\n'
                        % (datetime.now(), body, detail))
                    raise detail
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to delete document: %s \nException detail: %s\n'
                    % (datetime.now(), body, detail))
                raise detail

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        kwargs = self._add_prefix(**kwargs)
        print 'deleting index : %s' % kwargs.get('index')
        return self.es.indices.delete(ignore=[400, 404], **kwargs)

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index, doc_type, and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        kwargs = self._add_prefix(**kwargs)
        body = kwargs.get('body', None)
        id = kwargs.get('id', None)

        if id:
            if isinstance(id, list):
                kwargs.setdefault('body', {'ids': kwargs.pop('id')})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)

        ret = None
        try:
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning(
                '%s: WARNING: search failed for query: %s \nException detail: %s\n'
                % (datetime.now(), body, detail))
            pass

        return ret

    def create_mapping(self,
                       index,
                       doc_type,
                       fieldname='',
                       fieldtype='string',
                       fieldindex=None,
                       body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        index = self._add_prefix(index)
        if not body:
            if fieldtype == 'geo_shape':
                body = {
                    doc_type: {
                        'properties': {
                            fieldname: {
                                'type': 'geo_shape',
                                'tree': 'geohash',
                                'precision': '1m'
                            }
                        }
                    }
                }
            else:
                fn = {'type': fieldtype}
                if fieldindex:
                    fn['index'] = fieldindex
                body = {doc_type: {'properties': {fieldname: fn}}}

        self.es.indices.create(index=index, ignore=400)
        self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body)
        print 'creating index : %s/%s' % (index, doc_type)

    def create_index(self, **kwargs):
        kwargs = self._add_prefix(**kwargs)
        self.es.indices.create(**kwargs)
        print 'creating index : %s' % kwargs.get('index', '')

    def index_data(self,
                   index=None,
                   doc_type=None,
                   body=None,
                   idfield=None,
                   id=None,
                   **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the
            document itself and use the value found for the id of the document

        """

        index = self._add_prefix(index)
        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document, idfield)

            try:
                self.es.index(index=index,
                              doc_type=doc_type,
                              body=document,
                              id=id)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to index document: %s \nException detail: %s\n'
                    % (datetime.now(), document, detail))
                raise detail

    def bulk_index(self, data, **kwargs):
        return helpers.bulk(self.es, data, **kwargs)

    def create_bulk_item(self,
                         op_type='index',
                         index=None,
                         doc_type=None,
                         id=None,
                         data=None):
        return {
            '_op_type': op_type,
            '_index': self._add_prefix(index),
            '_type': doc_type,
            '_id': id,
            '_source': data
        }

    def count(self, **kwargs):
        kwargs = self._add_prefix(**kwargs)
        count = self.es.count(**kwargs)
        if count is not None:
            return count['count']
        else:
            return None

    def BulkIndexer(outer_self, batch_size=500, **kwargs):
        class _BulkIndexer(object):
            def __init__(self, **kwargs):
                self.queue = []
                self.batch_size = kwargs.pop('batch_size', 500)
                self.kwargs = kwargs

            def add(self,
                    op_type='index',
                    index=None,
                    doc_type=None,
                    id=None,
                    data=None):
                doc = {
                    '_op_type': op_type,
                    '_index': outer_self._add_prefix(index),
                    '_type': doc_type,
                    '_id': id,
                    '_source': data
                }
                self.queue.append(doc)

                if len(self.queue) >= self.batch_size:
                    outer_self.bulk_index(self.queue, **self.kwargs)
                    del self.queue[:]  #clear out the array

            def close(self):
                outer_self.bulk_index(self.queue, **self.kwargs)

            def __enter__(self, **kwargs):
                return self

            def __exit__(self, type, value, traceback):
                return self.close()

        return _BulkIndexer(batch_size=batch_size, **kwargs)
class ElasticsearchAPI:
    """
    Each query will have its own index based on query name.
    index_name = query.name
    Doc type = query_name to make it possible to set mapping. Mapping is set per doc_type.

    All rows from a Query should look the same no matter the source.

    This makes all the data from all the servers in the same index.
        Comparable.
        Less indexes.
    """
    def __init__(self, host, port, user, password):
        logger.info("Connecting to ES %s..." % host)
        self.es = Elasticsearch(hosts=[
            {'host': host, 'port': port}, ])
        logger.debug(self.es.info())

    @staticmethod
    def from_config_manager(config_manager):
        config = config_manager.get_config('Elasticsearch')

        return ElasticsearchAPI(config['host'],
                                config['port'],
                                config['password'],
                                config['username'])

    def consume_all(self, items, doc_type, index_name, id_column_name):
        print('Pushing %s docs to index: %s' % (len(items), index_name))
        actions = []
        for doc in items:
            action = {
                "_id": doc[id_column_name],
                "_index": index_name,
                "_type": doc_type,
                "_source": doc,
                }
            actions.append(action)
        helpers.bulk(self.es, actions)
        self.es.indices.refresh()

        return len(items)

    def find_ids(self, ids, doc_type, index_name):
        body = {"ids": ids}
        result = self.es.mget(index=index_name, doc_type=doc_type, body=body)
        # print(result)
        if len(result) > 0:
            return [r['_id'] for r in result['docs'] if r['found'] is True]
        return []

    def init_indexes_for(self, sources):
        for source in sources:
            self.init_index_for_source(source)

    def set_mapping(self, doc_type, index_name, mapping):
        self.es.indices.put_mapping(
            index=index_name,
            doc_type=doc_type,
            body=mapping)

    def delete_index(self, index_name):
        print('Truncating data in index: %s' % index_name)
        self.es.indices.delete(index=index_name, ignore=404)

    def create_index(self, index_name):
        print('Creating index %s' % index_name)
        self.es.indices.create(index_name, ignore=400)
def esidfilegenerator(host=None,port=9200,index=None,type=None,body=None,source=True,source_exclude=None,source_include=None,idfile=None,headless=False,chunksize=1000,timeout=10):
    if os.path.isfile(idfile):
        if not source:
            source=True
        tracer = logging.getLogger('elasticsearch')
        tracer.setLevel(logging.WARNING)
        tracer.addHandler(logging.FileHandler('errors.txt'))
        es=Elasticsearch([{'host':host}],port=port,timeout=timeout, max_retries=10, retry_on_timeout=True)
        ids=set()
        
            
        with open(idfile,"r") as inp:
            for ppn in inp:
                _id=ppn.rstrip()
                ids.add(_id)
                if len(ids)>=chunksize:
                    if body and "query" in body and "match" in body["query"]:
                        searchbody={"query":{"bool":{"must":[{"match":body["query"]["match"]},{}]}}}
                        for _id in ids:
                            searchbody["query"]["bool"]["must"][1]={"match":{"_id":_id}}
                            #eprint(json.dumps(searchbody))
                            for doc in esgenerator(host=host,port=port,index=index,type=type,body=searchbody,source=source,source_exclude=source_exclude,source_include=source_include,headless=False,timeout=timeout,verbose=False):
                                if headless:
                                    yield doc.get("_source")
                                else:
                                    yield doc
                        ids.clear()
                    else:
                        searchbody={'ids':list(ids)}
                        try:
                            for doc in es.mget(index=index,doc_type=type,body=searchbody,_source_include=source_include,_source_exclude=source_exclude,_source=source).get("docs"):
                                if headless:
                                    yield doc.get("_source")
                                else:
                                    yield doc
                            ids.clear()
                        except exceptions.NotFoundError:
                            continue
        if len(ids)>0:
            if body and "query" in body and "match" in body["query"]:
                searchbody={"query":{"bool":{"must":[{"match":body["query"]["match"]},{}]}}}
                for _id in ids:
                    searchbody["query"]["bool"]["must"][1]={"match":{"_id":_id}}
                    #eprint(json.dumps(searchbody))
                    for doc in esgenerator(host=host,port=port,index=index,type=type,body=searchbody,source=source,source_exclude=source_exclude,source_include=source_include,headless=False,timeout=timeout,verbose=False):
                        if headless:
                            yield doc.get("_source")
                        else:
                            yield doc
                ids.clear()
            else:
                searchbody={'ids':list(ids)}
                try:
                    for doc in es.mget(index=index,doc_type=type,body=searchbody,_source_include=source_include,_source_exclude=source_exclude,_source=source).get("docs"):
                        if headless:
                            yield doc.get("_source")
                        else:
                            yield doc
                    ids.clear()
                except exceptions.NotFoundError:
                    pass
Exemplo n.º 8
0
class ES(object):
    def __init__(self, es_url, http_auth=None):
        self.es_url = es_url
        self.es = Elasticsearch([es_url],
                                show_ssl_warnings=False,
                                http_auth=http_auth,
                                retry_on_timeout=True)

    def load_data(self, index, doc_type, doc, doc_id):
        # import certifi
        #
        # es = Elasticsearch(
        #     ['localhost', 'otherhost'],
        #     http_auth=('user', 'secret'),
        #     port=443,
        #     use_ssl=True
        # )
        try:
            return self.es.index(index=index,
                                 doc_type=doc_type,
                                 body=doc,
                                 id=doc_id)
        except Exception as e:
            # try once more
            try:
                return self.load_data(index, doc_type, doc, doc_id)
            except Exception as e:
                print e
                return None

    def create_index(self, index_name, es_mapping):
        command = self.es_url + "/" + index_name
        return requests.put(command, data=es_mapping, verify=False)

    def create_alias(self, alias_name, indices):
        url = self.es_url + "/_aliases"
        command = {
            "actions": [{
                "remove": {
                    "index": "*",
                    "alias": alias_name
                }
            }, {
                "add": {
                    "indices": indices,
                    "alias": alias_name
                }
            }]
        }
        return requests.post(url, data=json.dumps(command))

    def load_bulk(self, index, doc_type, doc_id, docs):
        actions = [{
            "_index": index,
            "_type": doc_type,
            "_id": doc[doc_id],
            "_source": {
                json.dumps(doc),
            }
        } for doc in docs]

        helpers.bulk(self.es, actions)

    def retrieve_doc(self, index, doc_type, ids):
        if not isinstance(ids, list):
            ids = [ids]
        query = "{\"query\": {\"ids\": {\"values\":" + json.dumps(ids) + "}}}"
        print query
        try:
            return self.es.search(index=index,
                                  doc_type=doc_type,
                                  body=query,
                                  filter_path=['hits.hits._source'])
        except:
            # try once more
            try:
                return self.es.search(index=index,
                                      doc_type=doc_type,
                                      body=query,
                                      filter_path=['hits.hits._source'])
            except Exception as e:
                print e
                return None

    def search(self,
               index,
               doc_type,
               query,
               ignore_no_index=False,
               **other_params):
        # print query
        try:
            return self.es.search(index=index,
                                  doc_type=doc_type,
                                  body=query,
                                  **other_params)
        except TransportError as e:
            if e.error != 'index_not_found_exception' and ignore_no_index:
                print e
        except Exception as e:
            print e

    def mget(self, index, doc_type, body):
        try:
            return self.es.mget(index=index, doc_type=doc_type, body=body)
        except TransportError as e:
            if e.error != 'index_not_found_exception':
                print e
        except Exception as e:
            print e
Exemplo n.º 9
0
class SearchEngine(object):
    def __init__(self, **kwargs):
        #
        serializer = JSONSerializer()
        serializer.mimetype = "application/json"
        serializer.dumps = serializer.serialize
        serializer.loads = JSONDeserializer().deserialize
        self.prefix = kwargs.pop("prefix", "").lower()
        self.es = Elasticsearch(serializer=serializer, **kwargs)
        self.logger = logging.getLogger(__name__)

    def _add_prefix(self, *args, **kwargs):
        if args:
            index = args[0].strip()
        else:
            index = kwargs.get("index", "").strip()
        if index is None or index == "":
            raise NotImplementedError("Elasticsearch index not specified.")

        prefix = "%s_" % self.prefix.strip(
        ) if self.prefix and self.prefix.strip() != "" else ""
        ret = []
        for idx in index.split(","):
            ret.append("%s%s" % (prefix, idx))

        index = ",".join(ret)
        if args:
            return index
        else:
            return dict(kwargs, index=index)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        kwargs = self._add_prefix(**kwargs)
        body = kwargs.pop("body", None)
        if body is not None:
            try:
                data = []
                refresh = kwargs.pop("refresh", False)
                for hit in helpers.scan(self.es, query=body, **kwargs):
                    hit["_op_type"] = "delete"
                    data.append(hit)

                return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
            except Exception as detail:
                try:
                    # ignore 404 errors (index_not_found_exception)
                    if detail.status_code == 404:
                        pass
                except:
                    self.logger.warning(
                        "%s: WARNING: failed to delete document by query: %s \nException detail: %s\n"
                        % (datetime.now(), body, detail))
                    raise detail
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning(
                    "%s: WARNING: failed to delete document: %s \nException detail: %s\n"
                    % (datetime.now(), body, detail))
                raise detail

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        kwargs = self._add_prefix(**kwargs)
        print("deleting index : %s" % kwargs.get("index"))
        return self.es.indices.delete(ignore=[400, 404], **kwargs)

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        kwargs = self._add_prefix(**kwargs)
        body = kwargs.get("body", None)
        id = kwargs.get("id", None)

        if id:
            if isinstance(id, list):
                kwargs.setdefault("body", {"ids": kwargs.pop("id")})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)

        ret = None
        try:
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning(
                "%s: WARNING: search failed for query: %s \nException detail: %s\n"
                % (datetime.now(), body, detail))
            pass

        return ret

    def create_mapping(self,
                       index,
                       fieldname="",
                       fieldtype="string",
                       fieldindex=None,
                       body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        index = self._add_prefix(index)
        if not body:
            if fieldtype == "geo_shape":
                body = {
                    "_doc": {
                        "properties": {
                            fieldname: {
                                "type": "geo_shape",
                                "tree": "geohash",
                                "precision": "1m"
                            }
                        }
                    }
                }
            else:
                fn = {"type": fieldtype}
                if fieldindex:
                    fn["index"] = fieldindex
                body = {"_doc": {"properties": {fieldname: fn}}}

        self.es.indices.create(index=index, ignore=400)
        self.es.indices.put_mapping(index=index,
                                    doc_type="_doc",
                                    body=body,
                                    include_type_name=True)
        print("creating index : %s" % (index))

    def create_index(self, **kwargs):
        kwargs = self._add_prefix(**kwargs)
        kwargs["include_type_name"] = True
        self.es.indices.create(ignore=400, **kwargs)
        print("creating index : %s" % kwargs.get("index", ""))

    def index_data(self,
                   index=None,
                   body=None,
                   idfield=None,
                   id=None,
                   **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the
            document itself and use the value found for the id of the document

        """

        index = self._add_prefix(index)
        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document, idfield)

            try:
                self.es.index(index=index,
                              doc_type="_doc",
                              body=document,
                              id=id)
            except Exception as detail:
                self.logger.warning(
                    "%s: WARNING: failed to index document: %s \nException detail: %s\n"
                    % (datetime.now(), document, detail))
                raise detail

    def bulk_index(self, data, **kwargs):
        return helpers.bulk(self.es, data, **kwargs)

    def create_bulk_item(self,
                         op_type="index",
                         index=None,
                         id=None,
                         data=None):
        return {
            "_op_type": op_type,
            "_index": self._add_prefix(index),
            "_type": "_doc",
            "_id": id,
            "_source": data
        }

    def count(self, **kwargs):
        kwargs = self._add_prefix(**kwargs)
        kwargs["doc_type"] = kwargs.pop("doc_type", "_doc")
        body = kwargs.pop("body", None)

        # need to only pass in the query key as other keys (eg: _source) are not allowed
        if body:
            query = body.pop("query", None)
            if query:
                kwargs["body"] = {"query": query}

        count = self.es.count(**kwargs)
        if count is not None:
            return count["count"]
        else:
            return None

    def BulkIndexer(outer_self, batch_size=500, **kwargs):
        class _BulkIndexer(object):
            def __init__(self, **kwargs):
                self.queue = []
                self.batch_size = kwargs.pop("batch_size", 500)
                self.kwargs = kwargs

            def add(self, op_type="index", index=None, id=None, data=None):
                doc = {
                    "_op_type": op_type,
                    "_index": outer_self._add_prefix(index),
                    "_type": "_doc",
                    "_id": id,
                    "_source": data
                }
                self.queue.append(doc)

                if len(self.queue) >= self.batch_size:
                    outer_self.bulk_index(self.queue, **self.kwargs)
                    del self.queue[:]  # clear out the array

            def close(self):
                outer_self.bulk_index(self.queue, **self.kwargs)

            def __enter__(self, **kwargs):
                return self

            def __exit__(self, type, value, traceback):
                return self.close()

        return _BulkIndexer(batch_size=batch_size, **kwargs)
Exemplo n.º 10
0
class ElasticsearchClient(object):
    """
    Class ElasticsearchClient represent a Elasticsearch client,
    it implement something feature base on elasticsearch.Elasticsearch.
    """

    automatic_syn_data_flag = {}
    automatic_thread_name_counter = 0

    def __init__(self):
        self.client = None

    def from_normal(self, hosts=default.ELASTICSEARCH_HOSTS, **kwargs):
        """
        Initialize a Elasticsearch client by specified hosts list.

        :param hosts: list of nodes we should connect to. Node should be a
            dictionary ({"host": "localhost", "port": 9200}), the entire dictionary
            will be passed to the :class:`~elasticsearch.Connection` class as
            kwargs, or a string in the format of ``host[:port]`` which will be
            translated to a dictionary automatically.  If no value is given the
            :class:`~elasticsearch.Urllib3HttpConnection` class defaults will be used

        :return: void
        """
        self.client = Elasticsearch(hosts=hosts, **kwargs)
        logger.info('Initialize normal Elasticsearch Client: %s.' %
                    self.client)

    def from_sniffing(self,
                      active_nodes,
                      sniff_on_start=True,
                      sniff_on_connection_fail=True,
                      sniffer_timeout=60,
                      **kwargs):
        """
        Initialize a Elasticsearch client for specify to sniff on startup to
        inspect the cluster and load balance across all nodes.
        The client can be configured to inspect the cluster state to get
        a list of nodes upon startup, periodically and/or on failure.

        :param active_nodes: the list of active nodes
        :param sniff_on_start: flag indicating whether to obtain a list of nodes
            from the cluser at startup time
        :param sniff_on_connection_fail: flag controlling if connection failure triggers a sniff
        :param sniffer_timeout: number of seconds between automatic sniffs
        :return: void
        """
        self.client = Elasticsearch(
            active_nodes,
            sniff_on_start=sniff_on_start,
            sniff_on_connection_fail=sniff_on_connection_fail,
            sniffer_timeout=sniffer_timeout,
            **kwargs)
        logger.info('Initialize sniffing Elasticsearch Client: %s.' %
                    self.client)

    def from_ssl(self,
                 ca_certs,
                 client_cert,
                 client_key,
                 hosts=default.ELASTICSEARCH_HOSTS,
                 use_ssl=True,
                 verify_certs=True,
                 **kwargs):
        """
        Initialize a Elasticsearch client by SSL.

        :param ca_certs: optional path to CA bundle. See
        https://urllib3.readthedocs.io/en/latest/security.html#using-certifi-with-urllib3
        :param client_cert: path to the file containing the private key and the
        certificate, or cert only if using client_key
        :param client_key: path to the file containing the private key if using
        separate cert and key files (client_cert will contain only the cert)
        :param hosts: hostname of the node
        :param use_ssl: use ssl for the connection if `True`
        :param verify_certs: whether to verify SSL certificates
        :return: void
        """
        self.client = Elasticsearch(hosts=hosts,
                                    use_ssl=use_ssl,
                                    verify_certs=verify_certs,
                                    ca_certs=ca_certs,
                                    client_cert=client_cert,
                                    client_key=client_key,
                                    **kwargs)
        logger.info('Initialize SSL Elasticsearch Client: %s.' % self.client)

    def transfer_data_from_mongo(self,
                                 index,
                                 doc_type,
                                 use_mongo_id=False,
                                 indexed_flag_field_name='',
                                 mongo_query_params={},
                                 mongo_host=default.MONGO_HOST,
                                 mongo_port=default.MONGO_PORT,
                                 mongo_db=default.MONGO_DB,
                                 mongo_collection=default.MONGO_COLLECTION):
        """
        Transfer data from MongoDB into the Elasticsearch, the hostname, port, database and
        collection name in MongoDB default from load in default.py

        :param index: The name of the index
        :param doc_type: The type of the document
        :param use_mongo_id: Use id of MongoDB in the Elasticsearch if is true otherwise automatic generation
        :param indexed_flag_field_name: the name of the field of the document,
                    if associated value is False will synchronize data for it
        :param mongo_client_params: The dictionary for client params of MongoDB
        :param mongo_query_params: The dictionary for query params of MongoDB
        :param mongo_host: The name of the hostname from MongoDB
        :param mongo_port: The number of the port from MongoDB
        :param mongo_db: The name of the database from MongoDB
        :param mongo_collection: The name of the collection from MongoDB
        :return: void
        """
        mongo_client = MongoClient(host=mongo_host, port=int(mongo_port))
        try:
            collection = mongo_client[mongo_db][mongo_collection]
            if indexed_flag_field_name != '':
                mongo_query_params.update({indexed_flag_field_name: False})
            mongo_docs = collection.find(mongo_query_params)
        finally:
            mongo_client.close()
        # Joint actions of Elasticsearch for execute bulk api
        actions = []
        id_array = []
        for doc in mongo_docs:
            action = {'_op_type': 'index', '_index': index, '_type': doc_type}
            id_array.append(doc['_id'])
            if not use_mongo_id:
                doc.pop('_id')
            else:
                doc['id'] = str(doc['_id'])
                doc.pop('_id')
            action['_source'] = doc
            actions.append(action)
        success, failed = es_helpers.bulk(self.client,
                                          actions,
                                          request_timeout=60 * 60)
        logger.info(
            'Transfer data from MongoDB(%s:%s) into the Elasticsearch(%s) success: %s, failed: %s'
            % (mongo_host, mongo_port, self.client, success, failed))

        # Back update flag
        if indexed_flag_field_name != '':
            t = threading.Thread(target=ElasticsearchClient._back_update_mongo,
                                 args=(self, mongo_host, mongo_port, mongo_db,
                                       mongo_collection, id_array, {
                                           indexed_flag_field_name: True
                                       }),
                                 name='mongodb_back_update')
            t.start()
        return success, failed

    def _back_update_mongo(self, mongo_host, mongo_port, mongo_db,
                           mongo_collection, id_array, update):
        client = MongoClient(host=mongo_host, port=mongo_port)
        try:
            collection = client[mongo_db][mongo_collection]
            for id in id_array:
                collection.update({'_id': id}, {'$set': update})
        finally:
            client.close()

    def create(self, index, doc_type, id, body, params={}, **kwargs):
        result = self.client.create(index,
                                    doc_type,
                                    id,
                                    body,
                                    params=params,
                                    **kwargs)
        logger.info(
            'Create[index: %s, doc type: %s, id: %s] is done body: \n %s' %
            (index, doc_type, id, body))
        logger.debug(
            '<Verbose message> operation: %s, version: %s shards: %s' %
            (result['result'], result['_version'], result['_shards']))
        return result

    def index(self, index, doc_type, body, id=None, params={}, **kwargs):
        result = self.client.index(index,
                                   doc_type,
                                   body,
                                   id,
                                   params=params,
                                   **kwargs)
        if id is None:
            id_message = 'Automatic Generation'
        else:
            id_message = id
        logger.info(
            'Index[index: %s, doc type: %s, id: %s] is done body: \n %s' %
            (index, doc_type, id_message, body))
        logger.debug('<Verbose message> operation: %s version: %s shards: %s' %
                     (result['result'], result['_version'], result['_shards']))
        return result

    def delete(self, index, doc_type, id, params={}, **kwargs):
        result = self.client.delete(index,
                                    doc_type,
                                    id,
                                    params=params,
                                    **kwargs)
        logger.info('Delete[index: %s, doc type: %s, id: %s] is done' %
                    (index, doc_type, id))
        logger.debug('<Verbose message> operation: %s version: %s shards: %s' %
                     (result['result'], result['_version'], result['_shards']))
        return result

    def search(self,
               index=None,
               doc_type=None,
               body=None,
               params={},
               **kwargs):
        result = self.client.search(index,
                                    doc_type,
                                    body,
                                    params=params,
                                    **kwargs)
        if index is None and doc_type is None and body is None:
            logger.info('Search[all mode] is done')
            return result
        logger.info('Search[index: %s, doc type: %s] is done body: \n %s' %
                    (index, doc_type, body))
        logger.debug(
            '<Verbose message> took: %s shards: %s hits: %s' %
            (result['took'], result['_shards'], result['hits']['total']))
        return result

    def count(self, index=None, doc_type=None, body=None, params={}, **kwargs):
        result = self.client.count(index,
                                   doc_type,
                                   body,
                                   params=params,
                                   **kwargs)
        if index is None and doc_type is None and body is None:
            logger.info('Count[all mode] is done')
            return result
        logger.info('Count[index: %s, doc type: %s] is done body: \n %s' %
                    (index, doc_type, body))
        logger.debug('<Verbose message> count: %s shards: %s' %
                     (result['count'], result['_shards']))
        return result

    def update(self, index, doc_type, id, body=None, params={}, **kwargs):
        result = self.client.update(index,
                                    doc_type,
                                    id,
                                    body,
                                    params=params,
                                    **kwargs)
        logger.info(
            'Update[index: %s, doc type: %s, id: %s] is done body: \n %s' %
            (index, doc_type, id, body))
        logger.debug('<Verbose message> operation: %s version: %s shards: %s' %
                     (result['result'], result['_version'], result['_shards']))
        return result

    def bulk(self, actions, stats_only=False, **kwargs):
        """
        Executes bulk api by elasticsearch.helpers.bulk.

        :param actions: iterator containing the actions
        :param stats_only:if `True` only report number of successful/failed
        operations instead of just number of successful and a list of error responses
        Any additional keyword arguments will be passed to
        :func:`~elasticsearch.helpers.streaming_bulk` which is used to execute
        the operation, see :func:`~elasticsearch.helpers.streaming_bulk` for more
        accepted parameters.
        """
        success, failed = es_helpers.bulk(self.client, actions, stats_only,
                                          **kwargs)
        logger.info('Bulk is done success %s failed %s actions: \n %s' %
                    (success, failed, actions))

    def mget(self, body, index=None, doc_type=None, params={}, **kwargs):
        result = self.client.mget(body,
                                  index,
                                  doc_type,
                                  params=params,
                                  **kwargs)
        logger.info('Mget[index: %s, doc type: %s] is done body: \n %s' %
                    (index, doc_type, body))
        return result

    def get_client(self):
        if self.client is None:
            logger.warning('Elasticsearch Client is None')
        return self.client

    # TODO: Use more effective solution
    def automatic_syn_data_from_mongo(
            self,
            index,
            doc_type,
            indexed_flag_field_name,
            thread_name='automatic_syn_data_thread',
            interval=60,
            use_mongo_id=False,
            mongo_query_params={},
            mongo_host=default.MONGO_HOST,
            mongo_port=default.MONGO_PORT,
            mongo_db=default.MONGO_DB,
            mongo_collection=default.MONGO_COLLECTION):
        """
        Automatic synchronize data that from MongoDB into the Elasticsearch by schedule task,
        it will synchronize this data if the indexed_flag_field_name of the field of the document is False.
        Noteworthy that the function may be no good please you caution use it.

        :param indexed_flag_field_name: the name of the field of the document,
                    if associated value is False will synchronize data for it
        :param thread_name: the name of the schedule task thread
        :param interval: the time that executes interval of the scheduled task every time (unit second)
        :return: the thread id, you can use this id to cancel associated task
        """
        thread_id = self._generate_thread_id(thread_name)
        if thread_id in ElasticsearchClient.automatic_syn_data_flag:
            lock.acquire()
            try:
                thread_name = thread_name + '-%s' % ElasticsearchClient.automatic_thread_name_counter
                ElasticsearchClient.automatic_thread_name_counter += 1
                thread_id = self._generate_thread_id(thread_name)
            finally:
                lock.release()
        ElasticsearchClient.automatic_syn_data_flag[thread_id] = True

        t = threading.Thread(
            target=ElasticsearchClient._automatic_syn_data_from_mongo_worker,
            args=(self, thread_id, index, doc_type, indexed_flag_field_name,
                  interval, use_mongo_id, mongo_query_params, mongo_host,
                  mongo_port, mongo_db, mongo_collection),
            name=thread_name)

        t.start()
        return thread_id

    def _generate_thread_id(self, thread_name):
        return str(hash(thread_name))

    def stop_automatic_syn_data(self, thread_id):
        lock.acquire()
        try:
            ElasticsearchClient.automatic_syn_data_flag[thread_id] = False
        finally:
            lock.release()

    def _automatic_syn_data_from_mongo_worker(
            self,
            thread_id,
            index,
            doc_type,
            indexed_flag_field_name,
            interval=60,
            use_mongo_id=False,
            mongo_query_params={},
            mongo_host=default.MONGO_HOST,
            mongo_port=default.MONGO_PORT,
            mongo_db=default.MONGO_DB,
            mongo_collection=default.MONGO_COLLECTION):
        current_thread__name = threading.current_thread().name
        while ElasticsearchClient.automatic_syn_data_flag[thread_id]:
            logger.info(
                '[%s]: synchronize data work start %s:%s -----> %s' %
                (current_thread__name, mongo_host, mongo_port, self.client))
            success, failed = self.transfer_data_from_mongo(
                index=index,
                doc_type=doc_type,
                use_mongo_id=use_mongo_id,
                indexed_flag_field_name=indexed_flag_field_name,
                mongo_query_params=mongo_query_params,
                mongo_host=mongo_host,
                mongo_port=mongo_port,
                mongo_db=mongo_db,
                mongo_collection=mongo_collection)
            logger.info(
                '[%s]: synchronize data work done %s:%s -----> %s [success=%s, failed=%s]'
                % (current_thread__name, mongo_host, mongo_port, self.client,
                   success, failed))
            time.sleep(interval)
        logger.info('[%s]: synchronize data work is shutdown ' %
                    current_thread__name)

    def open_index(self, index, params={}, **kwargs):
        result = self.client.indices.open(index, params=params, **kwargs)
        logger.info('Index %s is opened' % index)
        return result

    def close_index(self, index, params={}, **kwargs):
        result = self.client.indices.close(index, params=params, **kwargs)
        logger.info('Index %s is closed' % index)
        return result

    def indices_stats_info(self, index=None, metric=None, params={}, **kwargs):
        result = self.client.indices.stats(index=index,
                                           metric=metric,
                                           params=params,
                                           **kwargs)
        logger.info('Acquire indices status information is done')
        return result

    def get_simple_info_for_index(self, index=None, params={}, **kwargs):
        """
        Return a list of simple info by specified index (default all), each elements is a dictionary
        such as
        {
            'health' : 'green', 'status' : 'open',
            'index' : 'xxxx', 'uuid' : 'xxxx',
            'pri' : 1, 'rep' : 1,
            `docs_count` : 4, 'docs_deleted' : 0,
            'store_size' : 10kb, 'pri_store_size' : 10kb
        }
        """
        raw = self.client.cat.indices(index, params=params,
                                      **kwargs).split('\n')
        list = []
        for r in raw:
            alter = r.split(' ')
            if len(alter) < 10: continue
            dict = {
                'health': alter[0],
                'status': alter[1],
                'index': alter[2],
            }
            if len(alter) == 11:
                # May appear split fail (alter[3] is a empty string)
                dict['uuid'] = alter[4]
                i = 5
            else:
                dict['uuid'] = alter[3]
                i = 4
            dict['pri'] = alter[i]
            i += 1
            dict['rep'] = alter[i]
            i += 1
            dict['docs_count'] = alter[i]
            i += 1
            dict['docs_deleted'] = alter[i]
            i += 1
            dict['store_size'] = alter[i]
            i += 1
            dict['pri_store_size'] = alter[i]
            list.append(dict)
        logger.info(
            'Acquire simple information of the index is done succeeded: %s' %
            len(list))
        return list

    def cluster_health(self, index=None, params={}, **kwargs):
        result = self.client.cluster.health(index, params=params, **kwargs)
        message = 'Acquire cluster health information is done index: %s'
        if index is None:
            message = message % 'all'
        else:
            message = message % index
        logger.info(message)
        return result

    def cluster_health_for_indices(self, index=None, params={}, **kwargs):
        """
        Return a list of cluster health of specified indices(default all),
        the first element is a dictionary represent a global information of the cluster
        such as "cluster_name", "number_of_nodes"...
        the second element represent a indices information list that each element is a dictionary for one index
        such as [{'index' : 'a', 'status' : 'yellow', ...} , {'index' : 'b', 'status' : 'yellow', ...}, ....]
        """
        params['level'] = 'indices'
        result = self.cluster_health(index, params, **kwargs)
        return self._process_cluster_health_info(result)

    def cluster_health_for_shards(self, index=None, params={}, **kwargs):
        """
        Return a list of cluster health of specified indices(default all) and
        append shards information of each index
        the first element is a dictionary represent a global information of the cluster
        the second element represent a information of indices and its shards and each element is a dictionary
        such as [{'index' : 'a', 'status' : 'yellow', ..., 'shards' : {'0' : {...}, '1' : {...}, ...}, ...]
        """
        params['level'] = 'shards'
        result = self.cluster_health(index, params, **kwargs)
        return self._process_cluster_health_info(result)

    def cluster_status_info(self, node_id=None, params={}, **kwargs):
        result = self.client.cluster.stats(node_id=node_id,
                                           params=params,
                                           **kwargs)
        logger.info('Acquire cluster status information is done')
        return result

    def _process_cluster_health_info(self, info):
        list = []
        first = {}
        second = []
        for k, v in info.items():
            if k == 'indices':
                for k2, v2 in v.items():
                    index = {}
                    index['index'] = k2
                    index.update(v2)
                    second.append(index)
            else:
                first[k] = v
        list.append(first)
        list.append(second)
        return list

    def nodes_status_info(self,
                          node_id=None,
                          metric=None,
                          index_metric=None,
                          params={},
                          **kwargs):
        result = self.client.nodes.stats(node_id=node_id,
                                         metric=metric,
                                         index_metric=index_metric,
                                         params=params,
                                         **kwargs)
        logger.info('Acquire nodes status information is done')
        return result

    def nodes_info(self, node_id=None, metric=None, params={}, **kwargs):
        result = self.client.nodes.info(node_id=node_id,
                                        metric=metric,
                                        params=params,
                                        **kwargs)
        logger.info('Acquire nodes info is done')
        return result

    def nodes_simple_info(self, params={}, **kwargs):
        """
        Return a dictionary of the nodes simple info that key is a column name,
        such as [{"http_address": "192.111.111.111", "name" : "test", ...}, ...]
        """
        h = [
            'name', 'pid', 'http_address', 'version', 'jdk', 'disk.total',
            'disk.used_percent', 'heap.current', 'heap.percent', 'ram.current',
            'ram.percent', 'uptime', 'node.role'
        ]
        result = self.client.cat.nodes(v=True, h=h, **kwargs, params=params)
        result = [x.strip().split(' ') for x in result.split('\n')]
        # Clean up the space
        result.remove(result[-1])
        for i in range(len(result)):
            result[i] = list(filter(lambda x: x != '', result[i]))
        # Packing into the dictionary
        dicts = []
        for i in range(len(result) - 1):
            dict = {}
            for k, v in zip(result[0], result[i + 1]):
                dict[k] = v
            dicts.append(dict)

        logger.info(
            'Acquire simple information of the nodes is done succeeded: %s' %
            len(dicts))
        return dicts
Exemplo n.º 11
0
class ElasticsearchConnector:
    def __init__(self):
        self.es = Elasticsearch()

    def execute_search(self, index, body):
        try:
            response = self.es.search(index=index, body=body)
            return response['hits']['hits']
        except exceptions.RequestError:
            # print("Request error")
            # print(body)
            return []

    def execute_multiget(self, index, body):
        try:
            response = self.es.mget(index=index, body=body)
            return response['docs']
        except exceptions.RequestError:
            # print("Request error")
            # print(body)
            return []

    def execute_aggregation(self, index, body, aggregation):
        response = self.es.search(index=index, body=body)
        return response['aggregations'][aggregation]

    def execute_search_with_scroll(self, index, body):
        response = self.es.search(index=index, scroll='2m', body=body)
        return response['_scroll_id'], response['hits']['total'], response

    def scroll(self, sid, scroll):
        return self.es.scroll(scroll_id=sid, scroll=scroll)

    # add document to the specified elastic index
    def add_document(self, index, doc_type, body):
        try:
            self.es.index(index=index, doc_type=doc_type, body=body)
        except exceptions.RequestError as e:
            print(e)
            print(body)

    # add multiple documents at once
    def add_bulk(self, index, bodies):
        actions = []
        for body in bodies:
            dump = json.dumps(body)
            if 'id' in body:
                actions.append({
                    "_id": body['id'],
                    "_index": index,
                    "_source": dump
                })
            else:
                actions.append({"_index": index, "_source": body})

        helpers.bulk(self.es, actions)

    def update_bulk(self, index, bodies):
        actions = [{
            "_id": body['id'],
            "_index": index,
            "_type": '_doc',
            "_source": {
                'doc': body
            },
            '_op_type': 'update'
        } for body in bodies]

        helpers.bulk(self.es, actions)

    # update a small part of the given document
    def update_document(self, index, docid, body):
        try:
            self.es.update(index=index, id=docid, body=body)
        except (exceptions.RequestError, exceptions.TransportError) as e:
            print(e)
            print(body)

    # retrieve the term vector for a given document
    def get_term_vector(self, index, docid):
        return self.es.termvectors(index=index,
                                   id=docid,
                                   positions=True,
                                   term_statistics=True)

    def clear_index(self, index):
        self.es.indices.delete(index=index, ignore=[400, 404])

    def clear_all(self):
        self.clear_index('aggregate_articles')
        self.clear_index('users')
        self.clear_index('recommendations')
        self.clear_index('occupation')
        self.clear_index('personalization')

    def delete(self, index, docid):
        self.es.delete(index, docid)
Exemplo n.º 12
0
class ES(object):
    def __init__(self, es_url, http_auth=None):
        self.es_url = es_url
        self.es = Elasticsearch([es_url], show_ssl_warnings=False, http_auth=http_auth,retry_on_timeout=True)

    def load_data(self, index, doc_type, doc, doc_id):
        # import certifi
        #
        # es = Elasticsearch(
        #     ['localhost', 'otherhost'],
        #     http_auth=('user', 'secret'),
        #     port=443,
        #     use_ssl=True
        # )
        try:
            return self.es.index(index=index, doc_type=doc_type, body=doc, id=doc_id)
        except Exception as e:
            # try once more
            try:
                return self.load_data(index, doc_type, doc, doc_id)
            except Exception as e:
                print e
                return None

    def create_index(self, index_name, es_mapping):
        command = self.es_url + "/" + index_name
        return requests.put(command, data=es_mapping, verify=False)

    def create_alias(self, alias_name, indices):
        url = self.es_url + "/_aliases"
        command = {"actions": [
            {"remove": {"index": "*", "alias": alias_name}},
            {"add": {"indices": indices, "alias": alias_name}}
        ]}
        return requests.post(url, data=json.dumps(command))

    def load_bulk(self, index, doc_type, doc_id, docs):
        actions = [
            {
                "_index": index,
                "_type": doc_type,
                "_id": doc[doc_id],
                "_source": {
                    json.dumps(doc),
                }
            }
            for doc in docs
            ]

        helpers.bulk(self.es, actions)

    def retrieve_doc(self, index, doc_type, ids):
        if not isinstance(ids, list):
            ids = [ids]
        query = "{\"query\": {\"ids\": {\"values\":" + json.dumps(ids) + "}}}"
        print query
        try:
            return self.es.search(index=index, doc_type=doc_type, body=query, filter_path=['hits.hits._source'])
        except:
            # try once more
            try:
                return self.es.search(index=index, doc_type=doc_type, body=query, filter_path=['hits.hits._source'])
            except Exception as e:
                print e
                return None

    def search(self, index, doc_type, query, ignore_no_index=False, **other_params):
        # print query
        try:
            return self.es.search(index=index, doc_type=doc_type, body=query, **other_params)
        except TransportError as e:
            if e.error != 'index_not_found_exception' and ignore_no_index:
                print e
        except Exception as e:
            print e

    def es_search(self, index, doc_type, query, scroll, ignore_no_index=False, **other_params):
        # print query
        if not scroll:
            try:
                return self.es.search(index=index, doc_type=doc_type, body=query, **other_params)
            except Exception as e:
                return e
        else:
            #Initiating scroll
            try:
                total_docs = query['size']
                query['size'] = 0
                query['from'] = 0
                data = self.es.search(index=index, doc_type=doc_type, body=query,scroll='1m',size=1000, **other_params)
                docs = []
                docs = data['hits']['hits']
                docs_count = len(data['hits']['hits'])
                sid = data['_scroll_id']
                scroll_size = len(data['hits']['hits'])
                while docs_count > total_docs or scroll_size > 0:
                    new_data = self.es.scroll(sid,scroll='1m')
                    sid = data['_scroll_id']
                    for doc in new_data['hits']['hits']:
                        docs.append(doc)
                    scroll_size = len(new_data['hits']['hits'])
                    docs_count = docs_count + scroll_size

                data['hits']['hits'] = docs[:docs_count]
                data['hits']['total'] = docs_count
                print "scroll complete with " + str(docs_count)
                return data
            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
                lines = ''.join(lines)
                print lines
                return e
            


    def mget(self,index,doc_type,body):
        try:
            return self.es.mget(index=index,doc_type=doc_type,body=body)
        except TransportError as e:
            if e.error != 'index_not_found_exception':
                print e
        except Exception as e:
            print e
Exemplo n.º 13
0
class ElasticStore(object):
    '''A feature collection store on ElasticSearch.

    Feature collections are maps from feature names to features.
    The representation of each feature is unspecified by this
    interface.

    This class exposes a similar interface to the regular ``Store``
    class, with a few additions:

      1. Canopy scans are implemented natively with ElasticSearch,
         so they are provided as methods here.
      2. On all retrieval methods, the caller can pass a list of
         feature names (or feature name wildcards) to retrieve.
         If your FCs have lots of features, this is useful when
         you only need to retrieve a small fraction of them.

    .. automethod:: __init__
    .. automethod:: configured

    **CRUD operations**

    .. automethod:: get
    .. automethod:: get_many
    .. automethod:: put
    .. automethod:: delete
    .. automethod:: delete_all
    .. automethod:: delete_index

    **Keyword scanning**

    .. automethod:: keyword_scan
    .. automethod:: keyword_scan_ids

    **Scanning ids in lexicographic order**

    Note that these operations may be inefficient because of
    how ElasticSearch handles sorting.

    .. automethod:: scan
    .. automethod:: scan_ids
    .. automethod:: scan_prefix
    .. automethod:: scan_prefix_ids

    **Low-level**

    .. automethod:: sync
    .. automethod:: index_scan_ids
    .. automethod:: index_names
    '''
    config_name = 'dossier.store'

    @classmethod
    def configured(cls):
        '''Create a new instance from the global configuration.

        In order to use this, you must make sure that
        :class:`ElasticStore` has been configured by :mod:`yakonfig`,
        usually by passing the class to ``yakonfig.parse_args``.
        '''
        return cls(**yakonfig.get_global_config('dossier.store'))

    def __init__(self, hosts=None, namespace=None, type='fc',
                 feature_indexes=None, shards=10, replicas=0,
                 fulltext_indexes=None):
        '''Create a new store or connect to an existing one.

        :param hosts:
          Passed directly to ``elasticsearch.Elasticsearch``
          constructor. Required.
        :param str namespace:
          Used as the ES index name, prefixed by ``fcs_``. Required.
        :param str type:
          The ES type to use. If this is set to ``None``, then a random
          unique string is used.
        :param [str] feature_indexes:
          A list of names of features to index.
        :param int shards:
          The number of shards to use for this index. This only has an
          effect if the ES index didn't previous exist.
        :param int replicas:
          The number of replicas to use for this index. This only has
          an effect if the ES index didn't previous exist.
        :rtype: :class:`ElasticStore`
        '''
        if hosts is None:
            raise yakonfig.ProgrammerError(
                'ElasticStore needs at least one host specified.')
        if namespace is None:
            raise yakonfig.ProgrammerError(
                'ElasticStore needs a namespace defined.')
        if type is None:
            type = unicode(uuid.uuid4())
        self.conn = Elasticsearch(hosts=hosts, timeout=60, request_timeout=60)
        self.index = 'fcs_%s' % namespace
        self.type = type
        self.shards = shards
        self.replicas = replicas
        self.indexes = OrderedDict()
        self.fulltext_indexes = OrderedDict()
        self.indexed_features = set()
        self.fulltext_indexed_features = set()

        self._normalize_feature_indexes(feature_indexes)
        self._normalize_fulltext_feature_indexes(fulltext_indexes)
        if not self.conn.indices.exists(index=self.index):
            # This can race, but that should be OK.
            # Worst case, we initialize with the same settings more than
            # once.
            self._create_index()
        mapping = self.conn.indices.get_mapping(
            index=self.index, doc_type=self.type)
        if len(mapping) == 0:
            self._create_mappings()

    def get(self, content_id, feature_names=None):
        '''Retrieve a feature collection.

        If a feature collection with the given id does not
        exist, then ``None`` is returned.

        :param str content_id: Content identifier.
        :param [str] feature_names:
          A list of feature names to retrieve. When ``None``, all
          features are retrieved. Wildcards are allowed.
        :rtype: :class:`dossier.fc.FeatureCollection` or ``None``
        '''
        try:
            resp = self.conn.get(index=self.index, doc_type=self.type,
                                 id=eid(content_id),
                                 _source=self._source(feature_names))
            return self.fc_from_dict(resp['_source']['fc'])
        except NotFoundError:
            return None
        except:
            raise

    def get_many(self, content_ids, feature_names=None):
        '''Returns an iterable of feature collections.

        This efficiently retrieves multiple FCs corresponding to the
        list of ids given. Tuples of identifier and feature collection
        are yielded. If the feature collection for a given id does not
        exist, then ``None`` is returned as the second element of the
        tuple.

        :param [str] content_ids: List of content ids.
        :param [str] feature_names:
          A list of feature names to retrieve. When ``None``, all
          features are retrieved. Wildcards are allowed.
        :rtype: Iterable of ``(content_id, FC)``
        '''
        try:
            resp = self.conn.mget(index=self.index, doc_type=self.type,
                                  _source=self._source(feature_names),
                                  body={'ids': map(eid, content_ids)})
        except TransportError:
            return
        for doc in resp['docs']:
            fc = None
            if doc['found']:
                fc = self.fc_from_dict(doc['_source']['fc'])
            yield did(doc['_id']), fc

    def put(self, items, indexes=True):
        '''Adds feature collections to the store.

        This efficiently adds multiple FCs to the store. The iterable
        of ``items`` given should yield tuples of ``(content_id, FC)``.

        :param items: Iterable of ``(content_id, FC)``.
        :param [str] feature_names:
          A list of feature names to retrieve. When ``None``, all
          features are retrieved. Wildcards are allowed.
        '''
        actions = []
        for cid, fc in items:
            # TODO: If we store features in a columnar order, then we
            # could tell ES to index the feature values directly. ---AG
            # (But is problematic because we want to preserve the ability
            # to selectively index FCs. So we'd probably need two distinct
            # doc types.)
            idxs = defaultdict(list)
            if indexes:
                for fname in self.indexed_features:
                    if fname in fc:
                        idxs[fname_to_idx_name(fname)].extend(fc[fname])
                for fname in self.fulltext_indexed_features:
                    if fname not in fc:
                        continue
                    if isinstance(fc[fname], basestring):
                        idxs[fname_to_full_idx_name(fname)] = fc[fname]
                    else:
                        idxs[fname_to_full_idx_name(fname)].extend(fc[fname])
            actions.append({
                '_index': self.index,
                '_type': self.type,
                '_id': eid(cid),
                '_op_type': 'index',
                '_source': dict(idxs, **{
                    'fc': self.fc_to_dict(fc),
                }),
            })
        bulk(self.conn, actions, timeout=60, request_timeout=60)

    def delete(self, content_id):
        '''Deletes the corresponding feature collection.

        If the FC does not exist, then this is a no-op.
        '''
        try:
            self.conn.delete(index=self.index, doc_type=self.type,
                             id=eid(content_id))
        except NotFoundError:
            pass

    def delete_all(self):
        '''Deletes all feature collections.

        This does not destroy the ES index, but instead only
        deletes all FCs with the configured document type
        (defaults to ``fc``).
        '''
        try:
            self.conn.indices.delete_mapping(
                index=self.index, doc_type=self.type)
        except TransportError:
            logger.warn('type %r in index %r already deleted',
                        self.index, self.type, exc_info=True)

    def delete_index(self):
        '''Deletes the underlying ES index.

        Only use this if you know what you're doing. This destroys
        the entire underlying ES index, which could be shared by
        multiple distinct ElasticStore instances.
        '''
        if self.conn.indices.exists(index=self.index):
            self.conn.indices.delete(index=self.index)

    def sync(self):
        '''Tells ES to tell Lucene to do an fsync.

        This guarantees that any previous calls to ``put`` will be
        flushed to disk and available in subsequent searches.

        Generally, this should only be used in test code.
        '''
        self.conn.indices.refresh(index=self.index)

    def scan(self, *key_ranges, **kwargs):
        '''Scan for FCs in the given id ranges.

        :param key_ranges:
          ``key_ranges`` should be a list of pairs of ranges. The first
          value is the lower bound id and the second value is the
          upper bound id. Use ``()`` in either position to leave it
          unbounded. If no ``key_ranges`` are given, then all FCs in
          the store are returned.
        :param [str] feature_names:
          A list of feature names to retrieve. When ``None``, all
          features are retrieved. Wildcards are allowed.
        :rtype: Iterable of ``(content_id, FC)``
        '''
        for hit in self._scan(*key_ranges, **kwargs):
            yield did(hit['_id']), self.fc_from_dict(hit['_source']['fc'])

    def scan_ids(self, *key_ranges, **kwargs):
        '''Scan for ids only in the given id ranges.

        :param key_ranges:
          ``key_ranges`` should be a list of pairs of ranges. The first
          value is the lower bound id and the second value is the
          upper bound id. Use ``()`` in either position to leave it
          unbounded. If no ``key_ranges`` are given, then all FCs in
          the store are returned.
        :param [str] feature_names:
          A list of feature names to retrieve. When ``None``, all
          features are retrieved. Wildcards are allowed.
        :rtype: Iterable of ``content_id``
        '''
        kwargs['feature_names'] = False
        for hit in self._scan(*key_ranges, **kwargs):
            yield did(hit['_id'])

    def scan_prefix(self, prefix, feature_names=None):
        '''Scan for FCs with a given prefix.

        :param str prefix: Identifier prefix.
        :param [str] feature_names:
          A list of feature names to retrieve. When ``None``, all
          features are retrieved. Wildcards are allowed.
        :rtype: Iterable of ``(content_id, FC)``
        '''
        resp = self._scan_prefix(prefix, feature_names=feature_names)
        for hit in resp:
            yield did(hit['_id']), self.fc_from_dict(hit['_source']['fc'])

    def scan_prefix_ids(self, prefix):
        '''Scan for ids with a given prefix.

        :param str prefix: Identifier prefix.
        :param [str] feature_names:
          A list of feature names to retrieve. When ``None``, all
          features are retrieved. Wildcards are allowed.
        :rtype: Iterable of ``content_id``
        '''
        resp = self._scan_prefix(prefix, feature_names=False)
        for hit in resp:
            yield did(hit['_id'])

    def fulltext_scan(self, query_id=None, query_fc=None, feature_names=None,
                      preserve_order=True):
        '''Fulltext search.

        Yields an iterable of triples (score, identifier, FC)
        corresponding to the search results of the fulltext search
        in ``query``. This will only search text indexed under the
        given feature named ``fname``.

        Note that, unless ``preserve_order`` is set to True, the
        ``score`` will always be 0.0, and the results will be
        unordered. ``preserve_order`` set to True will cause the
        results to be scored and be ordered by score, but you should
        expect to see a decrease in performance.

        :param str fname:
          The feature to search.
        :param unicode query:
          The query.
        :param [str] feature_names:
          A list of feature names to retrieve. When ``None``, all
          features are retrieved. Wildcards are allowed.
        :rtype: Iterable of ``(score, content_id, FC)``
        '''
        it = self._fulltext_scan(query_id, query_fc,
                                 feature_names=feature_names,
                                 preserve_order=preserve_order)
        for hit in it:
            fc = self.fc_from_dict(hit['_source']['fc'])
            yield hit['_score'], did(hit['_id']), fc

    def fulltext_scan_ids(self, query_id=None, query_fc=None,
                          preserve_order=True):
        '''Fulltext search for identifiers.

        Yields an iterable of triples (score, identifier)
        corresponding to the search results of the fulltext search
        in ``query``. This will only search text indexed under the
        given feature named ``fname``.

        Note that, unless ``preserve_order`` is set to True, the
        ``score`` will always be 0.0, and the results will be
        unordered. ``preserve_order`` set to True will cause the
        results to be scored and be ordered by score, but you should
        expect to see a decrease in performance.

        :param str fname:
          The feature to search.
        :param unicode query:
          The query.
        :rtype: Iterable of ``(score, content_id)``
        '''
        it = self._fulltext_scan(query_id, query_fc, feature_names=False,
                                 preserve_order=preserve_order)
        for hit in it:
            yield hit['_score'], did(hit['_id'])

    def keyword_scan(self, query_id=None, query_fc=None, feature_names=None):
        '''Keyword scan for feature collections.

        This performs a keyword scan using the query given. A keyword
        scan searches for FCs with terms in each of the query's indexed
        fields.

        At least one of ``query_id`` or ``query_fc`` must be provided.
        If ``query_fc`` is ``None``, then the query is retrieved
        automatically corresponding to ``query_id``.

        :param str query_id: Optional query id.
        :param query_fc: Optional query feature collection.
        :type query_fc: :class:`dossier.fc.FeatureCollection`
        :param [str] feature_names:
          A list of feature names to retrieve. When ``None``, all
          features are retrieved. Wildcards are allowed.
        :rtype: Iterable of ``(content_id, FC)``
        '''
        it = self._keyword_scan(query_id, query_fc,
                                feature_names=feature_names)
        for hit in it:
            fc = self.fc_from_dict(hit['_source']['fc'])
            yield did(hit['_id']), fc

    def keyword_scan_ids(self, query_id=None, query_fc=None):
        '''Keyword scan for ids.

        This performs a keyword scan using the query given. A keyword
        scan searches for FCs with terms in each of the query's indexed
        fields.

        At least one of ``query_id`` or ``query_fc`` must be provided.
        If ``query_fc`` is ``None``, then the query is retrieved
        automatically corresponding to ``query_id``.

        :param str query_id: Optional query id.
        :param query_fc: Optional query feature collection.
        :type query_fc: :class:`dossier.fc.FeatureCollection`
        :rtype: Iterable of ``content_id``
        '''
        it = self._keyword_scan(query_id, query_fc, feature_names=False)
        for hit in it:
            yield did(hit['_id'])

    def index_scan_ids(self, fname, val):
        '''Low-level keyword index scan for ids.

        Retrieves identifiers of FCs that have a feature value
        ``val`` in the feature named ``fname``. Note that
        ``fname`` must be indexed.

        :param str fname: Feature name.
        :param str val: Feature value.
        :rtype: Iterable of ``content_id``
        '''
        disj = []
        for fname2 in self.indexes[fname]['feature_names']:
            disj.append({'term': {fname_to_idx_name(fname2): val}})
        query = {
            'constant_score': {
                'filter': {'or': disj},
            },
        }
        hits = scan(self.conn, index=self.index, doc_type=self.type, query={
            '_source': False,
            'query': query,
        })
        for hit in hits:
            yield did(hit['_id'])

    def index_names(self):
        '''Returns a list of all defined index names.

        Note that this only includes boolean based indexes.

        :rtype: list of ``unicode``
        '''
        return map(unicode, self.indexes.iterkeys())

    def fulltext_index_names(self):
        '''Returns a list of all defined fulltext index names.

        :rtype: list of ``unicode``
        '''
        return map(unicode, self.fulltext_indexes.iterkeys())

    def _fulltext_scan(self, query_id, query_fc, preserve_order=True,
                       feature_names=None):
        query_fc = self.get_query_fc(query_id, query_fc)
        ids = set([] if query_id is None else [eid(query_id)])
        for fname, features in self.fulltext_indexes.iteritems():
            qvals = map(unicode, query_fc.get(fname, {}).keys())
            if len(qvals) == 0:
                continue
            qmatches = []
            qfields = map(fname_to_full_idx_name, features)
            for qval in qvals:
                if re.search('\p{Punct}', qval):
                    match_type = 'phrase'
                else:
                    match_type = 'best_fields'
                qmatches.append({
                    'multi_match': {
                        'type': match_type,
                        'query': qval,
                        'fields': qfields,
                    }
                })
            query = {
                'filtered': {
                    'query': {
                        'bool': {
                            'should': qmatches,
                        },
                    },
                    'filter': {
                        'not': {
                            'ids': {
                                'values': list(ids),
                            },
                        },
                    },
                },
            }

            logger.info('fulltext scanning index: %s, query: %r', fname, qvals)
            hits = scan(
                self.conn, index=self.index, doc_type=self.type,
                preserve_order=preserve_order,
                query={
                    '_source': self._source(feature_names),
                    'query': query,
                })
            for hit in hits:
                ids.add(eid(hit['_id']))
                yield hit

    def _keyword_scan(self, query_id, query_fc, feature_names=None):
        # Why are we running multiple scans? Why are we deduplicating?
        #
        # It turns out that, in our various systems, it can be important to
        # prioritize the order of results returned in a keyword scan based on
        # the feature index that is being searched. For example, we typically
        # want to start a keyword scan with the results from a search on
        # `NAME`, which we don't want to be mingled with the results from a
        # search on some other feature.
        #
        # The simplest way to guarantee this type of prioritization is to run
        # a query for each index in the order in which they were defined.
        #
        # This has some downsides:
        #
        # 1. We return *all* results for the first index before ever returning
        #    results for the second.
        # 2. Since we're running multiple queries, we could get back results
        #    we've already retrieved in a previous query.
        #
        # We accept (1) for now.
        #
        # To fix (2), we keep track of all ids we've seen and include them
        # as a filter in subsequent queries.
        query_fc = self.get_query_fc(query_id, query_fc)
        ids = set([] if query_id is None else [eid(query_id)])
        for fname in self.indexes:
            term_disj = self._fc_index_disjunction_from_query(query_fc, fname)
            if len(term_disj) == 0:
                continue
            query = {
                'constant_score': {
                    'filter': {
                        'and': [{
                            'not': {
                                'ids': {
                                    'values': list(ids),
                                },
                            },
                        }, {
                            'or': term_disj,
                        }],
                    },
                },
            }

            logger.info('keyword scanning index: %s', fname)
            hits = scan(
                self.conn, index=self.index, doc_type=self.type,
                query={
                    '_source': self._source(feature_names),
                    'query': query,
                })
            for hit in hits:
                ids.add(eid(hit['_id']))
                yield hit

    def _scan(self, *key_ranges, **kwargs):
        feature_names = kwargs.get('feature_names')
        range_filters = self._range_filters(*key_ranges)
        return scan(self.conn, index=self.index, doc_type=self.type,
                    _source=self._source(feature_names),
                    preserve_order=True,
                    query={
                        # Sorting by `_id` seems to fail spuriously and
                        # I have no idea why. ---AG
                        'sort': {'_uid': {'order': 'asc'}},
                        'query': {
                            'constant_score': {
                                'filter': {
                                    'and': range_filters,
                                },
                            },
                        },
                    })

    def _scan_prefix(self, prefix, feature_names=None):
        query = {
            'constant_score': {
                'filter': {
                    'and': [{
                        'prefix': {
                            '_id': eid(prefix),
                        },
                    }],
                },
            },
        }
        return scan(self.conn, index=self.index, doc_type=self.type,
                    _source=self._source(feature_names),
                    preserve_order=True,
                    query={
                        # Sorting by `_id` seems to fail spuriously and
                        # I have no idea why. ---AG
                        'sort': {'_uid': {'order': 'asc'}},
                        'query': query,
                    })

    def _source(self, feature_names):
        '''Maps feature names to ES's "_source" field.'''
        if feature_names is None:
            return True
        elif isinstance(feature_names, bool):
            return feature_names
        else:
            return map(lambda n: 'fc.' + n, feature_names)

    def _range_filters(self, *key_ranges):
        'Creates ES filters for key ranges used in scanning.'
        filters = []
        for s, e in key_ranges:
            if isinstance(s, basestring):
                s = eid(s)
            if isinstance(e, basestring):
                # Make the range inclusive.
                # We need a valid codepoint, so use the max.
                e += u'\U0010FFFF'
                e = eid(e)

            if s == () and e == ():
                filters.append({'match_all': {}})
            elif e == ():
                filters.append({'range': {'_id': {'gte': s}}})
            elif s == ():
                filters.append({'range': {'_id': {'lte': e}}})
            else:
                filters.append({'range': {'_id': {'gte': s, 'lte': e}}})
        if len(filters) == 0:
            return [{'match_all': {}}]
        else:
            return filters

    def _create_index(self):
        'Create the index'
        try:
            self.conn.indices.create(
                index=self.index, timeout=60, request_timeout=60, body={
                    'settings': {
                        'number_of_shards': self.shards,
                        'number_of_replicas': self.replicas,
                    },
                })
        except TransportError:
            # Hope that this is an "index already exists" error...
            logger.warn('index already exists? OK', exc_info=True)
            pass

    def _create_mappings(self):
        'Create the field type mapping.'
        self.conn.indices.put_mapping(
            index=self.index, doc_type=self.type,
            timeout=60, request_timeout=60,
            body={
                self.type: {
                    'dynamic_templates': [{
                        'default_no_analyze_fc': {
                            'match': 'fc.*',
                            'mapping': {'index': 'no'},
                        },
                    }],
                    '_all': {
                        'enabled': False,
                    },
                    '_id': {
                        'index': 'not_analyzed',  # allows range queries
                    },
                    'properties': self._get_index_mappings(),
                },
            })
        # It is possible to create an index and quickly launch a request
        # that will fail because the index hasn't been set up yet. Usually,
        # you'll get a "no active shards available" error.
        #
        # Since index creation is a very rare operation (it only happens
        # when the index doesn't already exist), we sit and wait for the
        # cluster to become healthy.
        self.conn.cluster.health(index=self.index, wait_for_status='yellow')

    def _get_index_mappings(self):
        'Retrieve the field mappings. Useful for debugging.'
        maps = {}
        for fname in self.indexed_features:
            config = self.indexes.get(fname, {})
            print(fname, config)
            maps[fname_to_idx_name(fname)] = {
                'type': config.get('es_index_type', 'integer'),
                'store': False,
                'index': 'not_analyzed',
            }
        for fname in self.fulltext_indexed_features:
            maps[fname_to_full_idx_name(fname)] = {
                'type': 'string',
                'store': False,
                'index': 'analyzed',
            }
        return maps

    def _get_field_types(self):
        'Retrieve the field types. Useful for debugging.'
        mapping = self.conn.indices.get_mapping(
            index=self.index, doc_type=self.type)
        return mapping[self.index]['mappings'][self.type]['properties']

    def _normalize_fulltext_feature_indexes(self, fulltext_indexes):
        for x in fulltext_indexes or []:
            if isinstance(x, Mapping):
                assert len(x) == 1, 'only one mapping per index entry allowed'
                name = x.keys()[0]
                features = x[name]
            else:
                name = x
                features = [x]
            self.fulltext_indexes[name] = features
            for fname in features:
                self.fulltext_indexed_features.add(fname)

    def _normalize_feature_indexes(self, feature_indexes):
        for x in feature_indexes or []:
            if isinstance(x, Mapping):
                assert len(x) == 1, 'only one mapping per index entry allowed'
                name = x.keys()[0]
                if isinstance(x[name], Mapping):
                    index_type = x[name]['es_index_type']
                    features = x[name]['feature_names']
                else:
                    index_type = 'integer'
                    features = x[name]
            else:
                name = x
                features = [x]
                index_type = 'integer'
            self.indexes[name] = {
                'feature_names': features,
                'es_index_type': index_type,
            }
            for fname in features:
                self.indexed_features.add(fname)

    def _fc_index_disjunction_from_query(self, query_fc, fname):
        'Creates a disjunction for keyword scan queries.'
        if len(query_fc.get(fname, [])) == 0:
            return []
        terms = query_fc[fname].keys()

        disj = []
        for fname in self.indexes[fname]['feature_names']:
            disj.append({'terms': {fname_to_idx_name(fname): terms}})
        return disj

    def fc_to_dict(self, fc):
        d = {}
        for name, feat in fc.to_dict().iteritems():
            # This is a hack to drop the clean_visible feature because it
            # is not necessary to store it and it is large. We simply need
            # to index it.
            if name == '#clean_visible':
                continue
            d[name] = base64.b64encode(cbor.dumps(feat))
        return d

    def fc_from_dict(self, fc_dict):
        d = {}
        for name, feat in fc_dict.iteritems():
            d[name] = cbor.loads(base64.b64decode(feat))
        return FC(d)

    def get_query_fc(self, query_id, query_fc):
        if query_fc is None:
            if query_id is None:
                raise ValueError(
                    'one of query_id or query_fc must not be None')
            query_fc = self.get(query_id)
        if query_fc is None:
            raise KeyError(query_id)
        return query_fc
Exemplo n.º 14
0
class _ES(object):
    def __init__(self, index, doc_type, host, port, timeout=300, **args):
        self.host = host
        self.port = port
        self.index = index
        self.doc_type = doc_type
        self.es = Elasticsearch(hosts=[
            {
                "host": self.host,
                "port": self.port
            },
        ],
                                timeout=timeout,
                                **args)

    def check_properties(self, properties):
        """
        Check if all properties are known (e.g. have mappings), and creates mappings as needed
        """
        properties = set(properties)
        if not (properties - self.get_properties()):
            return
        to_add = properties - self.get_properties()
        if to_add:
            self.add_properties(to_add)

    def add_properties(self, to_add):
        """
        Add the named properties, setting mapping depending on suffix
        """
        mappings = {}
        for name in to_add:
            ftype = name.rsplit("_", 1)[1] if "_" in name else 'default'
            mappings[name] = settings.ES_MAPPING_TYPES[ftype]
        self.es.indices.put_mapping(index=self.index,
                                    doc_type=self.doc_type,
                                    body={"properties": mappings})

    def get_mapping(self):
        m = self.es.indices.get_mapping(self.index, self.doc_type)
        return m[self.index]['mappings'][self.doc_type]['properties']

    def get_properties(self):
        self.check_index()
        return set(self.get_mapping().keys())

    def refresh(self):
        self.es.indices.refresh()

    def highlight_article(self, aid: int, query: str) -> dict:
        """Highlight article given by an article id using a Lucene query. The resulting strings
        are safe to insert into an HTML document even if the original document contained malicious
        constructs.

        If you need the original article including HTML, call html.unescape on this output."""
        from amcat.tools.amcates_queryset import ESQuerySet

        qs = ESQuerySet().filter(id=aid).only("text",
                                              "title").highlight(query,
                                                                 mark="em")

        try:
            return next(iter(qs)).to_dict()
        except StopIteration:
            raise ValueError(
                "Article(id={}) not found in elastic index.".format(aid))

    def clear_cache(self):
        self.es.indices.clear_cache()

    def delete_index(self):
        try:
            self.es.indices.delete(self.index)
        except NotFoundError:
            pass
        except Exception as e:
            if 'IndexMissingException' in str(e):
                return
            raise

    def create_index(self, shards=5, replicas=1):
        es_settings = settings.ES_SETTINGS.copy()
        es_settings.update({
            "number_of_shards": shards,
            "number_of_replicas": replicas
        })

        body = {
            "settings": es_settings,
            "mappings": {
                settings.ES_ARTICLE_DOCTYPE: settings.ES_MAPPING
            }
        }

        self.es.indices.create(self.index, body)

    def check_index(self):
        """
        Check whether the server is up and the index exists.
        If the server is down, raise an exception.
        If the index does not exist, try to create it.
        """
        if not self.es.ping():
            raise Exception("Elastic server cannot be reached")
        if not self.es.indices.exists(self.index):
            log.info("Index {self.index} does not exist, creating".format(
                **locals()))
            self.create_index()
        return self.es.cluster.health(self.index, wait_for_status='yellow')

    def exists_type(self, doc_type, **kargs):
        return self.es.indices.exists_type(index=self.index,
                                           doc_type=doc_type,
                                           **kargs)

    def put_mapping(self, doc_type, body, **kargs):
        return self.es.indices.put_mapping(index=self.index,
                                           doc_type=doc_type,
                                           body=body,
                                           **kargs)

    def status(self):
        nodes = self.es.nodes.info()['nodes'].values()
        return {
            "ping": self.es.ping(),
            "nodes": [n['name'] for n in nodes],
            "index": self.index,
            "index_health": self.es.cluster.health(self.index),
            "transport_hosts": self.es.transport.hosts,
        }

    def get(self, id, **options):
        """
        Get a single article from the index
        """
        kargs = dict(index=self.index, doc_type=self.doc_type)
        kargs.update(options)
        return self.es.get_source(id=id, **kargs)

    def mget(self, ids, doc_type=None, parents=None):
        """
        Get multiple articles from the index.
        If paret is given, it should be a sequence of the same length as ids
        """
        if parents is None: parents = [None] * len(ids)
        if doc_type is None: doc_type = self.doc_type
        getdocs = [{
            "_index": self.index,
            "_id": id,
            "_parent": parent,
            "_type": doc_type
        } for (id, parent) in zip(ids, parents)]
        return self.es.mget({"docs": getdocs})['docs']

    def search(self, body, **options):
        """
        Perform a 'raw' search on the underlying ES index
        """
        kargs = dict(index=self.index, doc_type=self.doc_type)
        kargs.update(options)
        return self.es.search(body=body, **kargs)

    def scan(self, query, **kargs):
        """
        Perform a scan query on the es index
        See: http://elasticsearch-py.readthedocs.org/en/latest/helpers.html#elasticsearch.helpers.scan
        """
        return scan(self.es,
                    index=self.index,
                    doc_type=self.doc_type,
                    query=query,
                    **kargs)

    def query_ids(self,
                  query=None,
                  filters=EMPTY_RO_DICT,
                  body=None,
                  limit=None,
                  **kwargs):
        """
        Query the index returning a sequence of article ids for the mathced articles

        @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @param filter: field filter DSL query dict
        @param body: if given, use this instead of constructing from query/filters
        @param filters: if filter is None, build filter from filters as accepted by build_query, e.g. sets=12345

        Note that query and filters can be combined in a single call
        """
        if body is None:
            body = dict(build_body(query, filters, query_as_filter=True))
        for i, a in enumerate(
                scan(self.es,
                     query=body,
                     index=self.index,
                     doc_type=self.doc_type,
                     size=(limit or 1000),
                     fields="")):
            if limit and i >= limit:
                return
            yield int(a['_id'])

    def query(self,
              query=None,
              filters=EMPTY_RO_DICT,
              highlight=False,
              lead=False,
              fields=(),
              score=True,
              **kwargs):
        """
        Execute a query for the given fields with the given query and filter
        @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @param filter: field filter DSL query dict, defaults to build_filter(**filters)
        @param kwargs: additional keyword arguments to pass to es.search, eg fields, sort, from_, etc
        @return: a list of named tuples containing id, score, and the requested fields
        """
        body = dict(
            build_body(query,
                       filters,
                       query_as_filter=(not (highlight or score))))
        if highlight and not score:
            body['query'] = {'constant_score': {'query': body['query']}}

        if 'sort' in kwargs:
            body['track_scores'] = True

        if highlight and query:
            if isinstance(highlight, dict):
                body['highlight'] = highlight
            else:
                body['highlight'] = HIGHLIGHT_OPTIONS
        if lead or False and query == "" and highlight:
            body['script_fields'] = {
                "lead": {
                    "script": {
                        "file": LEAD_SCRIPT_FIELD
                    }
                }
            }

        result = self.search(body, fields=fields, **kwargs)
        return SearchResult(result, fields, score, body, query=query)

    def query_all(self, *args, **kargs):
        kargs.update({"from_": 0})
        size = kargs.setdefault('size', 10000)
        result = self.query(*args, **kargs)
        total = result.total
        for offset in range(size, total, size):
            kargs['from_'] = offset
            result2 = self.query(*args, **kargs)
            result.hits += result2.hits

        return result

    def _get_used_properties(self, body__prop):
        body, prop = body__prop
        body["query"]["bool"]["must"][1]["exists"]["field"] = prop
        return bool(
            self.es.count(index=self.index, doc_type=self.doc_type,
                          body=body)['count'])

    def get_used_properties(self, set_ids=None, article_ids=None, **filters):
        """
        Returns a sequency of property names in use in the specified set(s) (or setids)
        """
        if set_ids is not None:
            filters["sets"] = set_ids

        if article_ids is not None:
            filters["ids"] = article_ids

        all_properties = self.get_properties()
        flexible_properties = set(all_properties) - set(ALL_FIELDS)

        body = {
            "query": {
                "bool": {
                    "must": [
                        build_filter(**filters), {
                            "exists": {
                                "field": "fakeprop"
                            }
                        }
                    ]
                }
            }
        }

        bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties)))
        pool = ThreadPool()
        results = pool.imap(self._get_used_properties,
                            zip(bodies, flexible_properties))

        try:
            for found, prop in zip(results, flexible_properties):
                if found:
                    yield prop
        finally:
            pool.close()

    def add_articles(self, article_ids, batch_size=1000):
        """
        Add the given article_ids to the index. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator).
        """
        #WvA: remove redundancy with create_articles
        if not article_ids: return
        from amcat.models import Article, ArticleSetArticle

        n = len(article_ids) / batch_size
        for i, batch in enumerate(
                splitlist(article_ids, itemsperbatch=batch_size)):
            log.info("Adding batch {i}/{n}".format(**locals()))
            all_sets = multidict(
                (aa.article_id, aa.articleset_id)
                for aa in ArticleSetArticle.objects.filter(article__in=batch))
            dicts = (get_article_dict(article,
                                      list(all_sets.get(article.id, [])))
                     for article in Article.objects.filter(pk__in=batch))
            self.bulk_insert(dicts, batch_size=None)

    def remove_from_set(self, setid, article_ids, flush=True):
        """Remove the given articles from the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""
        if not article_ids: return
        for batch in splitlist(article_ids, itemsperbatch=1000):
            self.bulk_update(batch,
                             UPDATE_SCRIPT_REMOVE_FROM_SET,
                             params={'set': setid})

    def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
        """Add the given articles to the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""

        if not article_ids:
            if monitor:
                monitor.update()
            return

        batches = list(splitlist(article_ids, itemsperbatch=1000))
        monitor = monitor.submonitor(total=len(batches))

        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(message="Adding batch {iplus}/{nbatches}..".format(
                iplus=i + 1, nbatches=nbatches))
            self.bulk_update(batch,
                             UPDATE_SCRIPT_ADD_TO_SET,
                             params={'set': setid})

    def get_tokens(self, aid: int, fields=["text", "title"]):
        """
        Get a list of all tokens (words and their positions) in the given document
        :param aid: Article ID
        :param fields: List of fields to get the terms for
        :return: a sequence of (field, position, term) tuples
        """
        fieldstr = ",".join(fields)
        data = self.es.termvectors(self.index,
                                   self.doc_type,
                                   aid,
                                   fields=fieldstr,
                                   field_statistics=False,
                                   payloads=False,
                                   offsets=False)
        for field in fields:
            if field in data['term_vectors']:
                for term, info in data['term_vectors'][field]['terms'].items():
                    for token in info['tokens']:
                        yield field, token['position'], term

    def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()):
        """
        Bulk insert the given articles in batches of batch_size
        """
        batches = list(toolkit.splitlist(
            dicts, itemsperbatch=batch_size)) if batch_size else [dicts]
        monitor = monitor.submonitor(total=len(batches))
        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(
                1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1,
                                                            **locals()))
            props, articles = set(), {}
            for d in batch:
                props |= (set(d.keys()) - ALL_FIELDS)
                articles[d["id"]] = serialize(d)
            self.check_properties(props)
            body = get_bulk_body(articles)
            resp = self.es.bulk(body=body,
                                index=self.index,
                                doc_type=settings.ES_ARTICLE_DOCTYPE)
            if resp["errors"]:
                raise ElasticSearchError(resp)

    def update_values(self, article_id, values):
        """Update properties of existing article.

        @param values: mapping from field name to (new) value
        @type values: dict"""
        return self.bulk_update_values({article_id: values})

    def bulk_update_values(self, articles):
        """Updates set of articles in bulk.
        """
        body = get_bulk_body(
            {aid: serialize({"doc": a})
             for aid, a in articles.items()},
            action="update")
        resp = self.es.bulk(body=body,
                            index=self.index,
                            doc_type=settings.ES_ARTICLE_DOCTYPE)

        if resp["errors"]:
            raise ElasticSearchError(resp)

    def bulk_update(self, article_ids, script, params):
        """
        Execute a bulk update script with the given params on the given article ids.
        """
        payload = serialize({"script": {"file": script, "params": params}})
        body = get_bulk_body({aid: payload
                              for aid in article_ids},
                             action="update")
        resp = self.es.bulk(body=body,
                            index=self.index,
                            doc_type=settings.ES_ARTICLE_DOCTYPE)

        if resp["errors"]:
            raise ElasticSearchError(resp)

    def synchronize_articleset(self, aset, full_refresh=False):
        """
        Make sure the given articleset is correctly stored in the index
        @param full_refresh: if true, re-add all articles to the index. Use this
                             after changing properties of articles
        """
        self.check_index()  # make sure index exists and is at least 'yellow'

        log.debug("Getting SOLR ids from set")
        solr_set_ids = set(self.query_ids(filters=dict(sets=[aset.id])))
        log.debug("Getting DB ids")
        db_ids = aset.get_article_ids()
        log.debug("Getting SOLR ids")
        solr_ids = set(self.in_index(db_ids))

        to_remove = solr_set_ids - db_ids
        if full_refresh:
            to_add_docs = db_ids
            to_add_set = set()
        else:
            to_add_docs = db_ids - solr_ids
            to_add_set = (db_ids & solr_ids) - solr_set_ids

        log.warning(
            "Refreshing index, full_refresh={full_refresh},"
            "|solr_set_ids|={nsolrset}, |db_set_ids|={ndb}, |solr_ids|={nsolr} "
            "|to_add| = {nta}, |to_add_set|={ntas}, |to_remove_set|={ntr}".
            format(nsolr=len(solr_ids),
                   nsolrset=len(solr_set_ids),
                   ndb=len(db_ids),
                   nta=len(to_add_docs),
                   ntas=len(to_add_set),
                   ntr=len(to_remove),
                   **locals()))

        log.info("Removing {} articles".format(len(to_remove)))
        self.remove_from_set(aset.id, to_remove)
        log.info("Adding {} articles to set".format(len(to_add_set)))
        self.add_to_set(aset.id, to_add_set)
        log.info("Adding {} articles to index".format(len(to_add_docs)))
        self.add_articles(to_add_docs)
        log.info("Refreshing")
        self.refresh()

    def _count(self, body):
        """Raw version of count directly passing given query to elastic, while setting the index and doc_type"""
        return self.es.count(index=self.index,
                             doc_type=settings.ES_ARTICLE_DOCTYPE,
                             body=body)

    def count(self, query=None, filters=None):
        """
        Compute the number of items matching the given query / filter
        """
        filters = dict(build_body(query, filters, query_as_filter=True))
        body = {"query": {"constant_score": filters}}
        return self._count(body)["count"]

    def search_aggregate(self,
                         aggregation,
                         query=None,
                         filters=None,
                         **options):
        """
        Run an aggregate search query and return the aggregation results
        @param aggregation: raw elastic query, e.g. {"terms" : {"field" : "medium"}}
        """
        body = dict(query={
            "filtered":
            dict(build_body(query, filters, query_as_filter=True))
        },
                    aggregations={"aggregation": aggregation})
        result = self.search(body, size=0, search_type="count", **options)
        return result['aggregations']['aggregation']

    def _parse_terms_aggregate(self, aggregate, group_by, terms, sets):
        if not group_by:
            for term in terms:
                yield term, aggregate[term.label]['doc_count']
        else:
            for term in terms:
                yield term, self._parse_aggregate(aggregate[term.label],
                                                  list(group_by), terms, sets)

    def _parse_other_aggregate(self, aggregate, group_by, group, terms, sets):
        buckets = aggregate[group]["buckets"]
        if not group_by:
            return ((b['key'], b['doc_count']) for b in buckets)
        return ((b['key'], self._parse_aggregate(b, list(group_by), terms,
                                                 sets)) for b in buckets)

    def _parse_aggregate(self, aggregate, group_by, terms, sets):
        """Parse a aggregation result to (nested) namedtuples."""
        group = group_by.pop(0)

        if group == "terms":
            result = self._parse_terms_aggregate(aggregate, group_by, terms,
                                                 sets)
        else:
            result = self._parse_other_aggregate(aggregate, group_by, group,
                                                 terms, sets)
            if group == "sets" and sets is not None:
                # Filter sets if 'sets' is given
                result = ((aset_id, res) for aset_id, res in result
                          if aset_id in set(sets))
            elif group == "date":
                # Parse timestamps as datetime objects
                result = ((get_date(stamp), aggr) for stamp, aggr in result)

        # Return results as namedtuples
        ntuple = namedtuple("Aggr",
                            [group, "buckets" if group_by else "count"])
        return [ntuple(*r) for r in result]

    def _build_aggregate(self, group_by, date_interval, terms, sets):
        """Build nested aggregation query for list of groups"""
        group = group_by.pop(0)

        if group == 'date':
            aggregation = {
                group: {
                    'date_histogram': {
                        'field': group,
                        'interval': date_interval,
                        "min_doc_count": 1
                    }
                }
            }
        elif group == 'terms':
            aggregation = {
                term.label: {
                    'filter': dict(build_body(term.query))
                }
                for term in terms
            }
        else:
            aggregation = {
                group: {
                    'terms': {
                        # Default size is too small, we want to return all results
                        'size': 999999,
                        'field': group
                    }
                }
            }

        # We need to nest the other aggregations, see:
        # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-aggregations.html
        if group_by:
            nested = self._build_aggregate(group_by, date_interval, terms,
                                           sets)
            for aggr in aggregation.values():
                aggr["aggregations"] = nested

        return aggregation

    def aggregate_query(self,
                        query=None,
                        filters=None,
                        group_by=None,
                        terms=None,
                        sets=None,
                        date_interval='month'):
        """
        Compute an aggregate query, e.g. select count(*) where <filters> group by <group_by>. If
        date is used as a group_by variable, uses date_interval to bin it. It does support multiple
        values for group_by.

        You can group_by on terms by supplying "terms" to group_by. In addition, you will need to
        supply terms as a parameter, which consists of a list of SearchQuery's. Query is then used
        as a global filter, while terms are 'local'.

        @param query: an elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @type group_by: list / tuple
        @type mediums: bool
        @param mediums: return Medium objects, instead of ids
        """
        if isinstance(group_by, str):
            log.warning(
                "Passing strings to aggregate_query(group_by) is deprecated.")
            group_by = [group_by]

        if "terms" in group_by and terms is None:
            raise ValueError(
                "You should pass a list of terms if aggregating on it.")

        filters = dict(build_body(query, filters, query_as_filter=True))
        aggregations = self._build_aggregate(list(group_by), date_interval,
                                             terms, sets)

        body = {
            "query": {
                "constant_score": filters
            },
            "aggregations": aggregations
        }

        log.debug("es.search(body={body})".format(**locals()))
        result = self.search(body)
        result = self._parse_aggregate(result["aggregations"], list(group_by),
                                       terms, sets)
        return result

    def statistics(self, query=None, filters=None):
        """Compute and return a Result object with n, start_date and end_date for the selection"""
        body = {
            "query": {
                "constant_score":
                dict(build_body(query, filters, query_as_filter=True))
            },
            'aggregations': {
                'stats': {
                    'stats': {
                        'field': 'date'
                    }
                }
            }
        }

        stats = self.search(body, size=0)['aggregations']['stats']
        result = Result()
        result.n = stats['count']
        if result.n == 0:
            result.start_date, result.end_date = None, None
        else:
            result.start_date = get_date(stats['min'])
            result.end_date = get_date(stats['max'])
        return result

    def list_dates(self, query=None, filters=None, interval="day"):
        from amcat.tools.aggregate_es import aggregate, IntervalCategory
        for date, count in aggregate(query,
                                     filters, [IntervalCategory(interval)],
                                     es=self):
            yield date

    def in_index(self, ids):
        """
        Check whether the given ids are already indexed.
        @return: a sequence of ids that are in the index
        """
        if not isinstance(ids, list): ids = list(ids)
        log.info(
            "Checking existence of {nids} documents".format(nids=len(ids)))
        if not ids: return
        for batch in splitlist(ids, itemsperbatch=10000):
            result = self.es.mget(index=self.index,
                                  doc_type=settings.ES_ARTICLE_DOCTYPE,
                                  body={"ids": batch},
                                  fields=[])
            for doc in result['docs']:
                if doc['found']: yield int(doc['_id'])

    def duplicate_exists(self, article):
        """
        Check whether a duplicate of the given article already exists.
        If so, returns the sets that the duplicate is a member of.
        Duplication is checked using de get_hash function, so article
        should be an object with the appropriate attributes (.title etc)
        @return: A (possibly empty) sequence of results with .id and .sets
        """
        hash = get_article_dict(article).hash
        return self.query(filters={'hashes': hash},
                          fields=["sets"],
                          score=False)

    def _get_purge_actions(self, query):
        for id in self.query_ids(body=query):
            yield {
                "_op_type": "delete",
                "_id": id,
                "_index": self.index,
                "_type": settings.ES_ARTICLE_DOCTYPE
            }

    def purge_orphans(self):
        """Remove all articles without set from the index"""
        query = {
            "query": {
                "constant_score": {
                    "filter": {
                        "missing": {
                            "field": "sets"
                        }
                    }
                }
            }
        }
        return bulk(self.es, self._get_purge_actions(query))

    def get_child_type_counts(self, **filters):
        """Get the number of child documents per type"""
        filters = dict(build_body(filters=filters))
        filter = {
            "has_parent": {
                "parent_type": self.doc_type,
                "filter": filters['filter']
            }
        }
        aggs = {"module": {"terms": {"field": "_type"}}}
        body = {"aggs": {"prep": {"filter": filter, "aggs": aggs}}}
        r = self.es.search(index=self.index, search_type="count", body=body)
        for b in r['aggregations']['prep']['module']['buckets']:
            yield b['key'], b['doc_count']

    def get_articles_without_child(self, child_doctype, limit=None, **filters):
        """Return the ids of all articles without a child of the given doctype"""
        nochild = {
            "not": {
                "has_child": {
                    "type": child_doctype,
                    "query": {
                        "match_all": {}
                    }
                }
            }
        }
        filter = dict(build_body(filters=filters))['filter']
        body = {"filter": {"bool": {"must": [filter, nochild]}}}
        return self.query_ids(body=body, limit=limit)
Exemplo n.º 15
0
class Elastic_Search:
    def __init__(self, index='iis-logs-', aws_secret_id=None):
        self.timestamp = datetime.datetime.utcnow()
        self.index = index
        self._setup_Elastic_on_localhost()  # default to localhost
        self._setup_Elastic_on_localhost()  # default to localhost
        self._result = None

        if index and aws_secret_id:
            self._setup_Elastic_on_cloud_via_AWS_Secret(index, aws_secret_id)

    def _setup_Elastic_on_localhost(self):
        self.host = 'localhost'
        self.port = 9200
        self.scheme = 'http'
        self.es = Elasticsearch([{'host': self.host, 'port': self.port}])

    def _setup_Elastic_on_cloud_via_AWS_Secret(self, index, secret_id):
        credentials = json.loads(Secrets(secret_id).value())
        self.host = credentials['host']
        self.username = credentials['username']
        self.password = credentials['password']
        self.port = credentials['port']
        self.index = index
        self._setup_Elastic_on_cloud(self.host, self.port, self.username,
                                     self.password)
        return self

    def _setup_Elastic_on_cloud(self, host, port, username, password):
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self.scheme = 'https'
        self.es = Elasticsearch([host],
                                http_auth=(username, password),
                                scheme="https",
                                port=port)
        return self

    def add_data_with_timestamp(self, data):
        data["@timestamp"] = self.timestamp
        return self.es.index(index=self.index, doc_type='item', body=data)

    def add(self, data, id_key=None):
        try:
            if id_key is not None:
                return self.es.index(index=self.index,
                                     doc_type='item',
                                     body=data,
                                     id=data[id_key])
            else:
                return self.es.index(index=self.index,
                                     doc_type='item',
                                     body=data)
        except Exception as error:
            print("elk-error", error)
            return {"elk-error": "{0}".format(error)}

    def add_bulk(self, data, id_key=None, pipeline=None):
        ok = 0
        if data:
            actions = []
            for item in data:
                item_data = {
                    "_index": self.index,
                    "_type": 'item',
                    "_source": item,
                }
                if id_key is not None:
                    item_data["_id"] = item[id_key]
                actions.append(item_data)

            if pipeline is None:
                ok, _ = helpers.bulk(self.es, actions, index=self.index)
            else:
                ok, _ = helpers.bulk(self.es,
                                     actions,
                                     index=self.index,
                                     pipeline=pipeline)
        return ok

    def create_index(self, body={}):
        if self.exists() is False:
            self._result = self.es.indices.create(index=self.index, body=body)
        return self

    def create_index_with_location_geo_point(self, field="location"):
        body = {
            "mappings": {
                "item": {
                    "properties": {
                        field: {
                            "type": "geo_point"
                        }
                    }
                }
            }
        }
        self.create_index(body)
        return self

    def create_index_pattern(self, add_time_field=True):
        if add_time_field:
            payload = {
                "type": "index-pattern",
                "index-pattern": {
                    "title": self.index + '*',
                    "timeFieldName": "date"
                }
            }
        else:
            print('creating index without index pattern')
            payload = {
                "type": "index-pattern",
                "index-pattern": {
                    "title": self.index + '*'
                }
            }
        data = json.dumps(payload)
        headers = {'Content-Type': 'application/json'}

        if self.host == 'localhost':
            url = 'http://{0}:{1}/.kibana/doc/index-pattern:{2}'.format(
                self.host, self.port, self.index)
            self._result = json.loads(PUT(url, data, headers))

        else:
            url = 'https://{0}:{1}/.kibana/doc/index-pattern:{2}'.format(
                self.host, self.port, self.index)
            response = requests.put(url,
                                    data,
                                    headers=headers,
                                    auth=HTTPBasicAuth(self.username,
                                                       self.password))
            self._result = json.loads(response.text)

        return self

    def delete_index_pattern(self):
        try:
            if self.host == 'localhost':
                url = 'http://{0}:{1}/.kibana/doc/index-pattern:{2}'.format(
                    self.host, self.port, self.index)
                self._result = json.loads(DELETE(url))
            else:
                url = 'https://{0}:{1}/.kibana/doc/index-pattern:{2}'.format(
                    self.host, self.port, self.index)
                response = requests.delete(url,
                                           auth=HTTPBasicAuth(
                                               self.username, self.password))
                self._result = json.loads(response.text)
        except Exception as error:
            self._result = {'error': error}
        return self

    def delete_data_by_id(self, id):
        return self.es.delete(index=self.index, doc_type='item', id=id)

    def get_data(self, id):
        try:
            return self.es.get(index=self.index, doc_type='item', id=id)
        except NotFoundError:
            return None

    def get_many(self, ids):
        data = self.es.mget(index=self.index,
                            doc_type='item',
                            body={'ids': ids})
        results = {}
        for item in data['docs']:
            _id = item['_id']
            if item['found'] is False:
                results[_id] = None
            else:
                results[_id] = item['_source']
        return results

    def get_data_First_10(self):
        results = self.es.search(index=self.index,
                                 body={"query": {
                                     "match_all": {}
                                 }})
        for result in results['hits']['hits']:
            yield result['_source']

    def get_index_settings(self):
        url = 'https://{3}:{4}@{0}:{1}/{2}/_settings'.format(
            self.host, self.port, self.index, self.username, self.password)
        return json.loads(requests.get(url).text)

    def search_using_lucene(
        self,
        query,
        size=10000,
        sort=None
    ):  # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax
        query = query.replace('“', '"').replace(
            '”', '"')  # fix the quotes we receive from Slack
        results = self.es.search(index=self.index,
                                 q=query,
                                 size=size,
                                 sort=sort)
        for result in results['hits']['hits']:
            yield result['_source']

    def search_using_lucene_index_by_id(
        self,
        query,
        size=10000,
        sort=None
    ):  # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax
        query = query.replace('“', '"').replace(
            '”', '"')  # fix the quotes we receive from Slack
        elk_results = self.es.search(index=self.index,
                                     q=query,
                                     size=size,
                                     sort=sort)
        results = {}
        for result in elk_results['hits']['hits']:
            id = result['_id']
            value = result['_source']
            results[id] = value
        return results

    def search_using_lucene_sort_by_date(
        self,
        query,
        size=10000
    ):  # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax
        query = query.replace('“', '"').replace(
            '”', '"')  # fix the quotes we receive from Slack
        elk_results = self.es.search(index=self.index,
                                     q=query,
                                     size=size,
                                     sort="date:desc")
        results = []
        for result in elk_results['hits']['hits']:
            id = result['_id']
            value = result['_source']
            item = {"id": id, "value": value}
            results.append(item)
        return results

    def search_using_query(self, query, size=10000):
        results = self.es.search(index=self.index, body=query, size=size)
        for result in results['hits']['hits']:
            yield result['_source']

    def search_on_field_for_value(self, field, value, size=10000):
        query = {"query": {"match": {field: {"query": value}}}}
        return self.search_using_query(query, size=size)

    def search_on_field_for_values(self, field, values):
        query = {
            "query": {
                "constant_score": {
                    "filter": {
                        "terms": {
                            field: values
                        }
                    }
                }
            }
        }
        return self.search_using_query(query)

    # this is not working
    # def search_get_unique_field_values(self, field,size = 10000):
    #     query = {
    #         "size": 0,
    #         "aggs": {
    #             "unique_ids": {
    #                 "terms": {
    #                     "field": 'field',
    #                     "size": size
    #                 }
    #             }
    #         }
    #     }
    #     return self.search_using_query(query)

    def set_index_settings(self, settings):
        headers = {'Content-Type': 'application/json'}
        url = 'https://{0}:{1}/{2}/_settings'.format(self.host, self.port,
                                                     self.index)
        response = requests.put(url,
                                json.dumps(settings),
                                headers=headers,
                                auth=HTTPBasicAuth(self.username,
                                                   self.password))
        return response.text

    def set_index_settings_total_fields(self, value):
        self.set_index_settings({"index.mapping.total_fields.limit": value})
        return self

    def delete_using_query(self, query):
        results = self.es.delete_by_query(index=self.index, body=query)
        return results

    def delete_index(self):
        if self.exists():
            self._result = self.es.indices.delete(self.index)
        return self

    def index_list(self):
        return set(self.es.indices.get_alias())

    def exists(self):
        return self.es.indices.exists(self.index)

    def set_index(self, index):
        self.index = index
        return self
Exemplo n.º 16
0
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 22 17:00:46 2018

@author: Ajanta
"""

import json
from elasticsearch import Elasticsearch

query = {'query': {'match_all': {}}}

elastic_obj = Elasticsearch(INDEX_NAME='products', TYPE_NAME='snapdeal')

f = open('products.json', 'w')
f.write(json.dumps(elastic_obj.mget(body=query)))
f.close()
Exemplo n.º 17
0
class SearchEngine(object):

    def __init__(self, prefix=settings.ELASTICSEARCH_PREFIX):
        #
        serializer = JSONSerializer()
        serializer.mimetype = 'application/json'
        serializer.dumps = serializer.serialize
        serializer.loads = JSONDeserializer().deserialize
        self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS)
        self.logger = logging.getLogger(__name__)
        self.prefix = prefix.lower()

    def _add_prefix(self, *args, **kwargs):
        if args:
            index = args[0].strip()
        else:
            index = kwargs.get('index', '').strip()
        if index is None or index == '':
            raise NotImplementedError("Elasticsearch index not specified.")

        prefix = '%s_' % self.prefix.strip() if self.prefix and self.prefix.strip() != '' else ''
        index = '%s%s' % (prefix, index)
        if args:
            return index
        else:
            return dict(kwargs, index=index)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index, doc_type, and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        kwargs = self._add_prefix(**kwargs)
        body = kwargs.pop('body', None)
        if body != None:
            try:
                data = []
                refresh = kwargs.pop('refresh', False)
                for hit in helpers.scan(self.es, query=body, **kwargs):
                    hit['_op_type'] = 'delete'
                    data.append(hit)

                return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
            except Exception as detail:
                try:
                    # ignore 404 errors (index_not_found_exception)
                    if detail.status_code == 404:
                        pass
                except:
                    self.logger.warning('%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                    raise detail
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                raise detail

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        kwargs = self._add_prefix(**kwargs)
        print 'deleting index : %s' % kwargs.get('index')
        return self.es.indices.delete(ignore=[400, 404], **kwargs)

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index, doc_type, and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        kwargs = self._add_prefix(**kwargs)
        body = kwargs.get('body', None)
        id = kwargs.get('id', None)
        
        if id:
            if isinstance(id, list):
                kwargs.setdefault('body', {'ids': kwargs.pop('id')})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)

        ret = None
        try:
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning('%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail))
            pass

        return ret

    def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex=None, body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        index = self._add_prefix(index)
        if not body:
            if fieldtype == 'geo_shape':
                body =  {
                    doc_type : {
                        'properties' : {
                            fieldname : { 'type' : 'geo_shape', 'tree' : 'geohash', 'precision': '1m' }
                        }
                    }
                }
            else:
                fn = { 'type' : fieldtype }
                if fieldindex:
                    fn['index'] = fieldindex
                body =  {
                    doc_type : {
                        'properties' : {
                            fieldname : fn
                        }
                    }
                }

        self.es.indices.create(index=index, ignore=400)
        self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body)
        print 'creating index : %s/%s' % (index, doc_type)

    def create_index(self, **kwargs):
        kwargs = self._add_prefix(**kwargs)
        self.es.indices.create(**kwargs)
        print 'creating index : %s' % kwargs.get('index', '')

    def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the
            document itself and use the value found for the id of the document

        """

        index = self._add_prefix(index)
        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document,idfield)

            try:
                self.es.index(index=index, doc_type=doc_type, body=document, id=id)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail))
                raise detail


    def bulk_index(self, data, **kwargs):
        return helpers.bulk(self.es, data, **kwargs)

    def create_bulk_item(self, op_type='index', index=None, doc_type=None, id=None, data=None):
        return {
            '_op_type': op_type,
            '_index': self._add_prefix(index),
            '_type': doc_type,
            '_id': id,
            '_source': data
        }

    def count(self, **kwargs):
        kwargs = self._add_prefix(**kwargs)
        count = self.es.count(**kwargs)
        if count is not None:
            return count['count']
        else:
            return None

    def BulkIndexer(outer_self, batch_size=500, **kwargs):

        class _BulkIndexer(object):
            def __init__(self, **kwargs):
                self.queue = []
                self.batch_size = kwargs.pop('batch_size', 500)
                self.kwargs = kwargs

            def add(self, op_type='index', index=None, doc_type=None, id=None, data=None):
                doc = {
                    '_op_type': op_type,
                    '_index': outer_self._add_prefix(index),
                    '_type': doc_type,
                    '_id': id,
                    '_source': data
                }
                self.queue.append(doc)

                if len(self.queue) >= self.batch_size:
                    outer_self.bulk_index(self.queue, **self.kwargs)
                    del self.queue[:]  #clear out the array
            
            def close(self):
                outer_self.bulk_index(self.queue, **self.kwargs)

            def __enter__(self, **kwargs):
                return self

            def __exit__(self, type, value, traceback):
                return self.close()

        return _BulkIndexer(batch_size=batch_size, **kwargs)
Exemplo n.º 18
0
class Kidash:
    def __init__(self, url, index='.kibana', doc_type=None):
        self.es = Elasticsearch(url, verify_certs=True)
        self.index = index
        self.doc_type = doc_type

    def search_request(self, body, size, filter_path):
        """Make a search in ES based on the given parameters.

        :param body: Body of the query to send
        :param filter_path: Filter for the parameters to retrieve
        :param size: length of the elements to retrieve
        :returns: an Object with the elements retrieved
        """
        request = self.es.search(index=self.index,
                                 doc_type=self.doc_type,
                                 body=body,
                                 filter_path=filter_path,
                                 size=size)
        return request

    def get_number_of_items(self, body):
        """Retrieve the number of items for a given search.

        :param body: Body of the query to send
        :returns: A counter of the total number of ids
        """
        filter_path = ['hits.total']
        request = self.search_request(body, 1, filter_path)
        t_ids = request['hits']['total']
        return t_ids

    def list_item_ids(self, body):
        """Retrieve the list of items for a given search.

        :param body: Body of the query to send
        :returns: an id's list
        """
        filter_path = ['hits.hits._id']
        size = self.get_number_of_items(body)
        request = self.search_request(body, size, filter_path)
        ids_list = request['hits']['hits']
        return ids_list

    def retrieve_items_by_list(self, ids_list):
        """Retrieve items based in a given id's list.

        :param body: Body of the query to send
        :returns: The list of elements retrieved
        """
        body_docs = {'docs': ids_list}
        request = self.es.mget(index=self.index,
                               doc_type=self.doc_type,
                               body=json.dumps(body_docs))
        elements_list = request['docs']
        return elements_list

    def retrieve_items_by_query(self, body=ALL):
        """Retrieve items based in a given query.

        By default it launches the query 'match_all'

        :param body: Body of the query to send
        :returns: The list of elements retrieved
        """
        filter_path = ['hits.hits._*']
        size = self.get_number_of_items(body)
        request = self.search_request(body, size, filter_path)
        return request['hits']['hits']

    def stream_items(self, query):
        """Scan the items of a given query, retrieve it and adds the delete operation.

        :param doc_type: Type of document to search
        :param query: Body of the query to send
        :yields: The elements retrieved
        """
        for item in scan(self.es,
                         query=query,
                         index=self.index,
                         doc_type=self.doc_type,
                         scroll='1m',
                         _source=False):

            del item['_score']
            item['_op_type'] = 'delete'
            yield item

    def load_items(self, list_of_elements):
        """Load a list of given items into ElasticSearch.

        :param doc_type: Type of document to search
        :param list_of_elements: List of the elements to load
        """
        bulk_items = []
        for element in list_of_elements:
            item = {
                "_index": self.index,
                "_type": element['_type'],
                "_id": element['_id'],
                "_source": element['_source'],
            }
            bulk_items.append(item)
        bulk(self.es, bulk_items)

    def delete_items(self, query):
        """Remove the elements of a given query by using Bulk operations.

        :param doc_type: Type of document to search
        :param query: Body of the query to send
        """
        bulk(self.es, self.stream_items(query), chunk_size=CHUNK_SIZE)

    def import_items(self, filepath):
        """Import a set of elements given a file.

        :param filepath: Path of the file to load
        """
        list_of_elements = json.loads(open(filepath).read())
        self.load_items(list_of_elements)

    def export_items(self, output_file, query):
        """Export a set of elements based on the parameters given.

        :param output_file: File where to export the items
        """
        items = self.retrieve_items_by_query(query)
        try:
            output_file.write(json.dumps(items, indent=2, sort_keys=True))
            output_file.write('\n')
        except IOError as e:
            raise RuntimeError(str(e))
Exemplo n.º 19
0
class _ES(object):
    def __init__(self, index, doc_type, host, port, timeout=300, **args):
        self.host = host
        self.port = port
        self.index = index
        self.doc_type = doc_type
        self.es = Elasticsearch(hosts=[{"host": self.host, "port": self.port}, ], timeout=timeout, **args)

    def check_properties(self, properties):
        """
        Check if all properties are known (e.g. have mappings), and creates mappings as needed
        """
        properties = set(properties)
        if not (properties - self.get_properties()):
            return
        to_add = properties - self.get_properties()
        if to_add:
            self.add_properties(to_add)

    def add_properties(self, to_add):
        """
        Add the named properties, setting mapping depending on suffix
        """
        mappings = {}
        for name in to_add:
            ftype = name.rsplit("_", 1)[1] if "_" in name else 'default'
            mappings[name] = settings.ES_MAPPING_TYPES[ftype]
        self.es.indices.put_mapping(index=self.index, doc_type=self.doc_type,
                                    body={"properties": mappings})

    def get_mapping(self):
        m = self.es.indices.get_mapping(self.index, self.doc_type)
        return m[self.index]['mappings'][self.doc_type]['properties']

    def get_properties(self):
        self.check_index()
        return set(self.get_mapping().keys())

    def refresh(self):
        self.es.indices.refresh()

    def highlight_article(self, aid: int, query: str) -> dict:
        """Highlight article given by an article id using a Lucene query. The resulting strings
        are safe to insert into an HTML document even if the original document contained malicious
        constructs.

        If you need the original article including HTML, call html.unescape on this output."""
        from amcat.tools.amcates_queryset import ESQuerySet

        qs = ESQuerySet().filter(id=aid).only("text", "title").highlight(query, mark="em")

        try:
            return next(iter(qs)).to_dict()
        except StopIteration:
            raise ValueError("Article(id={}) not found in elastic index.".format(aid))

    def clear_cache(self):
        self.es.indices.clear_cache()

    def delete_index(self):
        try:
            self.es.indices.delete(self.index)
        except NotFoundError:
            pass
        except Exception as e:
            if 'IndexMissingException' in str(e):
                return
            raise

    def create_index(self, shards=5, replicas=1):
        es_settings = settings.ES_SETTINGS.copy()
        es_settings.update({"number_of_shards": shards,
                            "number_of_replicas": replicas})

        body = {
            "settings": es_settings,
            "mappings": {
                settings.ES_ARTICLE_DOCTYPE: settings.ES_MAPPING
            }
        }

        self.es.indices.create(self.index, body)

    def check_index(self):
        """
        Check whether the server is up and the index exists.
        If the server is down, raise an exception.
        If the index does not exist, try to create it.
        """
        if not self.es.ping():
            raise Exception("Elastic server cannot be reached")
        if not self.es.indices.exists(self.index):
            log.info("Index {self.index} does not exist, creating".format(**locals()))
            self.create_index()
        return self.es.cluster.health(self.index, wait_for_status='yellow')

    def exists_type(self, doc_type, **kargs):
        return self.es.indices.exists_type(index=self.index, doc_type=doc_type, **kargs)

    def put_mapping(self, doc_type, body, **kargs):
        return self.es.indices.put_mapping(index=self.index, doc_type=doc_type, body=body, **kargs)

    def status(self):
        nodes = self.es.nodes.info()['nodes'].values()
        return {"ping": self.es.ping(),
                "nodes": [n['name'] for n in nodes],
                "index": self.index,
                "index_health": self.es.cluster.health(self.index),
                "transport_hosts": self.es.transport.hosts,
                }

    def get(self, id, **options):
        """
        Get a single article from the index
        """
        kargs = dict(index=self.index, doc_type=self.doc_type)
        kargs.update(options)
        return self.es.get_source(id=id, **kargs)

    def mget(self, ids, doc_type=None, parents=None):
        """
        Get multiple articles from the index.
        If paret is given, it should be a sequence of the same length as ids
        """
        if parents is None: parents = [None] * len(ids)
        if doc_type is None: doc_type = self.doc_type
        getdocs = [{"_index": self.index, "_id": id, "_parent": parent, "_type": doc_type}
                   for (id, parent) in zip(ids, parents)]
        return self.es.mget({"docs": getdocs})['docs']

    def search(self, body, **options):
        """
        Perform a 'raw' search on the underlying ES index
        """
        kargs = dict(index=self.index, doc_type=self.doc_type)
        kargs.update(options)
        if log.isEnabledFor(logging.DEBUG):
            # pprint can be expensive
            log.debug("Search with body:\n {}".format(pprint.pformat(body)))
        return self.es.search(body=body, **kargs)

    def scan(self, query, **kargs):
        """
        Perform a scan query on the es index
        See: http://elasticsearch-py.readthedocs.org/en/latest/helpers.html#elasticsearch.helpers.scan
        """
        return scan(self.es, index=self.index, doc_type=self.doc_type, query=query, **kargs)

    def query_ids(self, query=None, filters=EMPTY_RO_DICT, body=None, limit=None, **kwargs):
        """
        Query the index returning a sequence of article ids for the mathced articles

        @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @param filter: field filter DSL query dict
        @param body: if given, use this instead of constructing from query/filters
        @param filters: if filter is None, build filter from filters as accepted by build_query, e.g. sets=12345

        Note that query and filters can be combined in a single call
        """
        if body is None:
            body = dict(build_body(query, filters, query_as_filter=True))

        log.debug("query_ids with body:\n {}".format(pprint.pformat(body)))
        for i, a in enumerate(scan(self.es, query=body, index=self.index, doc_type=self.doc_type,
                                   size=(limit or 1000), _source=False)):
            if limit and i >= limit:
                return
            yield int(a['_id'])

    def query(self, query=None, filters=EMPTY_RO_DICT, highlight=False, lead=False, _source=(), score=True, **kwargs):
        """
        Execute a query for the given fields with the given query and filter
        @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @param filter: field filter DSL query dict, defaults to build_filter(**filters)
        @param kwargs: additional keyword arguments to pass to es.search, eg fields, sort, from_, etc
        @return: a list of named tuples containing id, score, and the requested fields
        """
        body = dict(build_body(query, filters, query_as_filter=(not (highlight or score))))
        if highlight and not score:
            body['query'] = {'constant_score': {'query': body['query']}}

        if 'sort' in kwargs:
            body['track_scores'] = True

        if highlight and query:
            if isinstance(highlight, dict):
                body['highlight'] = highlight
            else:
                body['highlight'] = HIGHLIGHT_OPTIONS
        if lead or False and query == "" and highlight:
            body['script_fields'] = {"lead": {"script": LEAD_SCRIPT_FIELD}}

        result = self.search(body, _source=_source, **kwargs)
        return SearchResult(result, _source, score, body, query=query)

    def query_all(self, *args, **kargs):
        kargs.update({"from_": 0})
        size = kargs.setdefault('size', 10000)
        result = self.query(*args, **kargs)
        total = result.total
        for offset in range(size, total, size):
            kargs['from_'] = offset
            result2 = self.query(*args, **kargs)
            result.hits += result2.hits

        return result

    def _get_used_properties(self, body__prop):
        body, prop = body__prop
        body["query"]["bool"]["must"][1]["exists"]["field"] = prop
        return bool(self.es.count(index=self.index, doc_type=self.doc_type, body=body)['count'])

    def get_used_properties(self, set_ids=None, article_ids=None, **filters):
        """
        Returns a sequency of property names in use in the specified set(s) (or setids)
        """
        if set_ids is not None:
            filters["sets"] = set_ids

        if article_ids is not None:
            filters["ids"] = article_ids

        all_properties = self.get_properties()
        flexible_properties = set(all_properties) - set(ALL_FIELDS)

        body = {"query": {"bool": {"must": [
            build_filter(**filters),
            {"exists": {"field": "fakeprop"}}
        ]}}}

        bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties)))
        pool = ThreadPool()
        results = pool.imap(self._get_used_properties, zip(bodies, flexible_properties))

        try:
            for found, prop in zip(results, flexible_properties):
                if found:
                    yield prop
        finally:
            pool.close()

    def add_articles(self, article_ids, batch_size=1000):
        """
        Add the given article_ids to the index. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator).
        """
        # WvA: remove redundancy with create_articles
        if not article_ids: return
        from amcat.models import Article, ArticleSetArticle

        n = len(article_ids) // batch_size
        for i, batch in enumerate(splitlist(article_ids, itemsperbatch=batch_size)):
            log.info("Adding batch {i}/{n}".format(**locals()))
            all_sets = multidict((aa.article_id, aa.articleset_id)
                                 for aa in ArticleSetArticle.objects.filter(article__in=batch))
            dicts = (get_article_dict(article, list(all_sets.get(article.id, [])))
                     for article in Article.objects.filter(pk__in=batch))
            self.bulk_insert(dicts, batch_size=None)

    def remove_from_set(self, setid, article_ids, flush=True):
        """Remove the given articles from the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""
        if not article_ids: return
        for batch in splitlist(article_ids, itemsperbatch=1000):
            self.bulk_update(batch, UPDATE_SCRIPT_REMOVE_FROM_SET, params={'set': setid})

    def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
        """Add the given articles to the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""

        if not article_ids:
            if monitor:
                monitor.update()
            return

        batches = [set(batch) for batch in splitlist(article_ids, itemsperbatch=1000)]
        monitor = monitor.submonitor(total=len(batches))

        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(message="Adding batch {iplus}/{nbatches}..".format(iplus=i + 1, nbatches=nbatches))
            missing = batch - set(self.in_index(batch))
            if missing:
                logging.warning("Adding {} missing articles to elastic".format(len(missing)))
                self.add_articles(missing)
            if batch - missing:
                self.bulk_update(batch - missing, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid})

    def get_tokens(self, aid: int, fields=["text", "title"]):
        """
        Get a list of all tokens (words and their positions) in the given document
        :param aid: Article ID
        :param fields: List of fields to get the terms for
        :return: a sequence of (field, position, term) tuples
        """
        fieldstr = ",".join(fields)
        data = self.es.termvectors(self.index, self.doc_type, aid, fields=fieldstr, field_statistics=False,
                                   payloads=False, offsets=False)
        for field in fields:
            if field in data['term_vectors']:
                for term, info in data['term_vectors'][field]['terms'].items():
                    for token in info['tokens']:
                        yield field, token['position'], term

    def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()):
        """
        Bulk insert the given articles in batches of batch_size
        """
        batches = list(toolkit.splitlist(dicts, itemsperbatch=batch_size)) if batch_size else [dicts]
        monitor = monitor.submonitor(total=len(batches))
        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals()))
            props, articles = set(), {}
            for d in batch:
                props |= (set(d.keys()) - ALL_FIELDS)
                articles[d["id"]] = serialize(d)
            self.check_properties(props)
            body = get_bulk_body(articles)
            resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE)
            if resp["errors"]:
                raise ElasticSearchError(resp)

    def update_values(self, article_id, values):
        """Update properties of existing article.

        @param values: mapping from field name to (new) value
        @type values: dict"""
        return self.bulk_update_values({article_id: values})

    def bulk_update_values(self, articles):
        """Updates set of articles in bulk.
        """
        body = get_bulk_body({aid: serialize({"doc": a}) for aid, a in articles.items()}, action="update")
        resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE)

        if resp["errors"]:
            raise ElasticSearchError(resp)

    def bulk_update(self, article_ids, script, params):
        """
        Execute a bulk update script with the given params on the given article ids.
        """
        payload = serialize({"script": dict(script, params=params)})
        body = get_bulk_body({aid: payload for aid in article_ids}, action="update")
        resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE)

        if resp["errors"]:
            raise ElasticSearchError(resp)

    def synchronize_articleset(self, aset, full_refresh=False):
        """
        Make sure the given articleset is correctly stored in the index
        @param full_refresh: if true, re-add all articles to the index. Use this
                             after changing properties of articles
        """
        self.check_index()  # make sure index exists and is at least 'yellow'

        log.debug("Getting SOLR ids from set")
        solr_set_ids = set(self.query_ids(filters=dict(sets=[aset.id])))
        log.debug("Getting DB ids")
        db_ids = aset.get_article_ids()
        log.debug("Getting SOLR ids")
        solr_ids = set(self.in_index(db_ids))

        to_remove = solr_set_ids - db_ids
        if full_refresh:
            to_add_docs = db_ids
            to_add_set = set()
        else:
            to_add_docs = db_ids - solr_ids
            to_add_set = (db_ids & solr_ids) - solr_set_ids

        log.warning("Refreshing index, full_refresh={full_refresh},"
                    "|solr_set_ids|={nsolrset}, |db_set_ids|={ndb}, |solr_ids|={nsolr} "
                    "|to_add| = {nta}, |to_add_set|={ntas}, |to_remove_set|={ntr}"
                    .format(nsolr=len(solr_ids), nsolrset=len(solr_set_ids), ndb=len(db_ids),
                            nta=len(to_add_docs), ntas=len(to_add_set), ntr=len(to_remove), **locals()))

        log.info("Removing {} articles".format(len(to_remove)))
        self.remove_from_set(aset.id, to_remove)
        log.info("Adding {} articles to set".format(len(to_add_set)))
        self.add_to_set(aset.id, to_add_set)
        log.info("Adding {} articles to index".format(len(to_add_docs)))
        self.add_articles(to_add_docs)
        log.info("Refreshing")
        self.refresh()

    def _count(self, body):
        """Raw version of count directly passing given query to elastic, while setting the index and doc_type"""
        return self.es.count(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body=body)

    def count(self, query=None, filters=None):
        """
        Compute the number of items matching the given query / filter
        """
        filters = dict(build_body(query, filters, query_as_filter=True))
        body = {"query": {"constant_score": filters}}
        return self._count(body)["count"]

    def search_aggregate(self, aggregation, query=None, filters=None, **options):
        """
        Run an aggregate search query and return the aggregation results
        @param aggregation: raw elastic query, e.g. {"terms" : {"field" : "medium"}}
        """
        body = dict(query={"filtered": dict(build_body(query, filters, query_as_filter=True))},
                    aggregations={"aggregation": aggregation})
        result = self.search(body, size=0, **options)
        return result['aggregations']['aggregation']

    def _parse_terms_aggregate(self, aggregate, group_by, terms, sets):
        if not group_by:
            for term in terms:
                yield term, aggregate[term.label]['doc_count']
        else:
            for term in terms:
                yield term, self._parse_aggregate(aggregate[term.label], list(group_by), terms, sets)

    def _parse_other_aggregate(self, aggregate, group_by, group, terms, sets):
        buckets = aggregate[group]["buckets"]
        if not group_by:
            return ((b['key'], b['doc_count']) for b in buckets)
        return ((b['key'], self._parse_aggregate(b, list(group_by), terms, sets)) for b in buckets)

    def _parse_aggregate(self, aggregate, group_by, terms, sets):
        """Parse a aggregation result to (nested) namedtuples."""
        group = group_by.pop(0)

        if group == "terms":
            result = self._parse_terms_aggregate(aggregate, group_by, terms, sets)
        else:
            result = self._parse_other_aggregate(aggregate, group_by, group, terms, sets)
            if group == "sets" and sets is not None:
                # Filter sets if 'sets' is given
                result = ((aset_id, res) for aset_id, res in result if aset_id in set(sets))
            elif group == "date":
                # Parse timestamps as datetime objects
                result = ((get_date(stamp), aggr) for stamp, aggr in result)

        # Return results as namedtuples
        ntuple = namedtuple("Aggr", [safe_identifier(group), "buckets" if group_by else "count"])
        return [ntuple(*r) for r in result]

    def _build_aggregate(self, group_by, date_interval, terms, sets):
        """Build nested aggregation query for list of groups"""
        group = group_by.pop(0)

        if group == 'date':
            aggregation = {
                group: {
                    'date_histogram': {
                        'field': group,
                        'interval': date_interval,
                        "min_doc_count": 1
                    }
                }
            }
        elif group == 'terms':
            aggregation = {
                term.label: {
                    'filter': dict(build_body(term.query))['query']
                } for term in terms
            }
        else:
            aggregation = {
                group: {
                    'terms': {
                        # Default size is too small, we want to return all results
                        'size': 999999,
                        'field': group
                    }
                }
            }

        # We need to nest the other aggregations, see:
        # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-aggregations.html
        if group_by:
            nested = self._build_aggregate(group_by, date_interval, terms, sets)
            for aggr in aggregation.values():
                aggr["aggregations"] = nested

        return aggregation

    def aggregate_query(self, query=None, filters=None, group_by=None, terms=None, sets=None, date_interval='month'):
        """
        Compute an aggregate query, e.g. select count(*) where <filters> group by <group_by>. If
        date is used as a group_by variable, uses date_interval to bin it. It does support multiple
        values for group_by.

        You can group_by on terms by supplying "terms" to group_by. In addition, you will need to
        supply terms as a parameter, which consists of a list of SearchQuery's. Query is then used
        as a global filter, while terms are 'local'.

        @param query: an elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @type group_by: list / tuple
        @type mediums: bool
        @param mediums: return Medium objects, instead of ids
        """
        if isinstance(group_by, str):
            log.warning("Passing strings to aggregate_query(group_by) is deprecated.")
            group_by = [group_by]

        if "terms" in group_by and terms is None:
            raise ValueError("You should pass a list of terms if aggregating on it.")

        filters = dict(build_body(query, filters, query_as_filter=True))
        aggregations = self._build_aggregate(list(group_by), date_interval, terms, sets)

        body = {
            "query": {"constant_score": filters},
            "aggregations": aggregations
        }

        log.debug("es.search(body={body})".format(**locals()))
        result = self.search(body)
        result = self._parse_aggregate(result["aggregations"], list(group_by), terms, sets)
        return result

    def statistics(self, query=None, filters=None):
        """Compute and return a Result object with n, start_date and end_date for the selection"""
        body = {
            "query": {
                "constant_score": dict(
                    build_body(query, filters, query_as_filter=True)
                )
            },
            'aggregations': {
                'stats': {
                    'stats': {'field': 'date'}
                }
            }
        }

        stats = self.search(body, size=0)['aggregations']['stats']
        result = Result()
        result.n = stats['count']
        if result.n == 0:
            result.start_date, result.end_date = None, None
        else:
            result.start_date = get_date(stats['min'])
            result.end_date = get_date(stats['max'])
        return result

    def list_dates(self, query=None, filters=None, interval="day"):
        from amcat.tools.aggregate_es import aggregate, IntervalCategory
        for date, count in aggregate(query, filters, [IntervalCategory(interval)], es=self):
            yield date

    def in_index(self, ids):
        """
        Check whether the given ids are already indexed.
        @return: a sequence of ids that are in the index
        """
        if not isinstance(ids, list): ids = list(ids)
        log.info("Checking existence of {nids} documents".format(nids=len(ids)))
        if not ids: return
        for batch in splitlist(ids, itemsperbatch=10000):
            result = self.es.mget(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE,
                                  body={"ids": batch}, _source=[])
            for doc in result['docs']:
                if doc['found']: yield int(doc['_id'])

    def duplicate_exists(self, article):
        """
        Check whether a duplicate of the given article already exists.
        If so, returns the sets that the duplicate is a member of.
        Duplication is checked using de get_hash function, so article
        should be an object with the appropriate attributes (.title etc)
        @return: A (possibly empty) sequence of results with .id and .sets
        """
        hash = get_article_dict(article).hash
        return self.query(filters={'hashes': hash}, _source=["sets"], score=False)

    def _get_purge_actions(self, query):
        for id in self.query_ids(body=query):
            yield {
                "_op_type": "delete",
                "_id": id,
                "_index": self.index,
                "_type": settings.ES_ARTICLE_DOCTYPE
            }

    def purge_orphans(self):
        """Remove all articles without set from the index"""
        query = {"query": {"bool": {"must_not": {"exists": {"field": "sets"}}}}}
        return bulk(self.es, self._get_purge_actions(query))

    def get_child_type_counts(self, **filters):
        """Get the number of child documents per type"""
        filters = dict(build_body(filters=filters))
        filter = {"has_parent": {"parent_type": self.doc_type, "filter": filters['filter']}}
        aggs = {"module": {"terms": {"field": "_type"}}}
        body = {"aggs": {"prep": {"filter": filter, "aggs": aggs}}}
        r = self.es.search(index=self.index, size=0, body=body)
        for b in r['aggregations']['prep']['module']['buckets']:
            yield b['key'], b['doc_count']

    def get_articles_without_child(self, child_doctype, limit=None, **filters):
        """Return the ids of all articles without a child of the given doctype"""
        nochild = {"not": {"has_child": {"type": child_doctype,
                                         "query": {"match_all": {}}}}}
        filter = dict(build_body(filters=filters))['filter']
        body = {"filter": {"bool": {"must": [filter, nochild]}}}
        return self.query_ids(body=body, limit=limit)
Exemplo n.º 20
0
class SearchEngine(object):

    def __init__(self):
        #
        serializer = JSONSerializer()
        serializer.mimetype = 'application/json'
        serializer.dumps = serializer.serialize
        serializer.loads = JSONDeserializer().deserialize
        self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS)
        self.logger = logging.getLogger(__name__)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index, doc_type, and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        body = kwargs.pop('body', None)
        if body != None:
            try:
                data = []
                refresh = kwargs.pop('refresh', False)
                for hit in helpers.scan(self.es, query=body, **kwargs):
                    hit['_op_type'] = 'delete'
                    data.append(hit)

                return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                raise detail
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                raise detail

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        index = kwargs.get('index', '').strip()
        print 'deleting index : %s' % index
        return self.es.indices.delete(index=index, ignore=[400, 404])

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index, doc_type, and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        body = kwargs.get('body', None)
        index = kwargs.get('index', None)
        id = kwargs.get('id', None)

        if index is None:
            raise NotImplementedError("You must specify an 'index' in your call to search")

        if id:
            if isinstance(id, list):
                kwargs.setdefault('body', {'ids': kwargs.pop('id')})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)

        ret = None
        try:
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning('%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail))
            pass

        return ret

    def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex=None, body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        if not body:
            if fieldtype == 'geo_shape':
                body =  {
                    doc_type : {
                        'properties' : {
                            fieldname : { 'type' : 'geo_shape', 'tree' : 'geohash', 'precision': '1m' }
                        }
                    }
                }
            else:
                fn = { 'type' : fieldtype }
                if fieldindex:
                    fn['index'] = fieldindex
                body =  {
                    doc_type : {
                        'properties' : {
                            fieldname : fn
                        }
                    }
                }

        self.es.indices.create(index=index, ignore=400)
        self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body)
        print 'creating index : %s/%s' % (index, doc_type)

    def create_index(self, **kwargs):
        self.es.indices.create(**kwargs)
        print 'creating index : %s' % kwargs.get('index', '')

    def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the
            document itself and use the value found for the id of the document

        """

        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document,idfield)

            try:
                self.es.index(index=index, doc_type=doc_type, body=document, id=id)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail))
                raise detail


    def bulk_index(self, data):
        return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True)

    def create_bulk_item(self, op_type='index', index=None, doc_type=None, id=None, data=None):
        return {
            '_op_type': op_type,
            '_index': index,
            '_type': doc_type,
            '_id': id,
            '_source': data
        }

    def count(self, **kwargs):
        count = self.es.count(**kwargs)
        if count is not None:
            return count['count']
        else:
            return None
Exemplo n.º 21
0
class CreateIndex(object):
    def __init__(self):
        self.db = cx_Oracle.connect("dmatdmp/D_Matdmp#[email protected]:1521/dmat")
        self.cursor = self.db.cursor()
        self.es = Elasticsearch("localhost:9200")  # 本地测试

    def create_index(self, index, doc_type):
        '''
        设计索引结构es_body, 根据结构创建index, 并判断是否创建成功
        :param index: 创建的索引名称
        :param doc_type: 创建的文档类型
        :return:
        '''
        es_body = {
            "settings": {
                "number_of_shards": 5,
                "number_of_replicas": 0,
                "analysis": {
                    "analyzer": {
                        "english_standard_analyzer": {
                            "type": "standard",  # 标准分词器
                            "stopwords": "_english_",  # 去除英文停止词
                            "tokenizer": "standard",  # 使用标准分词器(以非字符、下划线的字符进行分割)
                            "filter": ["lowercase"]  # 小写,可以在后面增加任何token filter
                        },
                        "english_comma_pat_analyzer": {
                            "type": "pattern",  # 模式分词器
                            "pattern": ",",  # 使用逗号分隔模式(英文逗号)
                            "stopwords": "_english_",  # 去除英文停止词
                            "lowercase": "true"  # 小写
                        }
                    }
                }
            },
            "mappings": {
                doc_type: {
                    "properties": {
                        "tbl_id": {
                            "type": "keyword"
                        },
                        "sys_id": {
                            "type": "keyword"
                        },
                        "sys_name": {
                            "type": "text",
                            "analyzer": "ik_max_word",
                            "fields": {
                                "raw": {
                                    "type": "keyword"
                                }
                            }
                        },
                        "owner": {
                            "type": "keyword"
                        },
                        "tbl_name": {
                            "analyzer": "standard",  # TODO: 测试
                            "type": "text",
                            "fields": {
                                "raw": {
                                    "type": "keyword"
                                }   
                            }
                        },
                        "col_names": {
                            "analyzer": "english_comma_pat_analyzer",
                            "type": "text"
                        },
                        "col_comments": {
                            "analyzer": "ik_smart",
                            "type": "text"
                        },
                        "sys_name_alias": {
                            "analyzer": "ik_max_word",
                            "type": "text"
                        }
                    }
                }
            }
        }
        if not self.es.indices.exists(index=index):
            try:
                self.es.indices.create(index=index, body=es_body)
                print("索引创建成功...")
                return True
            except Exception as e:
                print(e, "索引创建失败!")
                return False
        else:
            return False

    def _gen_data(self, index, doc_type, batch_chunk_size):
        '''
        生成数据的生成器
        :param index: 要插入数据的index
        :param doc_type: 索引index的文档类型
        :param chunk_size: 批量插入数据量的大小
        :return:
        '''
        sql = """select * from tem_search_engine_1 """  # TODO: 提取sql语句作为参数
        self.cursor.execute(sql)
        col_name_list = [col[0].lower() for col in self.cursor.description]
        col_name_len = len(col_name_list)
        actions = []

        start = time.time()
        for row in self.cursor:
            source = {}
            tbl_id = ""
            for i in range(col_name_len):
                source.update({col_name_list[i]: str(row[i])})
                if col_name_list[i] == "tbl_id":
                    tbl_id = row[i]
            action = {
                "_index": index,
                "_type": doc_type,
                "_id": tbl_id,  # TODO:判空
                "_source": source
            }
            actions.append(action)
            if len(actions) == batch_chunk_size:
                print("actions增加数据用时:", time.time()-start)
                yield actions
                actions = []
        print("for总用时:", time.time()-start)
        yield actions

    def _gen_parallel_data(self, index, doc_type):
        sql = """select * from tem_search_engine_1"""  # TODO: 提取sql语句作为参数
        self.cursor.execute(sql)
        col_name_list = [col[0].lower() for col in self.cursor.description]
        col_name_len = len(col_name_list)
        for row in self.cursor:
            source = {}
            tbl_id = ""
            for i in range(col_name_len):
                source.update({col_name_list[i]: str(row[i])})
                if col_name_list[i] == "tbl_id":
                    tbl_id = row[i]
            action = {
                "_index": index,
                "_type": doc_type,
                "_id": tbl_id,  # TODO:判空
                "_source": source
            }
            yield action

    def bulk_data(self, index, doc_type, is_parallel=True, batch_chunk_size=5000, threads_counts=8):
        '''
        数据批量插入
        :param index: 要插入数据的index
        :param doc_type: index的文档类型
        :param chunk_size: 批量插入的大小,只用于非并行插入
        :param is_parallel: 是否要并行插入,默认为并行插入
        :param threads_counts: 线程数量,默认为4,只有在并行插入数据时该参数才有效
        :return:
        '''
        if is_parallel is None or is_parallel == True:
            gen_action = self._gen_parallel_data(index, doc_type)
            print("正在并行插入数据...")
            start = time.time()
            for success, info in helpers.parallel_bulk(client=self.es, actions=gen_action, thread_count=threads_counts, chunk_size=1000):
                if not success:
                    print("Insert failed: ", info)
            print("插入数据成功... ", time.time()-start)
        elif is_parallel == False:
            gen_action = self._gen_data(index, doc_type, batch_chunk_size)
            try:
                print("正在插入数据...")
                t3 = time.time()
                helpers.bulk(client=self.es, actions=gen_action, chunk_size=500)
                print("插入成功....", time.time() - t3)
            except  Exception as e:
                print(e, "插入失败!")
        else:
            raise ValueError("is_parallel应该为True或False")

    def exists_doc(self, index, doc_type, doc_id, source=False):
        '''
        确定索引中的一个文档是否存在
        :param index:
        :param doc_type:
        :param doc_id:
        :param source:
        :return:
        '''
        return self.es.exists(index=index, doc_type=doc_type, id=doc_id, _source=source)

    def get_doc(self, index, doc_type, id):
        '''

        :param index:
        :param doc_type:
        :param id:
        :return:
        '''
        return self.es.get(index=index, doc_type=doc_type, id=id)

    def get_docs(self, index, doc_type, body, source=False):
        '''

        ============================EXAMPLE===================================
        createindex = CreateIndex()
        body = {
             "docs": [
                {"_id": "7970C657B49BA14AE050A8C0EBA07C72"},
                {"_id": "7970C657B49EA14AE050A8C0EBA07C72"}
            ]
        }
        print(createindex.get_docs("example_index", "examplecase", body))

        body = {
        "ids": [ "7970C657B49BA14AE050A8C0EBA07C72" "7970C657B49EA14AE050A8C0EBA07C72"]
        }
        print(createindex.get_docs("example_index", "examplecase", body))
        ======================================================================
        :param index:
        :param doc_type:
        :param body: 根据"docs"或"ids"获取多条文档信息
        :param source: 是否返回展示原始数据,默认为False
        :return:
        '''
        return self.es.mget(index=index, doc_type=doc_type, body=body, _source=source)

    def update_doc(self, index, doc_type, id, body):
        '''

        :param index:
        :param doc_type:
        :param id:
        :param body:
        :return:
        '''
        self.es.update(index=index, doc_type=doc_type, id=id, body=body)

    def delete_index(self, index):
        '''

        :param index:
        :return:
        '''
        return self.es.indices.delete(index=index)

    def delete_docs(self, index, doc_type, doc_id):
        '''

        :param index:
        :param doc_type:
        :param doc_id:
        :return:
        '''
        return self.es.delete(index=index, doc_type=doc_type, id=doc_id)

    def delete_by_query(self, index, doc_type, body, source):
        '''
        使用删除语句对文档进行删除
        :param index:
        :param doc_type:
        :param body:
        :return:
        '''
        return self.es.delete_by_query(index=index, doc_type=doc_type, body=body, _source=source)

    def get_info(self, **kwargs):
        return self.es.info(**kwargs)
Exemplo n.º 22
0
class ThreadingTests(TestCase):
    def setUp(self):
        self.es = Elasticsearch(ES_NODES)
        print GAME_QUEUE, Tasks.redis.llen(GAME_QUEUE)
        print USER_QUEUE, Tasks.redis.llen(USER_QUEUE)
        print GAME_SET, Tasks.redis.scard(GAME_SET)
        print USER_SET, Tasks.redis.scard(USER_SET)
        print TO_CRUNCHER, Tasks.redis.llen(TO_CRUNCHER)
        Tasks.new_games = 0
        print "Deleting the above-listed Redis keys."
        for key in GAME_QUEUE, USER_QUEUE, GAME_SET, USER_SET, TO_CRUNCHER:
            Tasks.redis.delete(key)
        self.es.delete_by_query(index=TEST_ES_INDEX, doc_type=GAME_DOCTYPE, body={"query": {"match_all": {}}})
        print "Be patient (10s) - making sure API is available"
        sleep(10)
        print "Ready!"

    def test_games_make_it_to_elasticsearch_in_reasonable_time(self):
        Tasks.add(TEST_GAMES, [])
        wt = WatcherThread(TEST_KEY, cycles=1)
        wt.start()
        REASONABLE_TIME = 20  # seconds
        with timeout(REASONABLE_TIME):
            while True:
                try:
                    # TODO - assert that the all items made it to ES
                    docs = self.es.mget(index=TEST_ES_INDEX, doc_type=GAME_DOCTYPE, body={'ids': TEST_GAMES})['docs']
                    assert all([d['found'] for d in docs])
                    break
                except:
                    pass
                sleep(0.1)
        wt.join()

        # 1. check that the game queue is now empty
        ONE_SHITLOAD = 10000
        self.assertGreater(ONE_SHITLOAD, Tasks.redis.llen(GAME_QUEUE))
        newly_queued_games = Tasks.redis._bulk_rpop(GAME_QUEUE, ONE_SHITLOAD)
        self.assertEquals(len(set(newly_queued_games)), 0)

        # 2. check that processed games made it to the GAME_SET
        self.assertEquals(Tasks.redis.scard(GAME_SET), len(set(TEST_GAMES)))
        items, is_old = zip(*Tasks.redis._intersect(GAME_SET, TEST_GAMES, insert=False))
        self.assertTrue(all(is_old))

    def test_games_and_users_properly_queued(self):
        # Init with 10 games and 5 users
        Tasks.add(TEST_GAMES, TEST_USERS)
        wt = WatcherThread(TEST_KEY, cycles=1)
        wt.run()

        # 1. check that none of the test games are now currently queued
        ONE_SHITLOAD = 10000
        newly_queued_games = Tasks.redis._bulk_rpop(GAME_QUEUE, ONE_SHITLOAD)
        self.assertEquals(len(set(newly_queued_games) & set(TEST_GAMES)), 0)

        # 2. check that seeded TEST_GAMEs are still in GAME_SET after the second iteration
        items, is_old = zip(*Tasks.redis._intersect(GAME_SET, TEST_GAMES, insert=False))
        self.assertTrue(all(is_old))

        # 3. check that some new users got added
        self.assertNotEqual(Tasks.redis.scard(USER_SET), 0)

        # 4. check that some new games got added
        self.assertNotEqual(Tasks.redis.scard(GAME_SET), 0)

        # 5. check that game counts are accurate
        self.assertEquals(Tasks.new_games, len(TEST_GAMES) + len(newly_queued_games))

    def test_multi_thread(self):
        Tasks.add(TEST_MANY_GAMES, TEST_USERS)
        wt1 = WatcherThread(TEST_KEY, cycles=1)
        wt2 = WatcherThread(TEST_KEY2, cycles=1)

        wt1.start()
        wt2.start()

        wt1.join()
        wt2.join()

        # 1. check that the game counts are accurate
        self.assertEquals(Tasks.new_games, len(TEST_MANY_GAMES) + Tasks.redis.llen(GAME_QUEUE))
Exemplo n.º 23
0
def export_attachments(data_set_id,
                       outfile,
                       sender='',
                       attachment_extension='jpg',
                       date_bounds=None):
    print(
        "email.get_attachments_sender(index=%s, sender=%s, attachment_type=%s, date_bounds=%s)"
        % (data_set_id, sender, attachment_extension, date_bounds))
    if not data_set_id:
        print "invalid service call - missing index"
        return 1
    # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10))
    es = Elasticsearch([{"host": "10.1.70.143", "port": 9200}], timeout=60)

    # TODO get accurate count -- this is not strictly needed as attachments will be accessed as inner docs on the email_address
    max_inner_attachments_returned = 100000

    # Get all attachments by extension
    rows = []
    body = _attch_nested__ext_query(
        sender,
        attachment_extension,
        date_bounds,
        max_inner_attachments_returned=max_inner_attachments_returned)
    print body
    addresses_count = es.count(index=data_set_id,
                               doc_type="email_address",
                               body=body)["count"]
    print "total addresses: " + str(addresses_count)
    addresses = es.search(index=data_set_id,
                          doc_type="email_address",
                          body=body,
                          size=addresses_count)
    for address in addresses["hits"]["hits"]:
        rows += [[
            address["_source"]["addr"], attachment["_source"]["guid"],
            attachment["_source"]["filename"],
            attachment["_source"]["datetime"]
        ] for attachment in address["inner_hits"]["sender_attachments"]["hits"]
                 ["hits"]]

    print "total attachments: " + str(len(rows))

    #  start tar.gz
    # tar = tarfile.open(mode='w:gz', name="big-export.tar.gz")
    # Start tar
    tar = tarfile.open(mode='w', name=outfile)

    csv_string_buffer = cStringIO.StringIO()
    csv_file = csv.writer(csv_string_buffer)

    # Add all rows to attachment csv
    csv_file.writerows(rows)
    tarinfo = tarfile.TarInfo("attachments.csv")

    tarinfo.size = csv_string_buffer.tell()
    tarinfo.mode = 0644
    tarinfo.mtime = time.time()
    csv_string_buffer.seek(0)

    tar.addfile(tarinfo, csv_string_buffer)

    # This is the buffer size of how many attachments to pull from ES at each iteration
    num_returned = 3
    index = 0
    # Paging
    while index < len(rows):
        # Get num_returned attachments from ES
        attachments = es.mget(index=data_set_id,
                              doc_type="attachments",
                              body={
                                  "docs":
                                  [{
                                      "_id": row[1]
                                  }
                                   for row in rows[index:index + num_returned]]
                              })
        index += num_returned

        # Add all attachments to the archive
        for attachment_source in attachments["docs"]:
            attachment = attachment_source["_source"]
            filename = attachment["filename"]
            attch_data = str(base64.b64decode(attachment["contents64"]))

            tarinfo_attch = tarfile.TarInfo(attachment["guid"] + "/" +
                                            filename)
            tarinfo_attch.size = len(attch_data)
            tarinfo_attch.mode = 0644
            tarinfo_attch.mtime = time.time()
            tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))

    tar.close()
Exemplo n.º 24
0
class SearchEngine(object):
    def __init__(self):
        #
        serializer = JSONSerializer()
        serializer.mimetype = 'application/json'
        serializer.dumps = serializer.serialize
        serializer.loads = JSONDeserializer().deserialize
        self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS,
                                serializer=serializer,
                                **settings.ELASTICSEARCH_CONNECTION_OPTIONS)
        self.logger = logging.getLogger(__name__)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index, doc_type, and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        body = kwargs.pop('body', None)
        if body != None:
            try:
                data = []
                refresh = kwargs.pop('refresh', False)
                for hit in helpers.scan(self.es, query=body, **kwargs):
                    hit['_op_type'] = 'delete'
                    data.append(hit)

                return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to delete document by query: %s \nException detail: %s\n'
                    % (datetime.now(), body, detail))
                raise detail
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to delete document: %s \nException detail: %s\n'
                    % (datetime.now(), body, detail))
                raise detail

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        index = kwargs.get('index', '').strip()
        print 'deleting index : %s' % index
        return self.es.indices.delete(index=index, ignore=[400, 404])

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index, doc_type, and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        body = kwargs.get('body', None)
        index = kwargs.get('index', None)
        id = kwargs.get('id', None)

        if index is None:
            raise NotImplementedError(
                "You must specify an 'index' in your call to search")

        if id:
            if isinstance(id, list):
                kwargs.setdefault('body', {'ids': kwargs.pop('id')})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)

        ret = None
        try:
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning(
                '%s: WARNING: search failed for query: %s \nException detail: %s\n'
                % (datetime.now(), body, detail))
            pass

        return ret

    def create_mapping(self,
                       index,
                       doc_type,
                       fieldname='',
                       fieldtype='string',
                       fieldindex=None,
                       body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        if not body:
            if fieldtype == 'geo_shape':
                body = {
                    doc_type: {
                        'properties': {
                            fieldname: {
                                'type': 'geo_shape',
                                'tree': 'geohash',
                                'precision': '1m'
                            }
                        }
                    }
                }
            else:
                fn = {'type': fieldtype}
                if fieldindex:
                    fn['index'] = fieldindex
                body = {doc_type: {'properties': {fieldname: fn}}}

        self.es.indices.create(index=index, ignore=400)
        self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body)
        print 'creating index : %s/%s' % (index, doc_type)

    def create_index(self, **kwargs):
        self.es.indices.create(**kwargs)
        print 'creating index : %s' % kwargs.get('index', '')

    def index_data(self,
                   index=None,
                   doc_type=None,
                   body=None,
                   idfield=None,
                   id=None,
                   **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the
            document itself and use the value found for the id of the document

        """

        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document, idfield)

            try:
                self.es.index(index=index,
                              doc_type=doc_type,
                              body=document,
                              id=id)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to index document: %s \nException detail: %s\n'
                    % (datetime.now(), document, detail))
                raise detail

    def bulk_index(self, data):
        return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True)

    def create_bulk_item(self,
                         op_type='index',
                         index=None,
                         doc_type=None,
                         id=None,
                         data=None):
        return {
            '_op_type': op_type,
            '_index': index,
            '_type': doc_type,
            '_id': id,
            '_source': data
        }

    def count(self, **kwargs):
        count = self.es.count(**kwargs)
        if count is not None:
            return count['count']
        else:
            return None
Exemplo n.º 25
0
if __name__ == "__main__":
	ids_path = sys.argv[1]  # 存放文档id的路径
	documents_path = sys.argv[2]  # 获取到的文档存放的路径
	es_address = sys.argv[3]  # es地址
	index_name = sys.argv[4] # 索引名称
	doc_type = sys.argv[5] # 文档类型
	batch_num = 1000

	es = Elasticsearch(hosts=[es_address], timeout=5000)

	batch = []
	body = {
		'ids': batch
	}
	with open(ids_path, mode='r') as source, open(documents_path, mode='w') as dest:
		for doc_id in source:
			batch.append(doc_id.strip())
			if len(batch) > batch_num:
				body['ids'] = batch
				docs = es.mget(index=index_name, doc_type=doc_type, body=body)
				for doc in translator(docs):
					dest.write(json.dumps(doc) + os.linesep)
				del batch[0:]

		if len(batch) > 0:
			body['ids'] = batch
			docs = es.mget(index=index_name, doc_type=doc_type, body=body)
			for doc in translator(docs):
				dest.write(json.dumps(doc) + os.linesep)
Exemplo n.º 26
0
    for uid in uid_list:
        if not result_data.has_key(uid):
            result_data[uid] = TOPIC_DICT
            uid_topic[uid] = ['life']

    return result_data, uid_topic


if __name__ == '__main__':
    from elasticsearch import Elasticsearch
    import json
    ES_CLUSTER_HOST = ['219.224.134.213:9205', '219.224.134.214:9205',\
                       '219.224.134.215:9205']
    es = Elasticsearch(ES_CLUSTER_HOST, timeout=600)
    index_name = 'fb_user_portrait'
    index_type = 'user'
    ids = ['544481513', '100010212181419']

    uid_list = []
    uid_weibo = {}
    res = es.mget(index=index_name, doc_type=index_type, body={'ids':
                                                               ids})['docs']
    for r in res:
        uid = r['_id']
        keywords = json.loads(r['_source']['filter_keywords'])
        uid_list.append(uid)
        uid_weibo[uid] = keywords
    result_data, uid_topic = topic_classfiy(uid_list, uid_weibo)
    print result_data
    print uid_topic
Exemplo n.º 27
0
class SearchEngine(object):
    def __init__(self):
        #
        serializer = JSONSerializer()
        serializer.mimetype = "application/json"
        serializer.dumps = serializer.serialize
        serializer.loads = JSONDeserializer().deserialize
        self.es = Elasticsearch(
            hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS
        )
        self.logger = logging.getLogger(__name__)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index, doc_type, and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        body = kwargs.pop("body", None)
        if body != None:
            try:
                data = []
                refresh = kwargs.pop("refresh", False)
                for hit in helpers.scan(self.es, query=body, **kwargs):
                    hit["_op_type"] = "delete"
                    data.append(hit)

                return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
            except Exception as detail:
                self.logger.warning(
                    "%s: WARNING: failed to delete document by query: %s \nException detail: %s\n"
                    % (datetime.now(), body, detail)
                )
                raise detail
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning(
                    "%s: WARNING: failed to delete document: %s \nException detail: %s\n"
                    % (datetime.now(), body, detail)
                )
                raise detail

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        index = kwargs.get("index", "").strip()
        print "deleting index : %s" % index
        return self.es.indices.delete(index=index, ignore=[400, 404])

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index, doc_type, and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        body = kwargs.get("body", None)
        index = kwargs.get("index", None)
        id = kwargs.get("id", None)

        if index is None:
            raise NotImplementedError("You must specify an 'index' in your call to search")

        if id:
            if isinstance(id, list):
                kwargs.setdefault("body", {"ids": kwargs.pop("id")})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)

        ret = None
        try:
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning(
                "%s: WARNING: search failed for query: %s \nException detail: %s\n" % (datetime.now(), body, detail)
            )
            pass

        return ret

    def index_term(self, term, id, context="", options={}):
        """
        If the term is already indexed, then simply increment the count and add the id of the term to the existing index.
        If the term isn't indexed then add the index.

        id: a unique id associated with the term
        context: a uuid of a concept to associate with the term to render in the ui
        options: any additional information to associate with the term

        """

        if term.strip(" \t\n\r") != "":
            already_indexed = False
            count = 1
            ids = [id]

            try:
                # _id = unicode(term, errors='ignore').decode('utf-8').encode('ascii')
                _id = uuid.uuid3(uuid.NAMESPACE_DNS, "%s%s" % (hash(term), hash(context)))
                result = self.es.get(index="term", doc_type="value", id=_id, ignore=404)

                # print 'result: %s' % result
                if result["found"] == True:
                    ids = result["_source"]["ids"]
                    if id not in ids:
                        ids.append(id)
                else:
                    ids = [id]

                self.index_data(
                    "term",
                    "value",
                    {"term": term, "context": context, "options": options, "count": len(ids), "ids": ids},
                    id=_id,
                )

            except Exception as detail:
                self.logger.warning(
                    "%s: WARNING: search failed to index term: %s \nException detail: %s\n"
                    % (datetime.now(), term, detail)
                )
                raise detail

    def delete_terms(self, ids):
        """
        If the term is referenced more then once simply decrement the
        count and remove the id of the deleted term from the from the existing index.

        If the term is only referenced once then delete the index

        """

        if not isinstance(ids, list):
            ids = [ids]

        for id in ids:
            result = self.es.search(
                index="term",
                doc_type="value",
                body={
                    "query": {"filtered": {"filter": {"terms": {"ids": [id]}}, "query": {"match_all": {}}}},
                    "from": 0,
                    "size": 10,
                },
                ignore=404,
            )

            if "hits" in result:
                for document in result["hits"]["hits"]:
                    document["_source"]["ids"].remove(id)
                    count = len(document["_source"]["ids"])
                    if count > 0:
                        document["_source"]["count"] = count
                        self.index_data("term", "value", document["_source"], id=document["_id"])
                        self.es.indices.refresh(index="term")
                    else:
                        self.delete(index="term", doc_type="value", id=document["_id"])

    def create_mapping(self, index, doc_type, fieldname="", fieldtype="string", fieldindex=None, body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        if not body:
            if fieldtype == "geo_shape":
                body = {
                    doc_type: {"properties": {fieldname: {"type": "geo_shape", "tree": "geohash", "precision": "1m"}}}
                }
            else:
                fn = {"type": fieldtype}
                if fieldindex:
                    fn["index"] = fieldindex
                body = {doc_type: {"properties": {fieldname: fn}}}

        self.create_index(index=index, ignore=400)
        self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body)

    def create_index(self, **kwargs):
        self.es.indices.create(**kwargs)

    def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the
            document itself and use the value found for the id of the document

        """

        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document, idfield)

            try:
                self.es.index(index=index, doc_type=doc_type, body=document, id=id, **kwargs)
            except Exception as detail:
                self.logger.warning(
                    "%s: WARNING: failed to index document: %s \nException detail: %s\n"
                    % (datetime.now(), document, detail)
                )
                raise detail

    def bulk_index(self, data):
        return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True)

    def create_bulk_item(self, index, type, id, data):
        if not (self.isempty_or_none(index) or self.isempty_or_none(type) or self.isempty_or_none(id)):
            return [{"index": {"_index": index, "_type": type, "_id": id}}, data]
        else:
            return false
Exemplo n.º 28
0
class NameStore(object):
    def __init__(self):
        """what should this do as we will only populate from es?"""
        pass

    def configure_index(self, configuration):
        self.es_config = configuration
        self.es_index = configuration['_index']
        self.es = Elasticsearch([{
            "host": self.es_config['host'],
            "port": self.es_config['port']
        }])
        if not self.es.indices.exists(index=self.es_index):
            self.es.indices.create(index=self.es_index)
        self.needs_refresh = False

    def index_needs_refresh(self):
        return self.needs_refresh

    def index_refresh(self):
        self.es.indices.refresh(index=self.es_index)
        self.needs_refresh = False

    def set_index_needs_refresh(self):
        self.needs_refresh = True

    def check_index_is_fresh(self):
        # check index is up to date, refresh if needed
        if self.index_needs_refresh():
            self.index_refresh()

    def get_name_es(self, name_id, params):
        if "action" not in params:
            params["action"] = "see"
        if "username" not in params:
            params["username"] = None
        # get Name from index
        Name = self.get_from_index(name_id,
                                   action=params["action"],
                                   name_type="naam")
        return Name.to_clean_json(params)

    def get_names_es(self, params):
        # check index is up to date, refresh if needed
        self.check_index_is_fresh()
        response = self.get_from_index_by_filters(params, name_type="naam")
        Names = [Name(hit) for hit in response["hits"]["hits"]]
        return {
            "total": response["hits"]["total"],
            "names": [Name.base() for Name in Names]
        }

    def get_names_by_id_es(self, name_ids, params):
        # check index is up to date, refresh if needed
        self.check_index_is_fresh()
        response = self.es.mget(index=self.es_index,
                                doc_type="Name",
                                body={"ids": name_ids})
        return [hit["_source"] for hit in response["hits"]["hits"]]

#    def get_collection_es(self, collection_id, params):
#        if "action" not in params:
#            params["action"] = "see"
#        if "username" not in params:
#            params["username"] = None
#        return collection.to_clean_json(params)

#    def get_collections_es(self, params):
#        # check index is up to date, refresh if needed
#        self.check_index_is_fresh()
#        response = self.get_from_index_by_filters(params, name_type="NameCollection")
#        collections = NameCollection(response["hits"]["hits"])
#        return {
#            "total": response["hits"]["total"],
#            "collections": collection.to_clean_json(params)
#        }

####################
# Helper functions #
####################

    """old helpers were superfluous, but we may add some similarity stuff here later """

    ###################
    # ES interactions #
    ###################
    """all adding stuff is delegated to helpers as we do not foresee writing interactions
    with the index as yet. Keep as placeholders though"""

    #    def add_to_index(self, Name, name_type):
    #        self.should_have_target_list(Name)
    #        self.should_have_permissions(Name)
    #        self.should_not_exist(Name['id'], name_type)
    #        return self.es.index(index=self.es_index, doc_type=name_type, id=Name['id'], body=Name)
    #
    #    def add_bulk_to_index(self, Names, name_type):
    #        raise ValueError("Function not yet implemented")

    #    def get_from_index(self, id, action, name_type="naam"):
    #        """for now we only have naam, but probably extend with institution
    #        and geonames later
    #        """
    #        # check index is up to date, refresh if needed
    #        self.check_index_is_fresh()
    #        # check that Name exists (and is not deleted)
    #        self.should_exist(id, name_type)
    #        return Name

    def get_from_index_by_id(self, name_id, name_type="naam"):
        self.should_exist(name_id, name_type)
        return self.es.get(index=self.es_index, doc_type=name_type,
                           id=name_id)['_source']

    def get_from_index_by_filters(self, params, name_type="naam"):
        filter_queries = query_helper.make_param_filter_queries(params)
        #        filter_queries += [query_helper.make_permission_see_query(params)]
        query = {
            "from": params["page"] * self.es_config["page_size"],
            "size": self.es_config["page_size"],
            "query": query_helper.bool_must(filter_queries)
        }
        return self.es.search(index=self.es_index,
                              doc_type=name_type,
                              body=query)

#    def remove_from_index(self, name_id, name_type):
#        self.should_exist(name_id, name_type)
#        return self.es.delete(index=self.es_index, doc_type=name_type, id=name_id)

#    def remove_from_index_if_allowed(self, name_id, params, name_type="_all"):
#        if "username" not in params:
#            params["username"] = None
#        # check index is up to date, refresh if needed
#        self.check_index_is_fresh()
#        # check that Name exists (and is not deleted)
#        self.should_exist(name_id, name_type)
#        # get original Name json
#        name_json = self.get_from_index_by_id(name_id, name_type)
#        # check if user has appropriate permissions
#        if not permissions.is_allowed_action(params["username"], "edit", Name(name_json)):
#            raise PermissionError(message="Unauthorized access - no permission to {a} Name".format(a=params["action"]))
#        return self.remove_from_index(name_id, "Name")
#
#    def is_deleted(self, name_id, name_type="_all"):
#        if self.es.exists(index=self.es_index, doc_type=name_type, id=name_id):
#            res = self.es.get(index=self.es_index, doc_type=name_type, id=name_id)
#            if "status" in res["_source"] and res["_source"]["status"] == "deleted":
#                return True
#        return False

    def should_exist(self, name_id, name_type="_all"):
        if self.es.exists(index=self.es_index, doc_type=name_type, id=name_id):
            if not self.is_deleted(name_id, name_type):
                return True
        raise NaamError(message="Name with id %s does not exist" % (name_id),
                        status_code=404)

    def should_not_exist(self, name_id, name_type="_all"):
        if self.es.exists(index=self.es_index, doc_type=name_type, id=name_id):
            raise NaamError(message="Name with id %s already exists" %
                            (name_id))
        else:
            return True

    def get_objects_from_hits(self, hits, doc_type="naam"):
        objects = []
        for hit in hits:
            if hit["_source"]["type"] == doc_type:
                objects += [Name(hit)]
#            elif hit["_source"]["type"] == "NameCollection":
#                objects += [NameCollection(hit["_source"])]

    def list_name_ids(self):
        return list(self.name_index.keys())

    def list_Names(self, ids=None):
        if not ids:
            ids = self.list_name_ids()
        return [Name for id, Name in self.name_index.items() if id in ids]

    def list_Names_as_json(self, ids=None):
        if not ids:
            ids = self.list_name_ids()
        return [
            Name.to_json() for id, Name in self.name_index.items() if id in ids
        ]
            mid_set.add(item['_source']["mid"])
        else:
            try:
                mid_set.add(item['_source']["root_mid"])
            except Exception, r:
                print Exception, r
    print len(mid_set)

    # 获得原创微博和转发微博信息存入es中,应当在近两天里
    index_list = []
    index_list.append(index_name)
    index_list.append("flow_text_"+ts2datetime(ts-3600*24))
    mid_list = list(mid_set)
    bulk_action = []
    non_exist_list = [] #尚未监控的微博
    exist_results = es_user_portrait.mget(index=monitor_index_name, doc_type=monitor_index_type, body={"ids":mid_list})["docs"]
    for item in exist_results:
        if not item["found"]:
            non_exist_list.append(item["_id"])
    
    #将尚未监控的微博纳入监控的范围内
    if non_exist_list:
        count = 0
        classify_text_dict = dict() # 分类文本
        classify_uid_list = []
        #f = open("text.txt", "a")
        lenth = len(non_exist_list)
        dividion = lenth/1000
        weibo_results = []
        for i in range(0,dividion+1):
            tmp_mid_list = non_exist_list[i*1000:(i+1)*1000]
Exemplo n.º 30
0
def export_emails_archive(data_set_id,
                          email_ids=["f9c9c59a-7fe8-11e5-bb05-08002705cb99"]):
    cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" %
                 (data_set_id, email_ids))
    if not data_set_id:
        return tangelo.HTTPStatusCode(400,
                                      "invalid service call - missing index")
    # if not email:
    #     return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id")

    # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10))
    es = Elasticsearch([{
        "host": "10.1.70.143",
        "port": 9200
    }],
                       request_timeout=60)
    # TODO can implement with multiple doc_types and combine attachments in
    emails = es.mget(index=data_set_id,
                     doc_type="emails",
                     body={"docs": [{
                         "_id": id
                     } for id in email_ids]})

    # TODO filename
    filename = "export.tar.gz"
    tangelo.content_type("application/x-gzip")
    header("Content-Disposition", 'attachment; filename="{}"'.format(filename))

    string_buffer = cStringIO.StringIO()
    tar = tarfile.open(mode='w:gz', fileobj=string_buffer)

    # Add each email to the tar
    for email_source in emails["docs"]:

        email = email_source["_source"]

        tarinfo_parent = tarfile.TarInfo(name=email["id"])
        tarinfo_parent.type = tarfile.DIRTYPE
        tarinfo_parent.mode = 0755
        tarinfo_parent.mtime = time.time()
        tar.addfile(tarinfo_parent)

        tarinfo = tarfile.TarInfo(email["id"] + "/" + email["id"] + ".json")
        # TODO -- email transformation
        data_string = json.dumps(email)
        fobj = cStringIO.StringIO(data_string)

        tarinfo.size = len(data_string)
        tarinfo.mode = 0644
        tarinfo.mtime = time.time()
        tar.addfile(tarinfo, fobj)

        # Get the attachments
        if email["attachments"]:
            attachments = es.mget(index=data_set_id,
                                  doc_type="attachments",
                                  body={
                                      "docs": [{
                                          "_id": attch["guid"]
                                      } for attch in email["attachments"]]
                                  })
            for attachment_source in attachments["docs"]:
                attachment = attachment_source["_source"]
                filename = attachment["filename"]
                attch_data = str(base64.b64decode(attachment["contents64"]))

                tarinfo_attch = tarfile.TarInfo(email["id"] + "/" + filename)
                tarinfo_attch.size = len(attch_data)
                tarinfo_attch.mode = 0644
                tarinfo_attch.mtime = time.time()
                tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))
    tar.close()

    return string_buffer.getvalue()
Exemplo n.º 31
0
class ElasticsearchBackend(object):
    def __init__(self, settings=None):
        if settings is None:
            settings = {}
        self.es = Elasticsearch(**settings)

    def set_variable(self, key, value):
        res = self.es.index("insee", doc_type="variable",
                            id=key.lower(), body=value)
        return res['created']

    def get_variable(self, key):
        res = self.es.get(index="insee", doc_type='variable',
                          id=key.lower())
        return res['_source']

    def get_variables(self, keys):
        res = self.es.mget(index="insee", doc_type="variable", body={
            "ids": [k.lower() for k in keys]
        })
        results = [d['_source'] for d in res['docs'] if '_source' in d]
        return results

    def search_variables(self, query):
        res = self.es.search(
            index="insee", doc_type="variable",
            body={"query": {"match": {"_all": query.lower()}}}
        )
        return [hit['_source'] for hit in res['hits']['hits']]

    def set_commune(self, key, value):
        res = self.es.index("insee", doc_type="commune",
                            id=key.lower(), body=value)
        return res['created']

    def get_commune(self, key):
        res = self.es.get(index="insee", doc_type='commune',
                          id=key.lower())
        return res['_source']

    def get_communes(self, keys):
        res = self.es.mget(index="insee", doc_type="commune", body={
            "ids": [k.lower() for k in keys]
        })
        results = [d['_source'] for d in res['docs'] if '_source' in d]
        return results

    def search_communes(self, query):
        res = self.es.search(
            index="insee", doc_type="commune",
            body={"query": {"match": {"_all": query.lower()}}}
        )
        return [hit['_source'] for hit in res['hits']['hits']]

    def set_data(self, var_lib, codgeo, value):
        res = self.es.index("insee", doc_type="data",
                            id="%s_%s" % (var_lib.lower(), codgeo), body=value)
        return res['created']

    def get_data(self, var_lib, codgeo):
        res = self.es.get(index="insee", doc_type='data',
                          id="%s_%s" % (var_lib.lower(), codgeo))
        return res['_source']
Exemplo n.º 32
0
def export_attachments(data_set_id, outfile, sender='', attachment_extension='jpg', date_bounds=None):
    print("email.get_attachments_sender(index=%s, sender=%s, attachment_type=%s, date_bounds=%s)" % (data_set_id, sender, attachment_extension, date_bounds))
    if not data_set_id:
        print "invalid service call - missing index"
        return 1
    # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10))
    es = Elasticsearch([{"host" : "10.1.70.143", "port" : 9200}], timeout=60)

    # TODO get accurate count -- this is not strictly needed as attachments will be accessed as inner docs on the email_address
    max_inner_attachments_returned = 100000

    # Get all attachments by extension
    rows=[]
    body = _attch_nested__ext_query(sender, attachment_extension, date_bounds, max_inner_attachments_returned=max_inner_attachments_returned )
    print body
    addresses_count = es.count(index=data_set_id, doc_type="email_address", body=body)["count"]
    print "total addresses: " + str(addresses_count)
    addresses = es.search(index=data_set_id, doc_type="email_address", body=body, size=addresses_count)
    for address in addresses["hits"]["hits"]:
        rows += [[address["_source"]["addr"], attachment["_source"]["guid"], attachment["_source"]["filename"], attachment["_source"]["datetime"]] for attachment in address["inner_hits"]["sender_attachments"]["hits"]["hits"]]

    print "total attachments: " + str(len(rows))

    #  start tar.gz
    # tar = tarfile.open(mode='w:gz', name="big-export.tar.gz")
    # Start tar
    tar = tarfile.open(mode='w', name=outfile)

    csv_string_buffer = cStringIO.StringIO()
    csv_file=csv.writer( csv_string_buffer )

    # Add all rows to attachment csv
    csv_file.writerows (rows)
    tarinfo = tarfile.TarInfo("attachments.csv")

    tarinfo.size = csv_string_buffer.tell()
    tarinfo.mode = 0644
    tarinfo.mtime = time.time()
    csv_string_buffer.seek(0)

    tar.addfile(tarinfo, csv_string_buffer)


    # This is the buffer size of how many attachments to pull from ES at each iteration
    num_returned=3
    index=0
    # Paging
    while index < len(rows):
        # Get num_returned attachments from ES
        attachments = es.mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":row[1]} for row in rows[index: index+num_returned]]})
        index+=num_returned

        # Add all attachments to the archive
        for attachment_source in attachments["docs"]:
            attachment = attachment_source["_source"]
            filename = attachment["filename"]
            attch_data = str(base64.b64decode(attachment["contents64"]))

            tarinfo_attch = tarfile.TarInfo(attachment["guid"]+"/"+filename)
            tarinfo_attch.size = len(attch_data)
            tarinfo_attch.mode = 0644
            tarinfo_attch.mtime = time.time()
            tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))

    tar.close()
Exemplo n.º 33
0
class Elasticsearch(Datastore):

    engine = None
    index = 'aleph-samples'
    tracking_index = 'aleph-tracking'
    doc_type = 'sample'
    cache = None

    def __init__(self):

        self.engine = ES()
        self.cache = SimpleCache()

    def all(self, page=1, size=DEFAULT_PAGE_SIZE):

        body = {
                'query': {
                    'match_all': {},
                },
                "sort": {
                    "timestamp": {
                        'order': 'desc'
                    },
                }
            }

        start = ((page - 1) * size)

        res = self.raw_search(body, start=start, size=size)

        total = res['hits']['total']
        entries = res['hits']['hits']

        return (total, self.entries_to_samples(entries))

    def entries_to_samples(self, entries):

        rv = []
        if not entries:
            return rv

        entry_table = {}

        for entry in entries:
            sample_id = entry['_id']
            entry_table[sample_id] = {'metadata': entry, 'tracking_data': {}}

        # Get tracking data for retrieved ids
        tracking_data = self._mget(list(entry_table.keys()), index=self.tracking_index)

        for td in tracking_data:
            sample_id = td['_id']
            entry_table[sample_id]['tracking_data'] = td

        # Add return values
        for sample_id, sample_data in entry_table.items():
            if '_source' in sample_data['metadata'] and '_source' in sample_data['tracking_data']:
                rv.append(Sample(sample_data['metadata'], sample_data['tracking_data']))

        return rv

    def _mget(self, ids, index=None):

        if not index:
            index = self.index

        if not isinstance(ids, list):
            raise ValueError("ids is not a list")

        body = { 'ids': ids }

        result = self.engine.mget(index=index, doc_type=self.doc_type, body=body, ignore=404)

        if 'docs' not in result:
            return None

        return result['docs']


    def _get(self, sample_id, index=None):

        if not index:
            index = self.index

        result = self.engine.get(index=index, doc_type=self.doc_type, id=sample_id, ignore=404)

        if result['found'] == False:
            return None

        return OrderedDict(sorted(result.items()))

    def mget(self, sample_ids):

        if not sample_ids:
            return None

        metadata = {s['_id']:s for s in self._mget(sample_ids)}
        tracking_data = {s['_id']:s for s in self._mget(sample_ids, index=self.tracking_index)}

        if not metadata or not tracking_data:
            return None

        entries = []

        for sample_id, v in metadata.items():
            if '_source' in metadata[sample_id].keys() and '_source' in tracking_data[sample_id].keys():
                entries.append(Sample(metadata[sample_id], tracking_data[sample_id]))

        return entries

    def get(self, sample_id):

        metadata = self._get(sample_id)
        tracking_data = self._get(sample_id, index=self.tracking_index)

        if not metadata or not tracking_data:
            return None

        return Sample(metadata, tracking_data)

    def get_parents(self, sample_id):

        rv = self.cache.get('get-parents-%s' % sample_id)

        if not rv:
            
            tracking_data = self._get(sample_id, index=self.tracking_index)

            if not tracking_data:
                return []

            rv = tracking_data['_source']['parents']

        return rv

    def get_children(self, sample_id):

        rv = self.cache.get('get-children-%s' % sample_id)

        if not rv:

            search_body = {
                "query": {
                    "bool": {
                        "must": [
                            { "term": { "parents": sample_id } }
                        ]
                    }
                }
            }

            result = self.raw_search(search_body, index=self.tracking_index)

            rv = result['hits']['hits']

        return rv

    def raw_search(self, body, q=None, start=0, size=DEFAULT_PAGE_SIZE, index=None):

        if not index:
            index = self.index

        result = []

        try:
            hits = self.engine.search(index=index, doc_type=self.doc_type, q=q, from_=start, size=size, body=body)
        except NotFoundError:
            pass
        except Exception:
            raise

        return hits
    

    def search(self, query, page=1, size=DEFAULT_PAGE_SIZE):

        start = ((page - 1) * size)

        result = []

        body = {
            "sort": {
                "timestamp": {
                    'order': 'desc'
                },
            }
        }

        hits = self.raw_search(body, start=start, size=size, q=query)

        total = hits['hits']['total']
        entries = hits['hits']['hits']

        return (total, self.entries_to_samples(entries))

    def count(self, body):

        return self.engine.count(index=self.tracking_index, doc_type=self.doc_type, body=body)['count']

    # Aux Methods 

    # Counters
    def count_all(self):

        body = {
            "query": {
                "match_all" : {}
            }
        }
        return self.count(body)

    def count_processing_samples(self):
        body = {
            "query": {
                "bool" : {
                    "filter" : [
                        {"script" : {"script" : {"source": "!doc['processors_completed'].containsAll(doc['processors_dispatched'])", "lang": "painless"}}},
                    ]
                }
            }
        }

        return self.count(body)

    def count_analyzing_samples(self):
        body = {
            "query": {
                "bool" : {
                    "filter" : [
                        {"script" : {"script" : {"source": "!doc['analyzers_completed'].containsAll(doc['analyzers_dispatched'])", "lang": "painless"}}},
                    ]
                }
            }
        }

        return self.count(body)

    # Graph Data
    def sample_histogram(self, size=24, interval="1h"):

        histogram = {}

        hist_body = {
            "aggs" : {
                "samples_over_time" : {
                    "date_histogram" : {
                        "field" : "timestamp",
                        "interval" : interval,
                        "min_doc_count": 0
                    }
                }
            }
        }
        hist_result = self.raw_search(hist_body)['aggregations']

        for h in hist_result['samples_over_time']['buckets']:
            histogram[h['key_as_string']] = h['doc_count']

        return histogram
    
    def sample_diversity(self):

        diversity = {}

        div_body = {
            "aggs" : {
                "genres" : {
                    "terms" : { "field" : "filetype" }
                }
            }
        }
        div_result = self.raw_search(div_body)['aggregations']

        for d in div_result['genres']['buckets']:
            diversity[d['key']] = d['doc_count']

        return diversity
Exemplo n.º 34
0
class VWCollection(VWCallback):
    
    def __init__(self,items=[],**kwargs):
        self.bulk_chunk_size = kwargs.get('bulk_chunk_size',
            config.bulk_chunk_size)
        self._sort = []
        self.results_per_page = kwargs.get('results_per_page',
            config.results_per_page)
        self._querybody = querybuilder.QueryBody() # sets up the new query bodies

        if kwargs.get('base_obj'):
            self.base_obj = kwargs.get('base_obj')
        else:
            try:
                self.base_obj = self.__class__.__model__
            except AttributeError:
                raise AttributeError('Base object must contain a model or pass base_obj')

        self._es = Elasticsearch(config.dsn)
        self._esc = client.IndicesClient(self._es)

        if '__index__' in dir(self.base_obj):
            idx = self.base_obj.__index__
        else:
            idx = config.default_index

        self._search_params = []
        self._raw = {}
        self.idx = idx
        self.type = self.base_obj.__type__
        self._special_body = {}
        
        # special list of items that can be committed in bulk
        self._items = items 

    def search(self,q):
        self._search_params.append(q)
        return self

    # setup a raw request
    def raw(self, raw_request):
        self._raw = raw_request
        return self

    def filter_by(self, condition = 'and',**kwargs):
        if kwargs.get('condition'):
            condition=kwargs.get('condition')
            del kwargs['condition']

        condition = self._translate_bool_condition(condition)

        for k,v in kwargs.iteritems():
            if k == 'id' or k == 'ids':
                id_filter = v
                if not isinstance(id_filter, list):
                    id_filter = [id_filter]

                self._querybody.chain(qdsl.ids(id_filter), condition=condition)
            else:
                try:
                    analyzed = is_analyzed(getattr(self.base_obj, k))
                except AttributeError:
                    analyzed = is_analyzed(v)

                q_type = 'filter'
                if analyzed:
                    q_type = 'query'

                if isinstance(v, list):
                    # lists are treat as like "OR" (using terms() on not_analyzed, bool/matched on analyzed)
                    if analyzed:
                        match_queries = []
                        for item in v:
                            match_queries.append( qdsl.match(k,item) )
                        self._querybody.chain( qdsl.bool(qdsl.should(match_queries)), condition=condition,type=q_type )
                    else:
                        self._querybody.chain( qdsl.terms(k,v),condition=condition,
                            type=q_type)
                else:
                    #search_value = unicode(v)
                    if analyzed:
                        self._querybody.chain(qdsl.match(unicode(k), v), condition=condition,type=q_type)
                    else:
                        self._querybody.chain(qdsl.term(unicode(k), v), condition=condition,type=q_type)

        return self

    def multi_match(self, fields, query, **kwargs):
        self._querybody.chain(qdsl.multi_match(query, fields), condition=kwargs.get('condition', None), type='query')
        return self

    def exact(self, field, value,**kwargs):
        try:
            field_template = getattr( self.base_obj, field)

            if type(field_template) != ESType:
                field_template = create_es_type(field_template)

            for estype in [String,IP,Attachment]:
                if isinstance(field_template, estype) and field_template.analyzed == True:
                    logger.warn('%s types may not exact match correctly if they are analyzed' % unicode(estype.__class__.__name__))

        except AttributeError:
            logger.warn('%s is not in the base model.' % unicode(field))

        kwargs['type'] = 'filter'
        if isinstance(value, list):
            self._querybody.chain(qdsl.terms(field,value), **kwargs)
        else:
            self._querybody.chain(qdsl.term(field, value), **kwargs)

        return self


    def or_(self,*args):
        return ' OR '.join(args)

    def and_(self,*args):
        return ' AND '.join(args)

    def get(self,id, **kwargs):
        try:
            params = {'index':self.idx, 'doc_type':self.type, 'id':id}
            params.update(kwargs)
            doc = self._es.get(**params)
            if doc:
                return VWCollectionGen(self.base_obj, {'docs':[doc]})[0]

            return None

        except:
            # TODO. Discuss this. Should get() return None even on exceptions?
            return None

    def refresh(self, **kwargs):
        self._esc.refresh(index=self.idx, **kwargs)

    def get_in(self, ids,**kwargs):
        if len(ids) > 0: # check for ids. empty list returns an empty list (instead of exception)
            params = {'index':self.idx, 'doc_type':self.type, 'body':{'ids':ids}}
            params.update(kwargs);
            res = self._es.mget(**params)
            if res and res.get('docs'):
                return VWCollectionGen(self.base_obj, res)

        return []

    def get_like_this(self,doc_id,**kwargs):
        params = {'index':self.idx,'doc_type':self.type,'id':doc_id}
        params.update(kwargs)
        res = self._es.mlt(**params)

        if res and res.get('docs'):
            return VWCollectionGen(self.base_obj, res)
        else:
            return []

    def sort(self, **kwargs):
        for k,v in kwargs.iteritems():
            v = v.lower()
            if v not in ['asc','desc']:
                v = 'asc'

            self._sort.append('%s:%s' % (k,v))
        return self

    def clear_previous_search(self):
        self._raw = {}
        self._search_params = []
        self._special_body = {}
        self._querybody = querybuilder.QueryBody()

    def _create_search_params( self, **kwargs ):
        # before_query_build() is allowed to manipulate the object's internal state before we do stuff
        self._querybody = self.execute_callbacks('before_query_build', self._querybody )

        q = {
            'index': self.idx,
            'doc_type': self.type
        }

        if self._raw:
            q['body'] = self._raw
        elif len(self._search_params) > 0:
            kwargs['type'] = 'query'
            self._querybody.chain(qdsl.query_string(self.and_(*self._search_params)), **kwargs)
        else:
            q['body'] = qdsl.query(qdsl.match_all())

        if self._querybody.is_filtered() or self._querybody.is_query():
            q['body'] = self._querybody.build()

        # after_query_build() can manipulate the final query before being sent to ES
        # this is generally considered a bad idea but might be useful for logging
        q = self.execute_callbacks( 'after_query_build', q )

        logger.debug(json.dumps(q))
        return q

    def count(self):
        params = self._create_search_params()
        resp = self._es.count(**params)
        return resp.get('count')

    def __len__(self):
        return self.count()

    def limit(self,count):
        self.results_per_page = count
        return self

    def all(self,**kwargs):

        params = self._create_search_params()
        if not params.get('size'):
            params['size'] = self.results_per_page

        if kwargs.get('results_per_page') != None:
            kwargs['size'] = kwargs.get('results_per_page')
            del kwargs['results_per_page']

        if kwargs.get('start') != None:
            kwargs['from_'] = kwargs.get('start')
            del kwargs['start']

        logger.debug(json.dumps(self._sort))

        params.update(kwargs)
        if len(self._sort) > 0:
            if params.get('sort') and isinstance(params['sort'], list):
                params['sort'].extend(self._sort)
            else:
                params['sort'] = self._sort

        if params.get('sort'):
            if isinstance(params['sort'], list):
                params['sort'] = ','.join(params.get('sort'))
            else:
                raise TypeError('"sort" argument must be a list')

        logger.debug(json.dumps(params))
        results = self._es.search(**params)

        return VWCollectionGen(self.base_obj,results)

    def one(self,**kwargs):
        kwargs['results_per_page'] = 1
        results = self.all(**kwargs)
        try:
            return results[0]
        except IndexError:
            raise NoResultsFound('No result found for one()')

    # this is for legacy purposes in filter_by
    def _translate_bool_condition(self,_bool_condition):
        if _bool_condition == 'and':
            _bool_condition = 'must'
        elif _bool_condition == 'or':
            _bool_condition = 'should'
        elif _bool_condition == 'not':
            _bool_condition = 'must_not'

        # this is for things like geo_distance where we explicitly want the true and/or/not
        elif _bool_condition == 'explicit_and':
            _bool_condition = 'and'
        elif _bool_condition == 'explicit_or':
            _bool_condition = 'or'
        elif _bool_condition == 'explicit_not':
            _bool_condition = 'not'

        return _bool_condition

    def range(self, field, **kwargs):
        search_options = {}
        for opt in ['condition','minimum_should_match']:
            if opt in kwargs:
                search_options[opt] = kwargs.get(opt)
                del kwargs[opt]

        q = qdsl.range(field, **kwargs)
        if self._querybody.is_filtered():
            d = {'filter': q}
        else:
            d = {'query': q}

        if search_options:
            d.update(search_options)

        self._querybody.chain(d)

        return self

    def search_geo(self, field, distance, lat, lon,**kwargs):
        condition = kwargs.get('condition', 'and')
        if 'condition' in kwargs:
            del kwargs['condition']

        self._querybody.chain(qdsl.filter_(qdsl.geo_distance(field, [lon,lat], distance, **kwargs)), condition=condition)
        return self

    def missing( self, field, **kwargs):
        self._querybody.chain(qdsl.filter_(qdsl.missing(field)))
        return self

    def exists( self, field, **kwargs):
        self._querybody.chain(qdsl.filter_(qdsl.exists(field, **kwargs)))
        return self

    def delete(self, **kwargs):
        params = self._create_search_params()
        params.update(kwargs)
        self._es.delete_by_query(**params)

    def delete_in(self, ids):
        if not isinstance(ids, list):
            raise TypeError('argument to delete in must be a list.')

        bulk_docs = []
        for i in ids:
            this_id = i
            this_type = self.base_obj.__type__
            this_idx = self.idx
            if isinstance(i, VWBase):
                this_id = i.id
                this_type = i.__type__
                try:
                    this_idx = i.__index__
                except AttributeError:
                    pass

            bulk_docs.append({'_op_type': 'delete', '_type': this_type, '_index': this_idx, '_id': this_id })

        return helpers.bulk( self._es, bulk_docs, chunk_size=self.bulk_chunk_size)

    # commits items in bulk
    def commit(self, callback=None):
        bulk_docs = []

        if callback:
            if not callable(callback):
                raise TypeError('Argument 2 to commit() must be callable')

        # allow for a search to work if there are not _items
        if len(self._items) == 0:
            items = self.all()
        else:
            items = self._items

        for i in self._items:
            if callback:
                i = callback(i)

            i = self.execute_callbacks('on_bulk_commit', i)

            this_dict = {}
            this_id = ''
            this_idx = self.idx
            this_type = self.base_obj.__type__
            if isinstance(i, VWBase):
                this_dict = i._create_source_document()
                this_type = i.__type__
                this_id = i.id
                try:
                    this_idx = i.__index__
                except AttributeError:
                    pass

            elif isinstance(i,dict):
                this_dict = i
                this_id = i.get('id')

            else:
                raise TypeError('Elments passed to the collection must be type of "dict" or "VWBase"')

            if not this_id:
                this_id = str(uuid4())

            bulk_docs.append({'_op_type': 'index', '_type': this_type, '_index': this_idx, '_id': this_id, '_source': this_dict})

        return helpers.bulk(self._es,bulk_docs,chunk_size=self.bulk_chunk_size)
Exemplo n.º 35
0
class SearchEngine(object):

    def __init__(self):
        self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, **settings.ELASTICSEARCH_CONNECTION_OPTIONS)
        self.logger = logging.getLogger(__name__)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index, doc_type, and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        body = kwargs.get('body', None)
        if body != None:
            try:
                return self.es.delete_by_query(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                raise detail   
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                raise detail   

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        index = kwargs.get('index', '').strip()
        print 'deleting index : %s' % index
        return self.es.indices.delete(index=index, ignore=[400, 404])

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index, doc_type, and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        body = kwargs.get('body', None)
        index = kwargs.get('index', None)
        id = kwargs.get('id', None)

        if index is None:
            raise NotImplementedError("You must specify an 'index' in your call to search")

        if id:
            if isinstance(id, list):
                kwargs.setdefault('body', {'ids': kwargs.pop('id')})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)
        
        ret = None
        try: 
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning('%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail))
            pass   

        return ret

    def index_term(self, term, id, context='', ewstatus='', options={}):
        """
        If the term is already indexed, then simply increment the count and add the id of the term to the existing index.
        If the term isn't indexed then add the index.

        id: a unique id associated with the term
        context: a uuid of a concept to associate with the term to render in the ui
        options: any additional information to associate with the term

        """

        if term.strip(' \t\n\r') != '':
            already_indexed = False
            count = 1
            ids = [id]
            
            try:
                #_id = unicode(term, errors='ignore').decode('utf-8').encode('ascii')
                _id = uuid.uuid3(uuid.NAMESPACE_DNS, '%s%s' % (hash(term), hash(context)))
                result = self.es.get(index='term', doc_type='value', id=_id, ignore=404)

                #print 'result: %s' % result
                if result['found'] == True:
                    ids = result['_source']['ids']
                    if id not in ids:
                        ids.append(id)
                else:
                    ids = [id]
                # ewstatus is indexed only if it's not dict
                if (type(ewstatus) is dict):
                    self.index_data('term', 'value', {'term': term, 'context': context, 'options': options, 'count': len(ids), 'ids': ids}, id=_id)
                else:
                    self.index_data('term', 'value', {'term': term, 'context': context, 'ewstatus': ewstatus, 'options': options, 'count': len(ids), 'ids': ids}, id=_id)
                
            except Exception as detail:
                self.logger.warning('%s: WARNING: search failed to index term: %s \nException detail: %s\n' % (datetime.now(), term, detail))
                raise detail   
                  
    def delete_terms(self, ids):
        """
        If the term is referenced more then once simply decrement the 
        count and remove the id of the deleted term from the from the existing index.

        If the term is only referenced once then delete the index  

        """

        if not isinstance(ids, list):
            ids = [ids]

        for id in ids:
            result = self.es.search(index='term', doc_type='value', body={
                "query": {
                    "filtered": {
                        "filter":{
                            "terms": {
                                "ids": [id]
                            }
                        }, 
                        "query": {
                            "match_all": {}
                        }
                    }
                }, 
                "from": 0, 
                "size": 10
            }, ignore=404)

            if 'hits' in result:
                for document in result['hits']['hits']:
                    document['_source']['ids'].remove(id)
                    count = len(document['_source']['ids'])
                    if count > 0:
                        document['_source']['count'] = count
                        self.index_data('term', 'value', document['_source'], id=document['_id'])
                        self.es.indices.refresh(index='term')
                    else:
                        self.delete(index='term', doc_type='value', id=document['_id'])

    def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex='analyzed', body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        if not body:
            if fieldtype == 'geo_shape':
                body =  { 
                    doc_type : {
                        'properties' : {
                            fieldname : { 'type' : 'geo_shape', 'tree' : 'geohash', 'precision': '1m' }
                        }
                    }
                } 
            else:           
                body =  { 
                    doc_type : {
                        'properties' : {
                            fieldname : { 'type' : fieldtype, 'index' : fieldindex }
                        }
                    }
                }

        self.create_index(index=index, ignore=400)
        self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body)

    def create_index(self, **kwargs):
        self.es.indices.create(**kwargs)

    def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the 
            document itself and use the value found for the id of the document

        """

        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document,idfield)

            try:
                self.es.index(index=index, doc_type=doc_type, body=document, id=id, **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail))
                raise detail


    def bulk_index(self, data):
        return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True)

    def create_bulk_item(self, index, type, id, data):
        if not(self.isempty_or_none(index) or self.isempty_or_none(type) or self.isempty_or_none(id)):
            return[
                { "index" : { "_index" : index, "_type" : type, "_id" : id } },
                data
            ]
        else:
            return false
Exemplo n.º 36
0
class TMMonoLing:
  DOC_TYPE = 'tm'

  def __init__(self, **kwargs):
    self.es = Elasticsearch(kwargs = kwargs)
    # Put default index template
    self.es.indices.put_template(name='tm_template', body = self._index_template())
    self.refresh()

    #self.preprocessors = dict()
    self.tokenizers = dict()
    self.regex = dict()

  # Add new segment
  def add_segment(self, segment, ftype):
    # Add segment source and target texts to the correspondent index of ElasticSearch
    id = getattr(segment, ftype + '_id')
    index = TMUtils.lang2es_index(getattr(segment, ftype + '_language'))
    s_result = self.es.index(index=index,
                             doc_type=self.DOC_TYPE,
                             id=id,
                             body = self._segment2doc(segment, ftype))
    return id

  # Bulk segment addition
  def add_segments(self, segments, ftype):
    # Bulk insert
    return self._segment2es_bulk(segments, ftype, 'update', self._segment2doc_upsert)

  # Search for top matching segments
  def query(self, lang, qstring, filter = None):
    index = TMUtils.lang2es_index(lang)
    if not self.index_exists(index): return
    # Query source ES for the text
    query = TMDbQuery(es=self.es,
                      index = index,
                      q=qstring,
                      filter=filter)
    for response,q in query():
      for hit in response:
        yield hit,q

  # Search for top matching segments
  def mquery(self, lang, limit, q_list, filter=None):
    index = TMUtils.lang2es_index(lang)
    if not self.index_exists(index): return
    # Query source ES for the text
    query = TMDbQuery(es=self.es,
                          index=index,
                          q=q_list,
                          filter=filter,
			                    limit=limit)
    for response, q in query():
      yield response
      #for hit in response:
      #  yield hit

  # Get segment by id
  def get(self, lang, id):
    index = TMUtils.lang2es_index(lang)
    if not self.index_exists(index): return

    hit = self.es.get(index=index, id=id)
    if not hit: return None
    return hit['_source']

  # Get multiple segments by id
  def mget(self, ids_lang):
    if not ids_lang: return []
    body = [{
        '_index': TMUtils.lang2es_index(lang),
        '_id' : id
      } for lang,id in ids_lang]
    hits = self.es.mget(body={'docs' : body})
    if not hits: return None
    return [hit.get('_source',None) for hit in hits['docs']]


  # Scan matching segments
  def scan(self, lang, filter = None):
    index = TMUtils.lang2es_index(lang)
    if not self.index_exists(index): return

    query = TMDbQuery(es=self.es, index = index, filter=filter)
    for hit in query.scan():
      # Build segment by querying map and target index
      yield hit

  # Scan all pivot segments
  def scan_pivot(self, pivot_lang, langs):
    index = TMUtils.lang2es_index(pivot_lang)
    if not self.index_exists(index): return

    search = Search(using=self.es, index=index)
    for lang in langs:
      search = search.query('match', target_language=lang)
    for result in search.scan():
      yield result.meta.id

  # Bulk delete segments by id
  def delete(self, lang, ids):
    index = TMUtils.lang2es_index(lang)

    actions = [{'_op_type': 'delete',
                '_id': id,
                '_index' : index,
                '_type': self.DOC_TYPE,
                } for id in ids]
    # Bulk delete
    try:
      status = helpers.bulk(self.es, actions)
    except Exception as e:
      logging.warning(e)
      return str(e)
    return status

  # Should be called after modifying the index
  def refresh(self):
    #self.indexes = self.es.indices.get_aliases() #not supported anymore
    self.indexes = self.es.indices.get_alias("*")

  def index_exists(self, index):
    return self.es.indices.exists(index)

  def get_langs(self):
    return [TMUtils.es_index2lang(l) for l in self.indexes if re.search('^tm_\w{2}$', l)]

  ############### Helper methods ###################
  def _segment2es_bulk(self, segments, ftype, op_type, f_action):
    # Add segment source and target texts to the correspondent index of ElasticSearch in a batch
    actions = []
    added_ids = set()
    for segment in segments:
      id = getattr(segment, ftype + '_id')
      if id in added_ids: continue # avoid duplicates in the same batch
      added_ids.add(id)
      index = TMUtils.lang2es_index(getattr(segment, ftype + '_language'))
      action = {'_id': id,
                '_index' : index,
                '_type' : self.DOC_TYPE,
                '_op_type': op_type,
                '_source' : f_action(segment, ftype) #self._segment2doc(segment, ftype)
                }
      actions.append(action)
    # Bulk insert
    logging.info("Bulk upsert: {}".format(actions))
    s_result = helpers.bulk(self.es, actions)
    self.refresh() # refresh list of indexes (could have been created during insert)
    return s_result

  def _segment2doc(self, segment, ftype):
    text_pos = getattr(segment, ftype + '_pos')
    doc = {'text': getattr(segment, ftype + '_text')}
    # Optional fields (POS, tokenized)
    if hasattr(segment, ftype + '_pos'):
      doc['pos'] = getattr(segment, ftype + '_pos')

    op_ftype = 'source' if ftype == 'target' else 'target'
    # Auxiliary field to facilitate language matrix generation
    doc['target_language'] = [TMUtils.lang2short(TMUtils.str2list(getattr(segment, op_ftype + '_language'))[0])]
    doc['token_cnt'] = self.token_count(getattr(segment, ftype + '_text'), getattr(segment, ftype + '_language'))
    return doc

  def _segment2doc_upsert(self, segment, ftype):
    doc = self._segment2doc(segment, ftype)
    upsert_body = {'upsert': doc, # insert doc as is if it doesn't exist yet
            # If doc exists, then execute this painless scipt:
            # - add target language to the list  and filter unique values by converting to set
            'script' : 'ctx._source.target_language.add(params.language); ctx._source.target_language = ctx._source.target_language.stream().distinct().filter(Objects::nonNull).collect(Collectors.toList()); \
             if (params.pos != null) { ctx._source.pos = params.pos; }',
#             ',
            # parameters to the script
            'params' : { 'language' :  doc['target_language'],
                         'pos' : doc['pos']}
    }
    #return {'doc': doc, 'doc_as_upsert' : True }
    return upsert_body

  # Applied regular expression. tokenize and count the total of words
  def token_count(self, text, lang):

    lang = lang.split('-')[0].upper()
    if not lang in self.regex:
      try:
        self.regex[lang] = TMRegExpPreprocessor(lang)
        logging.info("Loading Regex for {}".format(lang))
      except Exception as e:
        logging.info("Unsupported Regex for {} ".format(lang))
        self.regex[lang] = lang
    if not lang in self.tokenizers:
        try:
          self.tokenizers[lang] = TMTokenizer(lang)
          logging.info("Loading Tokenizer for {}".format(lang))
        except Exception as e:
          self.tokenizers[lang] = lang
          logging.info("Unsupported Tokenizer for {}".format(lang))

    if self.regex[lang] != lang: text = TMRegexMatch.simplified_name(self.regex[lang].process(text))
    if self.tokenizers[lang] != lang: token_cnt = len((self.tokenizers[lang].tokenizer.process(text)).split(' '))
    else:
      if ' ' in text: token_cnt = len(text.split(' '))
      else: token_cnt = 1

    return token_cnt#len((self.tokenizers[lang].tokenizer.process(TMRegexMatch.simplified_name(self.regex[lang].process(text)))).split(' '))

  def _index_template(self):
    template =  {
      "template": "tm_*",
      "settings": {
        "analysis": {
          "analyzer": {
            "folding": {
              "tokenizer": "standard",
              "filter": ["lowercase", "asciifolding"]
            }
          }
        }
      },
      "mappings" : {
        self.DOC_TYPE: {
          "properties": {
            # Field text should analyzed, text.raw shouldn't
            "text": {
              "type": "text",
              "analyzer": "folding"
            },
            "target_language": {
              "type": "keyword",
              "index": "true"
            },
            "pos": {
              "type": "keyword",
              "index": "true"
            },
            "token_cnt": {
              "type": "integer",
              "index": "true"
            }
          }
        }
      }
    }
    print(json.dumps(template))
    return template
Exemplo n.º 37
0
class ESClient(object):
    def __init__(self, hosts, batchSize=1000, **kwargs):
        self.esConn = Elasticsearch(hosts, **kwargs)
        self.bulker = ListBulker()
        self.batchSize = batchSize
        self.ID_FIELD = "_id"

    def _isOk(self, response):
        return response.get('acknowledged', False)

    def createIndex(self,
                    indexName="test",
                    body=None,
                    mappings=None,
                    settings=None):
        if self.esConn.indices.exists(indexName):
            self.deleteIndex(indexName)
        return self._createIndex(indexName, body, mappings, settings)

    def createIndexIfNotExist(self,
                              indexName="test",
                              body=None,
                              mappings=None,
                              settings=None):
        if not self.esConn.indices.exists(indexName):
            return self._createIndex(indexName, body, mappings, settings)
        return True

    def _createIndex(self, indexName, body, mappings, settings):
        logging.info('Create index %s ...', indexName)
        body = self._createIndexConfig(body, mappings, settings)
        logging.debug(json.dumps(body, ensure_ascii=False, indent=4))

        response = self.esConn.indices.create(index=indexName, body=body)
        return self._isOk(response)

    def _createIndexConfig(self, body, mappings, settings):
        if not body:
            body = {}
            if settings:
                if 'settings' in settings:
                    body.update(settings)
                else:
                    body['settings'] = settings

            if mappings:
                if 'mappings' in mappings:
                    body.update(mappings)
                else:
                    body['mappings'] = mappings
        return body

    def closeIndex(self, indexName="test"):
        response = self.esConn.indices.close(index=indexName)
        return self._isOk(response)

    def openIndex(self, indexName="test"):
        response = self.esConn.indices.open(index=indexName)
        return self._isOk(response)

    def updateSetting(self, indexName="test", settings={}):
        logging.info('Update setting for index %s ...', indexName)
        self.esConn.indices.put_settings(index=indexName, body=settings)

    def deleteIndex(self, indexName="test"):
        logging.info('Delete index %s ...', indexName)
        response = self.esConn.indices.delete(indexName)
        return self._isOk(response)

    def getDocById(self, indexName, indexType, docid):
        return self.esConn.get(index=indexName, doc_type=indexType,
                               id=docid).get(
                                   '_source',
                                   self.esConn.get(index=indexName,
                                                   doc_type=indexType,
                                                   id=docid))

    def getKeysAndDocsByIds(self, indexName, indexType, docids):
        docs = self.esConn.mget(index=indexName,
                                doc_type=indexType,
                                body={"ids": docids})
        for doc in docs:
            yield doc.get_id(), doc if doc != {} else None

    def getDocsByIds(self, indexName, indexType, docids):
        for _, doc in self.getKeysAndDocsByIds(indexName, indexType, docids):
            yield doc

    def indexDoc(self, indexName, indexType, doc, docid=None, bulk=False):
        if bulk:
            action = {
                '_op_type': 'index',
                '_index': indexName,
                '_type': indexType,
                '_source': doc
            }
            if docid:
                action['_id'] = docid

            self.bulker.add(action)
            return self.force_bulk()
        else:
            response = self.esConn.index(index=indexName,
                                         doc_type=indexType,
                                         id=docid,
                                         body=doc)
            return 'created' in response

    def deleteDoc(self, indexName, indexType, docid, bulk=False):
        if bulk:
            self.bulker.add({
                '_op_type': 'delete',
                '_index': indexName,
                '_type': indexType,
                '_id': docid
            })

            self.force_bulk()
        else:
            self.esConn.delete(index=indexName, doc_type=indexType, id=docid)

    def deleteDocs(self, indexName, indexType, docids):
        self.delete_batch(indexName, indexType, docids)

    def delete_batch(self, indexName, indexType, docids):
        actions = self._buildDeleteActions(indexName, indexType, docids)
        success, errors = helpers.bulk(self.esConn, actions)  # @UnusedVariable
        if errors:
            logging.error("Delete batch: there are some errors %s", errors)

    def upsert_batch(self,
                     indexName,
                     indexType,
                     docs,
                     batchSize=1000,
                     idField=None):
        actions = self._buildIndexActions(indexName, indexType, docs, idField)
        success, errors = helpers.bulk(self.esConn,
                                       actions,
                                       chunk_size=batchSize)  # @UnusedVariable
        if errors:
            logging.error("Upsert batch: there are some errors %s", errors)

    @command('return_bool')
    def force_bulk(self, bulk=False):
        if bulk or len(self.bulker) >= self.batchSize == 0:
            success, errors = helpers.bulk(
                self.esConn, self.bulker.pop_all(),
                chunk_size=self.batchSize)  # @UnusedVariable
            if errors:
                logging.error("Force bulk: there are some errors %s", errors)
                return False

        return True

    def _buildDeleteActions(self, indexName, indexType, docids):
        actions = []
        for docid in docids:
            actions.append({
                '_op_type': 'delete',
                '_index': indexName,
                '_type': indexType,
                '_id': docid
            })

        return actions

    def _buildIndexActions(self, indexName, indexType, docs, idField=None):
        if type(docs) == list:
            return self._buildIndexActionsFromList(indexName, indexType, docs,
                                                   idField)
        elif type(docs) == dict:
            return self._buildIndexActionsFromDict(indexName, indexType, docs)
        else:
            return []

    def _buildIndexActionsFromList(self, indexName, indexType, docs, idField):
        actions = []
        for doc in docs:
            _id = doc[idField]
            del doc[idField]
            action = {
                '_op_type': 'index',
                '_index': indexName,
                '_type': indexType,
                '_id': _id,
                '_source': doc
            }

            actions.append(action)

        return actions

    def _buildIndexActionsFromDict(self, indexName, indexType, docs):
        actions = []
        for docid, doc in docs.items():
            actions.append({
                '_op_type': 'index',
                '_index': indexName,
                '_type': indexType,
                '_id': docid,
                '_source': doc
            })

        return actions

    def countIndexDocs(self, indexName, typeName=None):
        time.sleep(3)
        return self.esConn.count(index=indexName, doc_type=typeName)

    def search(self, indexName, typeName=None, query=None, params=None):
        ''' 
        :param indexName: list or string of indices
        :param typeName: list or string of types
        '''
        if type(indexName) == list:
            indexName = ','.join(indexName)
        if type(typeName) == list:
            typeName = ','.join(typeName)

        if params:
            return self.esConn.search(index=indexName,
                                      doc_type=typeName,
                                      body=query,
                                      params=params)
        else:
            return self.esConn.search(index=indexName,
                                      doc_type=typeName,
                                      body=query)

    def scroll(self,
               indexName,
               typeName=None,
               query=None,
               scroll='10m',
               size=1000):
        return helpers.scan(self.esConn,
                            index=indexName,
                            doc_type=typeName,
                            query=query,
                            scroll=scroll,
                            size=size)

    def existsType(self, indexName, typeName=None):
        return self.esConn.indices.exists_type(index=indexName,
                                               doc_type=typeName)

    def existsIndex(self, indexName):
        return self.esConn.indices.exists_index(indexName)

    def putMapping(self, indexName, typeName, mapping={}):
        response = self.esConn.indices.put_mapping(index=indexName,
                                                   doc_type=typeName,
                                                   body=mapping)
        return self._isOk(response)
Exemplo n.º 38
0
class EntityManager(object):
    @staticmethod
    def entity_not_found_message(en_type, ids):
        return 'Entities: "{type}" with ids: {ids} not found.'.format(
            type=en_type, ids=ids)

    def __init__(self, index='default', es_settings=None):
        if es_settings:
            self.es = Elasticsearch(**es_settings)
        else:
            self.es = Elasticsearch()
        self._index = index
        self._registry = {}

    def persist(self, entity):
        if not hasattr(entity, 'to_storage') or not hasattr(
                entity, '__getitem__') or not hasattr(entity, 'type'):
            raise TypeError(
                'entity object must have to_storage, type and behave like a dict methods'
            )
        self._persist(entity, state=ADD)

    def remove(self, entity):
        self._persist(entity, state=REMOVE)

    def flush(self):
        actions = []
        for persisted_entity in six.itervalues(self._registry):
            if persisted_entity.is_action_needed():
                actions.append(persisted_entity)
        self._execute_callbacks(actions, 'pre')
        bulk_results = helpers.streaming_bulk(self.es,
                                              [a.stmt for a in actions])
        # TODO: checking exceptions in bulk_results
        for persisted_entity, result in zip(actions, bulk_results):
            if 'create' in result[1]:
                persisted_entity.set_id(result[1]['create']['_id'])
        for action in actions:
            action.reset_state()
        self._execute_callbacks(actions, 'post')

    def find(self, _id, _type, scope=None, **kwargs):
        params = {
            'id': _id,
            'index': self._index,
            'doc_type': _type.get_type()
        }
        if scope:
            params['_source'] = _type.get_fields(scope)
        params.update(kwargs)
        try:
            _data = self.es.get(**params)
        except TransportError as e:  # TODO: the might be other errors like server unavaliable
            raise EntityNotFound(
                self.entity_not_found_message(_type.get_type(), _id), e)
        if not _data['found']:
            raise EntityNotFound(
                self.entity_not_found_message(_type.get_type(), _id))
        source = _data['_source']
        source['id'] = _data['_id']
        entity = _type(source, scope)
        self._persist(entity, state=UPDATE)
        return entity

    def find_many(self, _ids, _type, scope=None, complete_data=True, **kwargs):
        params = {
            'body': {
                'ids': _ids
            },
            'index': self._index,
            'doc_type': _type.get_type()
        }
        if scope:
            params['_source'] = _type.get_fields(scope)
        params.update(kwargs)
        try:
            _data = self.es.mget(**params)
        except TransportError as e:  # TODO: the might be other errors like server unavaliable
            raise EntityNotFound(
                self.entity_not_found_message(_type.get_type(),
                                              ', '.join(_ids)), e)
        entities = []
        if complete_data:
            invalid_items = [
                elem['_id'] for elem in _data['docs'] if not elem['found']
            ]
            if invalid_items:
                raise EntityNotFound(
                    self.entity_not_found_message(_type.get_type(),
                                                  ', '.join(invalid_items)))
        for _entity in _data['docs']:
            if _entity['found']:
                source = _entity['_source']
                source['id'] = _entity['_id']
                entity = _type(source, scope)
                self._persist(entity, state=UPDATE)
                entities.append(entity)
        return entities

    def query(self, query, _type, scope=None, **kwargs):
        params = {}
        if scope:
            params['_source'] = _type.get_fields(scope)
        try:
            data = self.es.search(index=self._index,
                                  doc_type=_type.get_type(),
                                  body=query,
                                  **kwargs)
        except TransportError as e:
            raise RepositoryError('Transport returned error', cause=e)
        entities = []
        for record in data['hits']['hits']:
            source = record['_source']
            source['id'] = record['_id']
            source['_score'] = record['_score']
            if '_explanation' in record:
                source['_explanation'] = record['_explanation']
            entity = _type(source, scope, record.get('highlight'))
            self._persist(entity, state=UPDATE)
            entities.append(entity)
        return entities, without(['hits'],
                                 data,
                                 move_up={'hits': ['max_score', 'total']})

    def query_one(self, query, _type, scope=None, **kwargs):
        entities, meta = self.query(query, _type, scope, **kwargs)
        if len(entities) == 1:
            return entities[0]
        raise RepositoryError(
            'Expected one result, found {num}'.format(num=len(entities)))

    def clear(self):
        self._registry = {}

    def get_repository(self, repository):
        app, repository_class_name = repository.split(':')
        if app not in settings.INSTALLED_APPS:
            founded_app = [
                _app for _app in settings.INSTALLED_APPS if _app.endswith(app)
            ]
            if not founded_app:
                raise RepositoryError(
                    'Given application {app} are not in INSTALLED_APPS'.format(
                        app=app))
            app = founded_app[0]
        try:
            module = import_module(app + '.' + 'repositories')
        except ImportError:
            raise RepositoryError(
                'Given application {app} has no repositories'.format(app=app))
        if not hasattr(module, repository_class_name):
            raise RepositoryError(
                'Given repository {repository_class_name} does not exists in application {app}'
                .format(repository_class_name=repository_class_name, app=app))
        repository_class = getattr(module, repository_class_name)
        if not issubclass(repository_class, BaseRepository):
            raise RepositoryError(
                'Custom repository must be subclass of BaseRepository')
        return repository_class(self)

    def get_client(self):
        return self.es

    def _persist(self, entity, state):
        if id(entity) in self._registry:
            self._registry[id(entity)].state = state
        else:
            self._registry[id(entity)] = PersistedEntity(entity,
                                                         state=state,
                                                         index=self._index)

    def _execute_callbacks(self, actions, type):
        for persisted_entity in actions:
            if type == 'pre':
                attr = 'state'
            else:
                attr = 'last_state'
            action = {
                ADD: 'create',
                UPDATE: 'update',
                REMOVE: 'delete'
            }[getattr(persisted_entity, attr)]
            callback_func_name = type + '_' + action
            if hasattr(persisted_entity._entity, callback_func_name):
                getattr(persisted_entity._entity, callback_func_name)(self)
Exemplo n.º 39
0
class EntityManager(object):
    @staticmethod
    def entity_not_found_message(en_type, ids):
        return 'Entities: "{type}" with ids: {ids} not found.'.format(type=en_type, ids=ids)

    def __init__(self, index='default', es_settings=None):
        if es_settings:
            self.es = Elasticsearch(**es_settings)
        else:
            self.es = Elasticsearch()
        self._index = index
        self._registry = {}

    def persist(self, entity):
        if not hasattr(entity, 'to_storage') or not hasattr(entity, '__getitem__') or not hasattr(entity, 'type'):
            raise TypeError('entity object must have to_storage, type and behave like a dict methods')
        self._persist(entity, state=ADD)

    def remove(self, entity):
        self._persist(entity, state=REMOVE)

    def flush(self, refresh=False):
        actions = []
        for persisted_entity in six.itervalues(self._registry):
            if persisted_entity.is_action_needed():
                actions.append(persisted_entity)
        self._execute_callbacks(actions, 'pre')
        bulk_results = helpers.streaming_bulk(self.es, [a.stmt for a in actions], refresh=refresh)
        # TODO: checking exceptions in bulk_results
        for persisted_entity, result in zip(actions, bulk_results):
            if 'create' in result[1]:
                persisted_entity.set_id(result[1]['create']['_id'])
        for action in actions:
            action.reset_state()
        self._execute_callbacks(actions, 'post')

    def find(self, _id, _type, scope=None, **kwargs):
        params = {'id': _id, 'index': self._index, 'doc_type': _type.get_type()}
        if scope:
            params['_source'] = _type.get_fields(scope)
        params.update(kwargs)
        try:
            _data = self.es.get(**params)
        except TransportError as e:  # TODO: the might be other errors like server unavaliable
            raise EntityNotFound(self.entity_not_found_message(_type.get_type(), _id), e)
        if not _data['found']:
            raise EntityNotFound(self.entity_not_found_message(_type.get_type(), _id))
        source = _data['_source']
        source['id'] = _data['_id']
        entity = _type(source, scope)
        self._persist(entity, state=UPDATE)
        return entity

    def find_many(self, _ids, _type, scope=None, complete_data=True, **kwargs):
        try:
            _ids = list(_ids)
        except TypeError as e:
            raise RepositoryError('Variable _ids has to be iterable', cause=e)

        params = {'body': {'ids': _ids}, 'index': self._index, 'doc_type': _type.get_type()}
        if scope:
            params['_source'] = _type.get_fields(scope)
        params.update(kwargs)
        try:
            _data = self.es.mget(**params)
        except TransportError as e:  # TODO: the might be other errors like server unavaliable
            raise EntityNotFound(self.entity_not_found_message(_type.get_type(), ', '.join(_ids)), e)
        entities = []
        if complete_data:
            invalid_items = [elem['_id'] for elem in _data['docs'] if not elem['found']]
            if invalid_items:
                raise EntityNotFound(self.entity_not_found_message(_type.get_type(), ', '.join(invalid_items)))
        for _entity in _data['docs']:
            if _entity['found']:
                source = _entity['_source']
                source['id'] = _entity['_id']
                entity = _type(source, scope)
                self._persist(entity, state=UPDATE)
                entities.append(entity)
        return entities

    def query(self, query, _type, scope=None, **kwargs):
        params = {}
        if scope:
            params['_source'] = _type.get_fields(scope)
        params.update(kwargs)
        try:
            data = self.es.search(index=self._index, doc_type=_type.get_type(), body=query, **params)
        except TransportError as e:
            raise RepositoryError('Transport returned error', cause=e)
        entities = []
        for record in data['hits']['hits']:
            source = record['_source']
            source['id'] = record['_id']
            source['_score'] = record['_score']
            if '_explanation' in record:
                source['_explanation'] = record['_explanation']
            entity = _type(source, scope, record.get('highlight'))
            self._persist(entity, state=UPDATE)
            entities.append(entity)
        return entities, without(['hits'], data, move_up={'hits': ['max_score', 'total']})

    def query_one(self, query, _type, scope=None, **kwargs):
        entities, meta = self.query(query, _type, scope, **kwargs)
        if len(entities) == 1:
            return entities[0]
        raise RepositoryError('Expected one result, found {num}'.format(num=len(entities)))

    def clear(self):
        self._registry = {}

    def get_repository(self, repository):
        app, repository_class_name = repository.split(':')
        if app not in settings.INSTALLED_APPS:
            founded_app = [_app for _app in settings.INSTALLED_APPS if _app.endswith(app)]
            if not founded_app:
                raise RepositoryError('Given application {app} are not in INSTALLED_APPS'.format(app=app))
            app = founded_app[0]
        try:
            module = import_module(app + '.' + 'repositories')
        except ImportError:
            raise RepositoryError('Given application {app} has no repositories'.format(app=app))
        if not hasattr(module, repository_class_name):
            raise RepositoryError(
                'Given repository {repository_class_name} does not exists in application {app}'.format(
                    repository_class_name=repository_class_name, app=app
                ))
        repository_class = getattr(module, repository_class_name)
        if not issubclass(repository_class, BaseRepository):
            raise RepositoryError('Custom repository must be subclass of BaseRepository')
        return repository_class(self)

    def get_client(self):
        return self.es

    def _persist(self, entity, state):
        if id(entity) in self._registry:
            self._registry[id(entity)].state = state
        else:
            self._registry[id(entity)] = PersistedEntity(entity, state=state, index=self._index)

    def _execute_callbacks(self, actions, type):
        for persisted_entity in actions:
            if type == 'pre':
                attr = 'state'
            else:
                attr = 'last_state'
            action = {ADD: 'create', UPDATE: 'update', REMOVE: 'delete'}[getattr(persisted_entity, attr)]
            callback_func_name = type + '_' + action
            if hasattr(persisted_entity._entity, callback_func_name):
                getattr(persisted_entity._entity, callback_func_name)(self)
{
   "docs" : [
      {
         "_index" : "megacorp",
         "_type" :  "employee",
         "_id" :    2
      },
      {
         "_index" : "megacorp",
         "_type" :  "employee",
         "_id" :    1,
         "_source": "last_name"
      }
   ]
}
pp.pprint( es.mget(body=list_docs) )

print 'mget with simplified format'
list_docs = \
{
   "docs" : [
      {
         "_id" :    2
      },
      {
         "_id" :    1,
         "_source": "last_name"
      }
   ]
}
pp.pprint( es.mget(index="megacorp", doc_type="employee", body=list_docs) )
                            print Exception, ":", r
                            es = Elasticsearch("219.224.135.93")
                            print "retry"
                    print count_index

                if count_index % 10000 == 0:
                    ts = time.time()
                    print "%s  per  %s  second" % (count_index, ts - tb)
                    tb = ts

            else:
                exist_uid_list.append(user_id)
                count_uid += 1
                if count_uid % 1000 == 0:
                    multi_items = es.mget(index="activity",
                                          doc_type="manage",
                                          body={"ids": exist_uid_list},
                                          _source=True)['docs']
                    exist_uid_list = []
                    for m_item in multi_items:
                        m_item = m_item['_source']
                        update_item = compare_activity(item, m_item)
                        xdata = expand_index_action(update_item)
                        bulk_action.extend([xdata[0], xdata[1]])
                        count_index += 1

                        if count_index % 2000 == 0:
                            while True:
                                try:
                                    es.bulk(bulk_action,
                                            index="activity",
                                            doc_type="manage",
Exemplo n.º 42
0
class SearchEngine(object):
    def __init__(self):
        #
        serializer = JSONSerializer()
        serializer.mimetype = 'application/json'
        serializer.dumps = serializer.serialize
        serializer.loads = JSONDeserializer().deserialize
        self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS,
                                serializer=serializer,
                                **settings.ELASTICSEARCH_CONNECTION_OPTIONS)
        self.logger = logging.getLogger(__name__)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index, doc_type, and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        body = kwargs.pop('body', None)
        if body != None:
            try:
                data = []
                refresh = kwargs.pop('refresh', False)
                for hit in helpers.scan(self.es, query=body, **kwargs):
                    hit['_op_type'] = 'delete'
                    data.append(hit)

                return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to delete document by query: %s \nException detail: %s\n'
                    % (datetime.now(), body, detail))
                raise detail
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to delete document: %s \nException detail: %s\n'
                    % (datetime.now(), body, detail))
                raise detail

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        index = kwargs.get('index', '').strip()
        print 'deleting index : %s' % index
        return self.es.indices.delete(index=index, ignore=[400, 404])

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index, doc_type, and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        body = kwargs.get('body', None)
        index = kwargs.get('index', None)
        id = kwargs.get('id', None)

        if index is None:
            raise NotImplementedError(
                "You must specify an 'index' in your call to search")

        if id:
            if isinstance(id, list):
                kwargs.setdefault('body', {'ids': kwargs.pop('id')})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)

        ret = None
        try:
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning(
                '%s: WARNING: search failed for query: %s \nException detail: %s\n'
                % (datetime.now(), body, detail))
            pass

        return ret

    def index_term(self, term, id, context='', options={}):
        """
        If the term is already indexed, then simply increment the count and add the id of the term to the existing index.
        If the term isn't indexed then add the index.

        id: a unique id associated with the term
        context: a uuid of a concept to associate with the term to render in the ui
        options: any additional information to associate with the term

        """

        if term.strip(' \t\n\r') != '':
            already_indexed = False
            count = 1
            ids = [id]

            try:
                #_id = unicode(term, errors='ignore').decode('utf-8').encode('ascii')
                _id = uuid.uuid3(uuid.NAMESPACE_DNS,
                                 '%s%s' % (hash(term), hash(context)))
                result = self.es.get(index='term',
                                     doc_type='value',
                                     id=_id,
                                     ignore=404)

                #print 'result: %s' % result
                if result['found'] == True:
                    ids = result['_source']['ids']
                    if id not in ids:
                        ids.append(id)
                else:
                    ids = [id]

                self.index_data('term',
                                'value', {
                                    'term': term,
                                    'context': context,
                                    'options': options,
                                    'count': len(ids),
                                    'ids': ids
                                },
                                id=_id)

            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: search failed to index term: %s \nException detail: %s\n'
                    % (datetime.now(), term, detail))
                raise detail

    def delete_terms(self, ids):
        """
        If the term is referenced more then once simply decrement the 
        count and remove the id of the deleted term from the from the existing index.

        If the term is only referenced once then delete the index  

        """

        if not isinstance(ids, list):
            ids = [ids]

        for id in ids:
            result = self.es.search(index='term',
                                    doc_type='value',
                                    body={
                                        "query": {
                                            "filtered": {
                                                "filter": {
                                                    "terms": {
                                                        "ids": [id]
                                                    }
                                                },
                                                "query": {
                                                    "match_all": {}
                                                }
                                            }
                                        },
                                        "from": 0,
                                        "size": 10
                                    },
                                    ignore=404)

            if 'hits' in result:
                for document in result['hits']['hits']:
                    document['_source']['ids'].remove(id)
                    count = len(document['_source']['ids'])
                    if count > 0:
                        document['_source']['count'] = count
                        self.index_data('term',
                                        'value',
                                        document['_source'],
                                        id=document['_id'])
                        self.es.indices.refresh(index='term')
                    else:
                        self.delete(index='term',
                                    doc_type='value',
                                    id=document['_id'])

    def create_mapping(self,
                       index,
                       doc_type,
                       fieldname='',
                       fieldtype='string',
                       fieldindex=None,
                       body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        if not body:
            if fieldtype == 'geo_shape':
                body = {
                    doc_type: {
                        'properties': {
                            fieldname: {
                                'type': 'geo_shape',
                                'tree': 'geohash',
                                'precision': '1m'
                            }
                        }
                    }
                }
            else:
                fn = {'type': fieldtype}
                if fieldindex:
                    fn['index'] = fieldindex
                body = {doc_type: {'properties': {fieldname: fn}}}

        self.create_index(index=index, ignore=400)
        self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body)

    def create_index(self, **kwargs):
        self.es.indices.create(**kwargs)

    def index_data(self,
                   index=None,
                   doc_type=None,
                   body=None,
                   idfield=None,
                   id=None,
                   **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the 
            document itself and use the value found for the id of the document

        """

        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document, idfield)

            try:
                self.es.index(index=index,
                              doc_type=doc_type,
                              body=document,
                              id=id,
                              **kwargs)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to index document: %s \nException detail: %s\n'
                    % (datetime.now(), document, detail))
                raise detail

    def bulk_index(self, data):
        return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True)

    def create_bulk_item(self, index, type, id, data):
        if not (self.isempty_or_none(index) or self.isempty_or_none(type)
                or self.isempty_or_none(id)):
            return [{
                "index": {
                    "_index": index,
                    "_type": type,
                    "_id": id
                }
            }, data]
        else:
            return false
Exemplo n.º 43
0
                        except Exception, r:
                            print Exception,":",r
                            es = Elasticsearch("219.224.135.93")
                            print "retry"
                    print count_index

                if count_index % 10000 == 0:
                    ts = time.time()
                    print "%s  per  %s  second"  %(count_index, ts-tb)
                    tb = ts

            else:
                exist_uid_list.append(user_id)
                count_uid += 1
                if count_uid % 1000 == 0:
                    multi_items = es.mget(index="activity", doc_type="manage", body={"ids": exist_uid_list}, _source=True)['docs']
                    exist_uid_list = []
                    for m_item in multi_items:
                        m_item = m_item['_source']
                        update_item = compare_activity(item, m_item)
                        xdata = expand_index_action(update_item)
                        bulk_action.extend([xdata[0], xdata[1]])
                        count_index += 1

                        if count_index % 2000 == 0:
                            while True:
                                try:
                                    es.bulk(bulk_action, index="activity", doc_type="manage", timeout=30)
                                    bulk_action=[]
                                    break
                                except Exception, r:
Exemplo n.º 44
0
class ESearch():
    def __init__(self):
        """
        Initialize class parameters
        """
        # Connection object
        self._es = None
        self._index_name = "article_data"
        self._hash_field = "URL"
        self._dict_of_duplicate_docs = {}

    def connect_to_es(self, host_name=ELASTIC_SEARCH_ENDPOINT):
        """
        Establishes a connection to the Elastic search server.
        If server if pingable, returns connection object.
        Else return None
        :return: connection-object
        """
        self._es = Elasticsearch(hosts=[host_name], timeout=60)
        # Ping the connection to check if it's alive
        if self._es.ping():
            return self._es
        return None

    def index_exists(self, index_name=None):
        if not index_name:
            index_name = self._index_name
        return self._es.indices.exists(index_name)

    def _make_mapping(self):
        """
        Creates the index with the correct mapping
        :return:
        """
        m = Mapping()
        # add fields
        m.field('Title', 'text')
        m.field('Text', 'text')
        m.field('Publish_Date',
                'date')  # date type complicates matters across websites
        m.field('URL', 'text')
        m.field('Scrape_Date',
                'date')  # date type complicates matters across websites
        m.field('Source', 'text')
        m.field('Search_Keyword', 'text')  # save list as text?
        m.field('SE_Is_Risk', 'boolean')
        m.field('GP_Is_Risk', 'boolean')
        m.field('RG_Is_Risk', 'boolean')
        m.field('SE_Risk_Rating', 'float')
        m.field('GP_Risk_Rating', 'float')
        m.field('RG_Risk_Rating', 'float')
        m.field('SE_SnP_Open', 'float')
        m.field('SE_SnP_Close', 'float')
        m.field('SE_AbbV_Open', 'float')
        m.field('SE_AbbV_Close', 'float')
        m.field('SE_XBI_Open', 'float')
        m.field('SE_XBI_Close', 'float')
        m.field('SE_SnP_Open_Plus1', 'float')
        m.field('SE_SnP_Close_Plus1', 'float')
        m.field('SE_AbbV_Open_Plus1', 'float')
        m.field('SE_AbbV_Close_Plus1', 'float')
        m.field('SE_XBI_Open_Plus1', 'float')
        m.field('SE_XBI_Close_Plus1', 'float')
        m.field('SE_SentimentScore', 'float')
        m.field('SE_SentimentPolarity', 'float')
        m.field('CompositeScore', 'float')
        m.field('RG_FDA_Warning', 'boolean')
        m.field('GP_SentimentScore', 'float')
        m.field('GP_SentimentPolarity', 'float')
        m.field('GP_Location', 'text')
        m.field('GP_Country', 'text')
        m.field('Article_references', 'float')
        m.field('Is_source_type_RG', 'boolean')
        m.field('Is_source_type_SE', 'boolean')
        m.field('Is_source_type_GP', 'boolean')

        # save the mapping into index 'my-index'
        try:
            m.save(self._index_name)
        except Exception as e:
            print("Could not save schema!", e)

    def create_index(self):
        """
        Creates the index if it doesn't exist
        :return:
        """
        # create the index if it doesn't exist
        if not self.index_exists():
            try:
                index.create()
                self._make_mapping()
                print("Index was created :", index.exists())
            except Exception as e:
                print("~~~Index exists error")
                print(e)
                return -1
        else:
            print("Index already exists", self._index_name)
        return 0

    def get_index_mapping(self):
        """
        Retrieves the index mapping
        :return: Index mapping JSON object if success, -1 if error
        """
        try:
            return self._es.indices.get_mapping(index=self._index_name)
        except Exception as e:
            print("~~~Get index mapping error")
            print(e)
            return -1

    def get_count(self, search_obj=None):
        return self._es.count(index=self._index_name, body=search_obj)

    def upload_dataframe(self, df):
        """
        Uploads a dataframe into the index
        :param df: Dataframe (pandas)
        :return: 0 if success, -1 if failure
        """
        def rec_to_actions(df):
            for record in df.to_dict(orient="records"):
                yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}' %
                       (self._index_name, "_doc"))
                yield (json.dumps(record, default=int))

        if not self.index_exists():
            print("!!!INDEX DOES NOT EXIST -- RETURNING!!!")
            return -1

        try:
            # make the bulk call, and get a response
            response = self._es.bulk(rec_to_actions(df))  # return a dict
            if not response["errors"]:
                print("Records uploaded")
            else:
                print("Could not upload data ")
                print(response)
                return -1
        except Exception as e:
            print("\nERROR:", e)
            return -1

        return 0

    # Process documents returned by the current search/scroll
    def _populate_dict_of_duplicate_docs(self, hits):
        for item in hits:
            combined_key = str(item['_source'][self._hash_field])

            _id = item["_id"]
            # _Title = item["_source"]["Title"]

            hashval = hashlib.md5(combined_key.encode('utf-8')).digest()

            # If the hashval is new, then we will create a new key
            # in the dict_of_duplicate_docs, which will be
            # assigned a value of an empty array.
            # We then immediately push the _id onto the array.
            # If hashval already exists, then
            # we will just push the new _id onto the existing array
            self._dict_of_duplicate_docs.setdefault(hashval, []).append(_id)

    # Loop over all documents in the index, and populate the
    # dict_of_duplicate_docs data structure.
    def _scroll_over_all_docs(self):
        data = self._es.search(index=self._index_name,
                               scroll='1m',
                               body={"query": {
                                   "match_all": {}
                               }})

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        self._populate_dict_of_duplicate_docs(data['hits']['hits'])

        while scroll_size > 0:
            data = self._es.scroll(scroll_id=sid, scroll='2m')

            # Process current batch of hits
            self._populate_dict_of_duplicate_docs(data['hits']['hits'])

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

    def _loop_over_hashes_and_remove_duplicates(self):
        urls_to_delete = []
        ids_to_delete = []
        # Search through the hash of doc values to see if any
        # duplicate hashes have been found
        for hashval, array_of_ids in self._dict_of_duplicate_docs.items():
            if len(array_of_ids) > 1:
                # print("********** Duplicate docs hash=%s **********" % hashval)
                # Get the documents that have mapped to the current hasval
                matching_docs = self._es.mget(index=self._index_name,
                                              body={"ids": array_of_ids})
                # Check if the URLs are truly the same URLs
                dict_url_ids = {}
                for doc in matching_docs['docs']:
                    dict_url_ids.setdefault(doc["_source"].get("URL"),
                                            []).append(doc["_id"])
                # remove only the first ID from the list
                dict_url_ids = {
                    key: value[1:]
                    for (key, value) in dict_url_ids.items()
                }
                for i in list(dict_url_ids.keys()):
                    urls_to_delete.append(i)
                # Delete all the IDs now
                for i in list(dict_url_ids.values()):
                    ids_to_delete.extend(i)

        for u in urls_to_delete:
            print(u)

        for idd in ids_to_delete:
            try:
                del_return = self._es.delete(index=self._index_name, id=idd)
            except Exception as e:
                print(e)
                break

    def remove_duplicates(self):
        self._scroll_over_all_docs()
        self._loop_over_hashes_and_remove_duplicates()
Exemplo n.º 45
0
class AKAGraph(object):
    def __init__(
        self,
        hosts=None,
        index_name=None,
        replicas=10,
        soft_selectors=None,
        hard_selectors=None,
        hyper_edge_scorer=None,
        shards=None,
        buffer_size=20,
        conn=None,
        num_identifier_downweight=0,
        popular_identifier_downweight=0,
    ):
        '''AKAGraph provides the interface to an elastic-search backed
        probabilistic graph proximity engine

        Its main operations are:
         * add a "record" containing various types of identifiers
         * query for those records "close" to a given identifier

        :param hosts: elasticsearch hosts

        :param index_name: the elasticsearch index name to use (or create)

        :param replicas: the number of monte-carlo samples to use

        :param soft_selectors: a list of identifiers to be considered
        not globally unique

        :param hard_selectors: a list of globally unique identifiers

        :param shards: number of elasticsearch shards

        :param buffer_size: how many updates to batch before
        committing them to elasticsearch

        :param conn: Elasticsearch connection object

        :param num_identifiers_downweight: records with many
        identifiers should have their identifiers bind more loosely to
        others

        :param popular_identifier_downweight: identifiers in many
        records should bind loosely

        '''

        if conn is None:
            self.conn = Elasticsearch(hosts=hosts,
                                      retry_on_timeout=True,
                                      max_retries=5)
        else:
            self.conn = conn
        self.index = index_name
        self.shards = shards
        self.buffer_size = buffer_size
        self.record_buffer = []
        self.edge_buffer = []
        self.in_context = False
        if soft_selectors is None:
            soft_selectors = default_soft_selectors
        self.soft_selectors = set(soft_selectors)
        if hard_selectors is None:
            hard_selectors = default_hard_selectors
        self.hard_selectors = set(hard_selectors)
        if hyper_edge_scorer is not None:
            self.hyper_edge_scorer = hyper_edge_scorer
        else:
            unigrams, bigrams = load_ngrams()
            self.hyper_edge_scorer = \
                lambda s: prob_username(s, unigrams, bigrams)
        self.replica_list = range(replicas)
        self.score_cutoff = .001
        self.num_identifier_downweight = num_identifier_downweight
        self.popular_identifier_downweight = popular_identifier_downweight

    def __enter__(self):
        logger.debug('in context')
        self.in_context = True
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.in_context = False
        if exc_type is None and exc_value is None and traceback is None:
            self.flush()

    def add(self, rec, analyze_and_union=True):
        '''add `rec` to ES;  must be used inside a `with` statement
        '''
        assert self.in_context, 'must use "with" statement to add docs'
        self.record_buffer.append((rec, analyze_and_union))
        if len(self.record_buffer) >= self.buffer_size:
            self.flush_records()

    def add_edge(self, IDs, strength, evidence=None):
        '''
        Adds an edge between all identifiers in the iterable IDs with the given strength.
        This does not create entries of type RECORD.  
        It will simply do unions in the UNION_FIND type.
        However, if you have records ingested, and these id's correspond to the urls of those records, 
        ingesting this way will link these records just as when full record-based ingest happens.
        If evidence is None, each call to add_edge should be thought of as offering independent evidence of a relationship.
        So if you call add_edge(["A", "B"], .5) twice, this is equivalent to calling it once with strength .75
        However add_edge(["A", "B"], .5, 'foo') is idempotent 

        add_edge must be used inside a `with` statement

        :param IDs: An iterable of identifiers to union probabilistically.  
                    These can be any string, but if the match urls of records, then querying based on those record's fields will work as expected.
        :param strength: 0 < strength <= 1 is a probability with which to union all edges in the IDs set
        :param evidence: evidence is used for fine-grained control over whether repeated calls with overlapping IDs sets are treated independently.
                         For example, If you like A and B because they share a username "foo", and you like B and A because they share "foo", 
                         you want it to only link once.  Supply "foo" as evidence and this will work as desired.  
                         If you later want to link A and B because they share an email address, this is independent evidence and will increase the proximity of A and B
        '''

        assert self.in_context, 'must use "with" statement to add docs'
        self.edge_buffer.append((IDs, strength, evidence))
        if len(self.edge_buffer) >= self.buffer_size:
            self.flush_edges()

    def flush(self):
        self.flush_records()
        self.flush_edges()

    def flush_edges(self):
        local_union_find = MemoryUnionFind(
        )  # this is purely an efficiency hack so we hit ES less redundantly
        for equivs, score, score_reason in self.edge_buffer:
            logger.debug('given equivs %r with %s strength and evidence %s',
                         equivs, score, score_reason)
            self.probabilistically_unite_edges(equivs, score, score_reason,
                                               local_union_find)
        self.edge_buffer = self.edge_buffer[:0]

    def flush_records(self):
        '''Actually do the work to ingest records gathered by calls to `add`.
        All vertexes are their own roots on initial ingest; so this
        sets size to 1 iff the doc has not been ingested before.

        '''
        if not self.conn.indices.exists(index=self.index):
            self.create_index()
        logger.debug('flushing ingest buffer (size: %d)',
                     len(self.record_buffer))
        actions = []
        for rec, _ in self.record_buffer:
            actions.append({
                '_index': self.index,
                '_type': RECORD_TYPE,
                '_id': rec['url'],
                '_source': rec,
            })
            #actions.append({
            #    '_index': self.index,
            #    '_type': ROOT_SIZE_TYPE,
            #    '_id': rec['url'],
            #    '_op_type': 'update',
            #    'doc': {'size': 1}, # set initial size to 1, TODO:
            # write tests to make sure this
            # doesn't change values when
            # re-ingesting
            #    'doc_as_upsert': True,
            #})
        bulk(self.conn, actions, timeout='60s')
        # next find equivalent records via exact match, and union them

        # uh oh...
        self.sync()

        # as an efficiency hack we make a local, one-off union find so we hit ES less redundantly
        # batches are likely to have a lot of the same records to union, and we do not want
        # to tell ES about each of a set of redundant unions.  If we catch them locally, we only
        # hit ES with new stuff
        local_union_find = MemoryUnionFind()

        # record_buffer has tuples where the [0] element is the record
        # and the [1] element is whether or not to union from it.  Only process the records to union here
        # this supports adding records and *explicit* edges separately
        for rec, score, score_reason, equivs in self.find_equivs(
            [buf[0] for buf in self.record_buffer if buf[1]]):
            logger.debug('%s found %d (%f) equivs for %r --> %r', score_reason,
                         len(equivs), score, rec['url'], equivs)
            equivs.add(rec['url'])
            self.probabilistically_unite_edges(equivs, score, score_reason,
                                               local_union_find)
        self.record_buffer = self.record_buffer[:0]

    def probabilistically_unite_edges(self,
                                      equivs,
                                      score,
                                      score_reason,
                                      local_union_find=None):
        if score == 1:
            equivs_len = len(equivs)
            if local_union_find:
                equivs = local_union_find.find_all_and_union(*equivs)
                logger.debug('had %s equivs, now have %s', equivs_len,
                             len(equivs))
            if len(equivs) < 1:
                return
        if score_reason:

            def include_replica(replica):
                return score == 1 or pseudorandom(score_reason,
                                                  replica) < score
        else:
            # if no reason is given, make it random
            def include_replica(replica):
                del replica
                return score == 1 or uniform_random() < score

        for replica in self.replica_list:
            if include_replica(replica):
                self.unite(*[AKANode(url, replica) for url in equivs])
        self.sync()

    def sync(self):
        '''Forces data to disk, so that data from all calls to `put` will be
        available for getting and querying.  Generally, this should
        only be used in tests.

        '''
        self.conn.indices.refresh(index=self.index)

    def analyze_clusters(self, limit=None):
        '''hunt for clusters and return a list of clusters sored by size and
        indication of their overlaps:

        .. block-quote:: python
            [(size, [rec1, rec2, ...], {phone: ['+.....']}, ...]

        '''
        #i_recs = islice(loader(path, hard_selectors=self.hard_selectors), limit)
        clusters = []
        # consider only clusters of at least two records
        for root_url, count in self.get_all_roots(size_limit=1,
                                                  candidates_limit=limit):
            del count
            cc = list(self.connected_component(root_url))
            # The sequence of steps up to this point scans all
            # records, gathers their roots with counts of how many
            # records are under that root, then gets the CC for the
            # root... which should be the exact same set, right?  This
            # may be the source of the big clusters in DIFFEO-2305
            #assert len(cc) == count, (count, len(cc), cc)
            logger.debug('found connected component of %d: %r', len(cc), cc)
            recs = list(self.get_recs(*cc))
            overlaps = find_overlaps(recs)
            _recs = {}
            for rec in recs:
                _recs[rec['url']] = rec
            clusters.append({
                "count": len(cc),
                "records": _recs,
                "overlaps": overlaps
            })
        clusters.sort(key=itemgetter('count'), reverse=True)
        cluster_sizes = Counter()
        for size, _, _ in clusters:
            cluster_sizes[size] += 1
        data = {
            'clusters': clusters,
            'aggregate_stats': {
                'largest':
                clusters[0]['count'],
                'median':
                clusters[len(clusters) // 2]['count'],
                'mean':
                sum([cluster['count']
                     for cluster in clusters]) / len(clusters),
                'smallest':
                clusters[-1]['count'],
                'histogram':
                dict(cluster_sizes),
            }
        }
        return data

    def find_equivs(self, records):
        '''For an iterable of `records`, yield tuples of `(record, score,
        equivs)`, where a `record` from `records` might appear in
        multiple of the yielded.

        '''
        queries = []
        scores = []
        rec_pointers = []  # carries a pointer to a record for each query
        for rec in records:
            # compute score multiplies for this record
            weight = 1.0
            if self.num_identifier_downweight:
                count = sum([
                    len(values) for key, values in rec.iteritems() if
                    (key in self.hard_selectors or key in self.soft_selectors)
                ])
                weight = math.exp(-self.num_identifier_downweight *
                                  (count - 1))
                logger.debug('weight = %f, %d, %s', weight, count, rec['url'])

            # first we gather one query for all hard selectors
            hard_or_query = []
            for key, values in rec.iteritems():
                if key in self.hard_selectors:
                    for v in values:
                        hard_or_query.append({'term': {key: v}})
            if hard_or_query:
                query = {
                    "query": {
                        "constant_score": {
                            "filter": {
                                "bool": {
                                    "should": hard_or_query,
                                    "must_not": {
                                        "ids": {
                                            "values": [rec["url"]]
                                        }
                                    },
                                }
                            }
                        }
                    }
                }
                queries.append({
                    'index': self.index,
                    'type': RECORD_TYPE,
                    '_source_include': []
                })
                queries.append(query)
                scores.append((weight, json.dumps(hard_or_query)))
                rec_pointers.append(rec)
            else:
                logger.debug('skipping because no hard identifiers')
            # next, we make separate queries for each soft selector
            if not self.hyper_edge_scorer or len(self.replica_list) == 1:
                continue
            for key, values in rec.iteritems():
                if key not in self.soft_selectors: continue
                for v in values:
                    if not v:
                        continue
                    query = {
                        "query": {
                            "constant_score": {
                                "filter": {
                                    "bool": {
                                        "should": [{
                                            'term': {
                                                key: v
                                            }
                                        }],
                                        "must_not": {
                                            "ids": {
                                                "values": [rec["url"]]
                                            }
                                        },
                                    }
                                }
                            }
                        }
                    }
                    score = self.hyper_edge_scorer(v)
                    if score > self.score_cutoff:
                        logger.debug('soft selector score %.3f for %r', score,
                                     v)
                        queries.append({
                            'index': self.index,
                            'type': RECORD_TYPE,
                            '_source_include': []
                        })
                        queries.append(query)
                        scores.append((score * weight, v))
                        rec_pointers.append(rec)

        # helper function for stripping down to just the URL
        def hits_generator(hits):
            for hit in hits['hits']['hits']:
                yield hit['_id']

        # now loop until we get answers for all the queries
        cursor = 0
        while queries:
            res = self.conn.msearch(body=queries)
            for hits in res['responses']:
                # remove the corresponding two rows of queries and corresponding record
                queries.pop(0)
                queries.pop(0)
                record = rec_pointers[cursor]
                score, score_reason = scores[cursor]

                # revise_score
                cursor += 1
                if 'error' in hits:
                    # need to run msearch again, starting with the query after the failed one
                    if 'queue capacity' not in hits['error']:
                        logger.warn("Error getting equivs for %s: %s", record,
                                    hits['error'])
                    break
                else:
                    hits_set = set(hits_generator(hits))
                    if hits_set:
                        if self.score_cutoff < score < 1:
                            logger.debug("SOFT: %d, %s", score, score_reason)
                        if self.popular_identifier_downweight:
                            score = score * math.exp(
                                -self.popular_identifier_downweight *
                                (len(hits_set) - 1))

                        yield (record, score, score_reason, hits_set)

    def get_recs(self, *urls):
        '''get records one or more for `urls`
        '''
        if not urls:
            raise Exception('called get_recs with empty list')
        resp = self.conn.mget(index=self.index,
                              doc_type=RECORD_TYPE,
                              body={'ids': urls})
        for rec in resp['docs']:
            if not rec['found']:
                yield {"url": rec['_id']}
                #raise KeyError('missing: %r' % rec['_id'])
            else:
                yield rec['_source']

    def get_all_urls(self, limit=None):
        '''get all urls in the index
        '''
        res = scan(self.conn,
                   index=self.index,
                   doc_type=RECORD_TYPE,
                   _source_include=[],
                   query={'query': {
                       'match_all': {}
                   }})
        for item in islice(res, limit):
            yield item['_id']

    def find_urls_by_selector(self, selector, use_soft=True):
        if not self.conn.indices.exists(index=self.index):
            self.create_index()
        or_query = [{'term': {'url': selector}}]
        for key in self.hard_selectors:
            or_query.append({'term': {key: selector}})
        if use_soft:
            for key in self.soft_selectors:
                or_query.append({'term': {key: selector}})
            logger.debug('including soft_selectors: %r', self.soft_selectors)
        query = {
            "query": {
                "bool": {
                    "should": or_query,
                }
            }
        }
        # logger.debug(json.dumps(query, indent=4, sort_keys=True))
        try:
            res = self.conn.search(index=self.index,
                                   doc_type=RECORD_TYPE,
                                   _source_include=[],
                                   body=query)
            '''
            body={
                'query': {
                    'multi_match': {
                        'query': selector,
                        'type': 'cross_fields',
                        # TODO: blend soft_selectors into this
                        'fields': self.hard_selectors,
                        }
                    }
                })
            '''
            visited_urls = set()
            for hit in res['hits']['hits']:
                # logger.debug(hit['_score'])
                url = hit['_id']
                if url not in visited_urls:
                    visited_urls.add(url)
                    yield url
        except NotFoundError, exc:
            logger.warn('akagraph indexes do not exist yet: %s', exc)
            return