def export_emails_archive(data_set_id, email_ids=["f9c9c59a-7fe8-11e5-bb05-08002705cb99"]): cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, email_ids)) if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing index") # if not email: # return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id") # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10)) es = Elasticsearch([{"host" : "10.1.70.143", "port" : 9200}], request_timeout=60) # TODO can implement with multiple doc_types and combine attachments in emails = es.mget(index=data_set_id, doc_type="emails", body={"docs":[{"_id":id} for id in email_ids]}) # TODO filename filename= "export.tar.gz" tangelo.content_type("application/x-gzip") header("Content-Disposition", 'attachment; filename="{}"'.format(filename)) string_buffer = cStringIO.StringIO() tar = tarfile.open(mode='w:gz', fileobj=string_buffer) # Add each email to the tar for email_source in emails["docs"]: email = email_source["_source"] tarinfo_parent= tarfile.TarInfo(name = email["id"]) tarinfo_parent.type = tarfile.DIRTYPE tarinfo_parent.mode = 0755 tarinfo_parent.mtime = time.time() tar.addfile(tarinfo_parent) tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".json") # TODO -- email transformation data_string = json.dumps(email) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Get the attachments if email["attachments"]: attachments = es.mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":attch["guid"]} for attch in email["attachments"]]}) for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(email["id"]+"/"+filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close() return string_buffer.getvalue()
def esidfileconsumegenerator(host=None,port=9200,index=None,type=None,body=None,source=True,source_exclude=None,source_include=None,idfile=None,headless=False,chunksize=1000,timeout=10): if os.path.isfile(idfile): ids=list() notfound_ids=set() with open(idfile,"r") as inp: for ppn in inp: _id=ppn.rstrip() ids.append(_id) if not source: source=True tracer = logging.getLogger('elasticsearch') tracer.setLevel(logging.WARNING) tracer.addHandler(logging.FileHandler('errors.txt')) es=Elasticsearch([{'host':host}],port=port,timeout=timeout, max_retries=10, retry_on_timeout=True) success=False _ids=set() try: for _id in ids: _ids.add(ids.pop()) if len(_ids)>=chunksize: for doc in es.mget(index=index,doc_type=type,body={'ids':list(_ids)},_source_include=source_include,_source_exclude=source_exclude,_source=source).get("docs"): if headless: yield doc.get("_source") else: yield doc _ids.clear() if len(_ids)>0: for doc in es.mget(index=index,doc_type=type,body={'ids':list(_ids)},_source_include=source_include,_source_exclude=source_exclude,_source=source).get("docs"): if headless: yield doc.get("_source") else: yield doc _ids.clear() ids.clear() except exceptions.NotFoundError: notfound_ids.add(_ids) else: os.remove(idfile) finally: ids+=notfound_ids with open(idfile,"w") as outp: for _id in ids: print(_id,file=outp)
def _get_es_docs_by_ids(self, docs_ids: List[str]): """ Retrieve several documents from an ElasticSearch index. :param docs_ids: ids of the documents to retrieve :return: a list of tuples <doc_uri: document_dict> """ if not docs_ids: return [] elastic = Elasticsearch(self._config.es_host) return [(doc['_id'], doc['_source']) for doc in elastic.mget(body={'ids': docs_ids}, index='dbpedia')['docs'] if '_source' in doc]
class ESRetrieve(BatchStage): def __init__(self, es_hosts: str, es_indices: str): super().__init__(size=10, timeout=5) self._es_indices = es_indices.strip().split(",") self._es_hosts = es_hosts self._es_client = None self._retrieve = None def on_start(self): self._es_client = Elasticsearch(self._es_hosts) # use Elasticsearch mget when a single index is specified if len( self._es_indices ) == 1 and not self._es_client.indices.exists_alias(self._es_indices): self._retrieve = self._mget else: self._retrieve = self._search @staticmethod def _mget(self, items: Sequence[DataItem]) -> Sequence[DataItem]: body = {"docs": [{"_id": item.payload["_id"]} for item in items]} resp = self._es_client.mget(body=body, index=self._es_indices) for i, doc in enumerate(resp["docs"]): if "error" not in doc: items[i].payload.update(doc) return items @staticmethod def _search(self, items: Sequence[DataItem]) -> Sequence[DataItem]: query = { "query": { "ids": { "values": [item.payload["_id"] for item in items] } } } resp = self._es_client.search(body=query, index=self._es_indices) for i, doc in enumerate(resp["hits"]["hits"]): items[i].payload.update(doc) return items def process_batch(self, items: Sequence[DataItem]) -> Sequence[DataItem]: return self._retrieve(self, items)
class SearchEngine(object): def __init__(self, prefix=settings.ELASTICSEARCH_PREFIX): # serializer = JSONSerializer() serializer.mimetype = 'application/json' serializer.dumps = serializer.serialize serializer.loads = JSONDeserializer().deserialize self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS) self.logger = logging.getLogger(__name__) self.prefix = prefix.lower() def _add_prefix(self, *args, **kwargs): if args: index = args[0].strip() else: index = kwargs.get('index', '').strip() if index is None or index == '': raise NotImplementedError("Elasticsearch index not specified.") prefix = '%s_' % self.prefix.strip( ) if self.prefix and self.prefix.strip() != '' else '' index = '%s%s' % (prefix, index) if args: return index else: return dict(kwargs, index=index) def delete(self, **kwargs): """ Deletes a document from the index Pass an index, doc_type, and id to delete a specific document Pass a body with a query dsl to delete by query """ kwargs = self._add_prefix(**kwargs) body = kwargs.pop('body', None) if body != None: try: data = [] refresh = kwargs.pop('refresh', False) for hit in helpers.scan(self.es, query=body, **kwargs): hit['_op_type'] = 'delete' data.append(hit) return helpers.bulk(self.es, data, refresh=refresh, **kwargs) except Exception as detail: try: # ignore 404 errors (index_not_found_exception) if detail.status_code == 404: pass except: self.logger.warning( '%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail def delete_index(self, **kwargs): """ Deletes an entire index """ kwargs = self._add_prefix(**kwargs) print 'deleting index : %s' % kwargs.get('index') return self.es.indices.delete(ignore=[400, 404], **kwargs) def search(self, **kwargs): """ Search for an item in the index. Pass an index, doc_type, and id to get a specific document Pass a body with a query dsl to perform a search """ kwargs = self._add_prefix(**kwargs) body = kwargs.get('body', None) id = kwargs.get('id', None) if id: if isinstance(id, list): kwargs.setdefault('body', {'ids': kwargs.pop('id')}) return self.es.mget(**kwargs) else: return self.es.get(**kwargs) ret = None try: ret = self.es.search(**kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) pass return ret def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex=None, body=None): """ Creates an Elasticsearch body for a single field given an index name and type name """ index = self._add_prefix(index) if not body: if fieldtype == 'geo_shape': body = { doc_type: { 'properties': { fieldname: { 'type': 'geo_shape', 'tree': 'geohash', 'precision': '1m' } } } } else: fn = {'type': fieldtype} if fieldindex: fn['index'] = fieldindex body = {doc_type: {'properties': {fieldname: fn}}} self.es.indices.create(index=index, ignore=400) self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body) print 'creating index : %s/%s' % (index, doc_type) def create_index(self, **kwargs): kwargs = self._add_prefix(**kwargs) self.es.indices.create(**kwargs) print 'creating index : %s' % kwargs.get('index', '') def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs): """ Indexes a document or list of documents into Elasticsearch If "id" is supplied then will use that as the id of the document If "idfield" is supplied then will try to find that property in the document itself and use the value found for the id of the document """ index = self._add_prefix(index) if not isinstance(body, list): body = [body] for document in body: if idfield is not None: if isinstance(document, dict): id = document[idfield] else: id = getattr(document, idfield) try: self.es.index(index=index, doc_type=doc_type, body=document, id=id) except Exception as detail: self.logger.warning( '%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail)) raise detail def bulk_index(self, data, **kwargs): return helpers.bulk(self.es, data, **kwargs) def create_bulk_item(self, op_type='index', index=None, doc_type=None, id=None, data=None): return { '_op_type': op_type, '_index': self._add_prefix(index), '_type': doc_type, '_id': id, '_source': data } def count(self, **kwargs): kwargs = self._add_prefix(**kwargs) count = self.es.count(**kwargs) if count is not None: return count['count'] else: return None def BulkIndexer(outer_self, batch_size=500, **kwargs): class _BulkIndexer(object): def __init__(self, **kwargs): self.queue = [] self.batch_size = kwargs.pop('batch_size', 500) self.kwargs = kwargs def add(self, op_type='index', index=None, doc_type=None, id=None, data=None): doc = { '_op_type': op_type, '_index': outer_self._add_prefix(index), '_type': doc_type, '_id': id, '_source': data } self.queue.append(doc) if len(self.queue) >= self.batch_size: outer_self.bulk_index(self.queue, **self.kwargs) del self.queue[:] #clear out the array def close(self): outer_self.bulk_index(self.queue, **self.kwargs) def __enter__(self, **kwargs): return self def __exit__(self, type, value, traceback): return self.close() return _BulkIndexer(batch_size=batch_size, **kwargs)
class ElasticsearchAPI: """ Each query will have its own index based on query name. index_name = query.name Doc type = query_name to make it possible to set mapping. Mapping is set per doc_type. All rows from a Query should look the same no matter the source. This makes all the data from all the servers in the same index. Comparable. Less indexes. """ def __init__(self, host, port, user, password): logger.info("Connecting to ES %s..." % host) self.es = Elasticsearch(hosts=[ {'host': host, 'port': port}, ]) logger.debug(self.es.info()) @staticmethod def from_config_manager(config_manager): config = config_manager.get_config('Elasticsearch') return ElasticsearchAPI(config['host'], config['port'], config['password'], config['username']) def consume_all(self, items, doc_type, index_name, id_column_name): print('Pushing %s docs to index: %s' % (len(items), index_name)) actions = [] for doc in items: action = { "_id": doc[id_column_name], "_index": index_name, "_type": doc_type, "_source": doc, } actions.append(action) helpers.bulk(self.es, actions) self.es.indices.refresh() return len(items) def find_ids(self, ids, doc_type, index_name): body = {"ids": ids} result = self.es.mget(index=index_name, doc_type=doc_type, body=body) # print(result) if len(result) > 0: return [r['_id'] for r in result['docs'] if r['found'] is True] return [] def init_indexes_for(self, sources): for source in sources: self.init_index_for_source(source) def set_mapping(self, doc_type, index_name, mapping): self.es.indices.put_mapping( index=index_name, doc_type=doc_type, body=mapping) def delete_index(self, index_name): print('Truncating data in index: %s' % index_name) self.es.indices.delete(index=index_name, ignore=404) def create_index(self, index_name): print('Creating index %s' % index_name) self.es.indices.create(index_name, ignore=400)
def esidfilegenerator(host=None,port=9200,index=None,type=None,body=None,source=True,source_exclude=None,source_include=None,idfile=None,headless=False,chunksize=1000,timeout=10): if os.path.isfile(idfile): if not source: source=True tracer = logging.getLogger('elasticsearch') tracer.setLevel(logging.WARNING) tracer.addHandler(logging.FileHandler('errors.txt')) es=Elasticsearch([{'host':host}],port=port,timeout=timeout, max_retries=10, retry_on_timeout=True) ids=set() with open(idfile,"r") as inp: for ppn in inp: _id=ppn.rstrip() ids.add(_id) if len(ids)>=chunksize: if body and "query" in body and "match" in body["query"]: searchbody={"query":{"bool":{"must":[{"match":body["query"]["match"]},{}]}}} for _id in ids: searchbody["query"]["bool"]["must"][1]={"match":{"_id":_id}} #eprint(json.dumps(searchbody)) for doc in esgenerator(host=host,port=port,index=index,type=type,body=searchbody,source=source,source_exclude=source_exclude,source_include=source_include,headless=False,timeout=timeout,verbose=False): if headless: yield doc.get("_source") else: yield doc ids.clear() else: searchbody={'ids':list(ids)} try: for doc in es.mget(index=index,doc_type=type,body=searchbody,_source_include=source_include,_source_exclude=source_exclude,_source=source).get("docs"): if headless: yield doc.get("_source") else: yield doc ids.clear() except exceptions.NotFoundError: continue if len(ids)>0: if body and "query" in body and "match" in body["query"]: searchbody={"query":{"bool":{"must":[{"match":body["query"]["match"]},{}]}}} for _id in ids: searchbody["query"]["bool"]["must"][1]={"match":{"_id":_id}} #eprint(json.dumps(searchbody)) for doc in esgenerator(host=host,port=port,index=index,type=type,body=searchbody,source=source,source_exclude=source_exclude,source_include=source_include,headless=False,timeout=timeout,verbose=False): if headless: yield doc.get("_source") else: yield doc ids.clear() else: searchbody={'ids':list(ids)} try: for doc in es.mget(index=index,doc_type=type,body=searchbody,_source_include=source_include,_source_exclude=source_exclude,_source=source).get("docs"): if headless: yield doc.get("_source") else: yield doc ids.clear() except exceptions.NotFoundError: pass
class ES(object): def __init__(self, es_url, http_auth=None): self.es_url = es_url self.es = Elasticsearch([es_url], show_ssl_warnings=False, http_auth=http_auth, retry_on_timeout=True) def load_data(self, index, doc_type, doc, doc_id): # import certifi # # es = Elasticsearch( # ['localhost', 'otherhost'], # http_auth=('user', 'secret'), # port=443, # use_ssl=True # ) try: return self.es.index(index=index, doc_type=doc_type, body=doc, id=doc_id) except Exception as e: # try once more try: return self.load_data(index, doc_type, doc, doc_id) except Exception as e: print e return None def create_index(self, index_name, es_mapping): command = self.es_url + "/" + index_name return requests.put(command, data=es_mapping, verify=False) def create_alias(self, alias_name, indices): url = self.es_url + "/_aliases" command = { "actions": [{ "remove": { "index": "*", "alias": alias_name } }, { "add": { "indices": indices, "alias": alias_name } }] } return requests.post(url, data=json.dumps(command)) def load_bulk(self, index, doc_type, doc_id, docs): actions = [{ "_index": index, "_type": doc_type, "_id": doc[doc_id], "_source": { json.dumps(doc), } } for doc in docs] helpers.bulk(self.es, actions) def retrieve_doc(self, index, doc_type, ids): if not isinstance(ids, list): ids = [ids] query = "{\"query\": {\"ids\": {\"values\":" + json.dumps(ids) + "}}}" print query try: return self.es.search(index=index, doc_type=doc_type, body=query, filter_path=['hits.hits._source']) except: # try once more try: return self.es.search(index=index, doc_type=doc_type, body=query, filter_path=['hits.hits._source']) except Exception as e: print e return None def search(self, index, doc_type, query, ignore_no_index=False, **other_params): # print query try: return self.es.search(index=index, doc_type=doc_type, body=query, **other_params) except TransportError as e: if e.error != 'index_not_found_exception' and ignore_no_index: print e except Exception as e: print e def mget(self, index, doc_type, body): try: return self.es.mget(index=index, doc_type=doc_type, body=body) except TransportError as e: if e.error != 'index_not_found_exception': print e except Exception as e: print e
class SearchEngine(object): def __init__(self, **kwargs): # serializer = JSONSerializer() serializer.mimetype = "application/json" serializer.dumps = serializer.serialize serializer.loads = JSONDeserializer().deserialize self.prefix = kwargs.pop("prefix", "").lower() self.es = Elasticsearch(serializer=serializer, **kwargs) self.logger = logging.getLogger(__name__) def _add_prefix(self, *args, **kwargs): if args: index = args[0].strip() else: index = kwargs.get("index", "").strip() if index is None or index == "": raise NotImplementedError("Elasticsearch index not specified.") prefix = "%s_" % self.prefix.strip( ) if self.prefix and self.prefix.strip() != "" else "" ret = [] for idx in index.split(","): ret.append("%s%s" % (prefix, idx)) index = ",".join(ret) if args: return index else: return dict(kwargs, index=index) def delete(self, **kwargs): """ Deletes a document from the index Pass an index and id to delete a specific document Pass a body with a query dsl to delete by query """ kwargs = self._add_prefix(**kwargs) body = kwargs.pop("body", None) if body is not None: try: data = [] refresh = kwargs.pop("refresh", False) for hit in helpers.scan(self.es, query=body, **kwargs): hit["_op_type"] = "delete" data.append(hit) return helpers.bulk(self.es, data, refresh=refresh, **kwargs) except Exception as detail: try: # ignore 404 errors (index_not_found_exception) if detail.status_code == 404: pass except: self.logger.warning( "%s: WARNING: failed to delete document by query: %s \nException detail: %s\n" % (datetime.now(), body, detail)) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning( "%s: WARNING: failed to delete document: %s \nException detail: %s\n" % (datetime.now(), body, detail)) raise detail def delete_index(self, **kwargs): """ Deletes an entire index """ kwargs = self._add_prefix(**kwargs) print("deleting index : %s" % kwargs.get("index")) return self.es.indices.delete(ignore=[400, 404], **kwargs) def search(self, **kwargs): """ Search for an item in the index. Pass an index and id to get a specific document Pass a body with a query dsl to perform a search """ kwargs = self._add_prefix(**kwargs) body = kwargs.get("body", None) id = kwargs.get("id", None) if id: if isinstance(id, list): kwargs.setdefault("body", {"ids": kwargs.pop("id")}) return self.es.mget(**kwargs) else: return self.es.get(**kwargs) ret = None try: ret = self.es.search(**kwargs) except Exception as detail: self.logger.warning( "%s: WARNING: search failed for query: %s \nException detail: %s\n" % (datetime.now(), body, detail)) pass return ret def create_mapping(self, index, fieldname="", fieldtype="string", fieldindex=None, body=None): """ Creates an Elasticsearch body for a single field given an index name and type name """ index = self._add_prefix(index) if not body: if fieldtype == "geo_shape": body = { "_doc": { "properties": { fieldname: { "type": "geo_shape", "tree": "geohash", "precision": "1m" } } } } else: fn = {"type": fieldtype} if fieldindex: fn["index"] = fieldindex body = {"_doc": {"properties": {fieldname: fn}}} self.es.indices.create(index=index, ignore=400) self.es.indices.put_mapping(index=index, doc_type="_doc", body=body, include_type_name=True) print("creating index : %s" % (index)) def create_index(self, **kwargs): kwargs = self._add_prefix(**kwargs) kwargs["include_type_name"] = True self.es.indices.create(ignore=400, **kwargs) print("creating index : %s" % kwargs.get("index", "")) def index_data(self, index=None, body=None, idfield=None, id=None, **kwargs): """ Indexes a document or list of documents into Elasticsearch If "id" is supplied then will use that as the id of the document If "idfield" is supplied then will try to find that property in the document itself and use the value found for the id of the document """ index = self._add_prefix(index) if not isinstance(body, list): body = [body] for document in body: if idfield is not None: if isinstance(document, dict): id = document[idfield] else: id = getattr(document, idfield) try: self.es.index(index=index, doc_type="_doc", body=document, id=id) except Exception as detail: self.logger.warning( "%s: WARNING: failed to index document: %s \nException detail: %s\n" % (datetime.now(), document, detail)) raise detail def bulk_index(self, data, **kwargs): return helpers.bulk(self.es, data, **kwargs) def create_bulk_item(self, op_type="index", index=None, id=None, data=None): return { "_op_type": op_type, "_index": self._add_prefix(index), "_type": "_doc", "_id": id, "_source": data } def count(self, **kwargs): kwargs = self._add_prefix(**kwargs) kwargs["doc_type"] = kwargs.pop("doc_type", "_doc") body = kwargs.pop("body", None) # need to only pass in the query key as other keys (eg: _source) are not allowed if body: query = body.pop("query", None) if query: kwargs["body"] = {"query": query} count = self.es.count(**kwargs) if count is not None: return count["count"] else: return None def BulkIndexer(outer_self, batch_size=500, **kwargs): class _BulkIndexer(object): def __init__(self, **kwargs): self.queue = [] self.batch_size = kwargs.pop("batch_size", 500) self.kwargs = kwargs def add(self, op_type="index", index=None, id=None, data=None): doc = { "_op_type": op_type, "_index": outer_self._add_prefix(index), "_type": "_doc", "_id": id, "_source": data } self.queue.append(doc) if len(self.queue) >= self.batch_size: outer_self.bulk_index(self.queue, **self.kwargs) del self.queue[:] # clear out the array def close(self): outer_self.bulk_index(self.queue, **self.kwargs) def __enter__(self, **kwargs): return self def __exit__(self, type, value, traceback): return self.close() return _BulkIndexer(batch_size=batch_size, **kwargs)
class ElasticsearchClient(object): """ Class ElasticsearchClient represent a Elasticsearch client, it implement something feature base on elasticsearch.Elasticsearch. """ automatic_syn_data_flag = {} automatic_thread_name_counter = 0 def __init__(self): self.client = None def from_normal(self, hosts=default.ELASTICSEARCH_HOSTS, **kwargs): """ Initialize a Elasticsearch client by specified hosts list. :param hosts: list of nodes we should connect to. Node should be a dictionary ({"host": "localhost", "port": 9200}), the entire dictionary will be passed to the :class:`~elasticsearch.Connection` class as kwargs, or a string in the format of ``host[:port]`` which will be translated to a dictionary automatically. If no value is given the :class:`~elasticsearch.Urllib3HttpConnection` class defaults will be used :return: void """ self.client = Elasticsearch(hosts=hosts, **kwargs) logger.info('Initialize normal Elasticsearch Client: %s.' % self.client) def from_sniffing(self, active_nodes, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60, **kwargs): """ Initialize a Elasticsearch client for specify to sniff on startup to inspect the cluster and load balance across all nodes. The client can be configured to inspect the cluster state to get a list of nodes upon startup, periodically and/or on failure. :param active_nodes: the list of active nodes :param sniff_on_start: flag indicating whether to obtain a list of nodes from the cluser at startup time :param sniff_on_connection_fail: flag controlling if connection failure triggers a sniff :param sniffer_timeout: number of seconds between automatic sniffs :return: void """ self.client = Elasticsearch( active_nodes, sniff_on_start=sniff_on_start, sniff_on_connection_fail=sniff_on_connection_fail, sniffer_timeout=sniffer_timeout, **kwargs) logger.info('Initialize sniffing Elasticsearch Client: %s.' % self.client) def from_ssl(self, ca_certs, client_cert, client_key, hosts=default.ELASTICSEARCH_HOSTS, use_ssl=True, verify_certs=True, **kwargs): """ Initialize a Elasticsearch client by SSL. :param ca_certs: optional path to CA bundle. See https://urllib3.readthedocs.io/en/latest/security.html#using-certifi-with-urllib3 :param client_cert: path to the file containing the private key and the certificate, or cert only if using client_key :param client_key: path to the file containing the private key if using separate cert and key files (client_cert will contain only the cert) :param hosts: hostname of the node :param use_ssl: use ssl for the connection if `True` :param verify_certs: whether to verify SSL certificates :return: void """ self.client = Elasticsearch(hosts=hosts, use_ssl=use_ssl, verify_certs=verify_certs, ca_certs=ca_certs, client_cert=client_cert, client_key=client_key, **kwargs) logger.info('Initialize SSL Elasticsearch Client: %s.' % self.client) def transfer_data_from_mongo(self, index, doc_type, use_mongo_id=False, indexed_flag_field_name='', mongo_query_params={}, mongo_host=default.MONGO_HOST, mongo_port=default.MONGO_PORT, mongo_db=default.MONGO_DB, mongo_collection=default.MONGO_COLLECTION): """ Transfer data from MongoDB into the Elasticsearch, the hostname, port, database and collection name in MongoDB default from load in default.py :param index: The name of the index :param doc_type: The type of the document :param use_mongo_id: Use id of MongoDB in the Elasticsearch if is true otherwise automatic generation :param indexed_flag_field_name: the name of the field of the document, if associated value is False will synchronize data for it :param mongo_client_params: The dictionary for client params of MongoDB :param mongo_query_params: The dictionary for query params of MongoDB :param mongo_host: The name of the hostname from MongoDB :param mongo_port: The number of the port from MongoDB :param mongo_db: The name of the database from MongoDB :param mongo_collection: The name of the collection from MongoDB :return: void """ mongo_client = MongoClient(host=mongo_host, port=int(mongo_port)) try: collection = mongo_client[mongo_db][mongo_collection] if indexed_flag_field_name != '': mongo_query_params.update({indexed_flag_field_name: False}) mongo_docs = collection.find(mongo_query_params) finally: mongo_client.close() # Joint actions of Elasticsearch for execute bulk api actions = [] id_array = [] for doc in mongo_docs: action = {'_op_type': 'index', '_index': index, '_type': doc_type} id_array.append(doc['_id']) if not use_mongo_id: doc.pop('_id') else: doc['id'] = str(doc['_id']) doc.pop('_id') action['_source'] = doc actions.append(action) success, failed = es_helpers.bulk(self.client, actions, request_timeout=60 * 60) logger.info( 'Transfer data from MongoDB(%s:%s) into the Elasticsearch(%s) success: %s, failed: %s' % (mongo_host, mongo_port, self.client, success, failed)) # Back update flag if indexed_flag_field_name != '': t = threading.Thread(target=ElasticsearchClient._back_update_mongo, args=(self, mongo_host, mongo_port, mongo_db, mongo_collection, id_array, { indexed_flag_field_name: True }), name='mongodb_back_update') t.start() return success, failed def _back_update_mongo(self, mongo_host, mongo_port, mongo_db, mongo_collection, id_array, update): client = MongoClient(host=mongo_host, port=mongo_port) try: collection = client[mongo_db][mongo_collection] for id in id_array: collection.update({'_id': id}, {'$set': update}) finally: client.close() def create(self, index, doc_type, id, body, params={}, **kwargs): result = self.client.create(index, doc_type, id, body, params=params, **kwargs) logger.info( 'Create[index: %s, doc type: %s, id: %s] is done body: \n %s' % (index, doc_type, id, body)) logger.debug( '<Verbose message> operation: %s, version: %s shards: %s' % (result['result'], result['_version'], result['_shards'])) return result def index(self, index, doc_type, body, id=None, params={}, **kwargs): result = self.client.index(index, doc_type, body, id, params=params, **kwargs) if id is None: id_message = 'Automatic Generation' else: id_message = id logger.info( 'Index[index: %s, doc type: %s, id: %s] is done body: \n %s' % (index, doc_type, id_message, body)) logger.debug('<Verbose message> operation: %s version: %s shards: %s' % (result['result'], result['_version'], result['_shards'])) return result def delete(self, index, doc_type, id, params={}, **kwargs): result = self.client.delete(index, doc_type, id, params=params, **kwargs) logger.info('Delete[index: %s, doc type: %s, id: %s] is done' % (index, doc_type, id)) logger.debug('<Verbose message> operation: %s version: %s shards: %s' % (result['result'], result['_version'], result['_shards'])) return result def search(self, index=None, doc_type=None, body=None, params={}, **kwargs): result = self.client.search(index, doc_type, body, params=params, **kwargs) if index is None and doc_type is None and body is None: logger.info('Search[all mode] is done') return result logger.info('Search[index: %s, doc type: %s] is done body: \n %s' % (index, doc_type, body)) logger.debug( '<Verbose message> took: %s shards: %s hits: %s' % (result['took'], result['_shards'], result['hits']['total'])) return result def count(self, index=None, doc_type=None, body=None, params={}, **kwargs): result = self.client.count(index, doc_type, body, params=params, **kwargs) if index is None and doc_type is None and body is None: logger.info('Count[all mode] is done') return result logger.info('Count[index: %s, doc type: %s] is done body: \n %s' % (index, doc_type, body)) logger.debug('<Verbose message> count: %s shards: %s' % (result['count'], result['_shards'])) return result def update(self, index, doc_type, id, body=None, params={}, **kwargs): result = self.client.update(index, doc_type, id, body, params=params, **kwargs) logger.info( 'Update[index: %s, doc type: %s, id: %s] is done body: \n %s' % (index, doc_type, id, body)) logger.debug('<Verbose message> operation: %s version: %s shards: %s' % (result['result'], result['_version'], result['_shards'])) return result def bulk(self, actions, stats_only=False, **kwargs): """ Executes bulk api by elasticsearch.helpers.bulk. :param actions: iterator containing the actions :param stats_only:if `True` only report number of successful/failed operations instead of just number of successful and a list of error responses Any additional keyword arguments will be passed to :func:`~elasticsearch.helpers.streaming_bulk` which is used to execute the operation, see :func:`~elasticsearch.helpers.streaming_bulk` for more accepted parameters. """ success, failed = es_helpers.bulk(self.client, actions, stats_only, **kwargs) logger.info('Bulk is done success %s failed %s actions: \n %s' % (success, failed, actions)) def mget(self, body, index=None, doc_type=None, params={}, **kwargs): result = self.client.mget(body, index, doc_type, params=params, **kwargs) logger.info('Mget[index: %s, doc type: %s] is done body: \n %s' % (index, doc_type, body)) return result def get_client(self): if self.client is None: logger.warning('Elasticsearch Client is None') return self.client # TODO: Use more effective solution def automatic_syn_data_from_mongo( self, index, doc_type, indexed_flag_field_name, thread_name='automatic_syn_data_thread', interval=60, use_mongo_id=False, mongo_query_params={}, mongo_host=default.MONGO_HOST, mongo_port=default.MONGO_PORT, mongo_db=default.MONGO_DB, mongo_collection=default.MONGO_COLLECTION): """ Automatic synchronize data that from MongoDB into the Elasticsearch by schedule task, it will synchronize this data if the indexed_flag_field_name of the field of the document is False. Noteworthy that the function may be no good please you caution use it. :param indexed_flag_field_name: the name of the field of the document, if associated value is False will synchronize data for it :param thread_name: the name of the schedule task thread :param interval: the time that executes interval of the scheduled task every time (unit second) :return: the thread id, you can use this id to cancel associated task """ thread_id = self._generate_thread_id(thread_name) if thread_id in ElasticsearchClient.automatic_syn_data_flag: lock.acquire() try: thread_name = thread_name + '-%s' % ElasticsearchClient.automatic_thread_name_counter ElasticsearchClient.automatic_thread_name_counter += 1 thread_id = self._generate_thread_id(thread_name) finally: lock.release() ElasticsearchClient.automatic_syn_data_flag[thread_id] = True t = threading.Thread( target=ElasticsearchClient._automatic_syn_data_from_mongo_worker, args=(self, thread_id, index, doc_type, indexed_flag_field_name, interval, use_mongo_id, mongo_query_params, mongo_host, mongo_port, mongo_db, mongo_collection), name=thread_name) t.start() return thread_id def _generate_thread_id(self, thread_name): return str(hash(thread_name)) def stop_automatic_syn_data(self, thread_id): lock.acquire() try: ElasticsearchClient.automatic_syn_data_flag[thread_id] = False finally: lock.release() def _automatic_syn_data_from_mongo_worker( self, thread_id, index, doc_type, indexed_flag_field_name, interval=60, use_mongo_id=False, mongo_query_params={}, mongo_host=default.MONGO_HOST, mongo_port=default.MONGO_PORT, mongo_db=default.MONGO_DB, mongo_collection=default.MONGO_COLLECTION): current_thread__name = threading.current_thread().name while ElasticsearchClient.automatic_syn_data_flag[thread_id]: logger.info( '[%s]: synchronize data work start %s:%s -----> %s' % (current_thread__name, mongo_host, mongo_port, self.client)) success, failed = self.transfer_data_from_mongo( index=index, doc_type=doc_type, use_mongo_id=use_mongo_id, indexed_flag_field_name=indexed_flag_field_name, mongo_query_params=mongo_query_params, mongo_host=mongo_host, mongo_port=mongo_port, mongo_db=mongo_db, mongo_collection=mongo_collection) logger.info( '[%s]: synchronize data work done %s:%s -----> %s [success=%s, failed=%s]' % (current_thread__name, mongo_host, mongo_port, self.client, success, failed)) time.sleep(interval) logger.info('[%s]: synchronize data work is shutdown ' % current_thread__name) def open_index(self, index, params={}, **kwargs): result = self.client.indices.open(index, params=params, **kwargs) logger.info('Index %s is opened' % index) return result def close_index(self, index, params={}, **kwargs): result = self.client.indices.close(index, params=params, **kwargs) logger.info('Index %s is closed' % index) return result def indices_stats_info(self, index=None, metric=None, params={}, **kwargs): result = self.client.indices.stats(index=index, metric=metric, params=params, **kwargs) logger.info('Acquire indices status information is done') return result def get_simple_info_for_index(self, index=None, params={}, **kwargs): """ Return a list of simple info by specified index (default all), each elements is a dictionary such as { 'health' : 'green', 'status' : 'open', 'index' : 'xxxx', 'uuid' : 'xxxx', 'pri' : 1, 'rep' : 1, `docs_count` : 4, 'docs_deleted' : 0, 'store_size' : 10kb, 'pri_store_size' : 10kb } """ raw = self.client.cat.indices(index, params=params, **kwargs).split('\n') list = [] for r in raw: alter = r.split(' ') if len(alter) < 10: continue dict = { 'health': alter[0], 'status': alter[1], 'index': alter[2], } if len(alter) == 11: # May appear split fail (alter[3] is a empty string) dict['uuid'] = alter[4] i = 5 else: dict['uuid'] = alter[3] i = 4 dict['pri'] = alter[i] i += 1 dict['rep'] = alter[i] i += 1 dict['docs_count'] = alter[i] i += 1 dict['docs_deleted'] = alter[i] i += 1 dict['store_size'] = alter[i] i += 1 dict['pri_store_size'] = alter[i] list.append(dict) logger.info( 'Acquire simple information of the index is done succeeded: %s' % len(list)) return list def cluster_health(self, index=None, params={}, **kwargs): result = self.client.cluster.health(index, params=params, **kwargs) message = 'Acquire cluster health information is done index: %s' if index is None: message = message % 'all' else: message = message % index logger.info(message) return result def cluster_health_for_indices(self, index=None, params={}, **kwargs): """ Return a list of cluster health of specified indices(default all), the first element is a dictionary represent a global information of the cluster such as "cluster_name", "number_of_nodes"... the second element represent a indices information list that each element is a dictionary for one index such as [{'index' : 'a', 'status' : 'yellow', ...} , {'index' : 'b', 'status' : 'yellow', ...}, ....] """ params['level'] = 'indices' result = self.cluster_health(index, params, **kwargs) return self._process_cluster_health_info(result) def cluster_health_for_shards(self, index=None, params={}, **kwargs): """ Return a list of cluster health of specified indices(default all) and append shards information of each index the first element is a dictionary represent a global information of the cluster the second element represent a information of indices and its shards and each element is a dictionary such as [{'index' : 'a', 'status' : 'yellow', ..., 'shards' : {'0' : {...}, '1' : {...}, ...}, ...] """ params['level'] = 'shards' result = self.cluster_health(index, params, **kwargs) return self._process_cluster_health_info(result) def cluster_status_info(self, node_id=None, params={}, **kwargs): result = self.client.cluster.stats(node_id=node_id, params=params, **kwargs) logger.info('Acquire cluster status information is done') return result def _process_cluster_health_info(self, info): list = [] first = {} second = [] for k, v in info.items(): if k == 'indices': for k2, v2 in v.items(): index = {} index['index'] = k2 index.update(v2) second.append(index) else: first[k] = v list.append(first) list.append(second) return list def nodes_status_info(self, node_id=None, metric=None, index_metric=None, params={}, **kwargs): result = self.client.nodes.stats(node_id=node_id, metric=metric, index_metric=index_metric, params=params, **kwargs) logger.info('Acquire nodes status information is done') return result def nodes_info(self, node_id=None, metric=None, params={}, **kwargs): result = self.client.nodes.info(node_id=node_id, metric=metric, params=params, **kwargs) logger.info('Acquire nodes info is done') return result def nodes_simple_info(self, params={}, **kwargs): """ Return a dictionary of the nodes simple info that key is a column name, such as [{"http_address": "192.111.111.111", "name" : "test", ...}, ...] """ h = [ 'name', 'pid', 'http_address', 'version', 'jdk', 'disk.total', 'disk.used_percent', 'heap.current', 'heap.percent', 'ram.current', 'ram.percent', 'uptime', 'node.role' ] result = self.client.cat.nodes(v=True, h=h, **kwargs, params=params) result = [x.strip().split(' ') for x in result.split('\n')] # Clean up the space result.remove(result[-1]) for i in range(len(result)): result[i] = list(filter(lambda x: x != '', result[i])) # Packing into the dictionary dicts = [] for i in range(len(result) - 1): dict = {} for k, v in zip(result[0], result[i + 1]): dict[k] = v dicts.append(dict) logger.info( 'Acquire simple information of the nodes is done succeeded: %s' % len(dicts)) return dicts
class ElasticsearchConnector: def __init__(self): self.es = Elasticsearch() def execute_search(self, index, body): try: response = self.es.search(index=index, body=body) return response['hits']['hits'] except exceptions.RequestError: # print("Request error") # print(body) return [] def execute_multiget(self, index, body): try: response = self.es.mget(index=index, body=body) return response['docs'] except exceptions.RequestError: # print("Request error") # print(body) return [] def execute_aggregation(self, index, body, aggregation): response = self.es.search(index=index, body=body) return response['aggregations'][aggregation] def execute_search_with_scroll(self, index, body): response = self.es.search(index=index, scroll='2m', body=body) return response['_scroll_id'], response['hits']['total'], response def scroll(self, sid, scroll): return self.es.scroll(scroll_id=sid, scroll=scroll) # add document to the specified elastic index def add_document(self, index, doc_type, body): try: self.es.index(index=index, doc_type=doc_type, body=body) except exceptions.RequestError as e: print(e) print(body) # add multiple documents at once def add_bulk(self, index, bodies): actions = [] for body in bodies: dump = json.dumps(body) if 'id' in body: actions.append({ "_id": body['id'], "_index": index, "_source": dump }) else: actions.append({"_index": index, "_source": body}) helpers.bulk(self.es, actions) def update_bulk(self, index, bodies): actions = [{ "_id": body['id'], "_index": index, "_type": '_doc', "_source": { 'doc': body }, '_op_type': 'update' } for body in bodies] helpers.bulk(self.es, actions) # update a small part of the given document def update_document(self, index, docid, body): try: self.es.update(index=index, id=docid, body=body) except (exceptions.RequestError, exceptions.TransportError) as e: print(e) print(body) # retrieve the term vector for a given document def get_term_vector(self, index, docid): return self.es.termvectors(index=index, id=docid, positions=True, term_statistics=True) def clear_index(self, index): self.es.indices.delete(index=index, ignore=[400, 404]) def clear_all(self): self.clear_index('aggregate_articles') self.clear_index('users') self.clear_index('recommendations') self.clear_index('occupation') self.clear_index('personalization') def delete(self, index, docid): self.es.delete(index, docid)
class ES(object): def __init__(self, es_url, http_auth=None): self.es_url = es_url self.es = Elasticsearch([es_url], show_ssl_warnings=False, http_auth=http_auth,retry_on_timeout=True) def load_data(self, index, doc_type, doc, doc_id): # import certifi # # es = Elasticsearch( # ['localhost', 'otherhost'], # http_auth=('user', 'secret'), # port=443, # use_ssl=True # ) try: return self.es.index(index=index, doc_type=doc_type, body=doc, id=doc_id) except Exception as e: # try once more try: return self.load_data(index, doc_type, doc, doc_id) except Exception as e: print e return None def create_index(self, index_name, es_mapping): command = self.es_url + "/" + index_name return requests.put(command, data=es_mapping, verify=False) def create_alias(self, alias_name, indices): url = self.es_url + "/_aliases" command = {"actions": [ {"remove": {"index": "*", "alias": alias_name}}, {"add": {"indices": indices, "alias": alias_name}} ]} return requests.post(url, data=json.dumps(command)) def load_bulk(self, index, doc_type, doc_id, docs): actions = [ { "_index": index, "_type": doc_type, "_id": doc[doc_id], "_source": { json.dumps(doc), } } for doc in docs ] helpers.bulk(self.es, actions) def retrieve_doc(self, index, doc_type, ids): if not isinstance(ids, list): ids = [ids] query = "{\"query\": {\"ids\": {\"values\":" + json.dumps(ids) + "}}}" print query try: return self.es.search(index=index, doc_type=doc_type, body=query, filter_path=['hits.hits._source']) except: # try once more try: return self.es.search(index=index, doc_type=doc_type, body=query, filter_path=['hits.hits._source']) except Exception as e: print e return None def search(self, index, doc_type, query, ignore_no_index=False, **other_params): # print query try: return self.es.search(index=index, doc_type=doc_type, body=query, **other_params) except TransportError as e: if e.error != 'index_not_found_exception' and ignore_no_index: print e except Exception as e: print e def es_search(self, index, doc_type, query, scroll, ignore_no_index=False, **other_params): # print query if not scroll: try: return self.es.search(index=index, doc_type=doc_type, body=query, **other_params) except Exception as e: return e else: #Initiating scroll try: total_docs = query['size'] query['size'] = 0 query['from'] = 0 data = self.es.search(index=index, doc_type=doc_type, body=query,scroll='1m',size=1000, **other_params) docs = [] docs = data['hits']['hits'] docs_count = len(data['hits']['hits']) sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) while docs_count > total_docs or scroll_size > 0: new_data = self.es.scroll(sid,scroll='1m') sid = data['_scroll_id'] for doc in new_data['hits']['hits']: docs.append(doc) scroll_size = len(new_data['hits']['hits']) docs_count = docs_count + scroll_size data['hits']['hits'] = docs[:docs_count] data['hits']['total'] = docs_count print "scroll complete with " + str(docs_count) return data except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) lines = ''.join(lines) print lines return e def mget(self,index,doc_type,body): try: return self.es.mget(index=index,doc_type=doc_type,body=body) except TransportError as e: if e.error != 'index_not_found_exception': print e except Exception as e: print e
class ElasticStore(object): '''A feature collection store on ElasticSearch. Feature collections are maps from feature names to features. The representation of each feature is unspecified by this interface. This class exposes a similar interface to the regular ``Store`` class, with a few additions: 1. Canopy scans are implemented natively with ElasticSearch, so they are provided as methods here. 2. On all retrieval methods, the caller can pass a list of feature names (or feature name wildcards) to retrieve. If your FCs have lots of features, this is useful when you only need to retrieve a small fraction of them. .. automethod:: __init__ .. automethod:: configured **CRUD operations** .. automethod:: get .. automethod:: get_many .. automethod:: put .. automethod:: delete .. automethod:: delete_all .. automethod:: delete_index **Keyword scanning** .. automethod:: keyword_scan .. automethod:: keyword_scan_ids **Scanning ids in lexicographic order** Note that these operations may be inefficient because of how ElasticSearch handles sorting. .. automethod:: scan .. automethod:: scan_ids .. automethod:: scan_prefix .. automethod:: scan_prefix_ids **Low-level** .. automethod:: sync .. automethod:: index_scan_ids .. automethod:: index_names ''' config_name = 'dossier.store' @classmethod def configured(cls): '''Create a new instance from the global configuration. In order to use this, you must make sure that :class:`ElasticStore` has been configured by :mod:`yakonfig`, usually by passing the class to ``yakonfig.parse_args``. ''' return cls(**yakonfig.get_global_config('dossier.store')) def __init__(self, hosts=None, namespace=None, type='fc', feature_indexes=None, shards=10, replicas=0, fulltext_indexes=None): '''Create a new store or connect to an existing one. :param hosts: Passed directly to ``elasticsearch.Elasticsearch`` constructor. Required. :param str namespace: Used as the ES index name, prefixed by ``fcs_``. Required. :param str type: The ES type to use. If this is set to ``None``, then a random unique string is used. :param [str] feature_indexes: A list of names of features to index. :param int shards: The number of shards to use for this index. This only has an effect if the ES index didn't previous exist. :param int replicas: The number of replicas to use for this index. This only has an effect if the ES index didn't previous exist. :rtype: :class:`ElasticStore` ''' if hosts is None: raise yakonfig.ProgrammerError( 'ElasticStore needs at least one host specified.') if namespace is None: raise yakonfig.ProgrammerError( 'ElasticStore needs a namespace defined.') if type is None: type = unicode(uuid.uuid4()) self.conn = Elasticsearch(hosts=hosts, timeout=60, request_timeout=60) self.index = 'fcs_%s' % namespace self.type = type self.shards = shards self.replicas = replicas self.indexes = OrderedDict() self.fulltext_indexes = OrderedDict() self.indexed_features = set() self.fulltext_indexed_features = set() self._normalize_feature_indexes(feature_indexes) self._normalize_fulltext_feature_indexes(fulltext_indexes) if not self.conn.indices.exists(index=self.index): # This can race, but that should be OK. # Worst case, we initialize with the same settings more than # once. self._create_index() mapping = self.conn.indices.get_mapping( index=self.index, doc_type=self.type) if len(mapping) == 0: self._create_mappings() def get(self, content_id, feature_names=None): '''Retrieve a feature collection. If a feature collection with the given id does not exist, then ``None`` is returned. :param str content_id: Content identifier. :param [str] feature_names: A list of feature names to retrieve. When ``None``, all features are retrieved. Wildcards are allowed. :rtype: :class:`dossier.fc.FeatureCollection` or ``None`` ''' try: resp = self.conn.get(index=self.index, doc_type=self.type, id=eid(content_id), _source=self._source(feature_names)) return self.fc_from_dict(resp['_source']['fc']) except NotFoundError: return None except: raise def get_many(self, content_ids, feature_names=None): '''Returns an iterable of feature collections. This efficiently retrieves multiple FCs corresponding to the list of ids given. Tuples of identifier and feature collection are yielded. If the feature collection for a given id does not exist, then ``None`` is returned as the second element of the tuple. :param [str] content_ids: List of content ids. :param [str] feature_names: A list of feature names to retrieve. When ``None``, all features are retrieved. Wildcards are allowed. :rtype: Iterable of ``(content_id, FC)`` ''' try: resp = self.conn.mget(index=self.index, doc_type=self.type, _source=self._source(feature_names), body={'ids': map(eid, content_ids)}) except TransportError: return for doc in resp['docs']: fc = None if doc['found']: fc = self.fc_from_dict(doc['_source']['fc']) yield did(doc['_id']), fc def put(self, items, indexes=True): '''Adds feature collections to the store. This efficiently adds multiple FCs to the store. The iterable of ``items`` given should yield tuples of ``(content_id, FC)``. :param items: Iterable of ``(content_id, FC)``. :param [str] feature_names: A list of feature names to retrieve. When ``None``, all features are retrieved. Wildcards are allowed. ''' actions = [] for cid, fc in items: # TODO: If we store features in a columnar order, then we # could tell ES to index the feature values directly. ---AG # (But is problematic because we want to preserve the ability # to selectively index FCs. So we'd probably need two distinct # doc types.) idxs = defaultdict(list) if indexes: for fname in self.indexed_features: if fname in fc: idxs[fname_to_idx_name(fname)].extend(fc[fname]) for fname in self.fulltext_indexed_features: if fname not in fc: continue if isinstance(fc[fname], basestring): idxs[fname_to_full_idx_name(fname)] = fc[fname] else: idxs[fname_to_full_idx_name(fname)].extend(fc[fname]) actions.append({ '_index': self.index, '_type': self.type, '_id': eid(cid), '_op_type': 'index', '_source': dict(idxs, **{ 'fc': self.fc_to_dict(fc), }), }) bulk(self.conn, actions, timeout=60, request_timeout=60) def delete(self, content_id): '''Deletes the corresponding feature collection. If the FC does not exist, then this is a no-op. ''' try: self.conn.delete(index=self.index, doc_type=self.type, id=eid(content_id)) except NotFoundError: pass def delete_all(self): '''Deletes all feature collections. This does not destroy the ES index, but instead only deletes all FCs with the configured document type (defaults to ``fc``). ''' try: self.conn.indices.delete_mapping( index=self.index, doc_type=self.type) except TransportError: logger.warn('type %r in index %r already deleted', self.index, self.type, exc_info=True) def delete_index(self): '''Deletes the underlying ES index. Only use this if you know what you're doing. This destroys the entire underlying ES index, which could be shared by multiple distinct ElasticStore instances. ''' if self.conn.indices.exists(index=self.index): self.conn.indices.delete(index=self.index) def sync(self): '''Tells ES to tell Lucene to do an fsync. This guarantees that any previous calls to ``put`` will be flushed to disk and available in subsequent searches. Generally, this should only be used in test code. ''' self.conn.indices.refresh(index=self.index) def scan(self, *key_ranges, **kwargs): '''Scan for FCs in the given id ranges. :param key_ranges: ``key_ranges`` should be a list of pairs of ranges. The first value is the lower bound id and the second value is the upper bound id. Use ``()`` in either position to leave it unbounded. If no ``key_ranges`` are given, then all FCs in the store are returned. :param [str] feature_names: A list of feature names to retrieve. When ``None``, all features are retrieved. Wildcards are allowed. :rtype: Iterable of ``(content_id, FC)`` ''' for hit in self._scan(*key_ranges, **kwargs): yield did(hit['_id']), self.fc_from_dict(hit['_source']['fc']) def scan_ids(self, *key_ranges, **kwargs): '''Scan for ids only in the given id ranges. :param key_ranges: ``key_ranges`` should be a list of pairs of ranges. The first value is the lower bound id and the second value is the upper bound id. Use ``()`` in either position to leave it unbounded. If no ``key_ranges`` are given, then all FCs in the store are returned. :param [str] feature_names: A list of feature names to retrieve. When ``None``, all features are retrieved. Wildcards are allowed. :rtype: Iterable of ``content_id`` ''' kwargs['feature_names'] = False for hit in self._scan(*key_ranges, **kwargs): yield did(hit['_id']) def scan_prefix(self, prefix, feature_names=None): '''Scan for FCs with a given prefix. :param str prefix: Identifier prefix. :param [str] feature_names: A list of feature names to retrieve. When ``None``, all features are retrieved. Wildcards are allowed. :rtype: Iterable of ``(content_id, FC)`` ''' resp = self._scan_prefix(prefix, feature_names=feature_names) for hit in resp: yield did(hit['_id']), self.fc_from_dict(hit['_source']['fc']) def scan_prefix_ids(self, prefix): '''Scan for ids with a given prefix. :param str prefix: Identifier prefix. :param [str] feature_names: A list of feature names to retrieve. When ``None``, all features are retrieved. Wildcards are allowed. :rtype: Iterable of ``content_id`` ''' resp = self._scan_prefix(prefix, feature_names=False) for hit in resp: yield did(hit['_id']) def fulltext_scan(self, query_id=None, query_fc=None, feature_names=None, preserve_order=True): '''Fulltext search. Yields an iterable of triples (score, identifier, FC) corresponding to the search results of the fulltext search in ``query``. This will only search text indexed under the given feature named ``fname``. Note that, unless ``preserve_order`` is set to True, the ``score`` will always be 0.0, and the results will be unordered. ``preserve_order`` set to True will cause the results to be scored and be ordered by score, but you should expect to see a decrease in performance. :param str fname: The feature to search. :param unicode query: The query. :param [str] feature_names: A list of feature names to retrieve. When ``None``, all features are retrieved. Wildcards are allowed. :rtype: Iterable of ``(score, content_id, FC)`` ''' it = self._fulltext_scan(query_id, query_fc, feature_names=feature_names, preserve_order=preserve_order) for hit in it: fc = self.fc_from_dict(hit['_source']['fc']) yield hit['_score'], did(hit['_id']), fc def fulltext_scan_ids(self, query_id=None, query_fc=None, preserve_order=True): '''Fulltext search for identifiers. Yields an iterable of triples (score, identifier) corresponding to the search results of the fulltext search in ``query``. This will only search text indexed under the given feature named ``fname``. Note that, unless ``preserve_order`` is set to True, the ``score`` will always be 0.0, and the results will be unordered. ``preserve_order`` set to True will cause the results to be scored and be ordered by score, but you should expect to see a decrease in performance. :param str fname: The feature to search. :param unicode query: The query. :rtype: Iterable of ``(score, content_id)`` ''' it = self._fulltext_scan(query_id, query_fc, feature_names=False, preserve_order=preserve_order) for hit in it: yield hit['_score'], did(hit['_id']) def keyword_scan(self, query_id=None, query_fc=None, feature_names=None): '''Keyword scan for feature collections. This performs a keyword scan using the query given. A keyword scan searches for FCs with terms in each of the query's indexed fields. At least one of ``query_id`` or ``query_fc`` must be provided. If ``query_fc`` is ``None``, then the query is retrieved automatically corresponding to ``query_id``. :param str query_id: Optional query id. :param query_fc: Optional query feature collection. :type query_fc: :class:`dossier.fc.FeatureCollection` :param [str] feature_names: A list of feature names to retrieve. When ``None``, all features are retrieved. Wildcards are allowed. :rtype: Iterable of ``(content_id, FC)`` ''' it = self._keyword_scan(query_id, query_fc, feature_names=feature_names) for hit in it: fc = self.fc_from_dict(hit['_source']['fc']) yield did(hit['_id']), fc def keyword_scan_ids(self, query_id=None, query_fc=None): '''Keyword scan for ids. This performs a keyword scan using the query given. A keyword scan searches for FCs with terms in each of the query's indexed fields. At least one of ``query_id`` or ``query_fc`` must be provided. If ``query_fc`` is ``None``, then the query is retrieved automatically corresponding to ``query_id``. :param str query_id: Optional query id. :param query_fc: Optional query feature collection. :type query_fc: :class:`dossier.fc.FeatureCollection` :rtype: Iterable of ``content_id`` ''' it = self._keyword_scan(query_id, query_fc, feature_names=False) for hit in it: yield did(hit['_id']) def index_scan_ids(self, fname, val): '''Low-level keyword index scan for ids. Retrieves identifiers of FCs that have a feature value ``val`` in the feature named ``fname``. Note that ``fname`` must be indexed. :param str fname: Feature name. :param str val: Feature value. :rtype: Iterable of ``content_id`` ''' disj = [] for fname2 in self.indexes[fname]['feature_names']: disj.append({'term': {fname_to_idx_name(fname2): val}}) query = { 'constant_score': { 'filter': {'or': disj}, }, } hits = scan(self.conn, index=self.index, doc_type=self.type, query={ '_source': False, 'query': query, }) for hit in hits: yield did(hit['_id']) def index_names(self): '''Returns a list of all defined index names. Note that this only includes boolean based indexes. :rtype: list of ``unicode`` ''' return map(unicode, self.indexes.iterkeys()) def fulltext_index_names(self): '''Returns a list of all defined fulltext index names. :rtype: list of ``unicode`` ''' return map(unicode, self.fulltext_indexes.iterkeys()) def _fulltext_scan(self, query_id, query_fc, preserve_order=True, feature_names=None): query_fc = self.get_query_fc(query_id, query_fc) ids = set([] if query_id is None else [eid(query_id)]) for fname, features in self.fulltext_indexes.iteritems(): qvals = map(unicode, query_fc.get(fname, {}).keys()) if len(qvals) == 0: continue qmatches = [] qfields = map(fname_to_full_idx_name, features) for qval in qvals: if re.search('\p{Punct}', qval): match_type = 'phrase' else: match_type = 'best_fields' qmatches.append({ 'multi_match': { 'type': match_type, 'query': qval, 'fields': qfields, } }) query = { 'filtered': { 'query': { 'bool': { 'should': qmatches, }, }, 'filter': { 'not': { 'ids': { 'values': list(ids), }, }, }, }, } logger.info('fulltext scanning index: %s, query: %r', fname, qvals) hits = scan( self.conn, index=self.index, doc_type=self.type, preserve_order=preserve_order, query={ '_source': self._source(feature_names), 'query': query, }) for hit in hits: ids.add(eid(hit['_id'])) yield hit def _keyword_scan(self, query_id, query_fc, feature_names=None): # Why are we running multiple scans? Why are we deduplicating? # # It turns out that, in our various systems, it can be important to # prioritize the order of results returned in a keyword scan based on # the feature index that is being searched. For example, we typically # want to start a keyword scan with the results from a search on # `NAME`, which we don't want to be mingled with the results from a # search on some other feature. # # The simplest way to guarantee this type of prioritization is to run # a query for each index in the order in which they were defined. # # This has some downsides: # # 1. We return *all* results for the first index before ever returning # results for the second. # 2. Since we're running multiple queries, we could get back results # we've already retrieved in a previous query. # # We accept (1) for now. # # To fix (2), we keep track of all ids we've seen and include them # as a filter in subsequent queries. query_fc = self.get_query_fc(query_id, query_fc) ids = set([] if query_id is None else [eid(query_id)]) for fname in self.indexes: term_disj = self._fc_index_disjunction_from_query(query_fc, fname) if len(term_disj) == 0: continue query = { 'constant_score': { 'filter': { 'and': [{ 'not': { 'ids': { 'values': list(ids), }, }, }, { 'or': term_disj, }], }, }, } logger.info('keyword scanning index: %s', fname) hits = scan( self.conn, index=self.index, doc_type=self.type, query={ '_source': self._source(feature_names), 'query': query, }) for hit in hits: ids.add(eid(hit['_id'])) yield hit def _scan(self, *key_ranges, **kwargs): feature_names = kwargs.get('feature_names') range_filters = self._range_filters(*key_ranges) return scan(self.conn, index=self.index, doc_type=self.type, _source=self._source(feature_names), preserve_order=True, query={ # Sorting by `_id` seems to fail spuriously and # I have no idea why. ---AG 'sort': {'_uid': {'order': 'asc'}}, 'query': { 'constant_score': { 'filter': { 'and': range_filters, }, }, }, }) def _scan_prefix(self, prefix, feature_names=None): query = { 'constant_score': { 'filter': { 'and': [{ 'prefix': { '_id': eid(prefix), }, }], }, }, } return scan(self.conn, index=self.index, doc_type=self.type, _source=self._source(feature_names), preserve_order=True, query={ # Sorting by `_id` seems to fail spuriously and # I have no idea why. ---AG 'sort': {'_uid': {'order': 'asc'}}, 'query': query, }) def _source(self, feature_names): '''Maps feature names to ES's "_source" field.''' if feature_names is None: return True elif isinstance(feature_names, bool): return feature_names else: return map(lambda n: 'fc.' + n, feature_names) def _range_filters(self, *key_ranges): 'Creates ES filters for key ranges used in scanning.' filters = [] for s, e in key_ranges: if isinstance(s, basestring): s = eid(s) if isinstance(e, basestring): # Make the range inclusive. # We need a valid codepoint, so use the max. e += u'\U0010FFFF' e = eid(e) if s == () and e == (): filters.append({'match_all': {}}) elif e == (): filters.append({'range': {'_id': {'gte': s}}}) elif s == (): filters.append({'range': {'_id': {'lte': e}}}) else: filters.append({'range': {'_id': {'gte': s, 'lte': e}}}) if len(filters) == 0: return [{'match_all': {}}] else: return filters def _create_index(self): 'Create the index' try: self.conn.indices.create( index=self.index, timeout=60, request_timeout=60, body={ 'settings': { 'number_of_shards': self.shards, 'number_of_replicas': self.replicas, }, }) except TransportError: # Hope that this is an "index already exists" error... logger.warn('index already exists? OK', exc_info=True) pass def _create_mappings(self): 'Create the field type mapping.' self.conn.indices.put_mapping( index=self.index, doc_type=self.type, timeout=60, request_timeout=60, body={ self.type: { 'dynamic_templates': [{ 'default_no_analyze_fc': { 'match': 'fc.*', 'mapping': {'index': 'no'}, }, }], '_all': { 'enabled': False, }, '_id': { 'index': 'not_analyzed', # allows range queries }, 'properties': self._get_index_mappings(), }, }) # It is possible to create an index and quickly launch a request # that will fail because the index hasn't been set up yet. Usually, # you'll get a "no active shards available" error. # # Since index creation is a very rare operation (it only happens # when the index doesn't already exist), we sit and wait for the # cluster to become healthy. self.conn.cluster.health(index=self.index, wait_for_status='yellow') def _get_index_mappings(self): 'Retrieve the field mappings. Useful for debugging.' maps = {} for fname in self.indexed_features: config = self.indexes.get(fname, {}) print(fname, config) maps[fname_to_idx_name(fname)] = { 'type': config.get('es_index_type', 'integer'), 'store': False, 'index': 'not_analyzed', } for fname in self.fulltext_indexed_features: maps[fname_to_full_idx_name(fname)] = { 'type': 'string', 'store': False, 'index': 'analyzed', } return maps def _get_field_types(self): 'Retrieve the field types. Useful for debugging.' mapping = self.conn.indices.get_mapping( index=self.index, doc_type=self.type) return mapping[self.index]['mappings'][self.type]['properties'] def _normalize_fulltext_feature_indexes(self, fulltext_indexes): for x in fulltext_indexes or []: if isinstance(x, Mapping): assert len(x) == 1, 'only one mapping per index entry allowed' name = x.keys()[0] features = x[name] else: name = x features = [x] self.fulltext_indexes[name] = features for fname in features: self.fulltext_indexed_features.add(fname) def _normalize_feature_indexes(self, feature_indexes): for x in feature_indexes or []: if isinstance(x, Mapping): assert len(x) == 1, 'only one mapping per index entry allowed' name = x.keys()[0] if isinstance(x[name], Mapping): index_type = x[name]['es_index_type'] features = x[name]['feature_names'] else: index_type = 'integer' features = x[name] else: name = x features = [x] index_type = 'integer' self.indexes[name] = { 'feature_names': features, 'es_index_type': index_type, } for fname in features: self.indexed_features.add(fname) def _fc_index_disjunction_from_query(self, query_fc, fname): 'Creates a disjunction for keyword scan queries.' if len(query_fc.get(fname, [])) == 0: return [] terms = query_fc[fname].keys() disj = [] for fname in self.indexes[fname]['feature_names']: disj.append({'terms': {fname_to_idx_name(fname): terms}}) return disj def fc_to_dict(self, fc): d = {} for name, feat in fc.to_dict().iteritems(): # This is a hack to drop the clean_visible feature because it # is not necessary to store it and it is large. We simply need # to index it. if name == '#clean_visible': continue d[name] = base64.b64encode(cbor.dumps(feat)) return d def fc_from_dict(self, fc_dict): d = {} for name, feat in fc_dict.iteritems(): d[name] = cbor.loads(base64.b64decode(feat)) return FC(d) def get_query_fc(self, query_id, query_fc): if query_fc is None: if query_id is None: raise ValueError( 'one of query_id or query_fc must not be None') query_fc = self.get(query_id) if query_fc is None: raise KeyError(query_id) return query_fc
class _ES(object): def __init__(self, index, doc_type, host, port, timeout=300, **args): self.host = host self.port = port self.index = index self.doc_type = doc_type self.es = Elasticsearch(hosts=[ { "host": self.host, "port": self.port }, ], timeout=timeout, **args) def check_properties(self, properties): """ Check if all properties are known (e.g. have mappings), and creates mappings as needed """ properties = set(properties) if not (properties - self.get_properties()): return to_add = properties - self.get_properties() if to_add: self.add_properties(to_add) def add_properties(self, to_add): """ Add the named properties, setting mapping depending on suffix """ mappings = {} for name in to_add: ftype = name.rsplit("_", 1)[1] if "_" in name else 'default' mappings[name] = settings.ES_MAPPING_TYPES[ftype] self.es.indices.put_mapping(index=self.index, doc_type=self.doc_type, body={"properties": mappings}) def get_mapping(self): m = self.es.indices.get_mapping(self.index, self.doc_type) return m[self.index]['mappings'][self.doc_type]['properties'] def get_properties(self): self.check_index() return set(self.get_mapping().keys()) def refresh(self): self.es.indices.refresh() def highlight_article(self, aid: int, query: str) -> dict: """Highlight article given by an article id using a Lucene query. The resulting strings are safe to insert into an HTML document even if the original document contained malicious constructs. If you need the original article including HTML, call html.unescape on this output.""" from amcat.tools.amcates_queryset import ESQuerySet qs = ESQuerySet().filter(id=aid).only("text", "title").highlight(query, mark="em") try: return next(iter(qs)).to_dict() except StopIteration: raise ValueError( "Article(id={}) not found in elastic index.".format(aid)) def clear_cache(self): self.es.indices.clear_cache() def delete_index(self): try: self.es.indices.delete(self.index) except NotFoundError: pass except Exception as e: if 'IndexMissingException' in str(e): return raise def create_index(self, shards=5, replicas=1): es_settings = settings.ES_SETTINGS.copy() es_settings.update({ "number_of_shards": shards, "number_of_replicas": replicas }) body = { "settings": es_settings, "mappings": { settings.ES_ARTICLE_DOCTYPE: settings.ES_MAPPING } } self.es.indices.create(self.index, body) def check_index(self): """ Check whether the server is up and the index exists. If the server is down, raise an exception. If the index does not exist, try to create it. """ if not self.es.ping(): raise Exception("Elastic server cannot be reached") if not self.es.indices.exists(self.index): log.info("Index {self.index} does not exist, creating".format( **locals())) self.create_index() return self.es.cluster.health(self.index, wait_for_status='yellow') def exists_type(self, doc_type, **kargs): return self.es.indices.exists_type(index=self.index, doc_type=doc_type, **kargs) def put_mapping(self, doc_type, body, **kargs): return self.es.indices.put_mapping(index=self.index, doc_type=doc_type, body=body, **kargs) def status(self): nodes = self.es.nodes.info()['nodes'].values() return { "ping": self.es.ping(), "nodes": [n['name'] for n in nodes], "index": self.index, "index_health": self.es.cluster.health(self.index), "transport_hosts": self.es.transport.hosts, } def get(self, id, **options): """ Get a single article from the index """ kargs = dict(index=self.index, doc_type=self.doc_type) kargs.update(options) return self.es.get_source(id=id, **kargs) def mget(self, ids, doc_type=None, parents=None): """ Get multiple articles from the index. If paret is given, it should be a sequence of the same length as ids """ if parents is None: parents = [None] * len(ids) if doc_type is None: doc_type = self.doc_type getdocs = [{ "_index": self.index, "_id": id, "_parent": parent, "_type": doc_type } for (id, parent) in zip(ids, parents)] return self.es.mget({"docs": getdocs})['docs'] def search(self, body, **options): """ Perform a 'raw' search on the underlying ES index """ kargs = dict(index=self.index, doc_type=self.doc_type) kargs.update(options) return self.es.search(body=body, **kargs) def scan(self, query, **kargs): """ Perform a scan query on the es index See: http://elasticsearch-py.readthedocs.org/en/latest/helpers.html#elasticsearch.helpers.scan """ return scan(self.es, index=self.index, doc_type=self.doc_type, query=query, **kargs) def query_ids(self, query=None, filters=EMPTY_RO_DICT, body=None, limit=None, **kwargs): """ Query the index returning a sequence of article ids for the mathced articles @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @param filter: field filter DSL query dict @param body: if given, use this instead of constructing from query/filters @param filters: if filter is None, build filter from filters as accepted by build_query, e.g. sets=12345 Note that query and filters can be combined in a single call """ if body is None: body = dict(build_body(query, filters, query_as_filter=True)) for i, a in enumerate( scan(self.es, query=body, index=self.index, doc_type=self.doc_type, size=(limit or 1000), fields="")): if limit and i >= limit: return yield int(a['_id']) def query(self, query=None, filters=EMPTY_RO_DICT, highlight=False, lead=False, fields=(), score=True, **kwargs): """ Execute a query for the given fields with the given query and filter @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @param filter: field filter DSL query dict, defaults to build_filter(**filters) @param kwargs: additional keyword arguments to pass to es.search, eg fields, sort, from_, etc @return: a list of named tuples containing id, score, and the requested fields """ body = dict( build_body(query, filters, query_as_filter=(not (highlight or score)))) if highlight and not score: body['query'] = {'constant_score': {'query': body['query']}} if 'sort' in kwargs: body['track_scores'] = True if highlight and query: if isinstance(highlight, dict): body['highlight'] = highlight else: body['highlight'] = HIGHLIGHT_OPTIONS if lead or False and query == "" and highlight: body['script_fields'] = { "lead": { "script": { "file": LEAD_SCRIPT_FIELD } } } result = self.search(body, fields=fields, **kwargs) return SearchResult(result, fields, score, body, query=query) def query_all(self, *args, **kargs): kargs.update({"from_": 0}) size = kargs.setdefault('size', 10000) result = self.query(*args, **kargs) total = result.total for offset in range(size, total, size): kargs['from_'] = offset result2 = self.query(*args, **kargs) result.hits += result2.hits return result def _get_used_properties(self, body__prop): body, prop = body__prop body["query"]["bool"]["must"][1]["exists"]["field"] = prop return bool( self.es.count(index=self.index, doc_type=self.doc_type, body=body)['count']) def get_used_properties(self, set_ids=None, article_ids=None, **filters): """ Returns a sequency of property names in use in the specified set(s) (or setids) """ if set_ids is not None: filters["sets"] = set_ids if article_ids is not None: filters["ids"] = article_ids all_properties = self.get_properties() flexible_properties = set(all_properties) - set(ALL_FIELDS) body = { "query": { "bool": { "must": [ build_filter(**filters), { "exists": { "field": "fakeprop" } } ] } } } bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties))) pool = ThreadPool() results = pool.imap(self._get_used_properties, zip(bodies, flexible_properties)) try: for found, prop in zip(results, flexible_properties): if found: yield prop finally: pool.close() def add_articles(self, article_ids, batch_size=1000): """ Add the given article_ids to the index. This is done in batches, so there is no limit on the length of article_ids (which can be a generator). """ #WvA: remove redundancy with create_articles if not article_ids: return from amcat.models import Article, ArticleSetArticle n = len(article_ids) / batch_size for i, batch in enumerate( splitlist(article_ids, itemsperbatch=batch_size)): log.info("Adding batch {i}/{n}".format(**locals())) all_sets = multidict( (aa.article_id, aa.articleset_id) for aa in ArticleSetArticle.objects.filter(article__in=batch)) dicts = (get_article_dict(article, list(all_sets.get(article.id, []))) for article in Article.objects.filter(pk__in=batch)) self.bulk_insert(dicts, batch_size=None) def remove_from_set(self, setid, article_ids, flush=True): """Remove the given articles from the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return for batch in splitlist(article_ids, itemsperbatch=1000): self.bulk_update(batch, UPDATE_SCRIPT_REMOVE_FROM_SET, params={'set': setid}) def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: if monitor: monitor.update() return batches = list(splitlist(article_ids, itemsperbatch=1000)) monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(message="Adding batch {iplus}/{nbatches}..".format( iplus=i + 1, nbatches=nbatches)) self.bulk_update(batch, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid}) def get_tokens(self, aid: int, fields=["text", "title"]): """ Get a list of all tokens (words and their positions) in the given document :param aid: Article ID :param fields: List of fields to get the terms for :return: a sequence of (field, position, term) tuples """ fieldstr = ",".join(fields) data = self.es.termvectors(self.index, self.doc_type, aid, fields=fieldstr, field_statistics=False, payloads=False, offsets=False) for field in fields: if field in data['term_vectors']: for term, info in data['term_vectors'][field]['terms'].items(): for token in info['tokens']: yield field, token['position'], term def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()): """ Bulk insert the given articles in batches of batch_size """ batches = list(toolkit.splitlist( dicts, itemsperbatch=batch_size)) if batch_size else [dicts] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update( 1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals())) props, articles = set(), {} for d in batch: props |= (set(d.keys()) - ALL_FIELDS) articles[d["id"]] = serialize(d) self.check_properties(props) body = get_bulk_body(articles) resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def update_values(self, article_id, values): """Update properties of existing article. @param values: mapping from field name to (new) value @type values: dict""" return self.bulk_update_values({article_id: values}) def bulk_update_values(self, articles): """Updates set of articles in bulk. """ body = get_bulk_body( {aid: serialize({"doc": a}) for aid, a in articles.items()}, action="update") resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def bulk_update(self, article_ids, script, params): """ Execute a bulk update script with the given params on the given article ids. """ payload = serialize({"script": {"file": script, "params": params}}) body = get_bulk_body({aid: payload for aid in article_ids}, action="update") resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def synchronize_articleset(self, aset, full_refresh=False): """ Make sure the given articleset is correctly stored in the index @param full_refresh: if true, re-add all articles to the index. Use this after changing properties of articles """ self.check_index() # make sure index exists and is at least 'yellow' log.debug("Getting SOLR ids from set") solr_set_ids = set(self.query_ids(filters=dict(sets=[aset.id]))) log.debug("Getting DB ids") db_ids = aset.get_article_ids() log.debug("Getting SOLR ids") solr_ids = set(self.in_index(db_ids)) to_remove = solr_set_ids - db_ids if full_refresh: to_add_docs = db_ids to_add_set = set() else: to_add_docs = db_ids - solr_ids to_add_set = (db_ids & solr_ids) - solr_set_ids log.warning( "Refreshing index, full_refresh={full_refresh}," "|solr_set_ids|={nsolrset}, |db_set_ids|={ndb}, |solr_ids|={nsolr} " "|to_add| = {nta}, |to_add_set|={ntas}, |to_remove_set|={ntr}". format(nsolr=len(solr_ids), nsolrset=len(solr_set_ids), ndb=len(db_ids), nta=len(to_add_docs), ntas=len(to_add_set), ntr=len(to_remove), **locals())) log.info("Removing {} articles".format(len(to_remove))) self.remove_from_set(aset.id, to_remove) log.info("Adding {} articles to set".format(len(to_add_set))) self.add_to_set(aset.id, to_add_set) log.info("Adding {} articles to index".format(len(to_add_docs))) self.add_articles(to_add_docs) log.info("Refreshing") self.refresh() def _count(self, body): """Raw version of count directly passing given query to elastic, while setting the index and doc_type""" return self.es.count(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body=body) def count(self, query=None, filters=None): """ Compute the number of items matching the given query / filter """ filters = dict(build_body(query, filters, query_as_filter=True)) body = {"query": {"constant_score": filters}} return self._count(body)["count"] def search_aggregate(self, aggregation, query=None, filters=None, **options): """ Run an aggregate search query and return the aggregation results @param aggregation: raw elastic query, e.g. {"terms" : {"field" : "medium"}} """ body = dict(query={ "filtered": dict(build_body(query, filters, query_as_filter=True)) }, aggregations={"aggregation": aggregation}) result = self.search(body, size=0, search_type="count", **options) return result['aggregations']['aggregation'] def _parse_terms_aggregate(self, aggregate, group_by, terms, sets): if not group_by: for term in terms: yield term, aggregate[term.label]['doc_count'] else: for term in terms: yield term, self._parse_aggregate(aggregate[term.label], list(group_by), terms, sets) def _parse_other_aggregate(self, aggregate, group_by, group, terms, sets): buckets = aggregate[group]["buckets"] if not group_by: return ((b['key'], b['doc_count']) for b in buckets) return ((b['key'], self._parse_aggregate(b, list(group_by), terms, sets)) for b in buckets) def _parse_aggregate(self, aggregate, group_by, terms, sets): """Parse a aggregation result to (nested) namedtuples.""" group = group_by.pop(0) if group == "terms": result = self._parse_terms_aggregate(aggregate, group_by, terms, sets) else: result = self._parse_other_aggregate(aggregate, group_by, group, terms, sets) if group == "sets" and sets is not None: # Filter sets if 'sets' is given result = ((aset_id, res) for aset_id, res in result if aset_id in set(sets)) elif group == "date": # Parse timestamps as datetime objects result = ((get_date(stamp), aggr) for stamp, aggr in result) # Return results as namedtuples ntuple = namedtuple("Aggr", [group, "buckets" if group_by else "count"]) return [ntuple(*r) for r in result] def _build_aggregate(self, group_by, date_interval, terms, sets): """Build nested aggregation query for list of groups""" group = group_by.pop(0) if group == 'date': aggregation = { group: { 'date_histogram': { 'field': group, 'interval': date_interval, "min_doc_count": 1 } } } elif group == 'terms': aggregation = { term.label: { 'filter': dict(build_body(term.query)) } for term in terms } else: aggregation = { group: { 'terms': { # Default size is too small, we want to return all results 'size': 999999, 'field': group } } } # We need to nest the other aggregations, see: # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-aggregations.html if group_by: nested = self._build_aggregate(group_by, date_interval, terms, sets) for aggr in aggregation.values(): aggr["aggregations"] = nested return aggregation def aggregate_query(self, query=None, filters=None, group_by=None, terms=None, sets=None, date_interval='month'): """ Compute an aggregate query, e.g. select count(*) where <filters> group by <group_by>. If date is used as a group_by variable, uses date_interval to bin it. It does support multiple values for group_by. You can group_by on terms by supplying "terms" to group_by. In addition, you will need to supply terms as a parameter, which consists of a list of SearchQuery's. Query is then used as a global filter, while terms are 'local'. @param query: an elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @type group_by: list / tuple @type mediums: bool @param mediums: return Medium objects, instead of ids """ if isinstance(group_by, str): log.warning( "Passing strings to aggregate_query(group_by) is deprecated.") group_by = [group_by] if "terms" in group_by and terms is None: raise ValueError( "You should pass a list of terms if aggregating on it.") filters = dict(build_body(query, filters, query_as_filter=True)) aggregations = self._build_aggregate(list(group_by), date_interval, terms, sets) body = { "query": { "constant_score": filters }, "aggregations": aggregations } log.debug("es.search(body={body})".format(**locals())) result = self.search(body) result = self._parse_aggregate(result["aggregations"], list(group_by), terms, sets) return result def statistics(self, query=None, filters=None): """Compute and return a Result object with n, start_date and end_date for the selection""" body = { "query": { "constant_score": dict(build_body(query, filters, query_as_filter=True)) }, 'aggregations': { 'stats': { 'stats': { 'field': 'date' } } } } stats = self.search(body, size=0)['aggregations']['stats'] result = Result() result.n = stats['count'] if result.n == 0: result.start_date, result.end_date = None, None else: result.start_date = get_date(stats['min']) result.end_date = get_date(stats['max']) return result def list_dates(self, query=None, filters=None, interval="day"): from amcat.tools.aggregate_es import aggregate, IntervalCategory for date, count in aggregate(query, filters, [IntervalCategory(interval)], es=self): yield date def in_index(self, ids): """ Check whether the given ids are already indexed. @return: a sequence of ids that are in the index """ if not isinstance(ids, list): ids = list(ids) log.info( "Checking existence of {nids} documents".format(nids=len(ids))) if not ids: return for batch in splitlist(ids, itemsperbatch=10000): result = self.es.mget(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body={"ids": batch}, fields=[]) for doc in result['docs']: if doc['found']: yield int(doc['_id']) def duplicate_exists(self, article): """ Check whether a duplicate of the given article already exists. If so, returns the sets that the duplicate is a member of. Duplication is checked using de get_hash function, so article should be an object with the appropriate attributes (.title etc) @return: A (possibly empty) sequence of results with .id and .sets """ hash = get_article_dict(article).hash return self.query(filters={'hashes': hash}, fields=["sets"], score=False) def _get_purge_actions(self, query): for id in self.query_ids(body=query): yield { "_op_type": "delete", "_id": id, "_index": self.index, "_type": settings.ES_ARTICLE_DOCTYPE } def purge_orphans(self): """Remove all articles without set from the index""" query = { "query": { "constant_score": { "filter": { "missing": { "field": "sets" } } } } } return bulk(self.es, self._get_purge_actions(query)) def get_child_type_counts(self, **filters): """Get the number of child documents per type""" filters = dict(build_body(filters=filters)) filter = { "has_parent": { "parent_type": self.doc_type, "filter": filters['filter'] } } aggs = {"module": {"terms": {"field": "_type"}}} body = {"aggs": {"prep": {"filter": filter, "aggs": aggs}}} r = self.es.search(index=self.index, search_type="count", body=body) for b in r['aggregations']['prep']['module']['buckets']: yield b['key'], b['doc_count'] def get_articles_without_child(self, child_doctype, limit=None, **filters): """Return the ids of all articles without a child of the given doctype""" nochild = { "not": { "has_child": { "type": child_doctype, "query": { "match_all": {} } } } } filter = dict(build_body(filters=filters))['filter'] body = {"filter": {"bool": {"must": [filter, nochild]}}} return self.query_ids(body=body, limit=limit)
class Elastic_Search: def __init__(self, index='iis-logs-', aws_secret_id=None): self.timestamp = datetime.datetime.utcnow() self.index = index self._setup_Elastic_on_localhost() # default to localhost self._setup_Elastic_on_localhost() # default to localhost self._result = None if index and aws_secret_id: self._setup_Elastic_on_cloud_via_AWS_Secret(index, aws_secret_id) def _setup_Elastic_on_localhost(self): self.host = 'localhost' self.port = 9200 self.scheme = 'http' self.es = Elasticsearch([{'host': self.host, 'port': self.port}]) def _setup_Elastic_on_cloud_via_AWS_Secret(self, index, secret_id): credentials = json.loads(Secrets(secret_id).value()) self.host = credentials['host'] self.username = credentials['username'] self.password = credentials['password'] self.port = credentials['port'] self.index = index self._setup_Elastic_on_cloud(self.host, self.port, self.username, self.password) return self def _setup_Elastic_on_cloud(self, host, port, username, password): self.host = host self.port = port self.username = username self.password = password self.scheme = 'https' self.es = Elasticsearch([host], http_auth=(username, password), scheme="https", port=port) return self def add_data_with_timestamp(self, data): data["@timestamp"] = self.timestamp return self.es.index(index=self.index, doc_type='item', body=data) def add(self, data, id_key=None): try: if id_key is not None: return self.es.index(index=self.index, doc_type='item', body=data, id=data[id_key]) else: return self.es.index(index=self.index, doc_type='item', body=data) except Exception as error: print("elk-error", error) return {"elk-error": "{0}".format(error)} def add_bulk(self, data, id_key=None, pipeline=None): ok = 0 if data: actions = [] for item in data: item_data = { "_index": self.index, "_type": 'item', "_source": item, } if id_key is not None: item_data["_id"] = item[id_key] actions.append(item_data) if pipeline is None: ok, _ = helpers.bulk(self.es, actions, index=self.index) else: ok, _ = helpers.bulk(self.es, actions, index=self.index, pipeline=pipeline) return ok def create_index(self, body={}): if self.exists() is False: self._result = self.es.indices.create(index=self.index, body=body) return self def create_index_with_location_geo_point(self, field="location"): body = { "mappings": { "item": { "properties": { field: { "type": "geo_point" } } } } } self.create_index(body) return self def create_index_pattern(self, add_time_field=True): if add_time_field: payload = { "type": "index-pattern", "index-pattern": { "title": self.index + '*', "timeFieldName": "date" } } else: print('creating index without index pattern') payload = { "type": "index-pattern", "index-pattern": { "title": self.index + '*' } } data = json.dumps(payload) headers = {'Content-Type': 'application/json'} if self.host == 'localhost': url = 'http://{0}:{1}/.kibana/doc/index-pattern:{2}'.format( self.host, self.port, self.index) self._result = json.loads(PUT(url, data, headers)) else: url = 'https://{0}:{1}/.kibana/doc/index-pattern:{2}'.format( self.host, self.port, self.index) response = requests.put(url, data, headers=headers, auth=HTTPBasicAuth(self.username, self.password)) self._result = json.loads(response.text) return self def delete_index_pattern(self): try: if self.host == 'localhost': url = 'http://{0}:{1}/.kibana/doc/index-pattern:{2}'.format( self.host, self.port, self.index) self._result = json.loads(DELETE(url)) else: url = 'https://{0}:{1}/.kibana/doc/index-pattern:{2}'.format( self.host, self.port, self.index) response = requests.delete(url, auth=HTTPBasicAuth( self.username, self.password)) self._result = json.loads(response.text) except Exception as error: self._result = {'error': error} return self def delete_data_by_id(self, id): return self.es.delete(index=self.index, doc_type='item', id=id) def get_data(self, id): try: return self.es.get(index=self.index, doc_type='item', id=id) except NotFoundError: return None def get_many(self, ids): data = self.es.mget(index=self.index, doc_type='item', body={'ids': ids}) results = {} for item in data['docs']: _id = item['_id'] if item['found'] is False: results[_id] = None else: results[_id] = item['_source'] return results def get_data_First_10(self): results = self.es.search(index=self.index, body={"query": { "match_all": {} }}) for result in results['hits']['hits']: yield result['_source'] def get_index_settings(self): url = 'https://{3}:{4}@{0}:{1}/{2}/_settings'.format( self.host, self.port, self.index, self.username, self.password) return json.loads(requests.get(url).text) def search_using_lucene( self, query, size=10000, sort=None ): # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax query = query.replace('“', '"').replace( '”', '"') # fix the quotes we receive from Slack results = self.es.search(index=self.index, q=query, size=size, sort=sort) for result in results['hits']['hits']: yield result['_source'] def search_using_lucene_index_by_id( self, query, size=10000, sort=None ): # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax query = query.replace('“', '"').replace( '”', '"') # fix the quotes we receive from Slack elk_results = self.es.search(index=self.index, q=query, size=size, sort=sort) results = {} for result in elk_results['hits']['hits']: id = result['_id'] value = result['_source'] results[id] = value return results def search_using_lucene_sort_by_date( self, query, size=10000 ): # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax query = query.replace('“', '"').replace( '”', '"') # fix the quotes we receive from Slack elk_results = self.es.search(index=self.index, q=query, size=size, sort="date:desc") results = [] for result in elk_results['hits']['hits']: id = result['_id'] value = result['_source'] item = {"id": id, "value": value} results.append(item) return results def search_using_query(self, query, size=10000): results = self.es.search(index=self.index, body=query, size=size) for result in results['hits']['hits']: yield result['_source'] def search_on_field_for_value(self, field, value, size=10000): query = {"query": {"match": {field: {"query": value}}}} return self.search_using_query(query, size=size) def search_on_field_for_values(self, field, values): query = { "query": { "constant_score": { "filter": { "terms": { field: values } } } } } return self.search_using_query(query) # this is not working # def search_get_unique_field_values(self, field,size = 10000): # query = { # "size": 0, # "aggs": { # "unique_ids": { # "terms": { # "field": 'field', # "size": size # } # } # } # } # return self.search_using_query(query) def set_index_settings(self, settings): headers = {'Content-Type': 'application/json'} url = 'https://{0}:{1}/{2}/_settings'.format(self.host, self.port, self.index) response = requests.put(url, json.dumps(settings), headers=headers, auth=HTTPBasicAuth(self.username, self.password)) return response.text def set_index_settings_total_fields(self, value): self.set_index_settings({"index.mapping.total_fields.limit": value}) return self def delete_using_query(self, query): results = self.es.delete_by_query(index=self.index, body=query) return results def delete_index(self): if self.exists(): self._result = self.es.indices.delete(self.index) return self def index_list(self): return set(self.es.indices.get_alias()) def exists(self): return self.es.indices.exists(self.index) def set_index(self, index): self.index = index return self
# -*- coding: utf-8 -*- """ Created on Thu Mar 22 17:00:46 2018 @author: Ajanta """ import json from elasticsearch import Elasticsearch query = {'query': {'match_all': {}}} elastic_obj = Elasticsearch(INDEX_NAME='products', TYPE_NAME='snapdeal') f = open('products.json', 'w') f.write(json.dumps(elastic_obj.mget(body=query))) f.close()
class SearchEngine(object): def __init__(self, prefix=settings.ELASTICSEARCH_PREFIX): # serializer = JSONSerializer() serializer.mimetype = 'application/json' serializer.dumps = serializer.serialize serializer.loads = JSONDeserializer().deserialize self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS) self.logger = logging.getLogger(__name__) self.prefix = prefix.lower() def _add_prefix(self, *args, **kwargs): if args: index = args[0].strip() else: index = kwargs.get('index', '').strip() if index is None or index == '': raise NotImplementedError("Elasticsearch index not specified.") prefix = '%s_' % self.prefix.strip() if self.prefix and self.prefix.strip() != '' else '' index = '%s%s' % (prefix, index) if args: return index else: return dict(kwargs, index=index) def delete(self, **kwargs): """ Deletes a document from the index Pass an index, doc_type, and id to delete a specific document Pass a body with a query dsl to delete by query """ kwargs = self._add_prefix(**kwargs) body = kwargs.pop('body', None) if body != None: try: data = [] refresh = kwargs.pop('refresh', False) for hit in helpers.scan(self.es, query=body, **kwargs): hit['_op_type'] = 'delete' data.append(hit) return helpers.bulk(self.es, data, refresh=refresh, **kwargs) except Exception as detail: try: # ignore 404 errors (index_not_found_exception) if detail.status_code == 404: pass except: self.logger.warning('%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning('%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail def delete_index(self, **kwargs): """ Deletes an entire index """ kwargs = self._add_prefix(**kwargs) print 'deleting index : %s' % kwargs.get('index') return self.es.indices.delete(ignore=[400, 404], **kwargs) def search(self, **kwargs): """ Search for an item in the index. Pass an index, doc_type, and id to get a specific document Pass a body with a query dsl to perform a search """ kwargs = self._add_prefix(**kwargs) body = kwargs.get('body', None) id = kwargs.get('id', None) if id: if isinstance(id, list): kwargs.setdefault('body', {'ids': kwargs.pop('id')}) return self.es.mget(**kwargs) else: return self.es.get(**kwargs) ret = None try: ret = self.es.search(**kwargs) except Exception as detail: self.logger.warning('%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) pass return ret def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex=None, body=None): """ Creates an Elasticsearch body for a single field given an index name and type name """ index = self._add_prefix(index) if not body: if fieldtype == 'geo_shape': body = { doc_type : { 'properties' : { fieldname : { 'type' : 'geo_shape', 'tree' : 'geohash', 'precision': '1m' } } } } else: fn = { 'type' : fieldtype } if fieldindex: fn['index'] = fieldindex body = { doc_type : { 'properties' : { fieldname : fn } } } self.es.indices.create(index=index, ignore=400) self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body) print 'creating index : %s/%s' % (index, doc_type) def create_index(self, **kwargs): kwargs = self._add_prefix(**kwargs) self.es.indices.create(**kwargs) print 'creating index : %s' % kwargs.get('index', '') def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs): """ Indexes a document or list of documents into Elasticsearch If "id" is supplied then will use that as the id of the document If "idfield" is supplied then will try to find that property in the document itself and use the value found for the id of the document """ index = self._add_prefix(index) if not isinstance(body, list): body = [body] for document in body: if idfield is not None: if isinstance(document, dict): id = document[idfield] else: id = getattr(document,idfield) try: self.es.index(index=index, doc_type=doc_type, body=document, id=id) except Exception as detail: self.logger.warning('%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail)) raise detail def bulk_index(self, data, **kwargs): return helpers.bulk(self.es, data, **kwargs) def create_bulk_item(self, op_type='index', index=None, doc_type=None, id=None, data=None): return { '_op_type': op_type, '_index': self._add_prefix(index), '_type': doc_type, '_id': id, '_source': data } def count(self, **kwargs): kwargs = self._add_prefix(**kwargs) count = self.es.count(**kwargs) if count is not None: return count['count'] else: return None def BulkIndexer(outer_self, batch_size=500, **kwargs): class _BulkIndexer(object): def __init__(self, **kwargs): self.queue = [] self.batch_size = kwargs.pop('batch_size', 500) self.kwargs = kwargs def add(self, op_type='index', index=None, doc_type=None, id=None, data=None): doc = { '_op_type': op_type, '_index': outer_self._add_prefix(index), '_type': doc_type, '_id': id, '_source': data } self.queue.append(doc) if len(self.queue) >= self.batch_size: outer_self.bulk_index(self.queue, **self.kwargs) del self.queue[:] #clear out the array def close(self): outer_self.bulk_index(self.queue, **self.kwargs) def __enter__(self, **kwargs): return self def __exit__(self, type, value, traceback): return self.close() return _BulkIndexer(batch_size=batch_size, **kwargs)
class Kidash: def __init__(self, url, index='.kibana', doc_type=None): self.es = Elasticsearch(url, verify_certs=True) self.index = index self.doc_type = doc_type def search_request(self, body, size, filter_path): """Make a search in ES based on the given parameters. :param body: Body of the query to send :param filter_path: Filter for the parameters to retrieve :param size: length of the elements to retrieve :returns: an Object with the elements retrieved """ request = self.es.search(index=self.index, doc_type=self.doc_type, body=body, filter_path=filter_path, size=size) return request def get_number_of_items(self, body): """Retrieve the number of items for a given search. :param body: Body of the query to send :returns: A counter of the total number of ids """ filter_path = ['hits.total'] request = self.search_request(body, 1, filter_path) t_ids = request['hits']['total'] return t_ids def list_item_ids(self, body): """Retrieve the list of items for a given search. :param body: Body of the query to send :returns: an id's list """ filter_path = ['hits.hits._id'] size = self.get_number_of_items(body) request = self.search_request(body, size, filter_path) ids_list = request['hits']['hits'] return ids_list def retrieve_items_by_list(self, ids_list): """Retrieve items based in a given id's list. :param body: Body of the query to send :returns: The list of elements retrieved """ body_docs = {'docs': ids_list} request = self.es.mget(index=self.index, doc_type=self.doc_type, body=json.dumps(body_docs)) elements_list = request['docs'] return elements_list def retrieve_items_by_query(self, body=ALL): """Retrieve items based in a given query. By default it launches the query 'match_all' :param body: Body of the query to send :returns: The list of elements retrieved """ filter_path = ['hits.hits._*'] size = self.get_number_of_items(body) request = self.search_request(body, size, filter_path) return request['hits']['hits'] def stream_items(self, query): """Scan the items of a given query, retrieve it and adds the delete operation. :param doc_type: Type of document to search :param query: Body of the query to send :yields: The elements retrieved """ for item in scan(self.es, query=query, index=self.index, doc_type=self.doc_type, scroll='1m', _source=False): del item['_score'] item['_op_type'] = 'delete' yield item def load_items(self, list_of_elements): """Load a list of given items into ElasticSearch. :param doc_type: Type of document to search :param list_of_elements: List of the elements to load """ bulk_items = [] for element in list_of_elements: item = { "_index": self.index, "_type": element['_type'], "_id": element['_id'], "_source": element['_source'], } bulk_items.append(item) bulk(self.es, bulk_items) def delete_items(self, query): """Remove the elements of a given query by using Bulk operations. :param doc_type: Type of document to search :param query: Body of the query to send """ bulk(self.es, self.stream_items(query), chunk_size=CHUNK_SIZE) def import_items(self, filepath): """Import a set of elements given a file. :param filepath: Path of the file to load """ list_of_elements = json.loads(open(filepath).read()) self.load_items(list_of_elements) def export_items(self, output_file, query): """Export a set of elements based on the parameters given. :param output_file: File where to export the items """ items = self.retrieve_items_by_query(query) try: output_file.write(json.dumps(items, indent=2, sort_keys=True)) output_file.write('\n') except IOError as e: raise RuntimeError(str(e))
class _ES(object): def __init__(self, index, doc_type, host, port, timeout=300, **args): self.host = host self.port = port self.index = index self.doc_type = doc_type self.es = Elasticsearch(hosts=[{"host": self.host, "port": self.port}, ], timeout=timeout, **args) def check_properties(self, properties): """ Check if all properties are known (e.g. have mappings), and creates mappings as needed """ properties = set(properties) if not (properties - self.get_properties()): return to_add = properties - self.get_properties() if to_add: self.add_properties(to_add) def add_properties(self, to_add): """ Add the named properties, setting mapping depending on suffix """ mappings = {} for name in to_add: ftype = name.rsplit("_", 1)[1] if "_" in name else 'default' mappings[name] = settings.ES_MAPPING_TYPES[ftype] self.es.indices.put_mapping(index=self.index, doc_type=self.doc_type, body={"properties": mappings}) def get_mapping(self): m = self.es.indices.get_mapping(self.index, self.doc_type) return m[self.index]['mappings'][self.doc_type]['properties'] def get_properties(self): self.check_index() return set(self.get_mapping().keys()) def refresh(self): self.es.indices.refresh() def highlight_article(self, aid: int, query: str) -> dict: """Highlight article given by an article id using a Lucene query. The resulting strings are safe to insert into an HTML document even if the original document contained malicious constructs. If you need the original article including HTML, call html.unescape on this output.""" from amcat.tools.amcates_queryset import ESQuerySet qs = ESQuerySet().filter(id=aid).only("text", "title").highlight(query, mark="em") try: return next(iter(qs)).to_dict() except StopIteration: raise ValueError("Article(id={}) not found in elastic index.".format(aid)) def clear_cache(self): self.es.indices.clear_cache() def delete_index(self): try: self.es.indices.delete(self.index) except NotFoundError: pass except Exception as e: if 'IndexMissingException' in str(e): return raise def create_index(self, shards=5, replicas=1): es_settings = settings.ES_SETTINGS.copy() es_settings.update({"number_of_shards": shards, "number_of_replicas": replicas}) body = { "settings": es_settings, "mappings": { settings.ES_ARTICLE_DOCTYPE: settings.ES_MAPPING } } self.es.indices.create(self.index, body) def check_index(self): """ Check whether the server is up and the index exists. If the server is down, raise an exception. If the index does not exist, try to create it. """ if not self.es.ping(): raise Exception("Elastic server cannot be reached") if not self.es.indices.exists(self.index): log.info("Index {self.index} does not exist, creating".format(**locals())) self.create_index() return self.es.cluster.health(self.index, wait_for_status='yellow') def exists_type(self, doc_type, **kargs): return self.es.indices.exists_type(index=self.index, doc_type=doc_type, **kargs) def put_mapping(self, doc_type, body, **kargs): return self.es.indices.put_mapping(index=self.index, doc_type=doc_type, body=body, **kargs) def status(self): nodes = self.es.nodes.info()['nodes'].values() return {"ping": self.es.ping(), "nodes": [n['name'] for n in nodes], "index": self.index, "index_health": self.es.cluster.health(self.index), "transport_hosts": self.es.transport.hosts, } def get(self, id, **options): """ Get a single article from the index """ kargs = dict(index=self.index, doc_type=self.doc_type) kargs.update(options) return self.es.get_source(id=id, **kargs) def mget(self, ids, doc_type=None, parents=None): """ Get multiple articles from the index. If paret is given, it should be a sequence of the same length as ids """ if parents is None: parents = [None] * len(ids) if doc_type is None: doc_type = self.doc_type getdocs = [{"_index": self.index, "_id": id, "_parent": parent, "_type": doc_type} for (id, parent) in zip(ids, parents)] return self.es.mget({"docs": getdocs})['docs'] def search(self, body, **options): """ Perform a 'raw' search on the underlying ES index """ kargs = dict(index=self.index, doc_type=self.doc_type) kargs.update(options) if log.isEnabledFor(logging.DEBUG): # pprint can be expensive log.debug("Search with body:\n {}".format(pprint.pformat(body))) return self.es.search(body=body, **kargs) def scan(self, query, **kargs): """ Perform a scan query on the es index See: http://elasticsearch-py.readthedocs.org/en/latest/helpers.html#elasticsearch.helpers.scan """ return scan(self.es, index=self.index, doc_type=self.doc_type, query=query, **kargs) def query_ids(self, query=None, filters=EMPTY_RO_DICT, body=None, limit=None, **kwargs): """ Query the index returning a sequence of article ids for the mathced articles @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @param filter: field filter DSL query dict @param body: if given, use this instead of constructing from query/filters @param filters: if filter is None, build filter from filters as accepted by build_query, e.g. sets=12345 Note that query and filters can be combined in a single call """ if body is None: body = dict(build_body(query, filters, query_as_filter=True)) log.debug("query_ids with body:\n {}".format(pprint.pformat(body))) for i, a in enumerate(scan(self.es, query=body, index=self.index, doc_type=self.doc_type, size=(limit or 1000), _source=False)): if limit and i >= limit: return yield int(a['_id']) def query(self, query=None, filters=EMPTY_RO_DICT, highlight=False, lead=False, _source=(), score=True, **kwargs): """ Execute a query for the given fields with the given query and filter @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @param filter: field filter DSL query dict, defaults to build_filter(**filters) @param kwargs: additional keyword arguments to pass to es.search, eg fields, sort, from_, etc @return: a list of named tuples containing id, score, and the requested fields """ body = dict(build_body(query, filters, query_as_filter=(not (highlight or score)))) if highlight and not score: body['query'] = {'constant_score': {'query': body['query']}} if 'sort' in kwargs: body['track_scores'] = True if highlight and query: if isinstance(highlight, dict): body['highlight'] = highlight else: body['highlight'] = HIGHLIGHT_OPTIONS if lead or False and query == "" and highlight: body['script_fields'] = {"lead": {"script": LEAD_SCRIPT_FIELD}} result = self.search(body, _source=_source, **kwargs) return SearchResult(result, _source, score, body, query=query) def query_all(self, *args, **kargs): kargs.update({"from_": 0}) size = kargs.setdefault('size', 10000) result = self.query(*args, **kargs) total = result.total for offset in range(size, total, size): kargs['from_'] = offset result2 = self.query(*args, **kargs) result.hits += result2.hits return result def _get_used_properties(self, body__prop): body, prop = body__prop body["query"]["bool"]["must"][1]["exists"]["field"] = prop return bool(self.es.count(index=self.index, doc_type=self.doc_type, body=body)['count']) def get_used_properties(self, set_ids=None, article_ids=None, **filters): """ Returns a sequency of property names in use in the specified set(s) (or setids) """ if set_ids is not None: filters["sets"] = set_ids if article_ids is not None: filters["ids"] = article_ids all_properties = self.get_properties() flexible_properties = set(all_properties) - set(ALL_FIELDS) body = {"query": {"bool": {"must": [ build_filter(**filters), {"exists": {"field": "fakeprop"}} ]}}} bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties))) pool = ThreadPool() results = pool.imap(self._get_used_properties, zip(bodies, flexible_properties)) try: for found, prop in zip(results, flexible_properties): if found: yield prop finally: pool.close() def add_articles(self, article_ids, batch_size=1000): """ Add the given article_ids to the index. This is done in batches, so there is no limit on the length of article_ids (which can be a generator). """ # WvA: remove redundancy with create_articles if not article_ids: return from amcat.models import Article, ArticleSetArticle n = len(article_ids) // batch_size for i, batch in enumerate(splitlist(article_ids, itemsperbatch=batch_size)): log.info("Adding batch {i}/{n}".format(**locals())) all_sets = multidict((aa.article_id, aa.articleset_id) for aa in ArticleSetArticle.objects.filter(article__in=batch)) dicts = (get_article_dict(article, list(all_sets.get(article.id, []))) for article in Article.objects.filter(pk__in=batch)) self.bulk_insert(dicts, batch_size=None) def remove_from_set(self, setid, article_ids, flush=True): """Remove the given articles from the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return for batch in splitlist(article_ids, itemsperbatch=1000): self.bulk_update(batch, UPDATE_SCRIPT_REMOVE_FROM_SET, params={'set': setid}) def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: if monitor: monitor.update() return batches = [set(batch) for batch in splitlist(article_ids, itemsperbatch=1000)] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(message="Adding batch {iplus}/{nbatches}..".format(iplus=i + 1, nbatches=nbatches)) missing = batch - set(self.in_index(batch)) if missing: logging.warning("Adding {} missing articles to elastic".format(len(missing))) self.add_articles(missing) if batch - missing: self.bulk_update(batch - missing, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid}) def get_tokens(self, aid: int, fields=["text", "title"]): """ Get a list of all tokens (words and their positions) in the given document :param aid: Article ID :param fields: List of fields to get the terms for :return: a sequence of (field, position, term) tuples """ fieldstr = ",".join(fields) data = self.es.termvectors(self.index, self.doc_type, aid, fields=fieldstr, field_statistics=False, payloads=False, offsets=False) for field in fields: if field in data['term_vectors']: for term, info in data['term_vectors'][field]['terms'].items(): for token in info['tokens']: yield field, token['position'], term def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()): """ Bulk insert the given articles in batches of batch_size """ batches = list(toolkit.splitlist(dicts, itemsperbatch=batch_size)) if batch_size else [dicts] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals())) props, articles = set(), {} for d in batch: props |= (set(d.keys()) - ALL_FIELDS) articles[d["id"]] = serialize(d) self.check_properties(props) body = get_bulk_body(articles) resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def update_values(self, article_id, values): """Update properties of existing article. @param values: mapping from field name to (new) value @type values: dict""" return self.bulk_update_values({article_id: values}) def bulk_update_values(self, articles): """Updates set of articles in bulk. """ body = get_bulk_body({aid: serialize({"doc": a}) for aid, a in articles.items()}, action="update") resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def bulk_update(self, article_ids, script, params): """ Execute a bulk update script with the given params on the given article ids. """ payload = serialize({"script": dict(script, params=params)}) body = get_bulk_body({aid: payload for aid in article_ids}, action="update") resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def synchronize_articleset(self, aset, full_refresh=False): """ Make sure the given articleset is correctly stored in the index @param full_refresh: if true, re-add all articles to the index. Use this after changing properties of articles """ self.check_index() # make sure index exists and is at least 'yellow' log.debug("Getting SOLR ids from set") solr_set_ids = set(self.query_ids(filters=dict(sets=[aset.id]))) log.debug("Getting DB ids") db_ids = aset.get_article_ids() log.debug("Getting SOLR ids") solr_ids = set(self.in_index(db_ids)) to_remove = solr_set_ids - db_ids if full_refresh: to_add_docs = db_ids to_add_set = set() else: to_add_docs = db_ids - solr_ids to_add_set = (db_ids & solr_ids) - solr_set_ids log.warning("Refreshing index, full_refresh={full_refresh}," "|solr_set_ids|={nsolrset}, |db_set_ids|={ndb}, |solr_ids|={nsolr} " "|to_add| = {nta}, |to_add_set|={ntas}, |to_remove_set|={ntr}" .format(nsolr=len(solr_ids), nsolrset=len(solr_set_ids), ndb=len(db_ids), nta=len(to_add_docs), ntas=len(to_add_set), ntr=len(to_remove), **locals())) log.info("Removing {} articles".format(len(to_remove))) self.remove_from_set(aset.id, to_remove) log.info("Adding {} articles to set".format(len(to_add_set))) self.add_to_set(aset.id, to_add_set) log.info("Adding {} articles to index".format(len(to_add_docs))) self.add_articles(to_add_docs) log.info("Refreshing") self.refresh() def _count(self, body): """Raw version of count directly passing given query to elastic, while setting the index and doc_type""" return self.es.count(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body=body) def count(self, query=None, filters=None): """ Compute the number of items matching the given query / filter """ filters = dict(build_body(query, filters, query_as_filter=True)) body = {"query": {"constant_score": filters}} return self._count(body)["count"] def search_aggregate(self, aggregation, query=None, filters=None, **options): """ Run an aggregate search query and return the aggregation results @param aggregation: raw elastic query, e.g. {"terms" : {"field" : "medium"}} """ body = dict(query={"filtered": dict(build_body(query, filters, query_as_filter=True))}, aggregations={"aggregation": aggregation}) result = self.search(body, size=0, **options) return result['aggregations']['aggregation'] def _parse_terms_aggregate(self, aggregate, group_by, terms, sets): if not group_by: for term in terms: yield term, aggregate[term.label]['doc_count'] else: for term in terms: yield term, self._parse_aggregate(aggregate[term.label], list(group_by), terms, sets) def _parse_other_aggregate(self, aggregate, group_by, group, terms, sets): buckets = aggregate[group]["buckets"] if not group_by: return ((b['key'], b['doc_count']) for b in buckets) return ((b['key'], self._parse_aggregate(b, list(group_by), terms, sets)) for b in buckets) def _parse_aggregate(self, aggregate, group_by, terms, sets): """Parse a aggregation result to (nested) namedtuples.""" group = group_by.pop(0) if group == "terms": result = self._parse_terms_aggregate(aggregate, group_by, terms, sets) else: result = self._parse_other_aggregate(aggregate, group_by, group, terms, sets) if group == "sets" and sets is not None: # Filter sets if 'sets' is given result = ((aset_id, res) for aset_id, res in result if aset_id in set(sets)) elif group == "date": # Parse timestamps as datetime objects result = ((get_date(stamp), aggr) for stamp, aggr in result) # Return results as namedtuples ntuple = namedtuple("Aggr", [safe_identifier(group), "buckets" if group_by else "count"]) return [ntuple(*r) for r in result] def _build_aggregate(self, group_by, date_interval, terms, sets): """Build nested aggregation query for list of groups""" group = group_by.pop(0) if group == 'date': aggregation = { group: { 'date_histogram': { 'field': group, 'interval': date_interval, "min_doc_count": 1 } } } elif group == 'terms': aggregation = { term.label: { 'filter': dict(build_body(term.query))['query'] } for term in terms } else: aggregation = { group: { 'terms': { # Default size is too small, we want to return all results 'size': 999999, 'field': group } } } # We need to nest the other aggregations, see: # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-aggregations.html if group_by: nested = self._build_aggregate(group_by, date_interval, terms, sets) for aggr in aggregation.values(): aggr["aggregations"] = nested return aggregation def aggregate_query(self, query=None, filters=None, group_by=None, terms=None, sets=None, date_interval='month'): """ Compute an aggregate query, e.g. select count(*) where <filters> group by <group_by>. If date is used as a group_by variable, uses date_interval to bin it. It does support multiple values for group_by. You can group_by on terms by supplying "terms" to group_by. In addition, you will need to supply terms as a parameter, which consists of a list of SearchQuery's. Query is then used as a global filter, while terms are 'local'. @param query: an elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @type group_by: list / tuple @type mediums: bool @param mediums: return Medium objects, instead of ids """ if isinstance(group_by, str): log.warning("Passing strings to aggregate_query(group_by) is deprecated.") group_by = [group_by] if "terms" in group_by and terms is None: raise ValueError("You should pass a list of terms if aggregating on it.") filters = dict(build_body(query, filters, query_as_filter=True)) aggregations = self._build_aggregate(list(group_by), date_interval, terms, sets) body = { "query": {"constant_score": filters}, "aggregations": aggregations } log.debug("es.search(body={body})".format(**locals())) result = self.search(body) result = self._parse_aggregate(result["aggregations"], list(group_by), terms, sets) return result def statistics(self, query=None, filters=None): """Compute and return a Result object with n, start_date and end_date for the selection""" body = { "query": { "constant_score": dict( build_body(query, filters, query_as_filter=True) ) }, 'aggregations': { 'stats': { 'stats': {'field': 'date'} } } } stats = self.search(body, size=0)['aggregations']['stats'] result = Result() result.n = stats['count'] if result.n == 0: result.start_date, result.end_date = None, None else: result.start_date = get_date(stats['min']) result.end_date = get_date(stats['max']) return result def list_dates(self, query=None, filters=None, interval="day"): from amcat.tools.aggregate_es import aggregate, IntervalCategory for date, count in aggregate(query, filters, [IntervalCategory(interval)], es=self): yield date def in_index(self, ids): """ Check whether the given ids are already indexed. @return: a sequence of ids that are in the index """ if not isinstance(ids, list): ids = list(ids) log.info("Checking existence of {nids} documents".format(nids=len(ids))) if not ids: return for batch in splitlist(ids, itemsperbatch=10000): result = self.es.mget(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body={"ids": batch}, _source=[]) for doc in result['docs']: if doc['found']: yield int(doc['_id']) def duplicate_exists(self, article): """ Check whether a duplicate of the given article already exists. If so, returns the sets that the duplicate is a member of. Duplication is checked using de get_hash function, so article should be an object with the appropriate attributes (.title etc) @return: A (possibly empty) sequence of results with .id and .sets """ hash = get_article_dict(article).hash return self.query(filters={'hashes': hash}, _source=["sets"], score=False) def _get_purge_actions(self, query): for id in self.query_ids(body=query): yield { "_op_type": "delete", "_id": id, "_index": self.index, "_type": settings.ES_ARTICLE_DOCTYPE } def purge_orphans(self): """Remove all articles without set from the index""" query = {"query": {"bool": {"must_not": {"exists": {"field": "sets"}}}}} return bulk(self.es, self._get_purge_actions(query)) def get_child_type_counts(self, **filters): """Get the number of child documents per type""" filters = dict(build_body(filters=filters)) filter = {"has_parent": {"parent_type": self.doc_type, "filter": filters['filter']}} aggs = {"module": {"terms": {"field": "_type"}}} body = {"aggs": {"prep": {"filter": filter, "aggs": aggs}}} r = self.es.search(index=self.index, size=0, body=body) for b in r['aggregations']['prep']['module']['buckets']: yield b['key'], b['doc_count'] def get_articles_without_child(self, child_doctype, limit=None, **filters): """Return the ids of all articles without a child of the given doctype""" nochild = {"not": {"has_child": {"type": child_doctype, "query": {"match_all": {}}}}} filter = dict(build_body(filters=filters))['filter'] body = {"filter": {"bool": {"must": [filter, nochild]}}} return self.query_ids(body=body, limit=limit)
class SearchEngine(object): def __init__(self): # serializer = JSONSerializer() serializer.mimetype = 'application/json' serializer.dumps = serializer.serialize serializer.loads = JSONDeserializer().deserialize self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS) self.logger = logging.getLogger(__name__) def delete(self, **kwargs): """ Deletes a document from the index Pass an index, doc_type, and id to delete a specific document Pass a body with a query dsl to delete by query """ body = kwargs.pop('body', None) if body != None: try: data = [] refresh = kwargs.pop('refresh', False) for hit in helpers.scan(self.es, query=body, **kwargs): hit['_op_type'] = 'delete' data.append(hit) return helpers.bulk(self.es, data, refresh=refresh, **kwargs) except Exception as detail: self.logger.warning('%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning('%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail def delete_index(self, **kwargs): """ Deletes an entire index """ index = kwargs.get('index', '').strip() print 'deleting index : %s' % index return self.es.indices.delete(index=index, ignore=[400, 404]) def search(self, **kwargs): """ Search for an item in the index. Pass an index, doc_type, and id to get a specific document Pass a body with a query dsl to perform a search """ body = kwargs.get('body', None) index = kwargs.get('index', None) id = kwargs.get('id', None) if index is None: raise NotImplementedError("You must specify an 'index' in your call to search") if id: if isinstance(id, list): kwargs.setdefault('body', {'ids': kwargs.pop('id')}) return self.es.mget(**kwargs) else: return self.es.get(**kwargs) ret = None try: ret = self.es.search(**kwargs) except Exception as detail: self.logger.warning('%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) pass return ret def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex=None, body=None): """ Creates an Elasticsearch body for a single field given an index name and type name """ if not body: if fieldtype == 'geo_shape': body = { doc_type : { 'properties' : { fieldname : { 'type' : 'geo_shape', 'tree' : 'geohash', 'precision': '1m' } } } } else: fn = { 'type' : fieldtype } if fieldindex: fn['index'] = fieldindex body = { doc_type : { 'properties' : { fieldname : fn } } } self.es.indices.create(index=index, ignore=400) self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body) print 'creating index : %s/%s' % (index, doc_type) def create_index(self, **kwargs): self.es.indices.create(**kwargs) print 'creating index : %s' % kwargs.get('index', '') def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs): """ Indexes a document or list of documents into Elasticsearch If "id" is supplied then will use that as the id of the document If "idfield" is supplied then will try to find that property in the document itself and use the value found for the id of the document """ if not isinstance(body, list): body = [body] for document in body: if idfield is not None: if isinstance(document, dict): id = document[idfield] else: id = getattr(document,idfield) try: self.es.index(index=index, doc_type=doc_type, body=document, id=id) except Exception as detail: self.logger.warning('%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail)) raise detail def bulk_index(self, data): return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True) def create_bulk_item(self, op_type='index', index=None, doc_type=None, id=None, data=None): return { '_op_type': op_type, '_index': index, '_type': doc_type, '_id': id, '_source': data } def count(self, **kwargs): count = self.es.count(**kwargs) if count is not None: return count['count'] else: return None
class CreateIndex(object): def __init__(self): self.db = cx_Oracle.connect("dmatdmp/D_Matdmp#[email protected]:1521/dmat") self.cursor = self.db.cursor() self.es = Elasticsearch("localhost:9200") # 本地测试 def create_index(self, index, doc_type): ''' 设计索引结构es_body, 根据结构创建index, 并判断是否创建成功 :param index: 创建的索引名称 :param doc_type: 创建的文档类型 :return: ''' es_body = { "settings": { "number_of_shards": 5, "number_of_replicas": 0, "analysis": { "analyzer": { "english_standard_analyzer": { "type": "standard", # 标准分词器 "stopwords": "_english_", # 去除英文停止词 "tokenizer": "standard", # 使用标准分词器(以非字符、下划线的字符进行分割) "filter": ["lowercase"] # 小写,可以在后面增加任何token filter }, "english_comma_pat_analyzer": { "type": "pattern", # 模式分词器 "pattern": ",", # 使用逗号分隔模式(英文逗号) "stopwords": "_english_", # 去除英文停止词 "lowercase": "true" # 小写 } } } }, "mappings": { doc_type: { "properties": { "tbl_id": { "type": "keyword" }, "sys_id": { "type": "keyword" }, "sys_name": { "type": "text", "analyzer": "ik_max_word", "fields": { "raw": { "type": "keyword" } } }, "owner": { "type": "keyword" }, "tbl_name": { "analyzer": "standard", # TODO: 测试 "type": "text", "fields": { "raw": { "type": "keyword" } } }, "col_names": { "analyzer": "english_comma_pat_analyzer", "type": "text" }, "col_comments": { "analyzer": "ik_smart", "type": "text" }, "sys_name_alias": { "analyzer": "ik_max_word", "type": "text" } } } } } if not self.es.indices.exists(index=index): try: self.es.indices.create(index=index, body=es_body) print("索引创建成功...") return True except Exception as e: print(e, "索引创建失败!") return False else: return False def _gen_data(self, index, doc_type, batch_chunk_size): ''' 生成数据的生成器 :param index: 要插入数据的index :param doc_type: 索引index的文档类型 :param chunk_size: 批量插入数据量的大小 :return: ''' sql = """select * from tem_search_engine_1 """ # TODO: 提取sql语句作为参数 self.cursor.execute(sql) col_name_list = [col[0].lower() for col in self.cursor.description] col_name_len = len(col_name_list) actions = [] start = time.time() for row in self.cursor: source = {} tbl_id = "" for i in range(col_name_len): source.update({col_name_list[i]: str(row[i])}) if col_name_list[i] == "tbl_id": tbl_id = row[i] action = { "_index": index, "_type": doc_type, "_id": tbl_id, # TODO:判空 "_source": source } actions.append(action) if len(actions) == batch_chunk_size: print("actions增加数据用时:", time.time()-start) yield actions actions = [] print("for总用时:", time.time()-start) yield actions def _gen_parallel_data(self, index, doc_type): sql = """select * from tem_search_engine_1""" # TODO: 提取sql语句作为参数 self.cursor.execute(sql) col_name_list = [col[0].lower() for col in self.cursor.description] col_name_len = len(col_name_list) for row in self.cursor: source = {} tbl_id = "" for i in range(col_name_len): source.update({col_name_list[i]: str(row[i])}) if col_name_list[i] == "tbl_id": tbl_id = row[i] action = { "_index": index, "_type": doc_type, "_id": tbl_id, # TODO:判空 "_source": source } yield action def bulk_data(self, index, doc_type, is_parallel=True, batch_chunk_size=5000, threads_counts=8): ''' 数据批量插入 :param index: 要插入数据的index :param doc_type: index的文档类型 :param chunk_size: 批量插入的大小,只用于非并行插入 :param is_parallel: 是否要并行插入,默认为并行插入 :param threads_counts: 线程数量,默认为4,只有在并行插入数据时该参数才有效 :return: ''' if is_parallel is None or is_parallel == True: gen_action = self._gen_parallel_data(index, doc_type) print("正在并行插入数据...") start = time.time() for success, info in helpers.parallel_bulk(client=self.es, actions=gen_action, thread_count=threads_counts, chunk_size=1000): if not success: print("Insert failed: ", info) print("插入数据成功... ", time.time()-start) elif is_parallel == False: gen_action = self._gen_data(index, doc_type, batch_chunk_size) try: print("正在插入数据...") t3 = time.time() helpers.bulk(client=self.es, actions=gen_action, chunk_size=500) print("插入成功....", time.time() - t3) except Exception as e: print(e, "插入失败!") else: raise ValueError("is_parallel应该为True或False") def exists_doc(self, index, doc_type, doc_id, source=False): ''' 确定索引中的一个文档是否存在 :param index: :param doc_type: :param doc_id: :param source: :return: ''' return self.es.exists(index=index, doc_type=doc_type, id=doc_id, _source=source) def get_doc(self, index, doc_type, id): ''' :param index: :param doc_type: :param id: :return: ''' return self.es.get(index=index, doc_type=doc_type, id=id) def get_docs(self, index, doc_type, body, source=False): ''' ============================EXAMPLE=================================== createindex = CreateIndex() body = { "docs": [ {"_id": "7970C657B49BA14AE050A8C0EBA07C72"}, {"_id": "7970C657B49EA14AE050A8C0EBA07C72"} ] } print(createindex.get_docs("example_index", "examplecase", body)) body = { "ids": [ "7970C657B49BA14AE050A8C0EBA07C72" "7970C657B49EA14AE050A8C0EBA07C72"] } print(createindex.get_docs("example_index", "examplecase", body)) ====================================================================== :param index: :param doc_type: :param body: 根据"docs"或"ids"获取多条文档信息 :param source: 是否返回展示原始数据,默认为False :return: ''' return self.es.mget(index=index, doc_type=doc_type, body=body, _source=source) def update_doc(self, index, doc_type, id, body): ''' :param index: :param doc_type: :param id: :param body: :return: ''' self.es.update(index=index, doc_type=doc_type, id=id, body=body) def delete_index(self, index): ''' :param index: :return: ''' return self.es.indices.delete(index=index) def delete_docs(self, index, doc_type, doc_id): ''' :param index: :param doc_type: :param doc_id: :return: ''' return self.es.delete(index=index, doc_type=doc_type, id=doc_id) def delete_by_query(self, index, doc_type, body, source): ''' 使用删除语句对文档进行删除 :param index: :param doc_type: :param body: :return: ''' return self.es.delete_by_query(index=index, doc_type=doc_type, body=body, _source=source) def get_info(self, **kwargs): return self.es.info(**kwargs)
class ThreadingTests(TestCase): def setUp(self): self.es = Elasticsearch(ES_NODES) print GAME_QUEUE, Tasks.redis.llen(GAME_QUEUE) print USER_QUEUE, Tasks.redis.llen(USER_QUEUE) print GAME_SET, Tasks.redis.scard(GAME_SET) print USER_SET, Tasks.redis.scard(USER_SET) print TO_CRUNCHER, Tasks.redis.llen(TO_CRUNCHER) Tasks.new_games = 0 print "Deleting the above-listed Redis keys." for key in GAME_QUEUE, USER_QUEUE, GAME_SET, USER_SET, TO_CRUNCHER: Tasks.redis.delete(key) self.es.delete_by_query(index=TEST_ES_INDEX, doc_type=GAME_DOCTYPE, body={"query": {"match_all": {}}}) print "Be patient (10s) - making sure API is available" sleep(10) print "Ready!" def test_games_make_it_to_elasticsearch_in_reasonable_time(self): Tasks.add(TEST_GAMES, []) wt = WatcherThread(TEST_KEY, cycles=1) wt.start() REASONABLE_TIME = 20 # seconds with timeout(REASONABLE_TIME): while True: try: # TODO - assert that the all items made it to ES docs = self.es.mget(index=TEST_ES_INDEX, doc_type=GAME_DOCTYPE, body={'ids': TEST_GAMES})['docs'] assert all([d['found'] for d in docs]) break except: pass sleep(0.1) wt.join() # 1. check that the game queue is now empty ONE_SHITLOAD = 10000 self.assertGreater(ONE_SHITLOAD, Tasks.redis.llen(GAME_QUEUE)) newly_queued_games = Tasks.redis._bulk_rpop(GAME_QUEUE, ONE_SHITLOAD) self.assertEquals(len(set(newly_queued_games)), 0) # 2. check that processed games made it to the GAME_SET self.assertEquals(Tasks.redis.scard(GAME_SET), len(set(TEST_GAMES))) items, is_old = zip(*Tasks.redis._intersect(GAME_SET, TEST_GAMES, insert=False)) self.assertTrue(all(is_old)) def test_games_and_users_properly_queued(self): # Init with 10 games and 5 users Tasks.add(TEST_GAMES, TEST_USERS) wt = WatcherThread(TEST_KEY, cycles=1) wt.run() # 1. check that none of the test games are now currently queued ONE_SHITLOAD = 10000 newly_queued_games = Tasks.redis._bulk_rpop(GAME_QUEUE, ONE_SHITLOAD) self.assertEquals(len(set(newly_queued_games) & set(TEST_GAMES)), 0) # 2. check that seeded TEST_GAMEs are still in GAME_SET after the second iteration items, is_old = zip(*Tasks.redis._intersect(GAME_SET, TEST_GAMES, insert=False)) self.assertTrue(all(is_old)) # 3. check that some new users got added self.assertNotEqual(Tasks.redis.scard(USER_SET), 0) # 4. check that some new games got added self.assertNotEqual(Tasks.redis.scard(GAME_SET), 0) # 5. check that game counts are accurate self.assertEquals(Tasks.new_games, len(TEST_GAMES) + len(newly_queued_games)) def test_multi_thread(self): Tasks.add(TEST_MANY_GAMES, TEST_USERS) wt1 = WatcherThread(TEST_KEY, cycles=1) wt2 = WatcherThread(TEST_KEY2, cycles=1) wt1.start() wt2.start() wt1.join() wt2.join() # 1. check that the game counts are accurate self.assertEquals(Tasks.new_games, len(TEST_MANY_GAMES) + Tasks.redis.llen(GAME_QUEUE))
def export_attachments(data_set_id, outfile, sender='', attachment_extension='jpg', date_bounds=None): print( "email.get_attachments_sender(index=%s, sender=%s, attachment_type=%s, date_bounds=%s)" % (data_set_id, sender, attachment_extension, date_bounds)) if not data_set_id: print "invalid service call - missing index" return 1 # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10)) es = Elasticsearch([{"host": "10.1.70.143", "port": 9200}], timeout=60) # TODO get accurate count -- this is not strictly needed as attachments will be accessed as inner docs on the email_address max_inner_attachments_returned = 100000 # Get all attachments by extension rows = [] body = _attch_nested__ext_query( sender, attachment_extension, date_bounds, max_inner_attachments_returned=max_inner_attachments_returned) print body addresses_count = es.count(index=data_set_id, doc_type="email_address", body=body)["count"] print "total addresses: " + str(addresses_count) addresses = es.search(index=data_set_id, doc_type="email_address", body=body, size=addresses_count) for address in addresses["hits"]["hits"]: rows += [[ address["_source"]["addr"], attachment["_source"]["guid"], attachment["_source"]["filename"], attachment["_source"]["datetime"] ] for attachment in address["inner_hits"]["sender_attachments"]["hits"] ["hits"]] print "total attachments: " + str(len(rows)) # start tar.gz # tar = tarfile.open(mode='w:gz', name="big-export.tar.gz") # Start tar tar = tarfile.open(mode='w', name=outfile) csv_string_buffer = cStringIO.StringIO() csv_file = csv.writer(csv_string_buffer) # Add all rows to attachment csv csv_file.writerows(rows) tarinfo = tarfile.TarInfo("attachments.csv") tarinfo.size = csv_string_buffer.tell() tarinfo.mode = 0644 tarinfo.mtime = time.time() csv_string_buffer.seek(0) tar.addfile(tarinfo, csv_string_buffer) # This is the buffer size of how many attachments to pull from ES at each iteration num_returned = 3 index = 0 # Paging while index < len(rows): # Get num_returned attachments from ES attachments = es.mget(index=data_set_id, doc_type="attachments", body={ "docs": [{ "_id": row[1] } for row in rows[index:index + num_returned]] }) index += num_returned # Add all attachments to the archive for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(attachment["guid"] + "/" + filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close()
class SearchEngine(object): def __init__(self): # serializer = JSONSerializer() serializer.mimetype = 'application/json' serializer.dumps = serializer.serialize serializer.loads = JSONDeserializer().deserialize self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS) self.logger = logging.getLogger(__name__) def delete(self, **kwargs): """ Deletes a document from the index Pass an index, doc_type, and id to delete a specific document Pass a body with a query dsl to delete by query """ body = kwargs.pop('body', None) if body != None: try: data = [] refresh = kwargs.pop('refresh', False) for hit in helpers.scan(self.es, query=body, **kwargs): hit['_op_type'] = 'delete' data.append(hit) return helpers.bulk(self.es, data, refresh=refresh, **kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail def delete_index(self, **kwargs): """ Deletes an entire index """ index = kwargs.get('index', '').strip() print 'deleting index : %s' % index return self.es.indices.delete(index=index, ignore=[400, 404]) def search(self, **kwargs): """ Search for an item in the index. Pass an index, doc_type, and id to get a specific document Pass a body with a query dsl to perform a search """ body = kwargs.get('body', None) index = kwargs.get('index', None) id = kwargs.get('id', None) if index is None: raise NotImplementedError( "You must specify an 'index' in your call to search") if id: if isinstance(id, list): kwargs.setdefault('body', {'ids': kwargs.pop('id')}) return self.es.mget(**kwargs) else: return self.es.get(**kwargs) ret = None try: ret = self.es.search(**kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) pass return ret def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex=None, body=None): """ Creates an Elasticsearch body for a single field given an index name and type name """ if not body: if fieldtype == 'geo_shape': body = { doc_type: { 'properties': { fieldname: { 'type': 'geo_shape', 'tree': 'geohash', 'precision': '1m' } } } } else: fn = {'type': fieldtype} if fieldindex: fn['index'] = fieldindex body = {doc_type: {'properties': {fieldname: fn}}} self.es.indices.create(index=index, ignore=400) self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body) print 'creating index : %s/%s' % (index, doc_type) def create_index(self, **kwargs): self.es.indices.create(**kwargs) print 'creating index : %s' % kwargs.get('index', '') def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs): """ Indexes a document or list of documents into Elasticsearch If "id" is supplied then will use that as the id of the document If "idfield" is supplied then will try to find that property in the document itself and use the value found for the id of the document """ if not isinstance(body, list): body = [body] for document in body: if idfield is not None: if isinstance(document, dict): id = document[idfield] else: id = getattr(document, idfield) try: self.es.index(index=index, doc_type=doc_type, body=document, id=id) except Exception as detail: self.logger.warning( '%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail)) raise detail def bulk_index(self, data): return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True) def create_bulk_item(self, op_type='index', index=None, doc_type=None, id=None, data=None): return { '_op_type': op_type, '_index': index, '_type': doc_type, '_id': id, '_source': data } def count(self, **kwargs): count = self.es.count(**kwargs) if count is not None: return count['count'] else: return None
if __name__ == "__main__": ids_path = sys.argv[1] # 存放文档id的路径 documents_path = sys.argv[2] # 获取到的文档存放的路径 es_address = sys.argv[3] # es地址 index_name = sys.argv[4] # 索引名称 doc_type = sys.argv[5] # 文档类型 batch_num = 1000 es = Elasticsearch(hosts=[es_address], timeout=5000) batch = [] body = { 'ids': batch } with open(ids_path, mode='r') as source, open(documents_path, mode='w') as dest: for doc_id in source: batch.append(doc_id.strip()) if len(batch) > batch_num: body['ids'] = batch docs = es.mget(index=index_name, doc_type=doc_type, body=body) for doc in translator(docs): dest.write(json.dumps(doc) + os.linesep) del batch[0:] if len(batch) > 0: body['ids'] = batch docs = es.mget(index=index_name, doc_type=doc_type, body=body) for doc in translator(docs): dest.write(json.dumps(doc) + os.linesep)
for uid in uid_list: if not result_data.has_key(uid): result_data[uid] = TOPIC_DICT uid_topic[uid] = ['life'] return result_data, uid_topic if __name__ == '__main__': from elasticsearch import Elasticsearch import json ES_CLUSTER_HOST = ['219.224.134.213:9205', '219.224.134.214:9205',\ '219.224.134.215:9205'] es = Elasticsearch(ES_CLUSTER_HOST, timeout=600) index_name = 'fb_user_portrait' index_type = 'user' ids = ['544481513', '100010212181419'] uid_list = [] uid_weibo = {} res = es.mget(index=index_name, doc_type=index_type, body={'ids': ids})['docs'] for r in res: uid = r['_id'] keywords = json.loads(r['_source']['filter_keywords']) uid_list.append(uid) uid_weibo[uid] = keywords result_data, uid_topic = topic_classfiy(uid_list, uid_weibo) print result_data print uid_topic
class SearchEngine(object): def __init__(self): # serializer = JSONSerializer() serializer.mimetype = "application/json" serializer.dumps = serializer.serialize serializer.loads = JSONDeserializer().deserialize self.es = Elasticsearch( hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS ) self.logger = logging.getLogger(__name__) def delete(self, **kwargs): """ Deletes a document from the index Pass an index, doc_type, and id to delete a specific document Pass a body with a query dsl to delete by query """ body = kwargs.pop("body", None) if body != None: try: data = [] refresh = kwargs.pop("refresh", False) for hit in helpers.scan(self.es, query=body, **kwargs): hit["_op_type"] = "delete" data.append(hit) return helpers.bulk(self.es, data, refresh=refresh, **kwargs) except Exception as detail: self.logger.warning( "%s: WARNING: failed to delete document by query: %s \nException detail: %s\n" % (datetime.now(), body, detail) ) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning( "%s: WARNING: failed to delete document: %s \nException detail: %s\n" % (datetime.now(), body, detail) ) raise detail def delete_index(self, **kwargs): """ Deletes an entire index """ index = kwargs.get("index", "").strip() print "deleting index : %s" % index return self.es.indices.delete(index=index, ignore=[400, 404]) def search(self, **kwargs): """ Search for an item in the index. Pass an index, doc_type, and id to get a specific document Pass a body with a query dsl to perform a search """ body = kwargs.get("body", None) index = kwargs.get("index", None) id = kwargs.get("id", None) if index is None: raise NotImplementedError("You must specify an 'index' in your call to search") if id: if isinstance(id, list): kwargs.setdefault("body", {"ids": kwargs.pop("id")}) return self.es.mget(**kwargs) else: return self.es.get(**kwargs) ret = None try: ret = self.es.search(**kwargs) except Exception as detail: self.logger.warning( "%s: WARNING: search failed for query: %s \nException detail: %s\n" % (datetime.now(), body, detail) ) pass return ret def index_term(self, term, id, context="", options={}): """ If the term is already indexed, then simply increment the count and add the id of the term to the existing index. If the term isn't indexed then add the index. id: a unique id associated with the term context: a uuid of a concept to associate with the term to render in the ui options: any additional information to associate with the term """ if term.strip(" \t\n\r") != "": already_indexed = False count = 1 ids = [id] try: # _id = unicode(term, errors='ignore').decode('utf-8').encode('ascii') _id = uuid.uuid3(uuid.NAMESPACE_DNS, "%s%s" % (hash(term), hash(context))) result = self.es.get(index="term", doc_type="value", id=_id, ignore=404) # print 'result: %s' % result if result["found"] == True: ids = result["_source"]["ids"] if id not in ids: ids.append(id) else: ids = [id] self.index_data( "term", "value", {"term": term, "context": context, "options": options, "count": len(ids), "ids": ids}, id=_id, ) except Exception as detail: self.logger.warning( "%s: WARNING: search failed to index term: %s \nException detail: %s\n" % (datetime.now(), term, detail) ) raise detail def delete_terms(self, ids): """ If the term is referenced more then once simply decrement the count and remove the id of the deleted term from the from the existing index. If the term is only referenced once then delete the index """ if not isinstance(ids, list): ids = [ids] for id in ids: result = self.es.search( index="term", doc_type="value", body={ "query": {"filtered": {"filter": {"terms": {"ids": [id]}}, "query": {"match_all": {}}}}, "from": 0, "size": 10, }, ignore=404, ) if "hits" in result: for document in result["hits"]["hits"]: document["_source"]["ids"].remove(id) count = len(document["_source"]["ids"]) if count > 0: document["_source"]["count"] = count self.index_data("term", "value", document["_source"], id=document["_id"]) self.es.indices.refresh(index="term") else: self.delete(index="term", doc_type="value", id=document["_id"]) def create_mapping(self, index, doc_type, fieldname="", fieldtype="string", fieldindex=None, body=None): """ Creates an Elasticsearch body for a single field given an index name and type name """ if not body: if fieldtype == "geo_shape": body = { doc_type: {"properties": {fieldname: {"type": "geo_shape", "tree": "geohash", "precision": "1m"}}} } else: fn = {"type": fieldtype} if fieldindex: fn["index"] = fieldindex body = {doc_type: {"properties": {fieldname: fn}}} self.create_index(index=index, ignore=400) self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body) def create_index(self, **kwargs): self.es.indices.create(**kwargs) def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs): """ Indexes a document or list of documents into Elasticsearch If "id" is supplied then will use that as the id of the document If "idfield" is supplied then will try to find that property in the document itself and use the value found for the id of the document """ if not isinstance(body, list): body = [body] for document in body: if idfield is not None: if isinstance(document, dict): id = document[idfield] else: id = getattr(document, idfield) try: self.es.index(index=index, doc_type=doc_type, body=document, id=id, **kwargs) except Exception as detail: self.logger.warning( "%s: WARNING: failed to index document: %s \nException detail: %s\n" % (datetime.now(), document, detail) ) raise detail def bulk_index(self, data): return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True) def create_bulk_item(self, index, type, id, data): if not (self.isempty_or_none(index) or self.isempty_or_none(type) or self.isempty_or_none(id)): return [{"index": {"_index": index, "_type": type, "_id": id}}, data] else: return false
class NameStore(object): def __init__(self): """what should this do as we will only populate from es?""" pass def configure_index(self, configuration): self.es_config = configuration self.es_index = configuration['_index'] self.es = Elasticsearch([{ "host": self.es_config['host'], "port": self.es_config['port'] }]) if not self.es.indices.exists(index=self.es_index): self.es.indices.create(index=self.es_index) self.needs_refresh = False def index_needs_refresh(self): return self.needs_refresh def index_refresh(self): self.es.indices.refresh(index=self.es_index) self.needs_refresh = False def set_index_needs_refresh(self): self.needs_refresh = True def check_index_is_fresh(self): # check index is up to date, refresh if needed if self.index_needs_refresh(): self.index_refresh() def get_name_es(self, name_id, params): if "action" not in params: params["action"] = "see" if "username" not in params: params["username"] = None # get Name from index Name = self.get_from_index(name_id, action=params["action"], name_type="naam") return Name.to_clean_json(params) def get_names_es(self, params): # check index is up to date, refresh if needed self.check_index_is_fresh() response = self.get_from_index_by_filters(params, name_type="naam") Names = [Name(hit) for hit in response["hits"]["hits"]] return { "total": response["hits"]["total"], "names": [Name.base() for Name in Names] } def get_names_by_id_es(self, name_ids, params): # check index is up to date, refresh if needed self.check_index_is_fresh() response = self.es.mget(index=self.es_index, doc_type="Name", body={"ids": name_ids}) return [hit["_source"] for hit in response["hits"]["hits"]] # def get_collection_es(self, collection_id, params): # if "action" not in params: # params["action"] = "see" # if "username" not in params: # params["username"] = None # return collection.to_clean_json(params) # def get_collections_es(self, params): # # check index is up to date, refresh if needed # self.check_index_is_fresh() # response = self.get_from_index_by_filters(params, name_type="NameCollection") # collections = NameCollection(response["hits"]["hits"]) # return { # "total": response["hits"]["total"], # "collections": collection.to_clean_json(params) # } #################### # Helper functions # #################### """old helpers were superfluous, but we may add some similarity stuff here later """ ################### # ES interactions # ################### """all adding stuff is delegated to helpers as we do not foresee writing interactions with the index as yet. Keep as placeholders though""" # def add_to_index(self, Name, name_type): # self.should_have_target_list(Name) # self.should_have_permissions(Name) # self.should_not_exist(Name['id'], name_type) # return self.es.index(index=self.es_index, doc_type=name_type, id=Name['id'], body=Name) # # def add_bulk_to_index(self, Names, name_type): # raise ValueError("Function not yet implemented") # def get_from_index(self, id, action, name_type="naam"): # """for now we only have naam, but probably extend with institution # and geonames later # """ # # check index is up to date, refresh if needed # self.check_index_is_fresh() # # check that Name exists (and is not deleted) # self.should_exist(id, name_type) # return Name def get_from_index_by_id(self, name_id, name_type="naam"): self.should_exist(name_id, name_type) return self.es.get(index=self.es_index, doc_type=name_type, id=name_id)['_source'] def get_from_index_by_filters(self, params, name_type="naam"): filter_queries = query_helper.make_param_filter_queries(params) # filter_queries += [query_helper.make_permission_see_query(params)] query = { "from": params["page"] * self.es_config["page_size"], "size": self.es_config["page_size"], "query": query_helper.bool_must(filter_queries) } return self.es.search(index=self.es_index, doc_type=name_type, body=query) # def remove_from_index(self, name_id, name_type): # self.should_exist(name_id, name_type) # return self.es.delete(index=self.es_index, doc_type=name_type, id=name_id) # def remove_from_index_if_allowed(self, name_id, params, name_type="_all"): # if "username" not in params: # params["username"] = None # # check index is up to date, refresh if needed # self.check_index_is_fresh() # # check that Name exists (and is not deleted) # self.should_exist(name_id, name_type) # # get original Name json # name_json = self.get_from_index_by_id(name_id, name_type) # # check if user has appropriate permissions # if not permissions.is_allowed_action(params["username"], "edit", Name(name_json)): # raise PermissionError(message="Unauthorized access - no permission to {a} Name".format(a=params["action"])) # return self.remove_from_index(name_id, "Name") # # def is_deleted(self, name_id, name_type="_all"): # if self.es.exists(index=self.es_index, doc_type=name_type, id=name_id): # res = self.es.get(index=self.es_index, doc_type=name_type, id=name_id) # if "status" in res["_source"] and res["_source"]["status"] == "deleted": # return True # return False def should_exist(self, name_id, name_type="_all"): if self.es.exists(index=self.es_index, doc_type=name_type, id=name_id): if not self.is_deleted(name_id, name_type): return True raise NaamError(message="Name with id %s does not exist" % (name_id), status_code=404) def should_not_exist(self, name_id, name_type="_all"): if self.es.exists(index=self.es_index, doc_type=name_type, id=name_id): raise NaamError(message="Name with id %s already exists" % (name_id)) else: return True def get_objects_from_hits(self, hits, doc_type="naam"): objects = [] for hit in hits: if hit["_source"]["type"] == doc_type: objects += [Name(hit)] # elif hit["_source"]["type"] == "NameCollection": # objects += [NameCollection(hit["_source"])] def list_name_ids(self): return list(self.name_index.keys()) def list_Names(self, ids=None): if not ids: ids = self.list_name_ids() return [Name for id, Name in self.name_index.items() if id in ids] def list_Names_as_json(self, ids=None): if not ids: ids = self.list_name_ids() return [ Name.to_json() for id, Name in self.name_index.items() if id in ids ]
mid_set.add(item['_source']["mid"]) else: try: mid_set.add(item['_source']["root_mid"]) except Exception, r: print Exception, r print len(mid_set) # 获得原创微博和转发微博信息存入es中,应当在近两天里 index_list = [] index_list.append(index_name) index_list.append("flow_text_"+ts2datetime(ts-3600*24)) mid_list = list(mid_set) bulk_action = [] non_exist_list = [] #尚未监控的微博 exist_results = es_user_portrait.mget(index=monitor_index_name, doc_type=monitor_index_type, body={"ids":mid_list})["docs"] for item in exist_results: if not item["found"]: non_exist_list.append(item["_id"]) #将尚未监控的微博纳入监控的范围内 if non_exist_list: count = 0 classify_text_dict = dict() # 分类文本 classify_uid_list = [] #f = open("text.txt", "a") lenth = len(non_exist_list) dividion = lenth/1000 weibo_results = [] for i in range(0,dividion+1): tmp_mid_list = non_exist_list[i*1000:(i+1)*1000]
def export_emails_archive(data_set_id, email_ids=["f9c9c59a-7fe8-11e5-bb05-08002705cb99"]): cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, email_ids)) if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing index") # if not email: # return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id") # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10)) es = Elasticsearch([{ "host": "10.1.70.143", "port": 9200 }], request_timeout=60) # TODO can implement with multiple doc_types and combine attachments in emails = es.mget(index=data_set_id, doc_type="emails", body={"docs": [{ "_id": id } for id in email_ids]}) # TODO filename filename = "export.tar.gz" tangelo.content_type("application/x-gzip") header("Content-Disposition", 'attachment; filename="{}"'.format(filename)) string_buffer = cStringIO.StringIO() tar = tarfile.open(mode='w:gz', fileobj=string_buffer) # Add each email to the tar for email_source in emails["docs"]: email = email_source["_source"] tarinfo_parent = tarfile.TarInfo(name=email["id"]) tarinfo_parent.type = tarfile.DIRTYPE tarinfo_parent.mode = 0755 tarinfo_parent.mtime = time.time() tar.addfile(tarinfo_parent) tarinfo = tarfile.TarInfo(email["id"] + "/" + email["id"] + ".json") # TODO -- email transformation data_string = json.dumps(email) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Get the attachments if email["attachments"]: attachments = es.mget(index=data_set_id, doc_type="attachments", body={ "docs": [{ "_id": attch["guid"] } for attch in email["attachments"]] }) for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(email["id"] + "/" + filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close() return string_buffer.getvalue()
class ElasticsearchBackend(object): def __init__(self, settings=None): if settings is None: settings = {} self.es = Elasticsearch(**settings) def set_variable(self, key, value): res = self.es.index("insee", doc_type="variable", id=key.lower(), body=value) return res['created'] def get_variable(self, key): res = self.es.get(index="insee", doc_type='variable', id=key.lower()) return res['_source'] def get_variables(self, keys): res = self.es.mget(index="insee", doc_type="variable", body={ "ids": [k.lower() for k in keys] }) results = [d['_source'] for d in res['docs'] if '_source' in d] return results def search_variables(self, query): res = self.es.search( index="insee", doc_type="variable", body={"query": {"match": {"_all": query.lower()}}} ) return [hit['_source'] for hit in res['hits']['hits']] def set_commune(self, key, value): res = self.es.index("insee", doc_type="commune", id=key.lower(), body=value) return res['created'] def get_commune(self, key): res = self.es.get(index="insee", doc_type='commune', id=key.lower()) return res['_source'] def get_communes(self, keys): res = self.es.mget(index="insee", doc_type="commune", body={ "ids": [k.lower() for k in keys] }) results = [d['_source'] for d in res['docs'] if '_source' in d] return results def search_communes(self, query): res = self.es.search( index="insee", doc_type="commune", body={"query": {"match": {"_all": query.lower()}}} ) return [hit['_source'] for hit in res['hits']['hits']] def set_data(self, var_lib, codgeo, value): res = self.es.index("insee", doc_type="data", id="%s_%s" % (var_lib.lower(), codgeo), body=value) return res['created'] def get_data(self, var_lib, codgeo): res = self.es.get(index="insee", doc_type='data', id="%s_%s" % (var_lib.lower(), codgeo)) return res['_source']
def export_attachments(data_set_id, outfile, sender='', attachment_extension='jpg', date_bounds=None): print("email.get_attachments_sender(index=%s, sender=%s, attachment_type=%s, date_bounds=%s)" % (data_set_id, sender, attachment_extension, date_bounds)) if not data_set_id: print "invalid service call - missing index" return 1 # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10)) es = Elasticsearch([{"host" : "10.1.70.143", "port" : 9200}], timeout=60) # TODO get accurate count -- this is not strictly needed as attachments will be accessed as inner docs on the email_address max_inner_attachments_returned = 100000 # Get all attachments by extension rows=[] body = _attch_nested__ext_query(sender, attachment_extension, date_bounds, max_inner_attachments_returned=max_inner_attachments_returned ) print body addresses_count = es.count(index=data_set_id, doc_type="email_address", body=body)["count"] print "total addresses: " + str(addresses_count) addresses = es.search(index=data_set_id, doc_type="email_address", body=body, size=addresses_count) for address in addresses["hits"]["hits"]: rows += [[address["_source"]["addr"], attachment["_source"]["guid"], attachment["_source"]["filename"], attachment["_source"]["datetime"]] for attachment in address["inner_hits"]["sender_attachments"]["hits"]["hits"]] print "total attachments: " + str(len(rows)) # start tar.gz # tar = tarfile.open(mode='w:gz', name="big-export.tar.gz") # Start tar tar = tarfile.open(mode='w', name=outfile) csv_string_buffer = cStringIO.StringIO() csv_file=csv.writer( csv_string_buffer ) # Add all rows to attachment csv csv_file.writerows (rows) tarinfo = tarfile.TarInfo("attachments.csv") tarinfo.size = csv_string_buffer.tell() tarinfo.mode = 0644 tarinfo.mtime = time.time() csv_string_buffer.seek(0) tar.addfile(tarinfo, csv_string_buffer) # This is the buffer size of how many attachments to pull from ES at each iteration num_returned=3 index=0 # Paging while index < len(rows): # Get num_returned attachments from ES attachments = es.mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":row[1]} for row in rows[index: index+num_returned]]}) index+=num_returned # Add all attachments to the archive for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(attachment["guid"]+"/"+filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close()
class Elasticsearch(Datastore): engine = None index = 'aleph-samples' tracking_index = 'aleph-tracking' doc_type = 'sample' cache = None def __init__(self): self.engine = ES() self.cache = SimpleCache() def all(self, page=1, size=DEFAULT_PAGE_SIZE): body = { 'query': { 'match_all': {}, }, "sort": { "timestamp": { 'order': 'desc' }, } } start = ((page - 1) * size) res = self.raw_search(body, start=start, size=size) total = res['hits']['total'] entries = res['hits']['hits'] return (total, self.entries_to_samples(entries)) def entries_to_samples(self, entries): rv = [] if not entries: return rv entry_table = {} for entry in entries: sample_id = entry['_id'] entry_table[sample_id] = {'metadata': entry, 'tracking_data': {}} # Get tracking data for retrieved ids tracking_data = self._mget(list(entry_table.keys()), index=self.tracking_index) for td in tracking_data: sample_id = td['_id'] entry_table[sample_id]['tracking_data'] = td # Add return values for sample_id, sample_data in entry_table.items(): if '_source' in sample_data['metadata'] and '_source' in sample_data['tracking_data']: rv.append(Sample(sample_data['metadata'], sample_data['tracking_data'])) return rv def _mget(self, ids, index=None): if not index: index = self.index if not isinstance(ids, list): raise ValueError("ids is not a list") body = { 'ids': ids } result = self.engine.mget(index=index, doc_type=self.doc_type, body=body, ignore=404) if 'docs' not in result: return None return result['docs'] def _get(self, sample_id, index=None): if not index: index = self.index result = self.engine.get(index=index, doc_type=self.doc_type, id=sample_id, ignore=404) if result['found'] == False: return None return OrderedDict(sorted(result.items())) def mget(self, sample_ids): if not sample_ids: return None metadata = {s['_id']:s for s in self._mget(sample_ids)} tracking_data = {s['_id']:s for s in self._mget(sample_ids, index=self.tracking_index)} if not metadata or not tracking_data: return None entries = [] for sample_id, v in metadata.items(): if '_source' in metadata[sample_id].keys() and '_source' in tracking_data[sample_id].keys(): entries.append(Sample(metadata[sample_id], tracking_data[sample_id])) return entries def get(self, sample_id): metadata = self._get(sample_id) tracking_data = self._get(sample_id, index=self.tracking_index) if not metadata or not tracking_data: return None return Sample(metadata, tracking_data) def get_parents(self, sample_id): rv = self.cache.get('get-parents-%s' % sample_id) if not rv: tracking_data = self._get(sample_id, index=self.tracking_index) if not tracking_data: return [] rv = tracking_data['_source']['parents'] return rv def get_children(self, sample_id): rv = self.cache.get('get-children-%s' % sample_id) if not rv: search_body = { "query": { "bool": { "must": [ { "term": { "parents": sample_id } } ] } } } result = self.raw_search(search_body, index=self.tracking_index) rv = result['hits']['hits'] return rv def raw_search(self, body, q=None, start=0, size=DEFAULT_PAGE_SIZE, index=None): if not index: index = self.index result = [] try: hits = self.engine.search(index=index, doc_type=self.doc_type, q=q, from_=start, size=size, body=body) except NotFoundError: pass except Exception: raise return hits def search(self, query, page=1, size=DEFAULT_PAGE_SIZE): start = ((page - 1) * size) result = [] body = { "sort": { "timestamp": { 'order': 'desc' }, } } hits = self.raw_search(body, start=start, size=size, q=query) total = hits['hits']['total'] entries = hits['hits']['hits'] return (total, self.entries_to_samples(entries)) def count(self, body): return self.engine.count(index=self.tracking_index, doc_type=self.doc_type, body=body)['count'] # Aux Methods # Counters def count_all(self): body = { "query": { "match_all" : {} } } return self.count(body) def count_processing_samples(self): body = { "query": { "bool" : { "filter" : [ {"script" : {"script" : {"source": "!doc['processors_completed'].containsAll(doc['processors_dispatched'])", "lang": "painless"}}}, ] } } } return self.count(body) def count_analyzing_samples(self): body = { "query": { "bool" : { "filter" : [ {"script" : {"script" : {"source": "!doc['analyzers_completed'].containsAll(doc['analyzers_dispatched'])", "lang": "painless"}}}, ] } } } return self.count(body) # Graph Data def sample_histogram(self, size=24, interval="1h"): histogram = {} hist_body = { "aggs" : { "samples_over_time" : { "date_histogram" : { "field" : "timestamp", "interval" : interval, "min_doc_count": 0 } } } } hist_result = self.raw_search(hist_body)['aggregations'] for h in hist_result['samples_over_time']['buckets']: histogram[h['key_as_string']] = h['doc_count'] return histogram def sample_diversity(self): diversity = {} div_body = { "aggs" : { "genres" : { "terms" : { "field" : "filetype" } } } } div_result = self.raw_search(div_body)['aggregations'] for d in div_result['genres']['buckets']: diversity[d['key']] = d['doc_count'] return diversity
class VWCollection(VWCallback): def __init__(self,items=[],**kwargs): self.bulk_chunk_size = kwargs.get('bulk_chunk_size', config.bulk_chunk_size) self._sort = [] self.results_per_page = kwargs.get('results_per_page', config.results_per_page) self._querybody = querybuilder.QueryBody() # sets up the new query bodies if kwargs.get('base_obj'): self.base_obj = kwargs.get('base_obj') else: try: self.base_obj = self.__class__.__model__ except AttributeError: raise AttributeError('Base object must contain a model or pass base_obj') self._es = Elasticsearch(config.dsn) self._esc = client.IndicesClient(self._es) if '__index__' in dir(self.base_obj): idx = self.base_obj.__index__ else: idx = config.default_index self._search_params = [] self._raw = {} self.idx = idx self.type = self.base_obj.__type__ self._special_body = {} # special list of items that can be committed in bulk self._items = items def search(self,q): self._search_params.append(q) return self # setup a raw request def raw(self, raw_request): self._raw = raw_request return self def filter_by(self, condition = 'and',**kwargs): if kwargs.get('condition'): condition=kwargs.get('condition') del kwargs['condition'] condition = self._translate_bool_condition(condition) for k,v in kwargs.iteritems(): if k == 'id' or k == 'ids': id_filter = v if not isinstance(id_filter, list): id_filter = [id_filter] self._querybody.chain(qdsl.ids(id_filter), condition=condition) else: try: analyzed = is_analyzed(getattr(self.base_obj, k)) except AttributeError: analyzed = is_analyzed(v) q_type = 'filter' if analyzed: q_type = 'query' if isinstance(v, list): # lists are treat as like "OR" (using terms() on not_analyzed, bool/matched on analyzed) if analyzed: match_queries = [] for item in v: match_queries.append( qdsl.match(k,item) ) self._querybody.chain( qdsl.bool(qdsl.should(match_queries)), condition=condition,type=q_type ) else: self._querybody.chain( qdsl.terms(k,v),condition=condition, type=q_type) else: #search_value = unicode(v) if analyzed: self._querybody.chain(qdsl.match(unicode(k), v), condition=condition,type=q_type) else: self._querybody.chain(qdsl.term(unicode(k), v), condition=condition,type=q_type) return self def multi_match(self, fields, query, **kwargs): self._querybody.chain(qdsl.multi_match(query, fields), condition=kwargs.get('condition', None), type='query') return self def exact(self, field, value,**kwargs): try: field_template = getattr( self.base_obj, field) if type(field_template) != ESType: field_template = create_es_type(field_template) for estype in [String,IP,Attachment]: if isinstance(field_template, estype) and field_template.analyzed == True: logger.warn('%s types may not exact match correctly if they are analyzed' % unicode(estype.__class__.__name__)) except AttributeError: logger.warn('%s is not in the base model.' % unicode(field)) kwargs['type'] = 'filter' if isinstance(value, list): self._querybody.chain(qdsl.terms(field,value), **kwargs) else: self._querybody.chain(qdsl.term(field, value), **kwargs) return self def or_(self,*args): return ' OR '.join(args) def and_(self,*args): return ' AND '.join(args) def get(self,id, **kwargs): try: params = {'index':self.idx, 'doc_type':self.type, 'id':id} params.update(kwargs) doc = self._es.get(**params) if doc: return VWCollectionGen(self.base_obj, {'docs':[doc]})[0] return None except: # TODO. Discuss this. Should get() return None even on exceptions? return None def refresh(self, **kwargs): self._esc.refresh(index=self.idx, **kwargs) def get_in(self, ids,**kwargs): if len(ids) > 0: # check for ids. empty list returns an empty list (instead of exception) params = {'index':self.idx, 'doc_type':self.type, 'body':{'ids':ids}} params.update(kwargs); res = self._es.mget(**params) if res and res.get('docs'): return VWCollectionGen(self.base_obj, res) return [] def get_like_this(self,doc_id,**kwargs): params = {'index':self.idx,'doc_type':self.type,'id':doc_id} params.update(kwargs) res = self._es.mlt(**params) if res and res.get('docs'): return VWCollectionGen(self.base_obj, res) else: return [] def sort(self, **kwargs): for k,v in kwargs.iteritems(): v = v.lower() if v not in ['asc','desc']: v = 'asc' self._sort.append('%s:%s' % (k,v)) return self def clear_previous_search(self): self._raw = {} self._search_params = [] self._special_body = {} self._querybody = querybuilder.QueryBody() def _create_search_params( self, **kwargs ): # before_query_build() is allowed to manipulate the object's internal state before we do stuff self._querybody = self.execute_callbacks('before_query_build', self._querybody ) q = { 'index': self.idx, 'doc_type': self.type } if self._raw: q['body'] = self._raw elif len(self._search_params) > 0: kwargs['type'] = 'query' self._querybody.chain(qdsl.query_string(self.and_(*self._search_params)), **kwargs) else: q['body'] = qdsl.query(qdsl.match_all()) if self._querybody.is_filtered() or self._querybody.is_query(): q['body'] = self._querybody.build() # after_query_build() can manipulate the final query before being sent to ES # this is generally considered a bad idea but might be useful for logging q = self.execute_callbacks( 'after_query_build', q ) logger.debug(json.dumps(q)) return q def count(self): params = self._create_search_params() resp = self._es.count(**params) return resp.get('count') def __len__(self): return self.count() def limit(self,count): self.results_per_page = count return self def all(self,**kwargs): params = self._create_search_params() if not params.get('size'): params['size'] = self.results_per_page if kwargs.get('results_per_page') != None: kwargs['size'] = kwargs.get('results_per_page') del kwargs['results_per_page'] if kwargs.get('start') != None: kwargs['from_'] = kwargs.get('start') del kwargs['start'] logger.debug(json.dumps(self._sort)) params.update(kwargs) if len(self._sort) > 0: if params.get('sort') and isinstance(params['sort'], list): params['sort'].extend(self._sort) else: params['sort'] = self._sort if params.get('sort'): if isinstance(params['sort'], list): params['sort'] = ','.join(params.get('sort')) else: raise TypeError('"sort" argument must be a list') logger.debug(json.dumps(params)) results = self._es.search(**params) return VWCollectionGen(self.base_obj,results) def one(self,**kwargs): kwargs['results_per_page'] = 1 results = self.all(**kwargs) try: return results[0] except IndexError: raise NoResultsFound('No result found for one()') # this is for legacy purposes in filter_by def _translate_bool_condition(self,_bool_condition): if _bool_condition == 'and': _bool_condition = 'must' elif _bool_condition == 'or': _bool_condition = 'should' elif _bool_condition == 'not': _bool_condition = 'must_not' # this is for things like geo_distance where we explicitly want the true and/or/not elif _bool_condition == 'explicit_and': _bool_condition = 'and' elif _bool_condition == 'explicit_or': _bool_condition = 'or' elif _bool_condition == 'explicit_not': _bool_condition = 'not' return _bool_condition def range(self, field, **kwargs): search_options = {} for opt in ['condition','minimum_should_match']: if opt in kwargs: search_options[opt] = kwargs.get(opt) del kwargs[opt] q = qdsl.range(field, **kwargs) if self._querybody.is_filtered(): d = {'filter': q} else: d = {'query': q} if search_options: d.update(search_options) self._querybody.chain(d) return self def search_geo(self, field, distance, lat, lon,**kwargs): condition = kwargs.get('condition', 'and') if 'condition' in kwargs: del kwargs['condition'] self._querybody.chain(qdsl.filter_(qdsl.geo_distance(field, [lon,lat], distance, **kwargs)), condition=condition) return self def missing( self, field, **kwargs): self._querybody.chain(qdsl.filter_(qdsl.missing(field))) return self def exists( self, field, **kwargs): self._querybody.chain(qdsl.filter_(qdsl.exists(field, **kwargs))) return self def delete(self, **kwargs): params = self._create_search_params() params.update(kwargs) self._es.delete_by_query(**params) def delete_in(self, ids): if not isinstance(ids, list): raise TypeError('argument to delete in must be a list.') bulk_docs = [] for i in ids: this_id = i this_type = self.base_obj.__type__ this_idx = self.idx if isinstance(i, VWBase): this_id = i.id this_type = i.__type__ try: this_idx = i.__index__ except AttributeError: pass bulk_docs.append({'_op_type': 'delete', '_type': this_type, '_index': this_idx, '_id': this_id }) return helpers.bulk( self._es, bulk_docs, chunk_size=self.bulk_chunk_size) # commits items in bulk def commit(self, callback=None): bulk_docs = [] if callback: if not callable(callback): raise TypeError('Argument 2 to commit() must be callable') # allow for a search to work if there are not _items if len(self._items) == 0: items = self.all() else: items = self._items for i in self._items: if callback: i = callback(i) i = self.execute_callbacks('on_bulk_commit', i) this_dict = {} this_id = '' this_idx = self.idx this_type = self.base_obj.__type__ if isinstance(i, VWBase): this_dict = i._create_source_document() this_type = i.__type__ this_id = i.id try: this_idx = i.__index__ except AttributeError: pass elif isinstance(i,dict): this_dict = i this_id = i.get('id') else: raise TypeError('Elments passed to the collection must be type of "dict" or "VWBase"') if not this_id: this_id = str(uuid4()) bulk_docs.append({'_op_type': 'index', '_type': this_type, '_index': this_idx, '_id': this_id, '_source': this_dict}) return helpers.bulk(self._es,bulk_docs,chunk_size=self.bulk_chunk_size)
class SearchEngine(object): def __init__(self): self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, **settings.ELASTICSEARCH_CONNECTION_OPTIONS) self.logger = logging.getLogger(__name__) def delete(self, **kwargs): """ Deletes a document from the index Pass an index, doc_type, and id to delete a specific document Pass a body with a query dsl to delete by query """ body = kwargs.get('body', None) if body != None: try: return self.es.delete_by_query(ignore=[404], **kwargs) except Exception as detail: self.logger.warning('%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning('%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail def delete_index(self, **kwargs): """ Deletes an entire index """ index = kwargs.get('index', '').strip() print 'deleting index : %s' % index return self.es.indices.delete(index=index, ignore=[400, 404]) def search(self, **kwargs): """ Search for an item in the index. Pass an index, doc_type, and id to get a specific document Pass a body with a query dsl to perform a search """ body = kwargs.get('body', None) index = kwargs.get('index', None) id = kwargs.get('id', None) if index is None: raise NotImplementedError("You must specify an 'index' in your call to search") if id: if isinstance(id, list): kwargs.setdefault('body', {'ids': kwargs.pop('id')}) return self.es.mget(**kwargs) else: return self.es.get(**kwargs) ret = None try: ret = self.es.search(**kwargs) except Exception as detail: self.logger.warning('%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) pass return ret def index_term(self, term, id, context='', ewstatus='', options={}): """ If the term is already indexed, then simply increment the count and add the id of the term to the existing index. If the term isn't indexed then add the index. id: a unique id associated with the term context: a uuid of a concept to associate with the term to render in the ui options: any additional information to associate with the term """ if term.strip(' \t\n\r') != '': already_indexed = False count = 1 ids = [id] try: #_id = unicode(term, errors='ignore').decode('utf-8').encode('ascii') _id = uuid.uuid3(uuid.NAMESPACE_DNS, '%s%s' % (hash(term), hash(context))) result = self.es.get(index='term', doc_type='value', id=_id, ignore=404) #print 'result: %s' % result if result['found'] == True: ids = result['_source']['ids'] if id not in ids: ids.append(id) else: ids = [id] # ewstatus is indexed only if it's not dict if (type(ewstatus) is dict): self.index_data('term', 'value', {'term': term, 'context': context, 'options': options, 'count': len(ids), 'ids': ids}, id=_id) else: self.index_data('term', 'value', {'term': term, 'context': context, 'ewstatus': ewstatus, 'options': options, 'count': len(ids), 'ids': ids}, id=_id) except Exception as detail: self.logger.warning('%s: WARNING: search failed to index term: %s \nException detail: %s\n' % (datetime.now(), term, detail)) raise detail def delete_terms(self, ids): """ If the term is referenced more then once simply decrement the count and remove the id of the deleted term from the from the existing index. If the term is only referenced once then delete the index """ if not isinstance(ids, list): ids = [ids] for id in ids: result = self.es.search(index='term', doc_type='value', body={ "query": { "filtered": { "filter":{ "terms": { "ids": [id] } }, "query": { "match_all": {} } } }, "from": 0, "size": 10 }, ignore=404) if 'hits' in result: for document in result['hits']['hits']: document['_source']['ids'].remove(id) count = len(document['_source']['ids']) if count > 0: document['_source']['count'] = count self.index_data('term', 'value', document['_source'], id=document['_id']) self.es.indices.refresh(index='term') else: self.delete(index='term', doc_type='value', id=document['_id']) def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex='analyzed', body=None): """ Creates an Elasticsearch body for a single field given an index name and type name """ if not body: if fieldtype == 'geo_shape': body = { doc_type : { 'properties' : { fieldname : { 'type' : 'geo_shape', 'tree' : 'geohash', 'precision': '1m' } } } } else: body = { doc_type : { 'properties' : { fieldname : { 'type' : fieldtype, 'index' : fieldindex } } } } self.create_index(index=index, ignore=400) self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body) def create_index(self, **kwargs): self.es.indices.create(**kwargs) def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs): """ Indexes a document or list of documents into Elasticsearch If "id" is supplied then will use that as the id of the document If "idfield" is supplied then will try to find that property in the document itself and use the value found for the id of the document """ if not isinstance(body, list): body = [body] for document in body: if idfield is not None: if isinstance(document, dict): id = document[idfield] else: id = getattr(document,idfield) try: self.es.index(index=index, doc_type=doc_type, body=document, id=id, **kwargs) except Exception as detail: self.logger.warning('%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail)) raise detail def bulk_index(self, data): return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True) def create_bulk_item(self, index, type, id, data): if not(self.isempty_or_none(index) or self.isempty_or_none(type) or self.isempty_or_none(id)): return[ { "index" : { "_index" : index, "_type" : type, "_id" : id } }, data ] else: return false
class TMMonoLing: DOC_TYPE = 'tm' def __init__(self, **kwargs): self.es = Elasticsearch(kwargs = kwargs) # Put default index template self.es.indices.put_template(name='tm_template', body = self._index_template()) self.refresh() #self.preprocessors = dict() self.tokenizers = dict() self.regex = dict() # Add new segment def add_segment(self, segment, ftype): # Add segment source and target texts to the correspondent index of ElasticSearch id = getattr(segment, ftype + '_id') index = TMUtils.lang2es_index(getattr(segment, ftype + '_language')) s_result = self.es.index(index=index, doc_type=self.DOC_TYPE, id=id, body = self._segment2doc(segment, ftype)) return id # Bulk segment addition def add_segments(self, segments, ftype): # Bulk insert return self._segment2es_bulk(segments, ftype, 'update', self._segment2doc_upsert) # Search for top matching segments def query(self, lang, qstring, filter = None): index = TMUtils.lang2es_index(lang) if not self.index_exists(index): return # Query source ES for the text query = TMDbQuery(es=self.es, index = index, q=qstring, filter=filter) for response,q in query(): for hit in response: yield hit,q # Search for top matching segments def mquery(self, lang, limit, q_list, filter=None): index = TMUtils.lang2es_index(lang) if not self.index_exists(index): return # Query source ES for the text query = TMDbQuery(es=self.es, index=index, q=q_list, filter=filter, limit=limit) for response, q in query(): yield response #for hit in response: # yield hit # Get segment by id def get(self, lang, id): index = TMUtils.lang2es_index(lang) if not self.index_exists(index): return hit = self.es.get(index=index, id=id) if not hit: return None return hit['_source'] # Get multiple segments by id def mget(self, ids_lang): if not ids_lang: return [] body = [{ '_index': TMUtils.lang2es_index(lang), '_id' : id } for lang,id in ids_lang] hits = self.es.mget(body={'docs' : body}) if not hits: return None return [hit.get('_source',None) for hit in hits['docs']] # Scan matching segments def scan(self, lang, filter = None): index = TMUtils.lang2es_index(lang) if not self.index_exists(index): return query = TMDbQuery(es=self.es, index = index, filter=filter) for hit in query.scan(): # Build segment by querying map and target index yield hit # Scan all pivot segments def scan_pivot(self, pivot_lang, langs): index = TMUtils.lang2es_index(pivot_lang) if not self.index_exists(index): return search = Search(using=self.es, index=index) for lang in langs: search = search.query('match', target_language=lang) for result in search.scan(): yield result.meta.id # Bulk delete segments by id def delete(self, lang, ids): index = TMUtils.lang2es_index(lang) actions = [{'_op_type': 'delete', '_id': id, '_index' : index, '_type': self.DOC_TYPE, } for id in ids] # Bulk delete try: status = helpers.bulk(self.es, actions) except Exception as e: logging.warning(e) return str(e) return status # Should be called after modifying the index def refresh(self): #self.indexes = self.es.indices.get_aliases() #not supported anymore self.indexes = self.es.indices.get_alias("*") def index_exists(self, index): return self.es.indices.exists(index) def get_langs(self): return [TMUtils.es_index2lang(l) for l in self.indexes if re.search('^tm_\w{2}$', l)] ############### Helper methods ################### def _segment2es_bulk(self, segments, ftype, op_type, f_action): # Add segment source and target texts to the correspondent index of ElasticSearch in a batch actions = [] added_ids = set() for segment in segments: id = getattr(segment, ftype + '_id') if id in added_ids: continue # avoid duplicates in the same batch added_ids.add(id) index = TMUtils.lang2es_index(getattr(segment, ftype + '_language')) action = {'_id': id, '_index' : index, '_type' : self.DOC_TYPE, '_op_type': op_type, '_source' : f_action(segment, ftype) #self._segment2doc(segment, ftype) } actions.append(action) # Bulk insert logging.info("Bulk upsert: {}".format(actions)) s_result = helpers.bulk(self.es, actions) self.refresh() # refresh list of indexes (could have been created during insert) return s_result def _segment2doc(self, segment, ftype): text_pos = getattr(segment, ftype + '_pos') doc = {'text': getattr(segment, ftype + '_text')} # Optional fields (POS, tokenized) if hasattr(segment, ftype + '_pos'): doc['pos'] = getattr(segment, ftype + '_pos') op_ftype = 'source' if ftype == 'target' else 'target' # Auxiliary field to facilitate language matrix generation doc['target_language'] = [TMUtils.lang2short(TMUtils.str2list(getattr(segment, op_ftype + '_language'))[0])] doc['token_cnt'] = self.token_count(getattr(segment, ftype + '_text'), getattr(segment, ftype + '_language')) return doc def _segment2doc_upsert(self, segment, ftype): doc = self._segment2doc(segment, ftype) upsert_body = {'upsert': doc, # insert doc as is if it doesn't exist yet # If doc exists, then execute this painless scipt: # - add target language to the list and filter unique values by converting to set 'script' : 'ctx._source.target_language.add(params.language); ctx._source.target_language = ctx._source.target_language.stream().distinct().filter(Objects::nonNull).collect(Collectors.toList()); \ if (params.pos != null) { ctx._source.pos = params.pos; }', # ', # parameters to the script 'params' : { 'language' : doc['target_language'], 'pos' : doc['pos']} } #return {'doc': doc, 'doc_as_upsert' : True } return upsert_body # Applied regular expression. tokenize and count the total of words def token_count(self, text, lang): lang = lang.split('-')[0].upper() if not lang in self.regex: try: self.regex[lang] = TMRegExpPreprocessor(lang) logging.info("Loading Regex for {}".format(lang)) except Exception as e: logging.info("Unsupported Regex for {} ".format(lang)) self.regex[lang] = lang if not lang in self.tokenizers: try: self.tokenizers[lang] = TMTokenizer(lang) logging.info("Loading Tokenizer for {}".format(lang)) except Exception as e: self.tokenizers[lang] = lang logging.info("Unsupported Tokenizer for {}".format(lang)) if self.regex[lang] != lang: text = TMRegexMatch.simplified_name(self.regex[lang].process(text)) if self.tokenizers[lang] != lang: token_cnt = len((self.tokenizers[lang].tokenizer.process(text)).split(' ')) else: if ' ' in text: token_cnt = len(text.split(' ')) else: token_cnt = 1 return token_cnt#len((self.tokenizers[lang].tokenizer.process(TMRegexMatch.simplified_name(self.regex[lang].process(text)))).split(' ')) def _index_template(self): template = { "template": "tm_*", "settings": { "analysis": { "analyzer": { "folding": { "tokenizer": "standard", "filter": ["lowercase", "asciifolding"] } } } }, "mappings" : { self.DOC_TYPE: { "properties": { # Field text should analyzed, text.raw shouldn't "text": { "type": "text", "analyzer": "folding" }, "target_language": { "type": "keyword", "index": "true" }, "pos": { "type": "keyword", "index": "true" }, "token_cnt": { "type": "integer", "index": "true" } } } } } print(json.dumps(template)) return template
class ESClient(object): def __init__(self, hosts, batchSize=1000, **kwargs): self.esConn = Elasticsearch(hosts, **kwargs) self.bulker = ListBulker() self.batchSize = batchSize self.ID_FIELD = "_id" def _isOk(self, response): return response.get('acknowledged', False) def createIndex(self, indexName="test", body=None, mappings=None, settings=None): if self.esConn.indices.exists(indexName): self.deleteIndex(indexName) return self._createIndex(indexName, body, mappings, settings) def createIndexIfNotExist(self, indexName="test", body=None, mappings=None, settings=None): if not self.esConn.indices.exists(indexName): return self._createIndex(indexName, body, mappings, settings) return True def _createIndex(self, indexName, body, mappings, settings): logging.info('Create index %s ...', indexName) body = self._createIndexConfig(body, mappings, settings) logging.debug(json.dumps(body, ensure_ascii=False, indent=4)) response = self.esConn.indices.create(index=indexName, body=body) return self._isOk(response) def _createIndexConfig(self, body, mappings, settings): if not body: body = {} if settings: if 'settings' in settings: body.update(settings) else: body['settings'] = settings if mappings: if 'mappings' in mappings: body.update(mappings) else: body['mappings'] = mappings return body def closeIndex(self, indexName="test"): response = self.esConn.indices.close(index=indexName) return self._isOk(response) def openIndex(self, indexName="test"): response = self.esConn.indices.open(index=indexName) return self._isOk(response) def updateSetting(self, indexName="test", settings={}): logging.info('Update setting for index %s ...', indexName) self.esConn.indices.put_settings(index=indexName, body=settings) def deleteIndex(self, indexName="test"): logging.info('Delete index %s ...', indexName) response = self.esConn.indices.delete(indexName) return self._isOk(response) def getDocById(self, indexName, indexType, docid): return self.esConn.get(index=indexName, doc_type=indexType, id=docid).get( '_source', self.esConn.get(index=indexName, doc_type=indexType, id=docid)) def getKeysAndDocsByIds(self, indexName, indexType, docids): docs = self.esConn.mget(index=indexName, doc_type=indexType, body={"ids": docids}) for doc in docs: yield doc.get_id(), doc if doc != {} else None def getDocsByIds(self, indexName, indexType, docids): for _, doc in self.getKeysAndDocsByIds(indexName, indexType, docids): yield doc def indexDoc(self, indexName, indexType, doc, docid=None, bulk=False): if bulk: action = { '_op_type': 'index', '_index': indexName, '_type': indexType, '_source': doc } if docid: action['_id'] = docid self.bulker.add(action) return self.force_bulk() else: response = self.esConn.index(index=indexName, doc_type=indexType, id=docid, body=doc) return 'created' in response def deleteDoc(self, indexName, indexType, docid, bulk=False): if bulk: self.bulker.add({ '_op_type': 'delete', '_index': indexName, '_type': indexType, '_id': docid }) self.force_bulk() else: self.esConn.delete(index=indexName, doc_type=indexType, id=docid) def deleteDocs(self, indexName, indexType, docids): self.delete_batch(indexName, indexType, docids) def delete_batch(self, indexName, indexType, docids): actions = self._buildDeleteActions(indexName, indexType, docids) success, errors = helpers.bulk(self.esConn, actions) # @UnusedVariable if errors: logging.error("Delete batch: there are some errors %s", errors) def upsert_batch(self, indexName, indexType, docs, batchSize=1000, idField=None): actions = self._buildIndexActions(indexName, indexType, docs, idField) success, errors = helpers.bulk(self.esConn, actions, chunk_size=batchSize) # @UnusedVariable if errors: logging.error("Upsert batch: there are some errors %s", errors) @command('return_bool') def force_bulk(self, bulk=False): if bulk or len(self.bulker) >= self.batchSize == 0: success, errors = helpers.bulk( self.esConn, self.bulker.pop_all(), chunk_size=self.batchSize) # @UnusedVariable if errors: logging.error("Force bulk: there are some errors %s", errors) return False return True def _buildDeleteActions(self, indexName, indexType, docids): actions = [] for docid in docids: actions.append({ '_op_type': 'delete', '_index': indexName, '_type': indexType, '_id': docid }) return actions def _buildIndexActions(self, indexName, indexType, docs, idField=None): if type(docs) == list: return self._buildIndexActionsFromList(indexName, indexType, docs, idField) elif type(docs) == dict: return self._buildIndexActionsFromDict(indexName, indexType, docs) else: return [] def _buildIndexActionsFromList(self, indexName, indexType, docs, idField): actions = [] for doc in docs: _id = doc[idField] del doc[idField] action = { '_op_type': 'index', '_index': indexName, '_type': indexType, '_id': _id, '_source': doc } actions.append(action) return actions def _buildIndexActionsFromDict(self, indexName, indexType, docs): actions = [] for docid, doc in docs.items(): actions.append({ '_op_type': 'index', '_index': indexName, '_type': indexType, '_id': docid, '_source': doc }) return actions def countIndexDocs(self, indexName, typeName=None): time.sleep(3) return self.esConn.count(index=indexName, doc_type=typeName) def search(self, indexName, typeName=None, query=None, params=None): ''' :param indexName: list or string of indices :param typeName: list or string of types ''' if type(indexName) == list: indexName = ','.join(indexName) if type(typeName) == list: typeName = ','.join(typeName) if params: return self.esConn.search(index=indexName, doc_type=typeName, body=query, params=params) else: return self.esConn.search(index=indexName, doc_type=typeName, body=query) def scroll(self, indexName, typeName=None, query=None, scroll='10m', size=1000): return helpers.scan(self.esConn, index=indexName, doc_type=typeName, query=query, scroll=scroll, size=size) def existsType(self, indexName, typeName=None): return self.esConn.indices.exists_type(index=indexName, doc_type=typeName) def existsIndex(self, indexName): return self.esConn.indices.exists_index(indexName) def putMapping(self, indexName, typeName, mapping={}): response = self.esConn.indices.put_mapping(index=indexName, doc_type=typeName, body=mapping) return self._isOk(response)
class EntityManager(object): @staticmethod def entity_not_found_message(en_type, ids): return 'Entities: "{type}" with ids: {ids} not found.'.format( type=en_type, ids=ids) def __init__(self, index='default', es_settings=None): if es_settings: self.es = Elasticsearch(**es_settings) else: self.es = Elasticsearch() self._index = index self._registry = {} def persist(self, entity): if not hasattr(entity, 'to_storage') or not hasattr( entity, '__getitem__') or not hasattr(entity, 'type'): raise TypeError( 'entity object must have to_storage, type and behave like a dict methods' ) self._persist(entity, state=ADD) def remove(self, entity): self._persist(entity, state=REMOVE) def flush(self): actions = [] for persisted_entity in six.itervalues(self._registry): if persisted_entity.is_action_needed(): actions.append(persisted_entity) self._execute_callbacks(actions, 'pre') bulk_results = helpers.streaming_bulk(self.es, [a.stmt for a in actions]) # TODO: checking exceptions in bulk_results for persisted_entity, result in zip(actions, bulk_results): if 'create' in result[1]: persisted_entity.set_id(result[1]['create']['_id']) for action in actions: action.reset_state() self._execute_callbacks(actions, 'post') def find(self, _id, _type, scope=None, **kwargs): params = { 'id': _id, 'index': self._index, 'doc_type': _type.get_type() } if scope: params['_source'] = _type.get_fields(scope) params.update(kwargs) try: _data = self.es.get(**params) except TransportError as e: # TODO: the might be other errors like server unavaliable raise EntityNotFound( self.entity_not_found_message(_type.get_type(), _id), e) if not _data['found']: raise EntityNotFound( self.entity_not_found_message(_type.get_type(), _id)) source = _data['_source'] source['id'] = _data['_id'] entity = _type(source, scope) self._persist(entity, state=UPDATE) return entity def find_many(self, _ids, _type, scope=None, complete_data=True, **kwargs): params = { 'body': { 'ids': _ids }, 'index': self._index, 'doc_type': _type.get_type() } if scope: params['_source'] = _type.get_fields(scope) params.update(kwargs) try: _data = self.es.mget(**params) except TransportError as e: # TODO: the might be other errors like server unavaliable raise EntityNotFound( self.entity_not_found_message(_type.get_type(), ', '.join(_ids)), e) entities = [] if complete_data: invalid_items = [ elem['_id'] for elem in _data['docs'] if not elem['found'] ] if invalid_items: raise EntityNotFound( self.entity_not_found_message(_type.get_type(), ', '.join(invalid_items))) for _entity in _data['docs']: if _entity['found']: source = _entity['_source'] source['id'] = _entity['_id'] entity = _type(source, scope) self._persist(entity, state=UPDATE) entities.append(entity) return entities def query(self, query, _type, scope=None, **kwargs): params = {} if scope: params['_source'] = _type.get_fields(scope) try: data = self.es.search(index=self._index, doc_type=_type.get_type(), body=query, **kwargs) except TransportError as e: raise RepositoryError('Transport returned error', cause=e) entities = [] for record in data['hits']['hits']: source = record['_source'] source['id'] = record['_id'] source['_score'] = record['_score'] if '_explanation' in record: source['_explanation'] = record['_explanation'] entity = _type(source, scope, record.get('highlight')) self._persist(entity, state=UPDATE) entities.append(entity) return entities, without(['hits'], data, move_up={'hits': ['max_score', 'total']}) def query_one(self, query, _type, scope=None, **kwargs): entities, meta = self.query(query, _type, scope, **kwargs) if len(entities) == 1: return entities[0] raise RepositoryError( 'Expected one result, found {num}'.format(num=len(entities))) def clear(self): self._registry = {} def get_repository(self, repository): app, repository_class_name = repository.split(':') if app not in settings.INSTALLED_APPS: founded_app = [ _app for _app in settings.INSTALLED_APPS if _app.endswith(app) ] if not founded_app: raise RepositoryError( 'Given application {app} are not in INSTALLED_APPS'.format( app=app)) app = founded_app[0] try: module = import_module(app + '.' + 'repositories') except ImportError: raise RepositoryError( 'Given application {app} has no repositories'.format(app=app)) if not hasattr(module, repository_class_name): raise RepositoryError( 'Given repository {repository_class_name} does not exists in application {app}' .format(repository_class_name=repository_class_name, app=app)) repository_class = getattr(module, repository_class_name) if not issubclass(repository_class, BaseRepository): raise RepositoryError( 'Custom repository must be subclass of BaseRepository') return repository_class(self) def get_client(self): return self.es def _persist(self, entity, state): if id(entity) in self._registry: self._registry[id(entity)].state = state else: self._registry[id(entity)] = PersistedEntity(entity, state=state, index=self._index) def _execute_callbacks(self, actions, type): for persisted_entity in actions: if type == 'pre': attr = 'state' else: attr = 'last_state' action = { ADD: 'create', UPDATE: 'update', REMOVE: 'delete' }[getattr(persisted_entity, attr)] callback_func_name = type + '_' + action if hasattr(persisted_entity._entity, callback_func_name): getattr(persisted_entity._entity, callback_func_name)(self)
class EntityManager(object): @staticmethod def entity_not_found_message(en_type, ids): return 'Entities: "{type}" with ids: {ids} not found.'.format(type=en_type, ids=ids) def __init__(self, index='default', es_settings=None): if es_settings: self.es = Elasticsearch(**es_settings) else: self.es = Elasticsearch() self._index = index self._registry = {} def persist(self, entity): if not hasattr(entity, 'to_storage') or not hasattr(entity, '__getitem__') or not hasattr(entity, 'type'): raise TypeError('entity object must have to_storage, type and behave like a dict methods') self._persist(entity, state=ADD) def remove(self, entity): self._persist(entity, state=REMOVE) def flush(self, refresh=False): actions = [] for persisted_entity in six.itervalues(self._registry): if persisted_entity.is_action_needed(): actions.append(persisted_entity) self._execute_callbacks(actions, 'pre') bulk_results = helpers.streaming_bulk(self.es, [a.stmt for a in actions], refresh=refresh) # TODO: checking exceptions in bulk_results for persisted_entity, result in zip(actions, bulk_results): if 'create' in result[1]: persisted_entity.set_id(result[1]['create']['_id']) for action in actions: action.reset_state() self._execute_callbacks(actions, 'post') def find(self, _id, _type, scope=None, **kwargs): params = {'id': _id, 'index': self._index, 'doc_type': _type.get_type()} if scope: params['_source'] = _type.get_fields(scope) params.update(kwargs) try: _data = self.es.get(**params) except TransportError as e: # TODO: the might be other errors like server unavaliable raise EntityNotFound(self.entity_not_found_message(_type.get_type(), _id), e) if not _data['found']: raise EntityNotFound(self.entity_not_found_message(_type.get_type(), _id)) source = _data['_source'] source['id'] = _data['_id'] entity = _type(source, scope) self._persist(entity, state=UPDATE) return entity def find_many(self, _ids, _type, scope=None, complete_data=True, **kwargs): try: _ids = list(_ids) except TypeError as e: raise RepositoryError('Variable _ids has to be iterable', cause=e) params = {'body': {'ids': _ids}, 'index': self._index, 'doc_type': _type.get_type()} if scope: params['_source'] = _type.get_fields(scope) params.update(kwargs) try: _data = self.es.mget(**params) except TransportError as e: # TODO: the might be other errors like server unavaliable raise EntityNotFound(self.entity_not_found_message(_type.get_type(), ', '.join(_ids)), e) entities = [] if complete_data: invalid_items = [elem['_id'] for elem in _data['docs'] if not elem['found']] if invalid_items: raise EntityNotFound(self.entity_not_found_message(_type.get_type(), ', '.join(invalid_items))) for _entity in _data['docs']: if _entity['found']: source = _entity['_source'] source['id'] = _entity['_id'] entity = _type(source, scope) self._persist(entity, state=UPDATE) entities.append(entity) return entities def query(self, query, _type, scope=None, **kwargs): params = {} if scope: params['_source'] = _type.get_fields(scope) params.update(kwargs) try: data = self.es.search(index=self._index, doc_type=_type.get_type(), body=query, **params) except TransportError as e: raise RepositoryError('Transport returned error', cause=e) entities = [] for record in data['hits']['hits']: source = record['_source'] source['id'] = record['_id'] source['_score'] = record['_score'] if '_explanation' in record: source['_explanation'] = record['_explanation'] entity = _type(source, scope, record.get('highlight')) self._persist(entity, state=UPDATE) entities.append(entity) return entities, without(['hits'], data, move_up={'hits': ['max_score', 'total']}) def query_one(self, query, _type, scope=None, **kwargs): entities, meta = self.query(query, _type, scope, **kwargs) if len(entities) == 1: return entities[0] raise RepositoryError('Expected one result, found {num}'.format(num=len(entities))) def clear(self): self._registry = {} def get_repository(self, repository): app, repository_class_name = repository.split(':') if app not in settings.INSTALLED_APPS: founded_app = [_app for _app in settings.INSTALLED_APPS if _app.endswith(app)] if not founded_app: raise RepositoryError('Given application {app} are not in INSTALLED_APPS'.format(app=app)) app = founded_app[0] try: module = import_module(app + '.' + 'repositories') except ImportError: raise RepositoryError('Given application {app} has no repositories'.format(app=app)) if not hasattr(module, repository_class_name): raise RepositoryError( 'Given repository {repository_class_name} does not exists in application {app}'.format( repository_class_name=repository_class_name, app=app )) repository_class = getattr(module, repository_class_name) if not issubclass(repository_class, BaseRepository): raise RepositoryError('Custom repository must be subclass of BaseRepository') return repository_class(self) def get_client(self): return self.es def _persist(self, entity, state): if id(entity) in self._registry: self._registry[id(entity)].state = state else: self._registry[id(entity)] = PersistedEntity(entity, state=state, index=self._index) def _execute_callbacks(self, actions, type): for persisted_entity in actions: if type == 'pre': attr = 'state' else: attr = 'last_state' action = {ADD: 'create', UPDATE: 'update', REMOVE: 'delete'}[getattr(persisted_entity, attr)] callback_func_name = type + '_' + action if hasattr(persisted_entity._entity, callback_func_name): getattr(persisted_entity._entity, callback_func_name)(self)
{ "docs" : [ { "_index" : "megacorp", "_type" : "employee", "_id" : 2 }, { "_index" : "megacorp", "_type" : "employee", "_id" : 1, "_source": "last_name" } ] } pp.pprint( es.mget(body=list_docs) ) print 'mget with simplified format' list_docs = \ { "docs" : [ { "_id" : 2 }, { "_id" : 1, "_source": "last_name" } ] } pp.pprint( es.mget(index="megacorp", doc_type="employee", body=list_docs) )
print Exception, ":", r es = Elasticsearch("219.224.135.93") print "retry" print count_index if count_index % 10000 == 0: ts = time.time() print "%s per %s second" % (count_index, ts - tb) tb = ts else: exist_uid_list.append(user_id) count_uid += 1 if count_uid % 1000 == 0: multi_items = es.mget(index="activity", doc_type="manage", body={"ids": exist_uid_list}, _source=True)['docs'] exist_uid_list = [] for m_item in multi_items: m_item = m_item['_source'] update_item = compare_activity(item, m_item) xdata = expand_index_action(update_item) bulk_action.extend([xdata[0], xdata[1]]) count_index += 1 if count_index % 2000 == 0: while True: try: es.bulk(bulk_action, index="activity", doc_type="manage",
class SearchEngine(object): def __init__(self): # serializer = JSONSerializer() serializer.mimetype = 'application/json' serializer.dumps = serializer.serialize serializer.loads = JSONDeserializer().deserialize self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS) self.logger = logging.getLogger(__name__) def delete(self, **kwargs): """ Deletes a document from the index Pass an index, doc_type, and id to delete a specific document Pass a body with a query dsl to delete by query """ body = kwargs.pop('body', None) if body != None: try: data = [] refresh = kwargs.pop('refresh', False) for hit in helpers.scan(self.es, query=body, **kwargs): hit['_op_type'] = 'delete' data.append(hit) return helpers.bulk(self.es, data, refresh=refresh, **kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail def delete_index(self, **kwargs): """ Deletes an entire index """ index = kwargs.get('index', '').strip() print 'deleting index : %s' % index return self.es.indices.delete(index=index, ignore=[400, 404]) def search(self, **kwargs): """ Search for an item in the index. Pass an index, doc_type, and id to get a specific document Pass a body with a query dsl to perform a search """ body = kwargs.get('body', None) index = kwargs.get('index', None) id = kwargs.get('id', None) if index is None: raise NotImplementedError( "You must specify an 'index' in your call to search") if id: if isinstance(id, list): kwargs.setdefault('body', {'ids': kwargs.pop('id')}) return self.es.mget(**kwargs) else: return self.es.get(**kwargs) ret = None try: ret = self.es.search(**kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) pass return ret def index_term(self, term, id, context='', options={}): """ If the term is already indexed, then simply increment the count and add the id of the term to the existing index. If the term isn't indexed then add the index. id: a unique id associated with the term context: a uuid of a concept to associate with the term to render in the ui options: any additional information to associate with the term """ if term.strip(' \t\n\r') != '': already_indexed = False count = 1 ids = [id] try: #_id = unicode(term, errors='ignore').decode('utf-8').encode('ascii') _id = uuid.uuid3(uuid.NAMESPACE_DNS, '%s%s' % (hash(term), hash(context))) result = self.es.get(index='term', doc_type='value', id=_id, ignore=404) #print 'result: %s' % result if result['found'] == True: ids = result['_source']['ids'] if id not in ids: ids.append(id) else: ids = [id] self.index_data('term', 'value', { 'term': term, 'context': context, 'options': options, 'count': len(ids), 'ids': ids }, id=_id) except Exception as detail: self.logger.warning( '%s: WARNING: search failed to index term: %s \nException detail: %s\n' % (datetime.now(), term, detail)) raise detail def delete_terms(self, ids): """ If the term is referenced more then once simply decrement the count and remove the id of the deleted term from the from the existing index. If the term is only referenced once then delete the index """ if not isinstance(ids, list): ids = [ids] for id in ids: result = self.es.search(index='term', doc_type='value', body={ "query": { "filtered": { "filter": { "terms": { "ids": [id] } }, "query": { "match_all": {} } } }, "from": 0, "size": 10 }, ignore=404) if 'hits' in result: for document in result['hits']['hits']: document['_source']['ids'].remove(id) count = len(document['_source']['ids']) if count > 0: document['_source']['count'] = count self.index_data('term', 'value', document['_source'], id=document['_id']) self.es.indices.refresh(index='term') else: self.delete(index='term', doc_type='value', id=document['_id']) def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex=None, body=None): """ Creates an Elasticsearch body for a single field given an index name and type name """ if not body: if fieldtype == 'geo_shape': body = { doc_type: { 'properties': { fieldname: { 'type': 'geo_shape', 'tree': 'geohash', 'precision': '1m' } } } } else: fn = {'type': fieldtype} if fieldindex: fn['index'] = fieldindex body = {doc_type: {'properties': {fieldname: fn}}} self.create_index(index=index, ignore=400) self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body) def create_index(self, **kwargs): self.es.indices.create(**kwargs) def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs): """ Indexes a document or list of documents into Elasticsearch If "id" is supplied then will use that as the id of the document If "idfield" is supplied then will try to find that property in the document itself and use the value found for the id of the document """ if not isinstance(body, list): body = [body] for document in body: if idfield is not None: if isinstance(document, dict): id = document[idfield] else: id = getattr(document, idfield) try: self.es.index(index=index, doc_type=doc_type, body=document, id=id, **kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail)) raise detail def bulk_index(self, data): return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True) def create_bulk_item(self, index, type, id, data): if not (self.isempty_or_none(index) or self.isempty_or_none(type) or self.isempty_or_none(id)): return [{ "index": { "_index": index, "_type": type, "_id": id } }, data] else: return false
except Exception, r: print Exception,":",r es = Elasticsearch("219.224.135.93") print "retry" print count_index if count_index % 10000 == 0: ts = time.time() print "%s per %s second" %(count_index, ts-tb) tb = ts else: exist_uid_list.append(user_id) count_uid += 1 if count_uid % 1000 == 0: multi_items = es.mget(index="activity", doc_type="manage", body={"ids": exist_uid_list}, _source=True)['docs'] exist_uid_list = [] for m_item in multi_items: m_item = m_item['_source'] update_item = compare_activity(item, m_item) xdata = expand_index_action(update_item) bulk_action.extend([xdata[0], xdata[1]]) count_index += 1 if count_index % 2000 == 0: while True: try: es.bulk(bulk_action, index="activity", doc_type="manage", timeout=30) bulk_action=[] break except Exception, r:
class ESearch(): def __init__(self): """ Initialize class parameters """ # Connection object self._es = None self._index_name = "article_data" self._hash_field = "URL" self._dict_of_duplicate_docs = {} def connect_to_es(self, host_name=ELASTIC_SEARCH_ENDPOINT): """ Establishes a connection to the Elastic search server. If server if pingable, returns connection object. Else return None :return: connection-object """ self._es = Elasticsearch(hosts=[host_name], timeout=60) # Ping the connection to check if it's alive if self._es.ping(): return self._es return None def index_exists(self, index_name=None): if not index_name: index_name = self._index_name return self._es.indices.exists(index_name) def _make_mapping(self): """ Creates the index with the correct mapping :return: """ m = Mapping() # add fields m.field('Title', 'text') m.field('Text', 'text') m.field('Publish_Date', 'date') # date type complicates matters across websites m.field('URL', 'text') m.field('Scrape_Date', 'date') # date type complicates matters across websites m.field('Source', 'text') m.field('Search_Keyword', 'text') # save list as text? m.field('SE_Is_Risk', 'boolean') m.field('GP_Is_Risk', 'boolean') m.field('RG_Is_Risk', 'boolean') m.field('SE_Risk_Rating', 'float') m.field('GP_Risk_Rating', 'float') m.field('RG_Risk_Rating', 'float') m.field('SE_SnP_Open', 'float') m.field('SE_SnP_Close', 'float') m.field('SE_AbbV_Open', 'float') m.field('SE_AbbV_Close', 'float') m.field('SE_XBI_Open', 'float') m.field('SE_XBI_Close', 'float') m.field('SE_SnP_Open_Plus1', 'float') m.field('SE_SnP_Close_Plus1', 'float') m.field('SE_AbbV_Open_Plus1', 'float') m.field('SE_AbbV_Close_Plus1', 'float') m.field('SE_XBI_Open_Plus1', 'float') m.field('SE_XBI_Close_Plus1', 'float') m.field('SE_SentimentScore', 'float') m.field('SE_SentimentPolarity', 'float') m.field('CompositeScore', 'float') m.field('RG_FDA_Warning', 'boolean') m.field('GP_SentimentScore', 'float') m.field('GP_SentimentPolarity', 'float') m.field('GP_Location', 'text') m.field('GP_Country', 'text') m.field('Article_references', 'float') m.field('Is_source_type_RG', 'boolean') m.field('Is_source_type_SE', 'boolean') m.field('Is_source_type_GP', 'boolean') # save the mapping into index 'my-index' try: m.save(self._index_name) except Exception as e: print("Could not save schema!", e) def create_index(self): """ Creates the index if it doesn't exist :return: """ # create the index if it doesn't exist if not self.index_exists(): try: index.create() self._make_mapping() print("Index was created :", index.exists()) except Exception as e: print("~~~Index exists error") print(e) return -1 else: print("Index already exists", self._index_name) return 0 def get_index_mapping(self): """ Retrieves the index mapping :return: Index mapping JSON object if success, -1 if error """ try: return self._es.indices.get_mapping(index=self._index_name) except Exception as e: print("~~~Get index mapping error") print(e) return -1 def get_count(self, search_obj=None): return self._es.count(index=self._index_name, body=search_obj) def upload_dataframe(self, df): """ Uploads a dataframe into the index :param df: Dataframe (pandas) :return: 0 if success, -1 if failure """ def rec_to_actions(df): for record in df.to_dict(orient="records"): yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}' % (self._index_name, "_doc")) yield (json.dumps(record, default=int)) if not self.index_exists(): print("!!!INDEX DOES NOT EXIST -- RETURNING!!!") return -1 try: # make the bulk call, and get a response response = self._es.bulk(rec_to_actions(df)) # return a dict if not response["errors"]: print("Records uploaded") else: print("Could not upload data ") print(response) return -1 except Exception as e: print("\nERROR:", e) return -1 return 0 # Process documents returned by the current search/scroll def _populate_dict_of_duplicate_docs(self, hits): for item in hits: combined_key = str(item['_source'][self._hash_field]) _id = item["_id"] # _Title = item["_source"]["Title"] hashval = hashlib.md5(combined_key.encode('utf-8')).digest() # If the hashval is new, then we will create a new key # in the dict_of_duplicate_docs, which will be # assigned a value of an empty array. # We then immediately push the _id onto the array. # If hashval already exists, then # we will just push the new _id onto the existing array self._dict_of_duplicate_docs.setdefault(hashval, []).append(_id) # Loop over all documents in the index, and populate the # dict_of_duplicate_docs data structure. def _scroll_over_all_docs(self): data = self._es.search(index=self._index_name, scroll='1m', body={"query": { "match_all": {} }}) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits self._populate_dict_of_duplicate_docs(data['hits']['hits']) while scroll_size > 0: data = self._es.scroll(scroll_id=sid, scroll='2m') # Process current batch of hits self._populate_dict_of_duplicate_docs(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) def _loop_over_hashes_and_remove_duplicates(self): urls_to_delete = [] ids_to_delete = [] # Search through the hash of doc values to see if any # duplicate hashes have been found for hashval, array_of_ids in self._dict_of_duplicate_docs.items(): if len(array_of_ids) > 1: # print("********** Duplicate docs hash=%s **********" % hashval) # Get the documents that have mapped to the current hasval matching_docs = self._es.mget(index=self._index_name, body={"ids": array_of_ids}) # Check if the URLs are truly the same URLs dict_url_ids = {} for doc in matching_docs['docs']: dict_url_ids.setdefault(doc["_source"].get("URL"), []).append(doc["_id"]) # remove only the first ID from the list dict_url_ids = { key: value[1:] for (key, value) in dict_url_ids.items() } for i in list(dict_url_ids.keys()): urls_to_delete.append(i) # Delete all the IDs now for i in list(dict_url_ids.values()): ids_to_delete.extend(i) for u in urls_to_delete: print(u) for idd in ids_to_delete: try: del_return = self._es.delete(index=self._index_name, id=idd) except Exception as e: print(e) break def remove_duplicates(self): self._scroll_over_all_docs() self._loop_over_hashes_and_remove_duplicates()
class AKAGraph(object): def __init__( self, hosts=None, index_name=None, replicas=10, soft_selectors=None, hard_selectors=None, hyper_edge_scorer=None, shards=None, buffer_size=20, conn=None, num_identifier_downweight=0, popular_identifier_downweight=0, ): '''AKAGraph provides the interface to an elastic-search backed probabilistic graph proximity engine Its main operations are: * add a "record" containing various types of identifiers * query for those records "close" to a given identifier :param hosts: elasticsearch hosts :param index_name: the elasticsearch index name to use (or create) :param replicas: the number of monte-carlo samples to use :param soft_selectors: a list of identifiers to be considered not globally unique :param hard_selectors: a list of globally unique identifiers :param shards: number of elasticsearch shards :param buffer_size: how many updates to batch before committing them to elasticsearch :param conn: Elasticsearch connection object :param num_identifiers_downweight: records with many identifiers should have their identifiers bind more loosely to others :param popular_identifier_downweight: identifiers in many records should bind loosely ''' if conn is None: self.conn = Elasticsearch(hosts=hosts, retry_on_timeout=True, max_retries=5) else: self.conn = conn self.index = index_name self.shards = shards self.buffer_size = buffer_size self.record_buffer = [] self.edge_buffer = [] self.in_context = False if soft_selectors is None: soft_selectors = default_soft_selectors self.soft_selectors = set(soft_selectors) if hard_selectors is None: hard_selectors = default_hard_selectors self.hard_selectors = set(hard_selectors) if hyper_edge_scorer is not None: self.hyper_edge_scorer = hyper_edge_scorer else: unigrams, bigrams = load_ngrams() self.hyper_edge_scorer = \ lambda s: prob_username(s, unigrams, bigrams) self.replica_list = range(replicas) self.score_cutoff = .001 self.num_identifier_downweight = num_identifier_downweight self.popular_identifier_downweight = popular_identifier_downweight def __enter__(self): logger.debug('in context') self.in_context = True return self def __exit__(self, exc_type, exc_value, traceback): self.in_context = False if exc_type is None and exc_value is None and traceback is None: self.flush() def add(self, rec, analyze_and_union=True): '''add `rec` to ES; must be used inside a `with` statement ''' assert self.in_context, 'must use "with" statement to add docs' self.record_buffer.append((rec, analyze_and_union)) if len(self.record_buffer) >= self.buffer_size: self.flush_records() def add_edge(self, IDs, strength, evidence=None): ''' Adds an edge between all identifiers in the iterable IDs with the given strength. This does not create entries of type RECORD. It will simply do unions in the UNION_FIND type. However, if you have records ingested, and these id's correspond to the urls of those records, ingesting this way will link these records just as when full record-based ingest happens. If evidence is None, each call to add_edge should be thought of as offering independent evidence of a relationship. So if you call add_edge(["A", "B"], .5) twice, this is equivalent to calling it once with strength .75 However add_edge(["A", "B"], .5, 'foo') is idempotent add_edge must be used inside a `with` statement :param IDs: An iterable of identifiers to union probabilistically. These can be any string, but if the match urls of records, then querying based on those record's fields will work as expected. :param strength: 0 < strength <= 1 is a probability with which to union all edges in the IDs set :param evidence: evidence is used for fine-grained control over whether repeated calls with overlapping IDs sets are treated independently. For example, If you like A and B because they share a username "foo", and you like B and A because they share "foo", you want it to only link once. Supply "foo" as evidence and this will work as desired. If you later want to link A and B because they share an email address, this is independent evidence and will increase the proximity of A and B ''' assert self.in_context, 'must use "with" statement to add docs' self.edge_buffer.append((IDs, strength, evidence)) if len(self.edge_buffer) >= self.buffer_size: self.flush_edges() def flush(self): self.flush_records() self.flush_edges() def flush_edges(self): local_union_find = MemoryUnionFind( ) # this is purely an efficiency hack so we hit ES less redundantly for equivs, score, score_reason in self.edge_buffer: logger.debug('given equivs %r with %s strength and evidence %s', equivs, score, score_reason) self.probabilistically_unite_edges(equivs, score, score_reason, local_union_find) self.edge_buffer = self.edge_buffer[:0] def flush_records(self): '''Actually do the work to ingest records gathered by calls to `add`. All vertexes are their own roots on initial ingest; so this sets size to 1 iff the doc has not been ingested before. ''' if not self.conn.indices.exists(index=self.index): self.create_index() logger.debug('flushing ingest buffer (size: %d)', len(self.record_buffer)) actions = [] for rec, _ in self.record_buffer: actions.append({ '_index': self.index, '_type': RECORD_TYPE, '_id': rec['url'], '_source': rec, }) #actions.append({ # '_index': self.index, # '_type': ROOT_SIZE_TYPE, # '_id': rec['url'], # '_op_type': 'update', # 'doc': {'size': 1}, # set initial size to 1, TODO: # write tests to make sure this # doesn't change values when # re-ingesting # 'doc_as_upsert': True, #}) bulk(self.conn, actions, timeout='60s') # next find equivalent records via exact match, and union them # uh oh... self.sync() # as an efficiency hack we make a local, one-off union find so we hit ES less redundantly # batches are likely to have a lot of the same records to union, and we do not want # to tell ES about each of a set of redundant unions. If we catch them locally, we only # hit ES with new stuff local_union_find = MemoryUnionFind() # record_buffer has tuples where the [0] element is the record # and the [1] element is whether or not to union from it. Only process the records to union here # this supports adding records and *explicit* edges separately for rec, score, score_reason, equivs in self.find_equivs( [buf[0] for buf in self.record_buffer if buf[1]]): logger.debug('%s found %d (%f) equivs for %r --> %r', score_reason, len(equivs), score, rec['url'], equivs) equivs.add(rec['url']) self.probabilistically_unite_edges(equivs, score, score_reason, local_union_find) self.record_buffer = self.record_buffer[:0] def probabilistically_unite_edges(self, equivs, score, score_reason, local_union_find=None): if score == 1: equivs_len = len(equivs) if local_union_find: equivs = local_union_find.find_all_and_union(*equivs) logger.debug('had %s equivs, now have %s', equivs_len, len(equivs)) if len(equivs) < 1: return if score_reason: def include_replica(replica): return score == 1 or pseudorandom(score_reason, replica) < score else: # if no reason is given, make it random def include_replica(replica): del replica return score == 1 or uniform_random() < score for replica in self.replica_list: if include_replica(replica): self.unite(*[AKANode(url, replica) for url in equivs]) self.sync() def sync(self): '''Forces data to disk, so that data from all calls to `put` will be available for getting and querying. Generally, this should only be used in tests. ''' self.conn.indices.refresh(index=self.index) def analyze_clusters(self, limit=None): '''hunt for clusters and return a list of clusters sored by size and indication of their overlaps: .. block-quote:: python [(size, [rec1, rec2, ...], {phone: ['+.....']}, ...] ''' #i_recs = islice(loader(path, hard_selectors=self.hard_selectors), limit) clusters = [] # consider only clusters of at least two records for root_url, count in self.get_all_roots(size_limit=1, candidates_limit=limit): del count cc = list(self.connected_component(root_url)) # The sequence of steps up to this point scans all # records, gathers their roots with counts of how many # records are under that root, then gets the CC for the # root... which should be the exact same set, right? This # may be the source of the big clusters in DIFFEO-2305 #assert len(cc) == count, (count, len(cc), cc) logger.debug('found connected component of %d: %r', len(cc), cc) recs = list(self.get_recs(*cc)) overlaps = find_overlaps(recs) _recs = {} for rec in recs: _recs[rec['url']] = rec clusters.append({ "count": len(cc), "records": _recs, "overlaps": overlaps }) clusters.sort(key=itemgetter('count'), reverse=True) cluster_sizes = Counter() for size, _, _ in clusters: cluster_sizes[size] += 1 data = { 'clusters': clusters, 'aggregate_stats': { 'largest': clusters[0]['count'], 'median': clusters[len(clusters) // 2]['count'], 'mean': sum([cluster['count'] for cluster in clusters]) / len(clusters), 'smallest': clusters[-1]['count'], 'histogram': dict(cluster_sizes), } } return data def find_equivs(self, records): '''For an iterable of `records`, yield tuples of `(record, score, equivs)`, where a `record` from `records` might appear in multiple of the yielded. ''' queries = [] scores = [] rec_pointers = [] # carries a pointer to a record for each query for rec in records: # compute score multiplies for this record weight = 1.0 if self.num_identifier_downweight: count = sum([ len(values) for key, values in rec.iteritems() if (key in self.hard_selectors or key in self.soft_selectors) ]) weight = math.exp(-self.num_identifier_downweight * (count - 1)) logger.debug('weight = %f, %d, %s', weight, count, rec['url']) # first we gather one query for all hard selectors hard_or_query = [] for key, values in rec.iteritems(): if key in self.hard_selectors: for v in values: hard_or_query.append({'term': {key: v}}) if hard_or_query: query = { "query": { "constant_score": { "filter": { "bool": { "should": hard_or_query, "must_not": { "ids": { "values": [rec["url"]] } }, } } } } } queries.append({ 'index': self.index, 'type': RECORD_TYPE, '_source_include': [] }) queries.append(query) scores.append((weight, json.dumps(hard_or_query))) rec_pointers.append(rec) else: logger.debug('skipping because no hard identifiers') # next, we make separate queries for each soft selector if not self.hyper_edge_scorer or len(self.replica_list) == 1: continue for key, values in rec.iteritems(): if key not in self.soft_selectors: continue for v in values: if not v: continue query = { "query": { "constant_score": { "filter": { "bool": { "should": [{ 'term': { key: v } }], "must_not": { "ids": { "values": [rec["url"]] } }, } } } } } score = self.hyper_edge_scorer(v) if score > self.score_cutoff: logger.debug('soft selector score %.3f for %r', score, v) queries.append({ 'index': self.index, 'type': RECORD_TYPE, '_source_include': [] }) queries.append(query) scores.append((score * weight, v)) rec_pointers.append(rec) # helper function for stripping down to just the URL def hits_generator(hits): for hit in hits['hits']['hits']: yield hit['_id'] # now loop until we get answers for all the queries cursor = 0 while queries: res = self.conn.msearch(body=queries) for hits in res['responses']: # remove the corresponding two rows of queries and corresponding record queries.pop(0) queries.pop(0) record = rec_pointers[cursor] score, score_reason = scores[cursor] # revise_score cursor += 1 if 'error' in hits: # need to run msearch again, starting with the query after the failed one if 'queue capacity' not in hits['error']: logger.warn("Error getting equivs for %s: %s", record, hits['error']) break else: hits_set = set(hits_generator(hits)) if hits_set: if self.score_cutoff < score < 1: logger.debug("SOFT: %d, %s", score, score_reason) if self.popular_identifier_downweight: score = score * math.exp( -self.popular_identifier_downweight * (len(hits_set) - 1)) yield (record, score, score_reason, hits_set) def get_recs(self, *urls): '''get records one or more for `urls` ''' if not urls: raise Exception('called get_recs with empty list') resp = self.conn.mget(index=self.index, doc_type=RECORD_TYPE, body={'ids': urls}) for rec in resp['docs']: if not rec['found']: yield {"url": rec['_id']} #raise KeyError('missing: %r' % rec['_id']) else: yield rec['_source'] def get_all_urls(self, limit=None): '''get all urls in the index ''' res = scan(self.conn, index=self.index, doc_type=RECORD_TYPE, _source_include=[], query={'query': { 'match_all': {} }}) for item in islice(res, limit): yield item['_id'] def find_urls_by_selector(self, selector, use_soft=True): if not self.conn.indices.exists(index=self.index): self.create_index() or_query = [{'term': {'url': selector}}] for key in self.hard_selectors: or_query.append({'term': {key: selector}}) if use_soft: for key in self.soft_selectors: or_query.append({'term': {key: selector}}) logger.debug('including soft_selectors: %r', self.soft_selectors) query = { "query": { "bool": { "should": or_query, } } } # logger.debug(json.dumps(query, indent=4, sort_keys=True)) try: res = self.conn.search(index=self.index, doc_type=RECORD_TYPE, _source_include=[], body=query) ''' body={ 'query': { 'multi_match': { 'query': selector, 'type': 'cross_fields', # TODO: blend soft_selectors into this 'fields': self.hard_selectors, } } }) ''' visited_urls = set() for hit in res['hits']['hits']: # logger.debug(hit['_score']) url = hit['_id'] if url not in visited_urls: visited_urls.add(url) yield url except NotFoundError, exc: logger.warn('akagraph indexes do not exist yet: %s', exc) return