} mq_client = message_queue_client_from_config(mq_config) mq_codec = JSONCodec() processed_resource_queue = 'processed_resources' # ElasticSearch es = ES('localhost:9200', timeout=60) es_index = 'topic_tracking' # dequeue one resource mq_client.connect() message = mq_client.get_message(processed_resource_queue) resource = mq_codec.decode(message.body, Resource) mq_client.delete_message(processed_resource_queue, message.id) mq_client.disconnect() # save the resource to mongo resource._id = makeIdFromURI(resource.uri) resource_collection.insert_model(resource) # index the resource for boost in [1, 1000]: es_doc = {} es_doc['content'] = resource.content es_doc['title'] = resource.title es_doc['entities'] = build_payload_string(resource.entities, boost) es_doc['terms'] = build_payload_string(resource.terms, boost) id = '%s_%d' % (resource._id, boost) r = es.index(es_doc, es_index, 'resource', id) pprint(r)
class ESDataTarget(base.DataTarget): """docstring for ClassName """ def __init__(self, document_type, database="test", host="127.0.0.1", port="9200", truncate=False, expand=False, **elasticsearch_args): """Creates a ElasticSearch data target stream. :Attributes: * document_ElasticSearch elasticsearch document_type name * database: database name * host: ElasticSearch database server host, default is ``localhost`` * port: ElasticSearch port, default is ``9200`` * expand: expand dictionary values and treat children as top-level keys with dot '.' separated key path to the child.. * truncate: delete existing data in the document_type. Default: False """ self.document_type = document_type self.database_name = database self.host = host self.port = port self.elasticsearch_args = elasticsearch_args self.expand = expand self.truncate = truncate self._fields = None def initialize(self): """Initialize ElasticSearch source stream: """ from pyes.es import ES from pyes.exceptions import IndexAlreadyExistsException args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port create = args.pop("create", False) replace = args.pop("replace", False) self.connection = ES(server, **args) self.connection.default_indices = self.database_name self.connection.default_types = self.document_type created = False if create: try: self.connection.create_index(self.database_name) self.connection.refresh(self.database_name) created = True except IndexAlreadyExistsException: pass if replace and not created: self.connection.delete_index_if_exists(self.database_name) time.sleep(2) self.connection.create_index(self.database_name) self.connection.refresh(self.database_name) if self.truncate: self.connection.delete_mapping(self.database_name, self.document_type) self.connection.refresh(self.database_name) def append(self, obj): record = obj if not isinstance(obj, dict): record = dict(zip(self.fields.names(), obj)) if self.expand: record = expand_record(record) id = record.get('id') or record.get('_id') self.connection.index(record, self.database_name, self.document_type, id, bulk=True) def finalize(self): self.connection.flush_bulk(forced=True)
class FullTextSearch(object): def __init__(self, server, settings=None): # These timeout and bulk_size parameters were determined through # trial and error to be necessary to avoid timeout errors when # generating indices on Sandbox. They should not be taken as gospel. self.conn = ES(server, timeout=120.0) # Default timeout: 30.0 self.conn.bulker.bulk_size = 25 # Default: 400 if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } self.refresh_index_cache() def search_index_text(self, query_string, fields="_all", **args): q = query.MatchQuery(fields, query_string) return self.search_index(q, **args) def search_index(self, query, indices=None, num_results=None, node_type=None): results = self.conn.search( query=query, indices=indices, doc_types=node_type) meta_list = [r.get_meta() for r in results[0:num_results]] node_dict = {} # fetch nodes grouped by type to reduce number of db calls key = itemgetter('type') for t, grouped_list in groupby(sorted(meta_list, key=key), key=key): ids = [meta['id'] for meta in grouped_list] for node in self.datastore.get_nodes(t, ids): node_dict[(node.type, node.key)] = node # return nodes in original order nodelist = [node_dict[(meta['type'], meta['id'])] for meta in meta_list] return nodelist def create_index(self, type, indexed_variables, index_name): self.conn.indices.create_index_if_missing(index_name, self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = {'boost': 1.0, 'analyzer': 'ngram_analyzer', 'type': 'string', 'term_vector': 'with_positions_offsets'} index_settings = {'index_analyzer': 'ngram_analyzer', 'search_analyzer': 'standard', 'properties': mapping} self.conn.indices.put_mapping(str(type), index_settings, [index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): try: indices = self.conn.indices.get_mapping(raw=True) except exceptions.IndexMissingException: indices = {} else: indices = dict((k, v.get('mappings', {})) for k, v in indices.items()) self.indices = indices def delete_index(self, index_name): self.conn.indices.delete_index_if_exists(index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] for node in node_list: key = node.key index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, type, key) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass self.conn.index(index_dict, index_name, type, key, bulk=True) self.conn.indices.refresh([index_name]) def on_create(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) try: self.conn.index(index_dict, index_name, node.type, node.key, bulk=True) self.conn.indices.refresh([index_name]) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: try: self.conn.delete(index_name, node.type, node.key, bulk=True) self.conn.indices.refresh([index_name]) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, node.type, node.key) self.conn.index(index_dict, index_name, node.type, node.key, bulk=True) self.conn.indices.refresh([index_name]) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass def get_indices_of_type(self, type): type_indices = [ key for key, value in self.indices.items() if type in value ] return type_indices def populate_index_document(self, node, index_name): indexed_variables = self.indices[index_name][node.type]['properties'].keys() index_dict = { field: node[field] for field in indexed_variables } return index_dict
class FullTextSearch(object): def __init__(self,server,settings = None ): self.conn = ES(server) self.indices = {} if settings: self.settings = settings else: self.settings = { 'index': { 'analysis' : { 'analyzer' : { 'ngram_analyzer' : { 'tokenizer' : 'keyword', 'filter' : ['lowercase', 'filter_ngram'], 'type' : 'custom' } }, 'filter' : { 'filter_ngram' : { 'type' : 'nGram', 'max_gram' : 30, 'min_gram' : 1 } } } } } def search_index(self, type, index_names, query_string, num_results=-1): ns_index_names= [str(type) + "-_-" + index_name for index_name in index_names] q = WildcardQuery('_all',lower(query_string)) results = self.conn.search(query=q, indices=ns_index_names, doc_types=type) num_found = len(results) if(num_results > num_found): num_results = num_found nodelist = [self.datastore.get_node(type,r['_id']) for r in results['hits']['hits']] if(num_results!=-1): return nodelist[0:num_results] else: return nodelist def create_index(self, type, indexed_variables, index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.conn.create_index(ns_index_name,self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = {'boost':1.0, 'analyzer' : 'ngram_analyzer', 'type': u'string', 'term_vector': 'with_positions_offsets'} index_settings = {'index_analyzer':'ngram_analyzer', 'search_analyzer':'standard', 'properties':mapping} self.conn.put_mapping(str(type),index_settings,[ns_index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): self.indices = self.conn.get_indices() def delete_index(self,type,index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ns_index_name = str(type) + "-_-" + index_name ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] mapping = self.conn.get_mapping(type,ns_index_name) for node in node_list: key = node.key index_dict = self.populate_index_document(type,ns_index_name,node.attributes,mapping) try: self.conn.delete(ns_index_name,type,key) except exceptions.NotFoundException: pass try: self.conn.index(index_dict,ns_index_name,type,key) except exceptions.ElasticSearchParseException: pass self.conn.refresh([ns_index_name]) def on_create(self,node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type,ns_index_name) index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping) self.conn.index(index_dict,ns_index_name,node.type,node.key) self.conn.refresh([ns_index_name]) def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: try: self.conn.delete(ns_index_name,node.type,node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type,ns_index_name) index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping) try: self.conn.delete(ns_index_name,node.type,node.key) self.conn.index(index_dict,ns_index_name,node.type,node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def get_indices_of_type(self,type): type_indices = [] for index in self.indices.keys(): if index.startswith(type+"-_-"): type_indices.append(index) return type_indices def populate_index_document(self,type,ns_index_name,attributes,mapping): indexed_variables = mapping[type]['properties'].keys() index_dict = {} for arg in indexed_variables: try: index_dict[arg] = attributes[arg] except KeyError: #if this attribute doesn't exist for this node, just pass pass return index_dict
'boost': 1.0, 'index': 'analyzed', 'store': 'true', 'type': u'string', "term_vector": "with_positions_offsets" }, u'name': { 'boost': 1.0, 'index': 'analyzed', 'store': 'true', 'type': u'string', "term_vector": "with_positions_offsets" }, u'age': { 'store': 'true', 'type': u'integer' }, } conn.create_index("test-index") conn.put_mapping("test-type", {'properties': mapping}, ["test-index"]) start = datetime.now() for k, userdata in dataset.items(): # conn.index(userdata, "test-index", "test-type", k) conn.index(userdata, "test-index", "test-type", k, bulk=True) conn.force_bulk() end = datetime.now() print "time:", end - start dataset.close()
class ESDataTarget(DataTarget): """docstring for ClassName """ def __init__(self, document_type, index="test", host="127.0.0.1", port="9200", truncate=False, expand=False, **elasticsearch_args): """Creates a ElasticSearch data target stream. :Attributes: * document_ElasticSearch elasticsearch document_type name * index: database name * host: ElasticSearch database server host, default is ``localhost`` * port: ElasticSearch port, default is ``9200`` * expand: expand dictionary values and treat children as top-level keys with dot '.' separated key path to the child.. * truncate: delete existing data in the document_type. Default: False """ super(ESDataTarget, self).__init__() self.document_type = document_type self.index = index self.host = host self.port = port self.elasticsearch_args = elasticsearch_args self.expand = expand self.truncate = truncate self._fields = None def initialize(self): """ Initialize ElasticSearch source stream: """ from pyes.es import ES from pyes.exceptions import IndexAlreadyExistsException args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port create = args.pop("create", False) replace = args.pop("replace", False) self.connection = ES(server, **args) self.connection.default_indices = self.index self.connection.default_types = self.document_type created = False if create: try: self.connection.create_index(self.index) self.connection.refresh(self.index) created = True except IndexAlreadyExistsException: pass if replace and not created: self.connection.delete_index_if_exists(self.index) self.connection.refresh(self.index) self.connection.create_index(self.index) self.connection.refresh(self.index) if self.truncate: self.connection.delete_mapping(self.index, self.document_type) self.connection.refresh(self.index) #check mapping try: self.connection.get_mapping(self.document_type, self.index) except TypeMissingException: self.connection.put_mapping(self.document_type, self._get_mapping(), self.index) def _get_mapping(self): """Build an ES optimized mapping for the given fields""" from pyes.mappings import DocumentObjectField, IntegerField, StringField, BooleanField, FloatField, DateField document = DocumentObjectField(name=self.document_type) for field in self.fields: st = field.storage_type if st == "unknown": #lets es detect the type continue elif st in ["string", "text"]: document.add_property(StringField(name=field.name)) elif st == "integer": document.add_property(IntegerField(name=field.name)) elif st == "boolean": document.add_property(BooleanField(name=field.name)) elif st == "date": document.add_property(DateField(name=field.name)) elif st == "float": document.add_property(FloatField(name=field.name)) return document def append(self, obj): record = obj if not isinstance(obj, dict): record = dict(zip(self.field_names, obj)) if self.expand: record = expand_record(record) id = record.get('id') or record.get('_id') self.connection.index(record, self.index, self.document_type, id, bulk=True) def finalize(self): self.connection.flush_bulk(forced=True)
dataset = shelve.open("samples.shelve") mapping = {u'description': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector": "with_positions_offsets" }, u'name': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector": "with_positions_offsets" }, u'age': {'store': 'yes', 'type': u'integer'}, } conn.create_index("test-index") conn.put_mapping("test-type", {'properties': mapping}, ["test-index"]) start = datetime.now() for k, userdata in dataset.items(): # conn.index(userdata, "test-index", "test-type", k) conn.index(userdata, "test-index", "test-type", k, bulk=True) conn.force_bulk() end = datetime.now() print "time:", end - start dataset.close()
class FullTextSearch(object): def __init__(self, server, settings=None): self.conn = ES(server) self.indices = {} if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } def search_index(self, type, index_names, query_string, num_results=-1): ns_index_names = [ str(type) + "-_-" + index_name for index_name in index_names ] q = WildcardQuery('_all', lower(query_string)) results = self.conn.search(query=q, indices=ns_index_names, doc_types=type) num_found = len(results) if (num_results > num_found): num_results = num_found nodelist = [ self.datastore.get_node(type, r['_id']) for r in results['hits']['hits'] ] if (num_results != -1): return nodelist[0:num_results] else: return nodelist def create_index(self, type, indexed_variables, index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.conn.create_index(ns_index_name, self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = { 'boost': 1.0, 'analyzer': 'ngram_analyzer', 'type': u'string', 'term_vector': 'with_positions_offsets' } index_settings = { 'index_analyzer': 'ngram_analyzer', 'search_analyzer': 'standard', 'properties': mapping } self.conn.put_mapping(str(type), index_settings, [ns_index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): self.indices = self.conn.get_indices() def delete_index(self, type, index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ns_index_name = str(type) + "-_-" + index_name ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] mapping = self.conn.get_mapping(type, ns_index_name) for node in node_list: key = node.key index_dict = self.populate_index_document(type, ns_index_name, node.attributes, mapping) try: self.conn.delete(ns_index_name, type, key) except exceptions.NotFoundException: pass try: self.conn.index(index_dict, ns_index_name, type, key) except exceptions.ElasticSearchParseException: pass self.conn.refresh([ns_index_name]) def on_create(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type, ns_index_name) index_dict = self.populate_index_document(node.type, ns_index_name, node.attributes, mapping) self.conn.index(index_dict, ns_index_name, node.type, node.key) self.conn.refresh([ns_index_name]) def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: try: self.conn.delete(ns_index_name, node.type, node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type, ns_index_name) index_dict = self.populate_index_document(node.type, ns_index_name, node.attributes, mapping) try: self.conn.delete(ns_index_name, node.type, node.key) self.conn.index(index_dict, ns_index_name, node.type, node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def get_indices_of_type(self, type): type_indices = [] for index in self.indices.keys(): if index.startswith(type + "-_-"): type_indices.append(index) return type_indices def populate_index_document(self, type, ns_index_name, attributes, mapping): indexed_variables = mapping[type]['properties'].keys() index_dict = {} for arg in indexed_variables: try: index_dict[arg] = attributes[arg] except KeyError: #if this attribute doesn't exist for this node, just pass pass return index_dict
from pyes.es import ES import random import datetime now = datetime.datetime.now() es = ES() for i in range(1000): d = {'created_at': now - datetime.timedelta(seconds=i), 'level': 'ERROR' if random.random() < 0.1 else 'WARN', 'message': "Test message"} es.index(d, 'my_index', 'my_type')
class FullTextSearch(object): def __init__(self, server, settings=None): self.conn = ES(server) if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } self.refresh_index_cache() def search_index_text(self, query_string, fields="_all", **args): q = query.TextQuery(fields, query_string) return self.search_index(q, **args) def search_index(self, query, indices=None, num_results=None, node_type=None): results = self.conn.search( query=query, indices=indices, doc_types=node_type) meta_list = [r.get_meta() for r in results[0:num_results]] node_dict = {} # fetch nodes grouped by type to reduce number of db calls key = itemgetter('type') for t, grouped_list in groupby(sorted(meta_list, key=key), key=key): ids = [meta['id'] for meta in grouped_list] for node in self.datastore.get_nodes(t, ids): node_dict[(node.type, node.key)] = node # return nodes in original order nodelist = [node_dict[(meta['type'], meta['id'])] for meta in meta_list] return nodelist def create_index(self, type, indexed_variables, index_name): self.conn.create_index_if_missing(index_name, self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = {'boost': 1.0, 'analyzer': 'ngram_analyzer', 'type': 'string', 'term_vector': 'with_positions_offsets'} index_settings = {'index_analyzer': 'ngram_analyzer', 'search_analyzer': 'standard', 'properties': mapping} self.conn.put_mapping(str(type), index_settings, [index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): try: self.indices = self.conn.get_mapping() except exceptions.IndexMissingException: self.indices = {} def delete_index(self, index_name): self.conn.delete_index_if_exists(index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] for node in node_list: key = node.key index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, type, key) except exceptions.NotFoundException: pass self.conn.index(index_dict, index_name, type, key) self.conn.refresh([index_name]) def on_create(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) self.conn.index(index_dict, index_name, node.type, node.key) self.conn.refresh([index_name]) def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: try: self.conn.delete(index_name, node.type, node.key) self.conn.refresh([index_name]) except exceptions.NotFoundException: pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, node.type, node.key) self.conn.index(index_dict, index_name, node.type, node.key) self.conn.refresh([index_name]) except exceptions.NotFoundException: pass def get_indices_of_type(self, type): type_indices = [ key for key, value in self.indices.items() if type in value ] return type_indices def populate_index_document(self, node, index_name): indexed_variables = self.indices[index_name][node.type]['properties'].keys() index_dict = { field: node[field] for field in indexed_variables } return index_dict