}
    mq_client = message_queue_client_from_config(mq_config)
    mq_codec = JSONCodec()
    processed_resource_queue = 'processed_resources'

    # ElasticSearch
    es = ES('localhost:9200', timeout=60)
    es_index = 'topic_tracking'

    # dequeue one resource
    mq_client.connect()
    message = mq_client.get_message(processed_resource_queue)
    resource = mq_codec.decode(message.body, Resource)
    mq_client.delete_message(processed_resource_queue, message.id)
    mq_client.disconnect()

    # save the resource to mongo
    resource._id = makeIdFromURI(resource.uri)
    resource_collection.insert_model(resource)

    # index the resource
    for boost in [1, 1000]:
        es_doc = {}
        es_doc['content'] = resource.content
        es_doc['title'] = resource.title
        es_doc['entities'] = build_payload_string(resource.entities, boost)
        es_doc['terms'] = build_payload_string(resource.terms, boost)
        id = '%s_%d' % (resource._id, boost)
        r = es.index(es_doc, es_index, 'resource', id)
        pprint(r)
Пример #2
0
class ESDataTarget(base.DataTarget):
    """docstring for ClassName
    """
    def __init__(self, document_type, database="test", host="127.0.0.1", port="9200",
                 truncate=False, expand=False, **elasticsearch_args):
        """Creates a ElasticSearch data target stream.

        :Attributes:
            * document_ElasticSearch elasticsearch document_type name
            * database: database name
            * host: ElasticSearch database server host, default is ``localhost``
            * port: ElasticSearch port, default is ``9200``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
            * truncate: delete existing data in the document_type. Default: False
        """
        self.document_type = document_type
        self.database_name = database
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.truncate = truncate
        self._fields = None

    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.database_name
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.database_name)
                self.connection.refresh(self.database_name)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.database_name)
            time.sleep(2)
            self.connection.create_index(self.database_name)
            self.connection.refresh(self.database_name)

        if self.truncate:
            self.connection.delete_mapping(self.database_name, self.document_type)
            self.connection.refresh(self.database_name)

    def append(self, obj):
        record = obj
        if not isinstance(obj, dict):
            record = dict(zip(self.fields.names(), obj))

        if self.expand:
            record = expand_record(record)

        id = record.get('id') or record.get('_id')
        self.connection.index(record, self.database_name, self.document_type, id, bulk=True)

    def finalize(self):
        self.connection.flush_bulk(forced=True)
Пример #3
0
class FullTextSearch(object):
    def __init__(self, server, settings=None):
        # These timeout and bulk_size parameters were determined through
        # trial and error to be necessary to avoid timeout errors when
        # generating indices on Sandbox. They should not be taken as gospel.
        self.conn = ES(server, timeout=120.0)  # Default timeout: 30.0
        self.conn.bulker.bulk_size = 25  # Default: 400
        if settings:
            self.settings = settings
        else:
            self.settings = {
                'index': {
                    'analysis': {
                        'analyzer': {
                            'ngram_analyzer': {
                                'tokenizer': 'keyword',
                                'filter': ['lowercase', 'filter_ngram'],
                                'type': 'custom'
                            }
                        },
                        'filter': {
                            'filter_ngram': {
                                'type': 'nGram',
                                'max_gram': 30,
                                'min_gram': 1
                            }
                        }
                    }
                }
            }
        self.refresh_index_cache()

    def search_index_text(self, query_string, fields="_all", **args):
        q = query.MatchQuery(fields, query_string)
        return self.search_index(q, **args)

    def search_index(self, query, indices=None, num_results=None, node_type=None):
        results = self.conn.search(
            query=query, indices=indices, doc_types=node_type)
        meta_list = [r.get_meta() for r in results[0:num_results]]
        node_dict = {}

        # fetch nodes grouped by type to reduce number of db calls
        key = itemgetter('type')
        for t, grouped_list in groupby(sorted(meta_list, key=key), key=key):
            ids = [meta['id'] for meta in grouped_list]
            for node in self.datastore.get_nodes(t, ids):
                node_dict[(node.type, node.key)] = node

        # return nodes in original order
        nodelist = [node_dict[(meta['type'], meta['id'])]
                    for meta in meta_list]

        return nodelist

    def create_index(self, type, indexed_variables, index_name):
        self.conn.indices.create_index_if_missing(index_name, self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {'boost': 1.0,
                            'analyzer': 'ngram_analyzer',
                            'type': 'string',
                            'term_vector': 'with_positions_offsets'}
        index_settings = {'index_analyzer': 'ngram_analyzer',
                          'search_analyzer': 'standard',
                          'properties': mapping}
        self.conn.indices.put_mapping(str(type), index_settings, [index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        try:
            indices = self.conn.indices.get_mapping(raw=True)
        except exceptions.IndexMissingException:
            indices = {}
        else:
            indices = dict((k, v.get('mappings', {})) for k, v in indices.items())
        self.indices = indices

    def delete_index(self, index_name):
        self.conn.indices.delete_index_if_exists(index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]

        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, type, key)
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass
            self.conn.index(index_dict, index_name, type, key, bulk=True)
        self.conn.indices.refresh([index_name])

    def on_create(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.index(index_dict, index_name, node.type, node.key, bulk=True)
                self.conn.indices.refresh([index_name])
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            try:
                self.conn.delete(index_name, node.type, node.key, bulk=True)
                self.conn.indices.refresh([index_name])
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass

    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, node.type, node.key)
                self.conn.index(index_dict, index_name, node.type, node.key, bulk=True)
                self.conn.indices.refresh([index_name])
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass

    def get_indices_of_type(self, type):
        type_indices = [
            key for key, value in self.indices.items()
            if type in value
        ]
        return type_indices

    def populate_index_document(self, node, index_name):
        indexed_variables = self.indices[index_name][node.type]['properties'].keys()
        index_dict = {
            field: node[field] for field in indexed_variables
        }
        return index_dict
Пример #4
0
class FullTextSearch(object):
    def __init__(self,server,settings = None ):
        self.conn = ES(server)
        self.indices = {}
        if settings:
            self.settings = settings
        else:
            self.settings = { 
                'index': {
                    'analysis' : {
                        'analyzer' : {                             
                            'ngram_analyzer' : {                   
                                'tokenizer' : 'keyword',
                                'filter' : ['lowercase', 'filter_ngram'],
                                'type' : 'custom'
                            }  
                        },
                        'filter' : {
                            'filter_ngram' : {                                 
                                'type' : 'nGram',
                                'max_gram' : 30,
                                'min_gram' : 1                                 
                            }                           
                        }
                    }
                }
            }

    def search_index(self, type, index_names, query_string, num_results=-1):
        ns_index_names= [str(type) + "-_-" + index_name for index_name in index_names]
        q = WildcardQuery('_all',lower(query_string))
        results = self.conn.search(query=q, indices=ns_index_names, doc_types=type)
        num_found = len(results)
        if(num_results > num_found):
            num_results = num_found
        nodelist = [self.datastore.get_node(type,r['_id']) for r in results['hits']['hits']]
        if(num_results!=-1):
            return nodelist[0:num_results]
        else:
            return nodelist

    def create_index(self, type, indexed_variables, index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.conn.create_index(ns_index_name,self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {'boost':1.0,
                            'analyzer' : 'ngram_analyzer',
                            'type': u'string',
                            'term_vector': 'with_positions_offsets'}
        index_settings = {'index_analyzer':'ngram_analyzer',
                          'search_analyzer':'standard',
                          'properties':mapping}
        self.conn.put_mapping(str(type),index_settings,[ns_index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        self.indices = self.conn.get_indices()

    def delete_index(self,type,index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ns_index_name = str(type) + "-_-" + index_name
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]
        mapping = self.conn.get_mapping(type,ns_index_name)
        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(type,ns_index_name,node.attributes,mapping)
            try:
                self.conn.delete(ns_index_name,type,key)
            except exceptions.NotFoundException:
                pass
            try: 
                self.conn.index(index_dict,ns_index_name,type,key)
            except exceptions.ElasticSearchParseException:
                pass
        self.conn.refresh([ns_index_name])

    def on_create(self,node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type,ns_index_name)
            index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping)
            self.conn.index(index_dict,ns_index_name,node.type,node.key)
            self.conn.refresh([ns_index_name])

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            try:
                self.conn.delete(ns_index_name,node.type,node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass
           
    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type,ns_index_name)
            index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping)
            try:
                self.conn.delete(ns_index_name,node.type,node.key)
                self.conn.index(index_dict,ns_index_name,node.type,node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass

    def get_indices_of_type(self,type):
        type_indices = []
        for index in self.indices.keys():
            if index.startswith(type+"-_-"):
                type_indices.append(index)
        return type_indices

    def populate_index_document(self,type,ns_index_name,attributes,mapping):
        indexed_variables = mapping[type]['properties'].keys()
        index_dict = {}
        for arg in indexed_variables:
            try:
                index_dict[arg] = attributes[arg]
            except KeyError:
                #if this attribute doesn't exist for this node, just pass
                pass
        return index_dict
Пример #5
0
        'boost': 1.0,
        'index': 'analyzed',
        'store': 'true',
        'type': u'string',
        "term_vector": "with_positions_offsets"
    },
    u'name': {
        'boost': 1.0,
        'index': 'analyzed',
        'store': 'true',
        'type': u'string',
        "term_vector": "with_positions_offsets"
    },
    u'age': {
        'store': 'true',
        'type': u'integer'
    },
}
conn.create_index("test-index")
conn.put_mapping("test-type", {'properties': mapping}, ["test-index"])

start = datetime.now()
for k, userdata in dataset.items():
    #    conn.index(userdata, "test-index", "test-type", k)
    conn.index(userdata, "test-index", "test-type", k, bulk=True)
conn.force_bulk()
end = datetime.now()

print "time:", end - start
dataset.close()
Пример #6
0
class ESDataTarget(DataTarget):
    """docstring for ClassName
    """

    def __init__(self, document_type, index="test", host="127.0.0.1", port="9200", truncate=False, expand=False,
                 **elasticsearch_args):
        """Creates a ElasticSearch data target stream.

        :Attributes:
            * document_ElasticSearch elasticsearch document_type name
            * index: database name
            * host: ElasticSearch database server host, default is ``localhost``
            * port: ElasticSearch port, default is ``9200``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
            * truncate: delete existing data in the document_type. Default: False
        """
        super(ESDataTarget, self).__init__()
        self.document_type = document_type
        self.index = index
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.truncate = truncate
        self._fields = None

    def initialize(self):
        """
        Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.index
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.index)
                self.connection.refresh(self.index)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.index)
            self.connection.refresh(self.index)
            self.connection.create_index(self.index)
            self.connection.refresh(self.index)

        if self.truncate:
            self.connection.delete_mapping(self.index, self.document_type)
            self.connection.refresh(self.index)
        #check mapping
        try:
            self.connection.get_mapping(self.document_type, self.index)
        except TypeMissingException:
            self.connection.put_mapping(self.document_type, self._get_mapping(), self.index)

    def _get_mapping(self):
        """Build an ES optimized mapping for the given fields"""
        from pyes.mappings import DocumentObjectField, IntegerField, StringField, BooleanField, FloatField, DateField

        document = DocumentObjectField(name=self.document_type)
        for field in self.fields:
            st = field.storage_type
            if st == "unknown":
                #lets es detect the type
                continue
            elif st in ["string", "text"]:
                document.add_property(StringField(name=field.name))
            elif st == "integer":
                document.add_property(IntegerField(name=field.name))
            elif st == "boolean":
                document.add_property(BooleanField(name=field.name))
            elif st == "date":
                document.add_property(DateField(name=field.name))
            elif st == "float":
                document.add_property(FloatField(name=field.name))

        return document


    def append(self, obj):
        record = obj
        if not isinstance(obj, dict):
            record = dict(zip(self.field_names, obj))

        if self.expand:
            record = expand_record(record)

        id = record.get('id') or record.get('_id')
        self.connection.index(record, self.index, self.document_type, id, bulk=True)

    def finalize(self):
        self.connection.flush_bulk(forced=True)
Пример #7
0
dataset = shelve.open("samples.shelve")

mapping = {u'description': {'boost': 1.0,
                            'index': 'analyzed',
                            'store': 'yes',
                            'type': u'string',
                            "term_vector": "with_positions_offsets"
},
           u'name': {'boost': 1.0,
                     'index': 'analyzed',
                     'store': 'yes',
                     'type': u'string',
                     "term_vector": "with_positions_offsets"
           },
           u'age': {'store': 'yes',
                    'type': u'integer'},
           }
conn.create_index("test-index")
conn.put_mapping("test-type", {'properties': mapping}, ["test-index"])

start = datetime.now()
for k, userdata in dataset.items():
#    conn.index(userdata, "test-index", "test-type", k)
    conn.index(userdata, "test-index", "test-type", k, bulk=True)
conn.force_bulk()
end = datetime.now()

print "time:", end - start
dataset.close()

Пример #8
0
class ESDataTarget(DataTarget):
    """docstring for ClassName
    """
    def __init__(self,
                 document_type,
                 index="test",
                 host="127.0.0.1",
                 port="9200",
                 truncate=False,
                 expand=False,
                 **elasticsearch_args):
        """Creates a ElasticSearch data target stream.

        :Attributes:
            * document_ElasticSearch elasticsearch document_type name
            * index: database name
            * host: ElasticSearch database server host, default is ``localhost``
            * port: ElasticSearch port, default is ``9200``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
            * truncate: delete existing data in the document_type. Default: False
        """
        super(ESDataTarget, self).__init__()
        self.document_type = document_type
        self.index = index
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.truncate = truncate
        self._fields = None

    def initialize(self):
        """
        Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.index
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.index)
                self.connection.refresh(self.index)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.index)
            self.connection.refresh(self.index)
            self.connection.create_index(self.index)
            self.connection.refresh(self.index)

        if self.truncate:
            self.connection.delete_mapping(self.index, self.document_type)
            self.connection.refresh(self.index)
        #check mapping
        try:
            self.connection.get_mapping(self.document_type, self.index)
        except TypeMissingException:
            self.connection.put_mapping(self.document_type,
                                        self._get_mapping(), self.index)

    def _get_mapping(self):
        """Build an ES optimized mapping for the given fields"""
        from pyes.mappings import DocumentObjectField, IntegerField, StringField, BooleanField, FloatField, DateField

        document = DocumentObjectField(name=self.document_type)
        for field in self.fields:
            st = field.storage_type
            if st == "unknown":
                #lets es detect the type
                continue
            elif st in ["string", "text"]:
                document.add_property(StringField(name=field.name))
            elif st == "integer":
                document.add_property(IntegerField(name=field.name))
            elif st == "boolean":
                document.add_property(BooleanField(name=field.name))
            elif st == "date":
                document.add_property(DateField(name=field.name))
            elif st == "float":
                document.add_property(FloatField(name=field.name))

        return document

    def append(self, obj):
        record = obj
        if not isinstance(obj, dict):
            record = dict(zip(self.field_names, obj))

        if self.expand:
            record = expand_record(record)

        id = record.get('id') or record.get('_id')
        self.connection.index(record,
                              self.index,
                              self.document_type,
                              id,
                              bulk=True)

    def finalize(self):
        self.connection.flush_bulk(forced=True)
Пример #9
0
class FullTextSearch(object):
    def __init__(self, server, settings=None):
        self.conn = ES(server)
        self.indices = {}
        if settings:
            self.settings = settings
        else:
            self.settings = {
                'index': {
                    'analysis': {
                        'analyzer': {
                            'ngram_analyzer': {
                                'tokenizer': 'keyword',
                                'filter': ['lowercase', 'filter_ngram'],
                                'type': 'custom'
                            }
                        },
                        'filter': {
                            'filter_ngram': {
                                'type': 'nGram',
                                'max_gram': 30,
                                'min_gram': 1
                            }
                        }
                    }
                }
            }

    def search_index(self, type, index_names, query_string, num_results=-1):
        ns_index_names = [
            str(type) + "-_-" + index_name for index_name in index_names
        ]
        q = WildcardQuery('_all', lower(query_string))
        results = self.conn.search(query=q,
                                   indices=ns_index_names,
                                   doc_types=type)
        num_found = len(results)
        if (num_results > num_found):
            num_results = num_found
        nodelist = [
            self.datastore.get_node(type, r['_id'])
            for r in results['hits']['hits']
        ]
        if (num_results != -1):
            return nodelist[0:num_results]
        else:
            return nodelist

    def create_index(self, type, indexed_variables, index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.conn.create_index(ns_index_name, self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {
                'boost': 1.0,
                'analyzer': 'ngram_analyzer',
                'type': u'string',
                'term_vector': 'with_positions_offsets'
            }
        index_settings = {
            'index_analyzer': 'ngram_analyzer',
            'search_analyzer': 'standard',
            'properties': mapping
        }
        self.conn.put_mapping(str(type), index_settings, [ns_index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        self.indices = self.conn.get_indices()

    def delete_index(self, type, index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ns_index_name = str(type) + "-_-" + index_name
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]
        mapping = self.conn.get_mapping(type, ns_index_name)
        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(type, ns_index_name,
                                                      node.attributes, mapping)
            try:
                self.conn.delete(ns_index_name, type, key)
            except exceptions.NotFoundException:
                pass
            try:
                self.conn.index(index_dict, ns_index_name, type, key)
            except exceptions.ElasticSearchParseException:
                pass
        self.conn.refresh([ns_index_name])

    def on_create(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type, ns_index_name)
            index_dict = self.populate_index_document(node.type, ns_index_name,
                                                      node.attributes, mapping)
            self.conn.index(index_dict, ns_index_name, node.type, node.key)
            self.conn.refresh([ns_index_name])

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            try:
                self.conn.delete(ns_index_name, node.type, node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass

    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type, ns_index_name)
            index_dict = self.populate_index_document(node.type, ns_index_name,
                                                      node.attributes, mapping)
            try:
                self.conn.delete(ns_index_name, node.type, node.key)
                self.conn.index(index_dict, ns_index_name, node.type, node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass

    def get_indices_of_type(self, type):
        type_indices = []
        for index in self.indices.keys():
            if index.startswith(type + "-_-"):
                type_indices.append(index)
        return type_indices

    def populate_index_document(self, type, ns_index_name, attributes,
                                mapping):
        indexed_variables = mapping[type]['properties'].keys()
        index_dict = {}
        for arg in indexed_variables:
            try:
                index_dict[arg] = attributes[arg]
            except KeyError:
                #if this attribute doesn't exist for this node, just pass
                pass
        return index_dict
Пример #10
0
from pyes.es import ES
import random
import datetime

now = datetime.datetime.now()
es = ES()
for i in range(1000):
    d = {'created_at': now - datetime.timedelta(seconds=i),
            'level': 'ERROR' if random.random() < 0.1 else 'WARN',
            'message': "Test message"}
    es.index(d, 'my_index', 'my_type')
Пример #11
0
class FullTextSearch(object):
    def __init__(self, server, settings=None):
        self.conn = ES(server)
        if settings:
            self.settings = settings
        else:
            self.settings = {
                'index': {
                    'analysis': {
                        'analyzer': {
                            'ngram_analyzer': {
                                'tokenizer': 'keyword',
                                'filter': ['lowercase', 'filter_ngram'],
                                'type': 'custom'
                            }
                        },
                        'filter': {
                            'filter_ngram': {
                                'type': 'nGram',
                                'max_gram': 30,
                                'min_gram': 1
                            }
                        }
                    }
                }
            }
        self.refresh_index_cache()

    def search_index_text(self, query_string, fields="_all", **args):
        q = query.TextQuery(fields, query_string)
        return self.search_index(q, **args)

    def search_index(self, query, indices=None, num_results=None, node_type=None):
        results = self.conn.search(
            query=query, indices=indices, doc_types=node_type)
        meta_list = [r.get_meta() for r in results[0:num_results]]
        node_dict = {}

        # fetch nodes grouped by type to reduce number of db calls
        key = itemgetter('type')
        for t, grouped_list in groupby(sorted(meta_list, key=key), key=key):
            ids = [meta['id'] for meta in grouped_list]
            for node in self.datastore.get_nodes(t, ids):
                node_dict[(node.type, node.key)] = node

        # return nodes in original order
        nodelist = [node_dict[(meta['type'], meta['id'])]
                    for meta in meta_list]

        return nodelist

    def create_index(self, type, indexed_variables, index_name):
        self.conn.create_index_if_missing(index_name, self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {'boost': 1.0,
                            'analyzer': 'ngram_analyzer',
                            'type': 'string',
                            'term_vector': 'with_positions_offsets'}
        index_settings = {'index_analyzer': 'ngram_analyzer',
                          'search_analyzer': 'standard',
                          'properties': mapping}
        self.conn.put_mapping(str(type), index_settings, [index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        try:
            self.indices = self.conn.get_mapping()
        except exceptions.IndexMissingException:
            self.indices = {}

    def delete_index(self, index_name):
        self.conn.delete_index_if_exists(index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]

        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, type, key)
            except exceptions.NotFoundException:
                pass
            self.conn.index(index_dict, index_name, type, key)
        self.conn.refresh([index_name])

    def on_create(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            self.conn.index(index_dict, index_name, node.type, node.key)
            self.conn.refresh([index_name])

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            try:
                self.conn.delete(index_name, node.type, node.key)
                self.conn.refresh([index_name])
            except exceptions.NotFoundException:
                pass

    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, node.type, node.key)
                self.conn.index(index_dict, index_name, node.type, node.key)
                self.conn.refresh([index_name])
            except exceptions.NotFoundException:
                pass

    def get_indices_of_type(self, type):
        type_indices = [
            key for key, value in self.indices.items()
            if type in value
        ]
        return type_indices

    def populate_index_document(self, node, index_name):
        indexed_variables = self.indices[index_name][node.type]['properties'].keys()
        index_dict = {
            field: node[field] for field in indexed_variables
        }
        return index_dict
Пример #12
0
class ESDataTarget(base.DataTarget):
    """docstring for ClassName
    """
    def __init__(self,
                 document_type,
                 database="test",
                 host="127.0.0.1",
                 port="9200",
                 truncate=False,
                 expand=False,
                 **elasticsearch_args):
        """Creates a ElasticSearch data target stream.

        :Attributes:
            * document_ElasticSearch elasticsearch document_type name
            * database: database name
            * host: ElasticSearch database server host, default is ``localhost``
            * port: ElasticSearch port, default is ``9200``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
            * truncate: delete existing data in the document_type. Default: False
        """
        self.document_type = document_type
        self.database_name = database
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.truncate = truncate
        self._fields = None

    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.database_name
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.database_name)
                self.connection.refresh(self.database_name)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.database_name)
            time.sleep(2)
            self.connection.create_index(self.database_name)
            self.connection.refresh(self.database_name)

        if self.truncate:
            self.connection.delete_mapping(self.database_name,
                                           self.document_type)
            self.connection.refresh(self.database_name)

    def append(self, obj):
        record = obj
        if not isinstance(obj, dict):
            record = dict(zip(self.fields.names(), obj))

        if self.expand:
            record = expand_record(record)

        id = record.get('id') or record.get('_id')
        self.connection.index(record,
                              self.database_name,
                              self.document_type,
                              id,
                              bulk=True)

    def finalize(self):
        self.connection.flush_bulk(forced=True)