Exemplo n.º 1
0
def dump(start,end,backupdir,eshost):
    conn = ES(eshost)
    out = file('/tmp/out.json','w')
    _type = 'habakkuk'
    q = MatchAllQuery()
    q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',start,end,include_upper=False)))
    q = q.search()
    # print json.dumps(json.loads(q.to_search_json()),indent=2)
    resultset = conn.search(query=q,indices=_type+"-*", doc_types=[_type], scan=True)
    cnt=0
    if not resultset.total:
        sys.stderr.write("no data for %s - %s\n"%(start,end))
        return

    try:
        sys.stderr.write("Will write %d lines to %s\n"%(resultset.total, out.name))
        while True:
            r = resultset.next()
            cnt+=1
            out.write(json.dumps(r)+'\n')
    except StopIteration:
        pass

    out.close()

    # gzip
    ext = datetime.strftime(start,'%Y-%m-%d')
    backup = os.path.join(backupdir,"habakkuk-%s.json.gz"%ext)

    f_in = open(out.name,'rb')
    f_out = gzip.open(backup,'wb')
    f_out.writelines(f_in)
    f_out.close()
    f_out.close()
    sys.stderr.write("Created %s\n"%backup)
Exemplo n.º 2
0
    def get_entries(self):
        '''Get all entries for a team + their filter from ES/MozDef'''
        teamfilter = self.config['teamsetup'][self.team]['filter']
        es = ES((self.config['mozdef']['proto'], self.config['mozdef']['host'], self.config['mozdef']['port']))

        # Default filter - time period
        try:
            td = self.config['es'][teamfilter]['_time_period']
        except KeyError:
            debug('No _time_period defined, defaulting to 24h')
            td = 24
        begindateUTC = toUTC(datetime.now() - timedelta(hours=td))
        enddateUTC= toUTC(datetime.now())
        print begindateUTC, enddateUTC
        fDate = pyes.RangeQuery(qrange=pyes.ESRange('utctimestamp', from_value=begindateUTC, to_value=enddateUTC))

        # Load team queries from our json config.
        # Lists are "should" unless an item is negated with "!" then it's must_not
        # Single items are "must"
        query = pyes.query.BoolQuery()
        query.add_must(pyes.QueryStringQuery('asset.autogroup: "{}"'.format(self.team)))
        for item in self.config['es'][teamfilter]:
            # items starting with '_' are internal/reserved, like _time_period
            if (item.startswith('_')):
                continue
            val = self.config['es'][teamfilter][item]
            if (type(val) == list):
                for v in val:
                    if (v.startswith("!")):
                        query.add_must_not(pyes.MatchQuery(item, v[1:]))
                    else:
                        query.add_should(pyes.MatchQuery(item, v))
            else:
                if (val.startswith("!")):
                    query.add_must_not(pyes.MatchQuery(item, val))
                else:
                    query.add_must(pyes.MatchQuery(item, val))


        q = pyes.ConstantScoreQuery(query)
        q = pyes.FilteredQuery(q, pyes.BoolFilter(must=[fDate]))

        results = es.search(query=q, indices=self.config['es']['index'])

        raw = results._search_raw(0, results.count())
        # This doesn't do much, but pyes has no "close()" or similar functionality.
        es.force_bulk()

        if (raw._shards.failed != 0):
            raise Exception("Some shards failed! {0}".format(raw._shards.__str__()))

        # Nobody cares for the metadata past this point (all the goodies are in '_source')
        data = []
        for i in raw.hits.hits:
            data += [i._source]
        return data
Exemplo n.º 3
0
def dump_topics(backupdir,
                eshost,
                _type,
                indices="topics-all"):
    conn = ES(eshost)
    out = file('/tmp/out.json','w')
    q = MatchAllQuery()
    q = q.search()

    resultset = conn.search(query=q,indices=indices, doc_types=[_type], scan=True)
    cnt=0
    if not resultset.total:
        sys.stderr.write("no data\n")
        return

    try:
        sys.stderr.write("Will write %d lines to %s\n"%(resultset.total, out.name))
        while True:
            r = resultset.next()
            r['_id'] = r._meta.id
            cnt+=1
            out.write(json.dumps(r)+'\n')
    except StopIteration:
        pass

    out.close()

    # gzip
    backup = os.path.join(backupdir,"topics.{}.json.gz".format(_type))

    f_in = open(out.name,'rb')
    f_out = gzip.open(backup,'wb')
    f_out.writelines(f_in)
    f_out.close()
    f_out.close()
    sys.stderr.write("Created %s\n"%backup)
Exemplo n.º 4
0
class FullTextSearch(object):
    def __init__(self, server, settings=None):
        # These timeout and bulk_size parameters were determined through
        # trial and error to be necessary to avoid timeout errors when
        # generating indices on Sandbox. They should not be taken as gospel.
        self.conn = ES(server, timeout=120.0)  # Default timeout: 30.0
        self.conn.bulker.bulk_size = 25  # Default: 400
        if settings:
            self.settings = settings
        else:
            self.settings = {
                'index': {
                    'analysis': {
                        'analyzer': {
                            'ngram_analyzer': {
                                'tokenizer': 'keyword',
                                'filter': ['lowercase', 'filter_ngram'],
                                'type': 'custom'
                            }
                        },
                        'filter': {
                            'filter_ngram': {
                                'type': 'nGram',
                                'max_gram': 30,
                                'min_gram': 1
                            }
                        }
                    }
                }
            }
        self.refresh_index_cache()

    def search_index_text(self, query_string, fields="_all", **args):
        q = query.MatchQuery(fields, query_string)
        return self.search_index(q, **args)

    def search_index(self, query, indices=None, num_results=None, node_type=None):
        results = self.conn.search(
            query=query, indices=indices, doc_types=node_type)
        meta_list = [r.get_meta() for r in results[0:num_results]]
        node_dict = {}

        # fetch nodes grouped by type to reduce number of db calls
        key = itemgetter('type')
        for t, grouped_list in groupby(sorted(meta_list, key=key), key=key):
            ids = [meta['id'] for meta in grouped_list]
            for node in self.datastore.get_nodes(t, ids):
                node_dict[(node.type, node.key)] = node

        # return nodes in original order
        nodelist = [node_dict[(meta['type'], meta['id'])]
                    for meta in meta_list]

        return nodelist

    def create_index(self, type, indexed_variables, index_name):
        self.conn.indices.create_index_if_missing(index_name, self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {'boost': 1.0,
                            'analyzer': 'ngram_analyzer',
                            'type': 'string',
                            'term_vector': 'with_positions_offsets'}
        index_settings = {'index_analyzer': 'ngram_analyzer',
                          'search_analyzer': 'standard',
                          'properties': mapping}
        self.conn.indices.put_mapping(str(type), index_settings, [index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        try:
            indices = self.conn.indices.get_mapping(raw=True)
        except exceptions.IndexMissingException:
            indices = {}
        else:
            indices = dict((k, v.get('mappings', {})) for k, v in indices.items())
        self.indices = indices

    def delete_index(self, index_name):
        self.conn.indices.delete_index_if_exists(index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]

        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, type, key)
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass
            self.conn.index(index_dict, index_name, type, key, bulk=True)
        self.conn.indices.refresh([index_name])

    def on_create(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.index(index_dict, index_name, node.type, node.key, bulk=True)
                self.conn.indices.refresh([index_name])
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            try:
                self.conn.delete(index_name, node.type, node.key, bulk=True)
                self.conn.indices.refresh([index_name])
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass

    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, node.type, node.key)
                self.conn.index(index_dict, index_name, node.type, node.key, bulk=True)
                self.conn.indices.refresh([index_name])
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass

    def get_indices_of_type(self, type):
        type_indices = [
            key for key, value in self.indices.items()
            if type in value
        ]
        return type_indices

    def populate_index_document(self, node, index_name):
        indexed_variables = self.indices[index_name][node.type]['properties'].keys()
        index_dict = {
            field: node[field] for field in indexed_variables
        }
        return index_dict
Exemplo n.º 5
0
class ESDataSource(base.DataSource):
    """docstring for ClassName
    """
    def __init__(self, document_type, database=None, host=None, port=None,
                 expand=False, **elasticsearch_args):
        """Creates a ElasticSearch data source stream.

        :Attributes:
            * document_type: elasticsearch document_type name
            * database: database name
            * host: elasticsearch database server host, default is ``localhost``
            * port: elasticsearch port, default is ``27017``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
        """
        self.document_type = document_type
        self.database_name = database
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.connection = None
        self._fields = None

    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        self.connection = ES(server, **args)
        self.connection.default_indices = self.database_name
        self.connection.default_types = self.document_type

    def read_fields(self, limit=0):
        keys = []
        probes = {}

        def probe_record(record, parent=None):
            for key, value in record.items():
                if parent:
                    full_key = parent + "." + key
                else:
                    full_key = key

                if self.expand and type(value) == dict:
                    probe_record(value, full_key)
                    continue

                if not full_key in probes:
                    probe = dq.FieldTypeProbe(full_key)
                    probes[full_key] = probe
                    keys.append(full_key)
                else:
                    probe = probes[full_key]
                probe.probe(value)

        for record in self.document_type.find(limit=limit):
            probe_record(record)

        fields = []

        for key in keys:
            probe = probes[key]
            field = base.Field(probe.field)

            storage_type = probe.unique_storage_type
            if not storage_type:
                field.storage_type = "unknown"
            elif storage_type == "unicode":
                field.storage_type = "string"
            else:
                field.storage_type = "unknown"
                field.concrete_storage_type = storage_type

            # FIXME: Set analytical type

            fields.append(field)

        self.fields = list(fields)
        return self.fields

    def rows(self):
        if not self.connection:
            raise RuntimeError("Stream is not initialized")
        from pyes.query import MatchAllQuery
        fields = self.fields.names()
        results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200")
        return ESRowIterator(results, fields)

    def records(self):
        if not self.connection:
            raise RuntimeError("Stream is not initialized")
        from pyes.query import MatchAllQuery
        results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200")
        return ESRecordIterator(results, self.expand)
                'size': 2,
                'explain': explain
            }

        return query


if __name__ == '__main__':

    explain = True

    # MongoDB
    host = 'localhost'
    port = 27017
    mcm = MongoConnectionManager(host, port, MongoCodec())
    database = 'processed'
    resource_collection = mcm.get_collection(database, 'resources', Resource)

    # ElasticSearch
    es = ES('localhost:9200', timeout=60)
    es_index = 'topic_tracking'

    # find the same resource
    resource = resource_collection.find_one_model()
    query = build_query(resource, explain)
    result = es.search(query, es_index, 'resource')

    for r in result['hits']['hits']:
        pprint(r)
    print('Tested resource %s: %s' % (resource._id, resource.uri))
Exemplo n.º 7
0
class FullTextSearch(object):
    def __init__(self,server,settings = None ):
        self.conn = ES(server)
        self.indices = {}
        if settings:
            self.settings = settings
        else:
            self.settings = { 
                'index': {
                    'analysis' : {
                        'analyzer' : {                             
                            'ngram_analyzer' : {                   
                                'tokenizer' : 'keyword',
                                'filter' : ['lowercase', 'filter_ngram'],
                                'type' : 'custom'
                            }  
                        },
                        'filter' : {
                            'filter_ngram' : {                                 
                                'type' : 'nGram',
                                'max_gram' : 30,
                                'min_gram' : 1                                 
                            }                           
                        }
                    }
                }
            }

    def search_index(self, type, index_names, query_string, num_results=-1):
        ns_index_names= [str(type) + "-_-" + index_name for index_name in index_names]
        q = WildcardQuery('_all',lower(query_string))
        results = self.conn.search(query=q, indices=ns_index_names, doc_types=type)
        num_found = len(results)
        if(num_results > num_found):
            num_results = num_found
        nodelist = [self.datastore.get_node(type,r['_id']) for r in results['hits']['hits']]
        if(num_results!=-1):
            return nodelist[0:num_results]
        else:
            return nodelist

    def create_index(self, type, indexed_variables, index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.conn.create_index(ns_index_name,self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {'boost':1.0,
                            'analyzer' : 'ngram_analyzer',
                            'type': u'string',
                            'term_vector': 'with_positions_offsets'}
        index_settings = {'index_analyzer':'ngram_analyzer',
                          'search_analyzer':'standard',
                          'properties':mapping}
        self.conn.put_mapping(str(type),index_settings,[ns_index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        self.indices = self.conn.get_indices()

    def delete_index(self,type,index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ns_index_name = str(type) + "-_-" + index_name
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]
        mapping = self.conn.get_mapping(type,ns_index_name)
        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(type,ns_index_name,node.attributes,mapping)
            try:
                self.conn.delete(ns_index_name,type,key)
            except exceptions.NotFoundException:
                pass
            try: 
                self.conn.index(index_dict,ns_index_name,type,key)
            except exceptions.ElasticSearchParseException:
                pass
        self.conn.refresh([ns_index_name])

    def on_create(self,node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type,ns_index_name)
            index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping)
            self.conn.index(index_dict,ns_index_name,node.type,node.key)
            self.conn.refresh([ns_index_name])

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            try:
                self.conn.delete(ns_index_name,node.type,node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass
           
    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type,ns_index_name)
            index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping)
            try:
                self.conn.delete(ns_index_name,node.type,node.key)
                self.conn.index(index_dict,ns_index_name,node.type,node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass

    def get_indices_of_type(self,type):
        type_indices = []
        for index in self.indices.keys():
            if index.startswith(type+"-_-"):
                type_indices.append(index)
        return type_indices

    def populate_index_document(self,type,ns_index_name,attributes,mapping):
        indexed_variables = mapping[type]['properties'].keys()
        index_dict = {}
        for arg in indexed_variables:
            try:
                index_dict[arg] = attributes[arg]
            except KeyError:
                #if this attribute doesn't exist for this node, just pass
                pass
        return index_dict
Exemplo n.º 8
0
class ESDataSource(DataSource):
    """
    docstring for ClassName
    """
    def __init__(self,
                 document_type,
                 index=None,
                 host=None,
                 port=None,
                 expand=False,
                 **elasticsearch_args):
        """Creates a ElasticSearch data source stream.

        :Attributes:
            * document_type: elasticsearch document_type name
            * index: index name, default is test
            * host: elasticsearch database server host, default is ``localhost``
            * port: elasticsearch port, default is ``27017``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
        """
        super(ESDataSource, self).__init__()
        self.document_type = document_type
        self.index = index or "test"
        self.host = host or "127.0.0.1"
        self.port = port or "9200"
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.connection = None
        self._fields = None

    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        self.connection = ES(server, **args)
        self.connection.default_indices = self.index
        self.connection.default_types = self.document_type

    def read_fields(self, limit=0, collapse=False):
        keys = []
        probes = {}

        def probe_record(record, parent=None):
            for key, value in record.items():
                if parent:
                    full_key = parent + "." + key
                else:
                    full_key = key

                if self.expand and type(value) == dict:
                    probe_record(value, full_key)
                    continue

                if not full_key in probes:
                    probe = FieldTypeProbe(full_key)
                    probes[full_key] = probe
                    keys.append(full_key)
                else:
                    probe = probes[full_key]
                probe.probe(value)

        for record in self.document_type.find(limit=limit):
            probe_record(record)

        fields = []

        for key in keys:
            probe = probes[key]
            field = Field(probe.field)

            storage_type = probe.unique_storage_type
            if not storage_type:
                field.storage_type = "unknown"
            elif storage_type == "unicode":
                field.storage_type = "string"
            else:
                field.storage_type = "unknown"
                field.concrete_storage_type = storage_type

            # FIXME: Set analytical type

            fields.append(field)

        self._fields = list(fields)
        return self._fields

    def rows(self):
        if not self.connection:
            raise RuntimeError("Stream is not initialized")
        from pyes.query import MatchAllQuery
        fields = self.field_names
        results = self.connection.search(MatchAllQuery(),
                                         search_type="scan",
                                         timeout="5m",
                                         size="200")
        return ESRowIterator(results, fields)

    def records(self):
        if not self.connection:
            raise RuntimeError("Stream is not initialized")
        from pyes.query import MatchAllQuery
        results = self.connection.search(MatchAllQuery(),
                                         search_type="scan",
                                         timeout="5m",
                                         size="200")
        return ESRecordIterator(results, self.expand)
Exemplo n.º 9
0
class FullTextSearch(object):
    def __init__(self, server, settings=None):
        self.conn = ES(server)
        self.indices = {}
        if settings:
            self.settings = settings
        else:
            self.settings = {
                'index': {
                    'analysis': {
                        'analyzer': {
                            'ngram_analyzer': {
                                'tokenizer': 'keyword',
                                'filter': ['lowercase', 'filter_ngram'],
                                'type': 'custom'
                            }
                        },
                        'filter': {
                            'filter_ngram': {
                                'type': 'nGram',
                                'max_gram': 30,
                                'min_gram': 1
                            }
                        }
                    }
                }
            }

    def search_index(self, type, index_names, query_string, num_results=-1):
        ns_index_names = [
            str(type) + "-_-" + index_name for index_name in index_names
        ]
        q = WildcardQuery('_all', lower(query_string))
        results = self.conn.search(query=q,
                                   indices=ns_index_names,
                                   doc_types=type)
        num_found = len(results)
        if (num_results > num_found):
            num_results = num_found
        nodelist = [
            self.datastore.get_node(type, r['_id'])
            for r in results['hits']['hits']
        ]
        if (num_results != -1):
            return nodelist[0:num_results]
        else:
            return nodelist

    def create_index(self, type, indexed_variables, index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.conn.create_index(ns_index_name, self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {
                'boost': 1.0,
                'analyzer': 'ngram_analyzer',
                'type': u'string',
                'term_vector': 'with_positions_offsets'
            }
        index_settings = {
            'index_analyzer': 'ngram_analyzer',
            'search_analyzer': 'standard',
            'properties': mapping
        }
        self.conn.put_mapping(str(type), index_settings, [ns_index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        self.indices = self.conn.get_indices()

    def delete_index(self, type, index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ns_index_name = str(type) + "-_-" + index_name
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]
        mapping = self.conn.get_mapping(type, ns_index_name)
        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(type, ns_index_name,
                                                      node.attributes, mapping)
            try:
                self.conn.delete(ns_index_name, type, key)
            except exceptions.NotFoundException:
                pass
            try:
                self.conn.index(index_dict, ns_index_name, type, key)
            except exceptions.ElasticSearchParseException:
                pass
        self.conn.refresh([ns_index_name])

    def on_create(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type, ns_index_name)
            index_dict = self.populate_index_document(node.type, ns_index_name,
                                                      node.attributes, mapping)
            self.conn.index(index_dict, ns_index_name, node.type, node.key)
            self.conn.refresh([ns_index_name])

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            try:
                self.conn.delete(ns_index_name, node.type, node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass

    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type, ns_index_name)
            index_dict = self.populate_index_document(node.type, ns_index_name,
                                                      node.attributes, mapping)
            try:
                self.conn.delete(ns_index_name, node.type, node.key)
                self.conn.index(index_dict, ns_index_name, node.type, node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass

    def get_indices_of_type(self, type):
        type_indices = []
        for index in self.indices.keys():
            if index.startswith(type + "-_-"):
                type_indices.append(index)
        return type_indices

    def populate_index_document(self, type, ns_index_name, attributes,
                                mapping):
        indexed_variables = mapping[type]['properties'].keys()
        index_dict = {}
        for arg in indexed_variables:
            try:
                index_dict[arg] = attributes[arg]
            except KeyError:
                #if this attribute doesn't exist for this node, just pass
                pass
        return index_dict
Exemplo n.º 10
0
class FullTextSearch(object):
    def __init__(self, server, settings=None):
        self.conn = ES(server)
        if settings:
            self.settings = settings
        else:
            self.settings = {
                'index': {
                    'analysis': {
                        'analyzer': {
                            'ngram_analyzer': {
                                'tokenizer': 'keyword',
                                'filter': ['lowercase', 'filter_ngram'],
                                'type': 'custom'
                            }
                        },
                        'filter': {
                            'filter_ngram': {
                                'type': 'nGram',
                                'max_gram': 30,
                                'min_gram': 1
                            }
                        }
                    }
                }
            }
        self.refresh_index_cache()

    def search_index_text(self, query_string, fields="_all", **args):
        q = query.TextQuery(fields, query_string)
        return self.search_index(q, **args)

    def search_index(self, query, indices=None, num_results=None, node_type=None):
        results = self.conn.search(
            query=query, indices=indices, doc_types=node_type)
        meta_list = [r.get_meta() for r in results[0:num_results]]
        node_dict = {}

        # fetch nodes grouped by type to reduce number of db calls
        key = itemgetter('type')
        for t, grouped_list in groupby(sorted(meta_list, key=key), key=key):
            ids = [meta['id'] for meta in grouped_list]
            for node in self.datastore.get_nodes(t, ids):
                node_dict[(node.type, node.key)] = node

        # return nodes in original order
        nodelist = [node_dict[(meta['type'], meta['id'])]
                    for meta in meta_list]

        return nodelist

    def create_index(self, type, indexed_variables, index_name):
        self.conn.create_index_if_missing(index_name, self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {'boost': 1.0,
                            'analyzer': 'ngram_analyzer',
                            'type': 'string',
                            'term_vector': 'with_positions_offsets'}
        index_settings = {'index_analyzer': 'ngram_analyzer',
                          'search_analyzer': 'standard',
                          'properties': mapping}
        self.conn.put_mapping(str(type), index_settings, [index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        try:
            self.indices = self.conn.get_mapping()
        except exceptions.IndexMissingException:
            self.indices = {}

    def delete_index(self, index_name):
        self.conn.delete_index_if_exists(index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]

        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, type, key)
            except exceptions.NotFoundException:
                pass
            self.conn.index(index_dict, index_name, type, key)
        self.conn.refresh([index_name])

    def on_create(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            self.conn.index(index_dict, index_name, node.type, node.key)
            self.conn.refresh([index_name])

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            try:
                self.conn.delete(index_name, node.type, node.key)
                self.conn.refresh([index_name])
            except exceptions.NotFoundException:
                pass

    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, node.type, node.key)
                self.conn.index(index_dict, index_name, node.type, node.key)
                self.conn.refresh([index_name])
            except exceptions.NotFoundException:
                pass

    def get_indices_of_type(self, type):
        type_indices = [
            key for key, value in self.indices.items()
            if type in value
        ]
        return type_indices

    def populate_index_document(self, node, index_name):
        indexed_variables = self.indices[index_name][node.type]['properties'].keys()
        index_dict = {
            field: node[field] for field in indexed_variables
        }
        return index_dict