def dump(start,end,backupdir,eshost): conn = ES(eshost) out = file('/tmp/out.json','w') _type = 'habakkuk' q = MatchAllQuery() q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',start,end,include_upper=False))) q = q.search() # print json.dumps(json.loads(q.to_search_json()),indent=2) resultset = conn.search(query=q,indices=_type+"-*", doc_types=[_type], scan=True) cnt=0 if not resultset.total: sys.stderr.write("no data for %s - %s\n"%(start,end)) return try: sys.stderr.write("Will write %d lines to %s\n"%(resultset.total, out.name)) while True: r = resultset.next() cnt+=1 out.write(json.dumps(r)+'\n') except StopIteration: pass out.close() # gzip ext = datetime.strftime(start,'%Y-%m-%d') backup = os.path.join(backupdir,"habakkuk-%s.json.gz"%ext) f_in = open(out.name,'rb') f_out = gzip.open(backup,'wb') f_out.writelines(f_in) f_out.close() f_out.close() sys.stderr.write("Created %s\n"%backup)
def get_entries(self): '''Get all entries for a team + their filter from ES/MozDef''' teamfilter = self.config['teamsetup'][self.team]['filter'] es = ES((self.config['mozdef']['proto'], self.config['mozdef']['host'], self.config['mozdef']['port'])) # Default filter - time period try: td = self.config['es'][teamfilter]['_time_period'] except KeyError: debug('No _time_period defined, defaulting to 24h') td = 24 begindateUTC = toUTC(datetime.now() - timedelta(hours=td)) enddateUTC= toUTC(datetime.now()) print begindateUTC, enddateUTC fDate = pyes.RangeQuery(qrange=pyes.ESRange('utctimestamp', from_value=begindateUTC, to_value=enddateUTC)) # Load team queries from our json config. # Lists are "should" unless an item is negated with "!" then it's must_not # Single items are "must" query = pyes.query.BoolQuery() query.add_must(pyes.QueryStringQuery('asset.autogroup: "{}"'.format(self.team))) for item in self.config['es'][teamfilter]: # items starting with '_' are internal/reserved, like _time_period if (item.startswith('_')): continue val = self.config['es'][teamfilter][item] if (type(val) == list): for v in val: if (v.startswith("!")): query.add_must_not(pyes.MatchQuery(item, v[1:])) else: query.add_should(pyes.MatchQuery(item, v)) else: if (val.startswith("!")): query.add_must_not(pyes.MatchQuery(item, val)) else: query.add_must(pyes.MatchQuery(item, val)) q = pyes.ConstantScoreQuery(query) q = pyes.FilteredQuery(q, pyes.BoolFilter(must=[fDate])) results = es.search(query=q, indices=self.config['es']['index']) raw = results._search_raw(0, results.count()) # This doesn't do much, but pyes has no "close()" or similar functionality. es.force_bulk() if (raw._shards.failed != 0): raise Exception("Some shards failed! {0}".format(raw._shards.__str__())) # Nobody cares for the metadata past this point (all the goodies are in '_source') data = [] for i in raw.hits.hits: data += [i._source] return data
def dump_topics(backupdir, eshost, _type, indices="topics-all"): conn = ES(eshost) out = file('/tmp/out.json','w') q = MatchAllQuery() q = q.search() resultset = conn.search(query=q,indices=indices, doc_types=[_type], scan=True) cnt=0 if not resultset.total: sys.stderr.write("no data\n") return try: sys.stderr.write("Will write %d lines to %s\n"%(resultset.total, out.name)) while True: r = resultset.next() r['_id'] = r._meta.id cnt+=1 out.write(json.dumps(r)+'\n') except StopIteration: pass out.close() # gzip backup = os.path.join(backupdir,"topics.{}.json.gz".format(_type)) f_in = open(out.name,'rb') f_out = gzip.open(backup,'wb') f_out.writelines(f_in) f_out.close() f_out.close() sys.stderr.write("Created %s\n"%backup)
class FullTextSearch(object): def __init__(self, server, settings=None): # These timeout and bulk_size parameters were determined through # trial and error to be necessary to avoid timeout errors when # generating indices on Sandbox. They should not be taken as gospel. self.conn = ES(server, timeout=120.0) # Default timeout: 30.0 self.conn.bulker.bulk_size = 25 # Default: 400 if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } self.refresh_index_cache() def search_index_text(self, query_string, fields="_all", **args): q = query.MatchQuery(fields, query_string) return self.search_index(q, **args) def search_index(self, query, indices=None, num_results=None, node_type=None): results = self.conn.search( query=query, indices=indices, doc_types=node_type) meta_list = [r.get_meta() for r in results[0:num_results]] node_dict = {} # fetch nodes grouped by type to reduce number of db calls key = itemgetter('type') for t, grouped_list in groupby(sorted(meta_list, key=key), key=key): ids = [meta['id'] for meta in grouped_list] for node in self.datastore.get_nodes(t, ids): node_dict[(node.type, node.key)] = node # return nodes in original order nodelist = [node_dict[(meta['type'], meta['id'])] for meta in meta_list] return nodelist def create_index(self, type, indexed_variables, index_name): self.conn.indices.create_index_if_missing(index_name, self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = {'boost': 1.0, 'analyzer': 'ngram_analyzer', 'type': 'string', 'term_vector': 'with_positions_offsets'} index_settings = {'index_analyzer': 'ngram_analyzer', 'search_analyzer': 'standard', 'properties': mapping} self.conn.indices.put_mapping(str(type), index_settings, [index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): try: indices = self.conn.indices.get_mapping(raw=True) except exceptions.IndexMissingException: indices = {} else: indices = dict((k, v.get('mappings', {})) for k, v in indices.items()) self.indices = indices def delete_index(self, index_name): self.conn.indices.delete_index_if_exists(index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] for node in node_list: key = node.key index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, type, key) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass self.conn.index(index_dict, index_name, type, key, bulk=True) self.conn.indices.refresh([index_name]) def on_create(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) try: self.conn.index(index_dict, index_name, node.type, node.key, bulk=True) self.conn.indices.refresh([index_name]) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: try: self.conn.delete(index_name, node.type, node.key, bulk=True) self.conn.indices.refresh([index_name]) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, node.type, node.key) self.conn.index(index_dict, index_name, node.type, node.key, bulk=True) self.conn.indices.refresh([index_name]) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass def get_indices_of_type(self, type): type_indices = [ key for key, value in self.indices.items() if type in value ] return type_indices def populate_index_document(self, node, index_name): indexed_variables = self.indices[index_name][node.type]['properties'].keys() index_dict = { field: node[field] for field in indexed_variables } return index_dict
class ESDataSource(base.DataSource): """docstring for ClassName """ def __init__(self, document_type, database=None, host=None, port=None, expand=False, **elasticsearch_args): """Creates a ElasticSearch data source stream. :Attributes: * document_type: elasticsearch document_type name * database: database name * host: elasticsearch database server host, default is ``localhost`` * port: elasticsearch port, default is ``27017`` * expand: expand dictionary values and treat children as top-level keys with dot '.' separated key path to the child.. """ self.document_type = document_type self.database_name = database self.host = host self.port = port self.elasticsearch_args = elasticsearch_args self.expand = expand self.connection = None self._fields = None def initialize(self): """Initialize ElasticSearch source stream: """ args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port self.connection = ES(server, **args) self.connection.default_indices = self.database_name self.connection.default_types = self.document_type def read_fields(self, limit=0): keys = [] probes = {} def probe_record(record, parent=None): for key, value in record.items(): if parent: full_key = parent + "." + key else: full_key = key if self.expand and type(value) == dict: probe_record(value, full_key) continue if not full_key in probes: probe = dq.FieldTypeProbe(full_key) probes[full_key] = probe keys.append(full_key) else: probe = probes[full_key] probe.probe(value) for record in self.document_type.find(limit=limit): probe_record(record) fields = [] for key in keys: probe = probes[key] field = base.Field(probe.field) storage_type = probe.unique_storage_type if not storage_type: field.storage_type = "unknown" elif storage_type == "unicode": field.storage_type = "string" else: field.storage_type = "unknown" field.concrete_storage_type = storage_type # FIXME: Set analytical type fields.append(field) self.fields = list(fields) return self.fields def rows(self): if not self.connection: raise RuntimeError("Stream is not initialized") from pyes.query import MatchAllQuery fields = self.fields.names() results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200") return ESRowIterator(results, fields) def records(self): if not self.connection: raise RuntimeError("Stream is not initialized") from pyes.query import MatchAllQuery results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200") return ESRecordIterator(results, self.expand)
'size': 2, 'explain': explain } return query if __name__ == '__main__': explain = True # MongoDB host = 'localhost' port = 27017 mcm = MongoConnectionManager(host, port, MongoCodec()) database = 'processed' resource_collection = mcm.get_collection(database, 'resources', Resource) # ElasticSearch es = ES('localhost:9200', timeout=60) es_index = 'topic_tracking' # find the same resource resource = resource_collection.find_one_model() query = build_query(resource, explain) result = es.search(query, es_index, 'resource') for r in result['hits']['hits']: pprint(r) print('Tested resource %s: %s' % (resource._id, resource.uri))
class FullTextSearch(object): def __init__(self,server,settings = None ): self.conn = ES(server) self.indices = {} if settings: self.settings = settings else: self.settings = { 'index': { 'analysis' : { 'analyzer' : { 'ngram_analyzer' : { 'tokenizer' : 'keyword', 'filter' : ['lowercase', 'filter_ngram'], 'type' : 'custom' } }, 'filter' : { 'filter_ngram' : { 'type' : 'nGram', 'max_gram' : 30, 'min_gram' : 1 } } } } } def search_index(self, type, index_names, query_string, num_results=-1): ns_index_names= [str(type) + "-_-" + index_name for index_name in index_names] q = WildcardQuery('_all',lower(query_string)) results = self.conn.search(query=q, indices=ns_index_names, doc_types=type) num_found = len(results) if(num_results > num_found): num_results = num_found nodelist = [self.datastore.get_node(type,r['_id']) for r in results['hits']['hits']] if(num_results!=-1): return nodelist[0:num_results] else: return nodelist def create_index(self, type, indexed_variables, index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.conn.create_index(ns_index_name,self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = {'boost':1.0, 'analyzer' : 'ngram_analyzer', 'type': u'string', 'term_vector': 'with_positions_offsets'} index_settings = {'index_analyzer':'ngram_analyzer', 'search_analyzer':'standard', 'properties':mapping} self.conn.put_mapping(str(type),index_settings,[ns_index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): self.indices = self.conn.get_indices() def delete_index(self,type,index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ns_index_name = str(type) + "-_-" + index_name ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] mapping = self.conn.get_mapping(type,ns_index_name) for node in node_list: key = node.key index_dict = self.populate_index_document(type,ns_index_name,node.attributes,mapping) try: self.conn.delete(ns_index_name,type,key) except exceptions.NotFoundException: pass try: self.conn.index(index_dict,ns_index_name,type,key) except exceptions.ElasticSearchParseException: pass self.conn.refresh([ns_index_name]) def on_create(self,node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type,ns_index_name) index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping) self.conn.index(index_dict,ns_index_name,node.type,node.key) self.conn.refresh([ns_index_name]) def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: try: self.conn.delete(ns_index_name,node.type,node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type,ns_index_name) index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping) try: self.conn.delete(ns_index_name,node.type,node.key) self.conn.index(index_dict,ns_index_name,node.type,node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def get_indices_of_type(self,type): type_indices = [] for index in self.indices.keys(): if index.startswith(type+"-_-"): type_indices.append(index) return type_indices def populate_index_document(self,type,ns_index_name,attributes,mapping): indexed_variables = mapping[type]['properties'].keys() index_dict = {} for arg in indexed_variables: try: index_dict[arg] = attributes[arg] except KeyError: #if this attribute doesn't exist for this node, just pass pass return index_dict
class ESDataSource(DataSource): """ docstring for ClassName """ def __init__(self, document_type, index=None, host=None, port=None, expand=False, **elasticsearch_args): """Creates a ElasticSearch data source stream. :Attributes: * document_type: elasticsearch document_type name * index: index name, default is test * host: elasticsearch database server host, default is ``localhost`` * port: elasticsearch port, default is ``27017`` * expand: expand dictionary values and treat children as top-level keys with dot '.' separated key path to the child.. """ super(ESDataSource, self).__init__() self.document_type = document_type self.index = index or "test" self.host = host or "127.0.0.1" self.port = port or "9200" self.elasticsearch_args = elasticsearch_args self.expand = expand self.connection = None self._fields = None def initialize(self): """Initialize ElasticSearch source stream: """ args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port self.connection = ES(server, **args) self.connection.default_indices = self.index self.connection.default_types = self.document_type def read_fields(self, limit=0, collapse=False): keys = [] probes = {} def probe_record(record, parent=None): for key, value in record.items(): if parent: full_key = parent + "." + key else: full_key = key if self.expand and type(value) == dict: probe_record(value, full_key) continue if not full_key in probes: probe = FieldTypeProbe(full_key) probes[full_key] = probe keys.append(full_key) else: probe = probes[full_key] probe.probe(value) for record in self.document_type.find(limit=limit): probe_record(record) fields = [] for key in keys: probe = probes[key] field = Field(probe.field) storage_type = probe.unique_storage_type if not storage_type: field.storage_type = "unknown" elif storage_type == "unicode": field.storage_type = "string" else: field.storage_type = "unknown" field.concrete_storage_type = storage_type # FIXME: Set analytical type fields.append(field) self._fields = list(fields) return self._fields def rows(self): if not self.connection: raise RuntimeError("Stream is not initialized") from pyes.query import MatchAllQuery fields = self.field_names results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200") return ESRowIterator(results, fields) def records(self): if not self.connection: raise RuntimeError("Stream is not initialized") from pyes.query import MatchAllQuery results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200") return ESRecordIterator(results, self.expand)
class FullTextSearch(object): def __init__(self, server, settings=None): self.conn = ES(server) self.indices = {} if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } def search_index(self, type, index_names, query_string, num_results=-1): ns_index_names = [ str(type) + "-_-" + index_name for index_name in index_names ] q = WildcardQuery('_all', lower(query_string)) results = self.conn.search(query=q, indices=ns_index_names, doc_types=type) num_found = len(results) if (num_results > num_found): num_results = num_found nodelist = [ self.datastore.get_node(type, r['_id']) for r in results['hits']['hits'] ] if (num_results != -1): return nodelist[0:num_results] else: return nodelist def create_index(self, type, indexed_variables, index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.conn.create_index(ns_index_name, self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = { 'boost': 1.0, 'analyzer': 'ngram_analyzer', 'type': u'string', 'term_vector': 'with_positions_offsets' } index_settings = { 'index_analyzer': 'ngram_analyzer', 'search_analyzer': 'standard', 'properties': mapping } self.conn.put_mapping(str(type), index_settings, [ns_index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): self.indices = self.conn.get_indices() def delete_index(self, type, index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ns_index_name = str(type) + "-_-" + index_name ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] mapping = self.conn.get_mapping(type, ns_index_name) for node in node_list: key = node.key index_dict = self.populate_index_document(type, ns_index_name, node.attributes, mapping) try: self.conn.delete(ns_index_name, type, key) except exceptions.NotFoundException: pass try: self.conn.index(index_dict, ns_index_name, type, key) except exceptions.ElasticSearchParseException: pass self.conn.refresh([ns_index_name]) def on_create(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type, ns_index_name) index_dict = self.populate_index_document(node.type, ns_index_name, node.attributes, mapping) self.conn.index(index_dict, ns_index_name, node.type, node.key) self.conn.refresh([ns_index_name]) def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: try: self.conn.delete(ns_index_name, node.type, node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type, ns_index_name) index_dict = self.populate_index_document(node.type, ns_index_name, node.attributes, mapping) try: self.conn.delete(ns_index_name, node.type, node.key) self.conn.index(index_dict, ns_index_name, node.type, node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def get_indices_of_type(self, type): type_indices = [] for index in self.indices.keys(): if index.startswith(type + "-_-"): type_indices.append(index) return type_indices def populate_index_document(self, type, ns_index_name, attributes, mapping): indexed_variables = mapping[type]['properties'].keys() index_dict = {} for arg in indexed_variables: try: index_dict[arg] = attributes[arg] except KeyError: #if this attribute doesn't exist for this node, just pass pass return index_dict
class FullTextSearch(object): def __init__(self, server, settings=None): self.conn = ES(server) if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } self.refresh_index_cache() def search_index_text(self, query_string, fields="_all", **args): q = query.TextQuery(fields, query_string) return self.search_index(q, **args) def search_index(self, query, indices=None, num_results=None, node_type=None): results = self.conn.search( query=query, indices=indices, doc_types=node_type) meta_list = [r.get_meta() for r in results[0:num_results]] node_dict = {} # fetch nodes grouped by type to reduce number of db calls key = itemgetter('type') for t, grouped_list in groupby(sorted(meta_list, key=key), key=key): ids = [meta['id'] for meta in grouped_list] for node in self.datastore.get_nodes(t, ids): node_dict[(node.type, node.key)] = node # return nodes in original order nodelist = [node_dict[(meta['type'], meta['id'])] for meta in meta_list] return nodelist def create_index(self, type, indexed_variables, index_name): self.conn.create_index_if_missing(index_name, self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = {'boost': 1.0, 'analyzer': 'ngram_analyzer', 'type': 'string', 'term_vector': 'with_positions_offsets'} index_settings = {'index_analyzer': 'ngram_analyzer', 'search_analyzer': 'standard', 'properties': mapping} self.conn.put_mapping(str(type), index_settings, [index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): try: self.indices = self.conn.get_mapping() except exceptions.IndexMissingException: self.indices = {} def delete_index(self, index_name): self.conn.delete_index_if_exists(index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] for node in node_list: key = node.key index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, type, key) except exceptions.NotFoundException: pass self.conn.index(index_dict, index_name, type, key) self.conn.refresh([index_name]) def on_create(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) self.conn.index(index_dict, index_name, node.type, node.key) self.conn.refresh([index_name]) def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: try: self.conn.delete(index_name, node.type, node.key) self.conn.refresh([index_name]) except exceptions.NotFoundException: pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, node.type, node.key) self.conn.index(index_dict, index_name, node.type, node.key) self.conn.refresh([index_name]) except exceptions.NotFoundException: pass def get_indices_of_type(self, type): type_indices = [ key for key, value in self.indices.items() if type in value ] return type_indices def populate_index_document(self, node, index_name): indexed_variables = self.indices[index_name][node.type]['properties'].keys() index_dict = { field: node[field] for field in indexed_variables } return index_dict