def dump(start,end,backupdir,eshost): conn = ES(eshost) out = file('/tmp/out.json','w') _type = 'habakkuk' q = MatchAllQuery() q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',start,end,include_upper=False))) q = q.search() # print json.dumps(json.loads(q.to_search_json()),indent=2) resultset = conn.search(query=q,indices=_type+"-*", doc_types=[_type], scan=True) cnt=0 if not resultset.total: sys.stderr.write("no data for %s - %s\n"%(start,end)) return try: sys.stderr.write("Will write %d lines to %s\n"%(resultset.total, out.name)) while True: r = resultset.next() cnt+=1 out.write(json.dumps(r)+'\n') except StopIteration: pass out.close() # gzip ext = datetime.strftime(start,'%Y-%m-%d') backup = os.path.join(backupdir,"habakkuk-%s.json.gz"%ext) f_in = open(out.name,'rb') f_out = gzip.open(backup,'wb') f_out.writelines(f_in) f_out.close() f_out.close() sys.stderr.write("Created %s\n"%backup)
def __init__(self, server, settings=None): self.conn = ES(server) self.indices = {} if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } }
def get_entries(self): '''Get all entries for a team + their filter from ES/MozDef''' teamfilter = self.config['teamsetup'][self.team]['filter'] es = ES((self.config['mozdef']['proto'], self.config['mozdef']['host'], self.config['mozdef']['port'])) # Default filter - time period try: td = self.config['es'][teamfilter]['_time_period'] except KeyError: debug('No _time_period defined, defaulting to 24h') td = 24 begindateUTC = toUTC(datetime.now() - timedelta(hours=td)) enddateUTC= toUTC(datetime.now()) print begindateUTC, enddateUTC fDate = pyes.RangeQuery(qrange=pyes.ESRange('utctimestamp', from_value=begindateUTC, to_value=enddateUTC)) # Load team queries from our json config. # Lists are "should" unless an item is negated with "!" then it's must_not # Single items are "must" query = pyes.query.BoolQuery() query.add_must(pyes.QueryStringQuery('asset.autogroup: "{}"'.format(self.team))) for item in self.config['es'][teamfilter]: # items starting with '_' are internal/reserved, like _time_period if (item.startswith('_')): continue val = self.config['es'][teamfilter][item] if (type(val) == list): for v in val: if (v.startswith("!")): query.add_must_not(pyes.MatchQuery(item, v[1:])) else: query.add_should(pyes.MatchQuery(item, v)) else: if (val.startswith("!")): query.add_must_not(pyes.MatchQuery(item, val)) else: query.add_must(pyes.MatchQuery(item, val)) q = pyes.ConstantScoreQuery(query) q = pyes.FilteredQuery(q, pyes.BoolFilter(must=[fDate])) results = es.search(query=q, indices=self.config['es']['index']) raw = results._search_raw(0, results.count()) # This doesn't do much, but pyes has no "close()" or similar functionality. es.force_bulk() if (raw._shards.failed != 0): raise Exception("Some shards failed! {0}".format(raw._shards.__str__())) # Nobody cares for the metadata past this point (all the goodies are in '_source') data = [] for i in raw.hits.hits: data += [i._source] return data
def initialize(self): """Initialize ElasticSearch source stream: """ args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port self.connection = ES(server, **args) self.connection.default_indices = self.index self.connection.default_types = self.document_type
class ElasticSearchTestCase(unittest.TestCase): def search(self, query): return self.conn.search_raw(query, 'test-index') def setUp(self): self.conn = ElasticSearch('http://localhost:9200/') def tearDown(self): self.conn.delete_index("test-index") def assertResultContains(self, result, expected): for (key, value) in expected.items(): self.assertEquals(value, result[key])
def test_servers(self): es = ES("127.0.0.1:9200") self.assertEquals(es.servers, [("http", "127.0.0.1", 9200)]) es = ES("127.0.0.1:9500") self.assertEquals(es.servers, [("thrift", "127.0.0.1", 9500)]) es = ES("http://127.0.0.1:9400") self.assertEquals(es.servers, [("http", "127.0.0.1", 9400)]) es = ES("thrift://127.0.0.1:9100") self.assertEquals(es.servers, [("thrift", "127.0.0.1", 9100)]) es = ES([ "thrift://127.0.0.1:9100", "127.0.0.1:9200", ("thrift", "127.0.0.1", 9000) ]) self.assertEquals(sorted(es.servers), [("http", "127.0.0.1", 9200), ("thrift", "127.0.0.1", 9000), ("thrift", "127.0.0.1", 9100)])
def __init__(self,server,settings = None ): self.conn = ES(server) self.indices = {} if settings: self.settings = settings else: self.settings = { 'index': { 'analysis' : { 'analyzer' : { 'ngram_analyzer' : { 'tokenizer' : 'keyword', 'filter' : ['lowercase', 'filter_ngram'], 'type' : 'custom' } }, 'filter' : { 'filter_ngram' : { 'type' : 'nGram', 'max_gram' : 30, 'min_gram' : 1 } } } } }
def connect_to_db(): eshosts = settings.ES_SETTINGS['ES_HOSTS'] index = settings.ES_SETTINGS['INDEX'] timeout = settings.ES_SETTINGS.get('TIMEOUT', 60.0) # build query return ES(eshosts, timeout=timeout)
def __init__(self, server, settings=None): # These timeout and bulk_size parameters were determined through # trial and error to be necessary to avoid timeout errors when # generating indices on Sandbox. They should not be taken as gospel. self.conn = ES(server, timeout=120.0) # Default timeout: 30.0 self.conn.bulker.bulk_size = 25 # Default: 400 if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } self.refresh_index_cache()
def initialize(self): """ Initialize ElasticSearch source stream: """ from pyes.es import ES from pyes.exceptions import IndexAlreadyExistsException args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port create = args.pop("create", False) replace = args.pop("replace", False) self.connection = ES(server, **args) self.connection.default_indices = self.index self.connection.default_types = self.document_type created = False if create: try: self.connection.create_index(self.index) self.connection.refresh(self.index) created = True except IndexAlreadyExistsException: pass if replace and not created: self.connection.delete_index_if_exists(self.index) self.connection.refresh(self.index) self.connection.create_index(self.index) self.connection.refresh(self.index) if self.truncate: self.connection.delete_mapping(self.index, self.document_type) self.connection.refresh(self.index) #check mapping try: self.connection.get_mapping(self.document_type, self.index) except TypeMissingException: self.connection.put_mapping(self.document_type, self._get_mapping(), self.index)
def test_servers(self): geturls = lambda servers: [server.geturl() for server in servers] es = ES("127.0.0.1:9200") self.assertEqual(geturls(es.servers), ["http://127.0.0.1:9200"]) es = ES("127.0.0.1:9500") self.assertEqual(geturls(es.servers), ["thrift://127.0.0.1:9500"]) es = ES(("http", "127.0.0.1", 9400)) self.assertEqual(geturls(es.servers), ["http://127.0.0.1:9400"]) es = ES(("thrift", "127.0.0.1", 9100)) self.assertEqual(geturls(es.servers), ["thrift://127.0.0.1:9100"]) es = ES([ "http://127.0.0.1:9100", "127.0.0.1:9200", ("thrift", "127.0.0.1", 9000), "127.0.0.1:9500", ]) self.assertEqual(geturls(sorted(es.servers)), [ "http://127.0.0.1:9100", "http://127.0.0.1:9200", "thrift://127.0.0.1:9000", "thrift://127.0.0.1:9500" ])
def initialize(self): """Initialize ElasticSearch source stream: """ args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port self.connection = ES(server, **args) self.connection.default_indices = self.database_name self.connection.default_types = self.document_type
def setUp(self): """reads the "real" elasticsearch settings from SOURCE/elasticsearch/settings.json and uses it to configure an index for the unittests""" self.es_settings = {'ES_HOSTS':['localhost:9200',], 'INDEX':"unittest-binarypig", 'FACET_SIZE':999999} query.settings.ES_SETTINGS = self.es_settings index_template_fn = os.path.join(settings.SOURCE_ROOT, 'elasticsearch', 'settings.json') self.index_settings = json.loads(file(index_template_fn).read()) conn = ES(self.es_settings['ES_HOSTS']) self.createIndex(conn)
def dump_topics(backupdir, eshost, _type, indices="topics-all"): conn = ES(eshost) out = file('/tmp/out.json','w') q = MatchAllQuery() q = q.search() resultset = conn.search(query=q,indices=indices, doc_types=[_type], scan=True) cnt=0 if not resultset.total: sys.stderr.write("no data\n") return try: sys.stderr.write("Will write %d lines to %s\n"%(resultset.total, out.name)) while True: r = resultset.next() r['_id'] = r._meta.id cnt+=1 out.write(json.dumps(r)+'\n') except StopIteration: pass out.close() # gzip backup = os.path.join(backupdir,"topics.{}.json.gz".format(_type)) f_in = open(out.name,'rb') f_out = gzip.open(backup,'wb') f_out.writelines(f_in) f_out.close() f_out.close() sys.stderr.write("Created %s\n"%backup)
def setUp(self): self.conn = ElasticSearch('http://localhost:9200/')
def tearDown(self): """delete the index""" conn = ES(self.es_settings['ES_HOSTS']) self.deleteIndex(conn)
from utils import myyaml app = Flask(__name__) # ElasticSearch escfg = Config.ES # Wanna make sure test db is used if /test/ in url try: if '/test/' in request.url: Config.TESTING = True escfg = Config.ES_TEST except: pass # ElasticSearch es = ES(("http", escfg['host'], escfg['port'])) es.__dict__['index_name'] = escfg['name'] app.es = es app.config.from_object(Config) mongo = PyMongo() app.mongo = mongo mongo.init_app(app) # add regex for routing app.url_map.converters['regex'] = RegexConverter ##################### GET SEARCH @app.route( '/es', methods=['GET']) def es():
class ESDataTarget(DataTarget): """docstring for ClassName """ def __init__(self, document_type, index="test", host="127.0.0.1", port="9200", truncate=False, expand=False, **elasticsearch_args): """Creates a ElasticSearch data target stream. :Attributes: * document_ElasticSearch elasticsearch document_type name * index: database name * host: ElasticSearch database server host, default is ``localhost`` * port: ElasticSearch port, default is ``9200`` * expand: expand dictionary values and treat children as top-level keys with dot '.' separated key path to the child.. * truncate: delete existing data in the document_type. Default: False """ super(ESDataTarget, self).__init__() self.document_type = document_type self.index = index self.host = host self.port = port self.elasticsearch_args = elasticsearch_args self.expand = expand self.truncate = truncate self._fields = None def initialize(self): """ Initialize ElasticSearch source stream: """ from pyes.es import ES from pyes.exceptions import IndexAlreadyExistsException args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port create = args.pop("create", False) replace = args.pop("replace", False) self.connection = ES(server, **args) self.connection.default_indices = self.index self.connection.default_types = self.document_type created = False if create: try: self.connection.create_index(self.index) self.connection.refresh(self.index) created = True except IndexAlreadyExistsException: pass if replace and not created: self.connection.delete_index_if_exists(self.index) self.connection.refresh(self.index) self.connection.create_index(self.index) self.connection.refresh(self.index) if self.truncate: self.connection.delete_mapping(self.index, self.document_type) self.connection.refresh(self.index) #check mapping try: self.connection.get_mapping(self.document_type, self.index) except TypeMissingException: self.connection.put_mapping(self.document_type, self._get_mapping(), self.index) def _get_mapping(self): """Build an ES optimized mapping for the given fields""" from pyes.mappings import DocumentObjectField, IntegerField, StringField, BooleanField, FloatField, DateField document = DocumentObjectField(name=self.document_type) for field in self.fields: st = field.storage_type if st == "unknown": #lets es detect the type continue elif st in ["string", "text"]: document.add_property(StringField(name=field.name)) elif st == "integer": document.add_property(IntegerField(name=field.name)) elif st == "boolean": document.add_property(BooleanField(name=field.name)) elif st == "date": document.add_property(DateField(name=field.name)) elif st == "float": document.add_property(FloatField(name=field.name)) return document def append(self, obj): record = obj if not isinstance(obj, dict): record = dict(zip(self.field_names, obj)) if self.expand: record = expand_record(record) id = record.get('id') or record.get('_id') self.connection.index(record, self.index, self.document_type, id, bulk=True) def finalize(self): self.connection.flush_bulk(forced=True)
def get_conn(*args, **kwargs): return ES(("http", "127.0.0.1", 9200), *args, **kwargs)
class FullTextSearch(object): def __init__(self, server, settings=None): self.conn = ES(server) if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } self.refresh_index_cache() def search_index_text(self, query_string, fields="_all", **args): q = query.TextQuery(fields, query_string) return self.search_index(q, **args) def search_index(self, query, indices=None, num_results=None, node_type=None): results = self.conn.search( query=query, indices=indices, doc_types=node_type) meta_list = [r.get_meta() for r in results[0:num_results]] node_dict = {} # fetch nodes grouped by type to reduce number of db calls key = itemgetter('type') for t, grouped_list in groupby(sorted(meta_list, key=key), key=key): ids = [meta['id'] for meta in grouped_list] for node in self.datastore.get_nodes(t, ids): node_dict[(node.type, node.key)] = node # return nodes in original order nodelist = [node_dict[(meta['type'], meta['id'])] for meta in meta_list] return nodelist def create_index(self, type, indexed_variables, index_name): self.conn.create_index_if_missing(index_name, self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = {'boost': 1.0, 'analyzer': 'ngram_analyzer', 'type': 'string', 'term_vector': 'with_positions_offsets'} index_settings = {'index_analyzer': 'ngram_analyzer', 'search_analyzer': 'standard', 'properties': mapping} self.conn.put_mapping(str(type), index_settings, [index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): try: self.indices = self.conn.get_mapping() except exceptions.IndexMissingException: self.indices = {} def delete_index(self, index_name): self.conn.delete_index_if_exists(index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] for node in node_list: key = node.key index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, type, key) except exceptions.NotFoundException: pass self.conn.index(index_dict, index_name, type, key) self.conn.refresh([index_name]) def on_create(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) self.conn.index(index_dict, index_name, node.type, node.key) self.conn.refresh([index_name]) def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: try: self.conn.delete(index_name, node.type, node.key) self.conn.refresh([index_name]) except exceptions.NotFoundException: pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, node.type, node.key) self.conn.index(index_dict, index_name, node.type, node.key) self.conn.refresh([index_name]) except exceptions.NotFoundException: pass def get_indices_of_type(self, type): type_indices = [ key for key, value in self.indices.items() if type in value ] return type_indices def populate_index_document(self, node, index_name): indexed_variables = self.indices[index_name][node.type]['properties'].keys() index_dict = { field: node[field] for field in indexed_variables } return index_dict
''' Created on Jan 13, 2013 @author: Fang Jiaguo ''' from pyes.es import ES from pymongo.connection import Connection import json settings = json.load(open('settings.json', 'r')) mongodb = Connection(settings['mongodb']['host'], settings['mongodb']['port'])[settings['mongodb']['db']] elasticsearch = ES(('http', settings['elasticsearch']['host'], settings['elasticsearch']['port']))
from pyes.es import ES import random import datetime now = datetime.datetime.now() es = ES() for i in range(1000): d = {'created_at': now - datetime.timedelta(seconds=i), 'level': 'ERROR' if random.random() < 0.1 else 'WARN', 'message': "Test message"} es.index(d, 'my_index', 'my_type')
class FullTextSearch(object): def __init__(self,server,settings = None ): self.conn = ES(server) self.indices = {} if settings: self.settings = settings else: self.settings = { 'index': { 'analysis' : { 'analyzer' : { 'ngram_analyzer' : { 'tokenizer' : 'keyword', 'filter' : ['lowercase', 'filter_ngram'], 'type' : 'custom' } }, 'filter' : { 'filter_ngram' : { 'type' : 'nGram', 'max_gram' : 30, 'min_gram' : 1 } } } } } def search_index(self, type, index_names, query_string, num_results=-1): ns_index_names= [str(type) + "-_-" + index_name for index_name in index_names] q = WildcardQuery('_all',lower(query_string)) results = self.conn.search(query=q, indices=ns_index_names, doc_types=type) num_found = len(results) if(num_results > num_found): num_results = num_found nodelist = [self.datastore.get_node(type,r['_id']) for r in results['hits']['hits']] if(num_results!=-1): return nodelist[0:num_results] else: return nodelist def create_index(self, type, indexed_variables, index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.conn.create_index(ns_index_name,self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = {'boost':1.0, 'analyzer' : 'ngram_analyzer', 'type': u'string', 'term_vector': 'with_positions_offsets'} index_settings = {'index_analyzer':'ngram_analyzer', 'search_analyzer':'standard', 'properties':mapping} self.conn.put_mapping(str(type),index_settings,[ns_index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): self.indices = self.conn.get_indices() def delete_index(self,type,index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ns_index_name = str(type) + "-_-" + index_name ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] mapping = self.conn.get_mapping(type,ns_index_name) for node in node_list: key = node.key index_dict = self.populate_index_document(type,ns_index_name,node.attributes,mapping) try: self.conn.delete(ns_index_name,type,key) except exceptions.NotFoundException: pass try: self.conn.index(index_dict,ns_index_name,type,key) except exceptions.ElasticSearchParseException: pass self.conn.refresh([ns_index_name]) def on_create(self,node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type,ns_index_name) index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping) self.conn.index(index_dict,ns_index_name,node.type,node.key) self.conn.refresh([ns_index_name]) def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: try: self.conn.delete(ns_index_name,node.type,node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type,ns_index_name) index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping) try: self.conn.delete(ns_index_name,node.type,node.key) self.conn.index(index_dict,ns_index_name,node.type,node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def get_indices_of_type(self,type): type_indices = [] for index in self.indices.keys(): if index.startswith(type+"-_-"): type_indices.append(index) return type_indices def populate_index_document(self,type,ns_index_name,attributes,mapping): indexed_variables = mapping[type]['properties'].keys() index_dict = {} for arg in indexed_variables: try: index_dict[arg] = attributes[arg] except KeyError: #if this attribute doesn't exist for this node, just pass pass return index_dict
class FullTextSearch(object): def __init__(self, server, settings=None): self.conn = ES(server) self.indices = {} if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } def search_index(self, type, index_names, query_string, num_results=-1): ns_index_names = [ str(type) + "-_-" + index_name for index_name in index_names ] q = WildcardQuery('_all', lower(query_string)) results = self.conn.search(query=q, indices=ns_index_names, doc_types=type) num_found = len(results) if (num_results > num_found): num_results = num_found nodelist = [ self.datastore.get_node(type, r['_id']) for r in results['hits']['hits'] ] if (num_results != -1): return nodelist[0:num_results] else: return nodelist def create_index(self, type, indexed_variables, index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.conn.create_index(ns_index_name, self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = { 'boost': 1.0, 'analyzer': 'ngram_analyzer', 'type': u'string', 'term_vector': 'with_positions_offsets' } index_settings = { 'index_analyzer': 'ngram_analyzer', 'search_analyzer': 'standard', 'properties': mapping } self.conn.put_mapping(str(type), index_settings, [ns_index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): self.indices = self.conn.get_indices() def delete_index(self, type, index_name): ns_index_name = str(type) + "-_-" + index_name self.conn.delete_index_if_exists(ns_index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ns_index_name = str(type) + "-_-" + index_name ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] mapping = self.conn.get_mapping(type, ns_index_name) for node in node_list: key = node.key index_dict = self.populate_index_document(type, ns_index_name, node.attributes, mapping) try: self.conn.delete(ns_index_name, type, key) except exceptions.NotFoundException: pass try: self.conn.index(index_dict, ns_index_name, type, key) except exceptions.ElasticSearchParseException: pass self.conn.refresh([ns_index_name]) def on_create(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type, ns_index_name) index_dict = self.populate_index_document(node.type, ns_index_name, node.attributes, mapping) self.conn.index(index_dict, ns_index_name, node.type, node.key) self.conn.refresh([ns_index_name]) def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: try: self.conn.delete(ns_index_name, node.type, node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for ns_index_name in type_indices: mapping = self.conn.get_mapping(node.type, ns_index_name) index_dict = self.populate_index_document(node.type, ns_index_name, node.attributes, mapping) try: self.conn.delete(ns_index_name, node.type, node.key) self.conn.index(index_dict, ns_index_name, node.type, node.key) self.conn.refresh([ns_index_name]) except exceptions.NotFoundException: pass def get_indices_of_type(self, type): type_indices = [] for index in self.indices.keys(): if index.startswith(type + "-_-"): type_indices.append(index) return type_indices def populate_index_document(self, type, ns_index_name, attributes, mapping): indexed_variables = mapping[type]['properties'].keys() index_dict = {} for arg in indexed_variables: try: index_dict[arg] = attributes[arg] except KeyError: #if this attribute doesn't exist for this node, just pass pass return index_dict
database = 'processed' resource_collection = mcm.get_collection(database, 'resources', Resource) # message queue mq_config = { 'transport': 'socket', 'protocol': 'binary', 'host': 'localhost', 'port': 9091 } mq_client = message_queue_client_from_config(mq_config) mq_codec = JSONCodec() processed_resource_queue = 'processed_resources' # ElasticSearch es = ES('localhost:9200', timeout=60) es_index = 'topic_tracking' # dequeue one resource mq_client.connect() message = mq_client.get_message(processed_resource_queue) resource = mq_codec.decode(message.body, Resource) mq_client.delete_message(processed_resource_queue, message.id) mq_client.disconnect() # save the resource to mongo resource._id = makeIdFromURI(resource.uri) resource_collection.insert_model(resource) # index the resource for boost in [1, 1000]:
class ESDataSource(DataSource): """ docstring for ClassName """ def __init__(self, document_type, index=None, host=None, port=None, expand=False, **elasticsearch_args): """Creates a ElasticSearch data source stream. :Attributes: * document_type: elasticsearch document_type name * index: index name, default is test * host: elasticsearch database server host, default is ``localhost`` * port: elasticsearch port, default is ``27017`` * expand: expand dictionary values and treat children as top-level keys with dot '.' separated key path to the child.. """ super(ESDataSource, self).__init__() self.document_type = document_type self.index = index or "test" self.host = host or "127.0.0.1" self.port = port or "9200" self.elasticsearch_args = elasticsearch_args self.expand = expand self.connection = None self._fields = None def initialize(self): """Initialize ElasticSearch source stream: """ args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port self.connection = ES(server, **args) self.connection.default_indices = self.index self.connection.default_types = self.document_type def read_fields(self, limit=0, collapse=False): keys = [] probes = {} def probe_record(record, parent=None): for key, value in record.items(): if parent: full_key = parent + "." + key else: full_key = key if self.expand and type(value) == dict: probe_record(value, full_key) continue if not full_key in probes: probe = FieldTypeProbe(full_key) probes[full_key] = probe keys.append(full_key) else: probe = probes[full_key] probe.probe(value) for record in self.document_type.find(limit=limit): probe_record(record) fields = [] for key in keys: probe = probes[key] field = Field(probe.field) storage_type = probe.unique_storage_type if not storage_type: field.storage_type = "unknown" elif storage_type == "unicode": field.storage_type = "string" else: field.storage_type = "unknown" field.concrete_storage_type = storage_type # FIXME: Set analytical type fields.append(field) self._fields = list(fields) return self._fields def rows(self): if not self.connection: raise RuntimeError("Stream is not initialized") from pyes.query import MatchAllQuery fields = self.field_names results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200") return ESRowIterator(results, fields) def records(self): if not self.connection: raise RuntimeError("Stream is not initialized") from pyes.query import MatchAllQuery results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200") return ESRecordIterator(results, self.expand)
import sys #sys.path.insert(0, "../") #from pyes import ES from pyes.es import ES from datetime import datetime import shelve conn = ES('127.0.0.1:9500') #conn = ES('192.168.2.50:9200') try: conn.delete_index("test-index") except: pass dataset = shelve.open("samples.shelve") mapping = {u'description': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector": "with_positions_offsets" }, u'name': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector": "with_positions_offsets" }, u'age': {'store': 'yes', 'type': u'integer'},
class ESDataSource(base.DataSource): """docstring for ClassName """ def __init__(self, document_type, database=None, host=None, port=None, expand=False, **elasticsearch_args): """Creates a ElasticSearch data source stream. :Attributes: * document_type: elasticsearch document_type name * database: database name * host: elasticsearch database server host, default is ``localhost`` * port: elasticsearch port, default is ``27017`` * expand: expand dictionary values and treat children as top-level keys with dot '.' separated key path to the child.. """ self.document_type = document_type self.database_name = database self.host = host self.port = port self.elasticsearch_args = elasticsearch_args self.expand = expand self.connection = None self._fields = None def initialize(self): """Initialize ElasticSearch source stream: """ args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port self.connection = ES(server, **args) self.connection.default_indices = self.database_name self.connection.default_types = self.document_type def read_fields(self, limit=0): keys = [] probes = {} def probe_record(record, parent=None): for key, value in record.items(): if parent: full_key = parent + "." + key else: full_key = key if self.expand and type(value) == dict: probe_record(value, full_key) continue if not full_key in probes: probe = dq.FieldTypeProbe(full_key) probes[full_key] = probe keys.append(full_key) else: probe = probes[full_key] probe.probe(value) for record in self.document_type.find(limit=limit): probe_record(record) fields = [] for key in keys: probe = probes[key] field = base.Field(probe.field) storage_type = probe.unique_storage_type if not storage_type: field.storage_type = "unknown" elif storage_type == "unicode": field.storage_type = "string" else: field.storage_type = "unknown" field.concrete_storage_type = storage_type # FIXME: Set analytical type fields.append(field) self.fields = list(fields) return self.fields def rows(self): if not self.connection: raise RuntimeError("Stream is not initialized") from pyes.query import MatchAllQuery fields = self.fields.names() results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200") return ESRowIterator(results, fields) def records(self): if not self.connection: raise RuntimeError("Stream is not initialized") from pyes.query import MatchAllQuery results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200") return ESRecordIterator(results, self.expand)
'size': 2, 'explain': explain } return query if __name__ == '__main__': explain = True # MongoDB host = 'localhost' port = 27017 mcm = MongoConnectionManager(host, port, MongoCodec()) database = 'processed' resource_collection = mcm.get_collection(database, 'resources', Resource) # ElasticSearch es = ES('localhost:9200', timeout=60) es_index = 'topic_tracking' # find the same resource resource = resource_collection.find_one_model() query = build_query(resource, explain) result = es.search(query, es_index, 'resource') for r in result['hits']['hits']: pprint(r) print('Tested resource %s: %s' % (resource._id, resource.uri))
class ESDataTarget(base.DataTarget): """docstring for ClassName """ def __init__(self, document_type, database="test", host="127.0.0.1", port="9200", truncate=False, expand=False, **elasticsearch_args): """Creates a ElasticSearch data target stream. :Attributes: * document_ElasticSearch elasticsearch document_type name * database: database name * host: ElasticSearch database server host, default is ``localhost`` * port: ElasticSearch port, default is ``9200`` * expand: expand dictionary values and treat children as top-level keys with dot '.' separated key path to the child.. * truncate: delete existing data in the document_type. Default: False """ self.document_type = document_type self.database_name = database self.host = host self.port = port self.elasticsearch_args = elasticsearch_args self.expand = expand self.truncate = truncate self._fields = None def initialize(self): """Initialize ElasticSearch source stream: """ from pyes.es import ES from pyes.exceptions import IndexAlreadyExistsException args = self.elasticsearch_args.copy() server = "" if self.host: server = self.host if self.port: server += ":" + self.port create = args.pop("create", False) replace = args.pop("replace", False) self.connection = ES(server, **args) self.connection.default_indices = self.database_name self.connection.default_types = self.document_type created = False if create: try: self.connection.create_index(self.database_name) self.connection.refresh(self.database_name) created = True except IndexAlreadyExistsException: pass if replace and not created: self.connection.delete_index_if_exists(self.database_name) time.sleep(2) self.connection.create_index(self.database_name) self.connection.refresh(self.database_name) if self.truncate: self.connection.delete_mapping(self.database_name, self.document_type) self.connection.refresh(self.database_name) def append(self, obj): record = obj if not isinstance(obj, dict): record = dict(zip(self.fields.names(), obj)) if self.expand: record = expand_record(record) id = record.get('id') or record.get('_id') self.connection.index(record, self.database_name, self.document_type, id, bulk=True) def finalize(self): self.connection.flush_bulk(forced=True)
def _get_conn(*args, **kwargs): _conn = ES(settings.es_hosts, *args, **kwargs) _conn.default_indices = settings.es_index return _conn
class FullTextSearch(object): def __init__(self, server, settings=None): # These timeout and bulk_size parameters were determined through # trial and error to be necessary to avoid timeout errors when # generating indices on Sandbox. They should not be taken as gospel. self.conn = ES(server, timeout=120.0) # Default timeout: 30.0 self.conn.bulker.bulk_size = 25 # Default: 400 if settings: self.settings = settings else: self.settings = { 'index': { 'analysis': { 'analyzer': { 'ngram_analyzer': { 'tokenizer': 'keyword', 'filter': ['lowercase', 'filter_ngram'], 'type': 'custom' } }, 'filter': { 'filter_ngram': { 'type': 'nGram', 'max_gram': 30, 'min_gram': 1 } } } } } self.refresh_index_cache() def search_index_text(self, query_string, fields="_all", **args): q = query.MatchQuery(fields, query_string) return self.search_index(q, **args) def search_index(self, query, indices=None, num_results=None, node_type=None): results = self.conn.search( query=query, indices=indices, doc_types=node_type) meta_list = [r.get_meta() for r in results[0:num_results]] node_dict = {} # fetch nodes grouped by type to reduce number of db calls key = itemgetter('type') for t, grouped_list in groupby(sorted(meta_list, key=key), key=key): ids = [meta['id'] for meta in grouped_list] for node in self.datastore.get_nodes(t, ids): node_dict[(node.type, node.key)] = node # return nodes in original order nodelist = [node_dict[(meta['type'], meta['id'])] for meta in meta_list] return nodelist def create_index(self, type, indexed_variables, index_name): self.conn.indices.create_index_if_missing(index_name, self.settings) mapping = {} for arg in indexed_variables: mapping[arg] = {'boost': 1.0, 'analyzer': 'ngram_analyzer', 'type': 'string', 'term_vector': 'with_positions_offsets'} index_settings = {'index_analyzer': 'ngram_analyzer', 'search_analyzer': 'standard', 'properties': mapping} self.conn.indices.put_mapping(str(type), index_settings, [index_name]) self.refresh_index_cache() self.populate_index(type, index_name) def refresh_index_cache(self): try: indices = self.conn.indices.get_mapping(raw=True) except exceptions.IndexMissingException: indices = {} else: indices = dict((k, v.get('mappings', {})) for k, v in indices.items()) self.indices = indices def delete_index(self, index_name): self.conn.indices.delete_index_if_exists(index_name) self.refresh_index_cache() def populate_index(self, type, index_name): #add all the currently existing nodes into the index ref_node = self.datastore.get_reference_node(type) node_list = [rel.target_node for rel in ref_node.instance.outgoing] for node in node_list: key = node.key index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, type, key) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass self.conn.index(index_dict, index_name, type, key, bulk=True) self.conn.indices.refresh([index_name]) def on_create(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) try: self.conn.index(index_dict, index_name, node.type, node.key, bulk=True) self.conn.indices.refresh([index_name]) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass def on_delete(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: try: self.conn.delete(index_name, node.type, node.key, bulk=True) self.conn.indices.refresh([index_name]) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass def on_modify(self, node): type_indices = self.get_indices_of_type(node.type) for index_name in type_indices: index_dict = self.populate_index_document(node, index_name) try: self.conn.delete(index_name, node.type, node.key) self.conn.index(index_dict, index_name, node.type, node.key, bulk=True) self.conn.indices.refresh([index_name]) except ELASTIC_SEARCH_EXCEPTIONS as err: log.exception(err) pass def get_indices_of_type(self, type): type_indices = [ key for key, value in self.indices.items() if type in value ] return type_indices def populate_index_document(self, node, index_name): indexed_variables = self.indices[index_name][node.type]['properties'].keys() index_dict = { field: node[field] for field in indexed_variables } return index_dict
Created on May 25, 2013 @author: yapianyu ''' from bson.objectid import ObjectId from difflib import SequenceMatcher from pyes.es import ES from pyes.query import MultiMatchQuery, Search from pymongo.connection import Connection import collections import datetime ml_100k_folder = '/home/yapianyu/Desktop/movielens/ml-100k/' ml_10m_folder = '/home/yapianyu/Desktop/movielens/ml-10M100K/' mongodb = Connection('127.0.0.1', 27017)['right-channel'] elasticsearch = ES(('http', '127.0.0.1', 9200)) def count_movie_num_each_year(): movie_num = {} f = open(ml_10m_folder + 'movies.dat') for line in f: year = int(line.split('::')[1][-5:-1]) if year in movie_num: movie_num[year] += 1 else: movie_num[year] = 1 d = collections.OrderedDict(sorted(movie_num.items(), key=lambda t: -t[0])) for year, num in d.items(): print year, '\t', num
def index(request): es = ES(settings.ES_SERVER) graphs = Graph.objects.all() for graph in graphs: graph.count = es.count(parse(graph.query))['count'] return render_to_response('dashboard/index.html', {'graphs': graphs})
import sys #sys.path.insert(0, "../") #from pyes import ES from pyes.es import ES from datetime import datetime import shelve conn = ES('127.0.0.1:9500') #conn = ES('192.168.2.50:9200') try: conn.delete_index("test-index") except: pass dataset = shelve.open("samples.shelve") mapping = { u'description': { 'boost': 1.0, 'index': 'analyzed', 'store': 'true', 'type': u'string', "term_vector": "with_positions_offsets" }, u'name': { 'boost': 1.0, 'index': 'analyzed', 'store': 'true', 'type': u'string', "term_vector": "with_positions_offsets"
odict = obj.as_dict() if isinstance(obj, (mappings.DocumentObjectField, mappings.ObjectField, mappings.NestedObject)): properties = odict.pop("properties", []) doc_count += 1 kwargs = ["name=%r" % obj.name, "type=%r" % odict.pop("type")] +\ ["%s=%r" % (k, odict[k]) for k in sorted(odict.keys())] result.append("doc%d=" % doc_count + str(type(obj)).split(".")[-1].strip("'>") + "(" + ', '.join(kwargs) + ")") for k in sorted(obj.properties.keys()): result.extend(mappings_to_code(obj.properties[k], doc_count)) else: kwargs = ["name=%r" % obj.name, "type=%r" % odict.pop("type"), "store=%r" % obj.store, "index=%r" % odict.pop("index")] +\ ["%s=%r" % (k, odict[k]) for k in sorted(odict.keys())] result.append("doc%d.add_property(" % doc_count +\ str(type(obj)).split(".")[-1].strip("'>") + "(" +\ ', '.join(kwargs) + "))") return result if __name__ == '__main__': es = ES("192.168.1.1:9200") res = mappings_to_code(es.mappings.get_doctype("twitter", "twitter")) print("\n".join(res))
from topic_tracking.model_management.helper.index import IndexHelper from topic_tracking.util.codec.mongo_codec import MongoCodec from topic_tracking.util.mongo import MongoConnectionManager if __name__ == '__main__': # MongoDB host = 'localhost' port = 27017 mcm = MongoConnectionManager(host, port, MongoCodec()) database = 'processed' resource_collection = mcm.get_collection(database, 'resources', Resource) # ElasticSearch es = ES('localhost:9200', timeout=60) resource_index = 'topic_tracking_resources' story_index = 'topic_tracking_stories' # utilities index_helper = IndexHelper(es, resource_index, story_index) # get a resource to mongo resource = resource_collection.find_one_model() # analyze the resource term_string = index_helper._build_payload_string(resource.terms) pprint(term_string) params = {} params['text'] = term_string response = es._send_request('GET', resource_index + '/_analyze', None, params)