def multi_param_search(request): log_results = None es = ES() # create elastic seach object if request.method == 'POST': # if the search form is submitted filters_list = [] # loop on each search param and check if it has value to add it to filter list for param in [ "version", "ip_header_length", "ttl", "protocol", "source_address", "destination_address", "source_port", "dest_port", "sequence_number", "acknowledgement", "tcp_header_length", "data", "datetime" ]: if request.POST.get(param) != '': q_param = TermFilter(param, request.POST.get(param)) filters_list.append(q_param) if len(filters_list ) != 0: # if there is filter params get the results orq = ANDFilter(filters_list) q = FilteredQuery(MatchAllQuery(), orq) log_results = es.search(q, indices=index_name, doc_types=type_name) else: log_results = None elif request.method == 'GET': # get all packet when get the search page log_results = es.search(MatchAllQuery(), indices=index_name, doc_types=type_name) return render(request, 'multi_param_search.html', {'log_results': log_results})
class BaseElasticSearchClient(BaseClient): def __init__(self, servers, index): """ @param servers: Make sure to include the port with the server address @param index: Document index @return: """ super(BaseElasticSearchClient, self).__init__() self.connection = None self.servers = servers self.index = index if type(index) is list else [index] def connect(self, connection_pool=1): update_connection_pool(connection_pool) try: self.connection = ES(self.servers) except NoServerAvailable: self._log.error('Failed to connect to elastic search server') return False return True def close(self): self.connection = None def _create_term_query(self, must_list): # TODO: add remaining conditional list functionality. query = BoolQuery() for term in must_list: query.add_must(term) def find_term(self, name, value, size=10): if not self.connection: return query = TermQuery(name, value) return self.connection.search(query=Search(query, size=size), indices=self.index) def find(self, filter_terms, size=10, doc_types=None): if not self.connection: return query = self._create_term_query(must_list=filter_terms) return self.connection.search(query=Search(query, size=size), indices=self.index, doc_types=doc_types) def find_one(self, filter_terms, size=10, doc_types=None): if not self.connection: return results = self.find(filter_terms=filter_terms, size=size, doc_types=doc_types) return results[0] if len(results) > 0 else None
def facets(host='localhost:9200', facet_terms=['bibleverse'], _type='habakkuk', date_filter=[], size=10): ret = {} conn = ES(host) q = MatchAllQuery() if date_filter: start,end = date_filter q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date', start.isoformat(), end.isoformat(), include_upper=False))) q = q.search(size=0) for term in facet_terms: q.facet.add_term_facet(term,order='count',size=size) es_logger.info(q.serialize()) resultset = conn.search(query=q, indices=_type+'-*', doc_types=[_type]) for facet in resultset.facets: ret[facet] = [] for row in resultset.facets[facet]['terms']: ret[facet].append({"value":row['term'],"count":row['count']}) logger.debug("facets return|'%s'"%json.dumps(ret)) return ret
def term_facet(host='localhost:9200', terms=['bibleverse'], _type='habakkuk', date_filter=[], size=10): ret = [] conn = ES(host) q = MatchAllQuery() if date_filter: start,end = date_filter q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',start,end,include_upper=False))) q = q.search(size=0) for term in terms: q.facet.add_term_facet(term,order='count',size=size) print json.dumps(json.loads(q.to_search_json()),indent=2) resultset = conn.search(query=q, indices=_type+'-*', doc_types=[_type]) for facet in resultset.facets: print "Total",facet,resultset.facets[facet]['total'] for row in resultset.facets[facet]['terms']: print "\t",row['term'],row['count'] ret.append((facet,row['term'])) return ret
def single_param_search(request): log_results = None es = ES() # create elastic seach object if request.method == 'POST': # if the search form is submitted # filter with search param and search tearm q1 = TermFilter(request.POST.get('searchby'), request.POST.get('searchterm')) orq = ORFilter([q1]) q = FilteredQuery(MatchAllQuery(), orq) log_results = es.search( q, indices=index_name, doc_types=type_name) # get the filtered data from elasticsearch elif request.method == 'GET': # get all packet when get the search page log_results = es.search(MatchAllQuery(), indices=index_name, doc_types=type_name) return render(request, 'single_param_search.html', {'log_results': log_results})
def find_BID_in_SBN(bid, es_server="localhost:9200"): sbn_bid = to_iccu_bid(bid) q = TermQuery('codiceIdentificativo', sbn_bid) es_conn = ES(server=es_server) resultset = list(es_conn.search(query=q, indices="iccu")) if (len(resultset) > 0): return resultset else: return None
def search_people_by_bio(query, limit_results=DEFAULT_LIMIT, index=['onename_people_index']): """ queries lucene index to find a nearest match, output is profile username """ from pyes import QueryStringQuery, ES conn = ES() q = QueryStringQuery(query, search_fields=['username', 'profile_bio'], default_operator='and') results = conn.search(query=q, size=20, indices=index) count = conn.count(query=q) count = count.count # having 'or' gives more results but results quality goes down if (count == 0): q = QueryStringQuery(query, search_fields=['username', 'profile_bio'], default_operator='or') results = conn.search(query=q, size=20, indices=index) results_list = [] counter = 0 for profile in results: username = profile['username'] results_list.append(username) counter += 1 if (counter == limit_results): break return results_list
def search_people_by_bio(query, limit_results=DEFAULT_LIMIT, index=['onename_people_index']): """ queries lucene index to find a nearest match, output is profile username """ from pyes import QueryStringQuery, ES conn = ES() q = QueryStringQuery(query, search_fields=['username', 'profile_bio'], default_operator='and') results = conn.search(query=q, size=20, indices=index) count = conn.count(query=q) count = count.count # having 'or' gives more results but results quality goes down if(count == 0): q = QueryStringQuery(query, search_fields=['username', 'profile_bio'], default_operator='or') results = conn.search(query=q, size=20, indices=index) results_list = [] counter = 0 for profile in results: username = profile['username'] results_list.append(username) counter += 1 if(counter == limit_results): break return results_list
class ESPages(): ''' For use with Django's paginator. Currently not used after pyes update implemented ResultSet, which provides the count, __getitem__, and __len__ methods required for Django's paginator. ''' def __init__(self, es_query, **kwargs): ''' Make initial ES query''' self.conn = ES(settings.ES_HOST[0], timeout=10.0) self.es_query = es_query res = self.conn.search(query=self.es_query, size='0', **kwargs) self.total_hits = res['hits']['total'] def count(self): return self.total_hits def __getitem__(self, q_slice): ''' Make ES query for range of hits''' q = self.es_query.search(start=str(q_slice.start), size=str(q_slice.stop-q_slice.start+1)) res = self.conn.search(q) return res['hits']['hits'] def __len__(self): return self.count()
def search(searchkey=u"电影"): conn = ES('127.0.0.1:9200') # TextQuery会对searchkey进行分词 qtitle = TextQuery("title", searchkey) h = HighLighter(['<b>'], ['</b>'], fragment_size=500) # 多字段搜索(must=>and,should=>or),高亮,结果截取(分页),排序 q = Search(BoolQuery(should=[qtitle]), highlight=h, start=0, size=3, sort={'id': {'order': 'asc'}}) q.add_highlight("title") results = conn.search(q, "zhihu", "answer") list = [] for r in results: if("title" in r._meta.highlight): r['title'] = r._meta.highlight[u"title"][0] list.append(r) return template('results.html', list=list, count=results.total)
def search(searchkey=u"电影"): conn = ES('127.0.0.1:9200') # TextQuery会对searchkey进行分词 qtitle = TextQuery("title", searchkey) h = HighLighter(['<b>'], ['</b>'], fragment_size=500) # 多字段搜索(must=>and,should=>or),高亮,结果截取(分页),排序 q = Search(BoolQuery(should=[qtitle]), highlight=h, start=0, size=3, sort={'id': { 'order': 'asc' }}) q.add_highlight("title") results = conn.search(q, "zhihu", "answer") list = [] for r in results: if ("title" in r._meta.highlight): r['title'] = r._meta.highlight[u"title"][0] list.append(r) return template('results.html', list=list, count=results.total)
class KVStore(KVStoreBase): def __init__(self, *args, **kwargs): super(KVStore, self).__init__(*args, **kwargs) self.connection = ES(settings.THUMBNAIL_ELASTIC_SEARCH_SERVERS) def _get_raw(self, key): try: #import pdb; pdb.set_trace() value = self.connection.get(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, key) return value['_source']['value'] except: return None def _set_raw(self, key, value): ret = self.connection.index({"value": value}, settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, key) return ret['ok'] def _delete_raw(self, *keys): rets = [] for key in keys: try: ret = self.connection.delete(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, key) rets.append(ret['ok']) except: rets.append(False) return rets def _find_keys_raw(self, prefix): search = Search(query=PrefixQuery("_id", prefix), size=1000, start=0, fields=[]) results = self.connection.search(search, indexes=[settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,], doc_types=[settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,]) return [hit['_id'] for hit in results['hits']['hits']]
from pyes import ES es = ES() index_name = "my_index" type_name = "my_type" from utils_pyes import create_and_add_mapping, populate create_and_add_mapping(es, index_name, type_name) populate(es, index_name, type_name) from pyes.query import * q = MatchAllQuery() q = q.search() q.facet.add_term_facet('tag') results = es.search(indices=index_name, doc_types=type_name, query=q) from pyes.facets import * q = MatchAllQuery() q = q.search() q.facet.facets.append(DateHistogramFacet('date_facet', field='date', interval='month')) results = es.search(indices=index_name, doc_types=type_name, query=q) es.indices.delete_index(index_name)
class Elastic(object): def init_app(self, app): self.conn = ES(app.config['ELASTIC_URL'], timeout=2) #self.remote_conns = [ES(url) for url in app.config['REMOTE_ELASTIC_URL']] def search(self, start=0, size=20, doc_types='resource', indices='order_index', sort=None, **kwargs): # set filter filters = [] for k,v in kwargs.items(): if k and k!='complete_time': filters.append(TermFilter(k, v)) elif k and v!='' and k=='complete_time': ct = kwargs['complete_time'] if len(ct) == 2: filters.append(RangeFilter(ESRange('complete_time', from_value=ct[0], to_value=ct[1]))) else: filters.append(RangeFilter(ESRange('complete_time', from_value=ct[0]))) _filter = None if filters: _filter = ANDFilter(filters) bq = MatchAllQuery() # filtered q = FilteredQuery(bq, _filter) # sort if sort: sf = SortFactory() for s in sort: sf.add(s) s = Search(q, sort=sf) else: s = Search(q) # result return self.conn.search(s, indices=indices, doc_types=doc_types, start=start, size=size) def delete(self, index='order_index', doc_type='resource', id=''): return self.conn.delete(index=index, doc_type=doc_type, id=id) def create(self, index='order_index', doc_type='resource', doc=None): # try: # self.delete(index, doc_type, doc['id']) # except NotFoundException: # pass try: return self.conn.index(doc, index, doc_type, id=doc['id']) except:# not connection pass def multi_create(self, index='order_index', doc_type='resource', doc=None): """如果同步缓存到远程,要使用celery""" try: return self.conn.index(doc, index, doc_type, id=doc['id']) except:# not connection pass try: for rconn in self.remote_conns: rconn.index(doc, index, doc_type, id=doc['id']) except: print '--------sync cache to remote error------'
from pyes import ES es = ES() index_name = "my_index" type_name = "my_type" from utils_pyes import create_and_add_mapping, populate create_and_add_mapping(es, index_name, type_name) populate(es, index_name, type_name) from pyes.query import * from pyes.filters import * results = es.search(index_name, type_name, MatchAllQuery()) print "total:", results.total for r in results: print r print "first element: ", results[0] print "slice elements: ", results[1:4] results = es.search(index_name, type_name, TermQuery("name", "joe", 3)) q1 = TermFilter("position", 1) q2 = TermFilter("position", 2) orq = ORFilter([q1, q2]) q = FilteredQuery(MatchAllQuery(), orq) results = es.search(index_name, type_name, q)
class BaseElasticSearchClient(BaseClient): def __init__(self, servers, index=None): """ @param servers: Make sure to include the port with the server address @param index: Document index @return: """ super(BaseElasticSearchClient, self).__init__() self.connection = None self.servers = servers if index is not None: self.index = index if type(index) is list else [index] def connect(self, connection_pool=1, bulk_size=10): update_connection_pool(connection_pool) try: self.connection = ES(self.servers, bulk_size=bulk_size) except NoServerAvailable: self._log.error('Failed to connect to elastic search server') return False return True def close(self): self.connection = None def _create_term_query(self, must_list): # TODO: add remaining conditional list functionality. query = BoolQuery() for term in must_list: query.add_must(term) def refresh_index(self, index_name, wait=1): self._log.info('ES: Refreshing index {0}'.format(index_name)) self.connection.indices.refresh(index_name, timesleep=wait) def has_index(self, index_name): self._log.info('ES: Checking for index {0}'.format(index_name)) try: self.connection.status(index_name) except IndexMissingException: return False return True def wait_for_index(self, index_name, wait=30): """ Checks to see if an index exists. Checks every second for int(X) seconds and returns True if successful """ for i in range(0, int(wait)): if self.has_index(index_name): return True sleep(1) return False def wait_for_messages(self, name, value, num=1, index=None, max_wait=30): """ Wait for a specific number of messages to be returned within a specified amount of time. Checks every second for {max_wait} seconds and returns a list of msgs """ for i in range(0, int(max_wait)): msgs = self.find_term(name=name, value=value, size=1, index=index) if len(msgs) == num: return msgs sleep(1) return [] def delete_index(self, index_name): self._log.info('ES: Deleting index {0}'.format(index_name)) self.connection.delete_index(index_name) def find_term(self, name, value, size=10, index=None): if not self.connection: return query = TermQuery(name, value) return self.connection.search(query=Search(query, size=size), indices=index or self.index) def find(self, filter_terms, size=10, doc_types=None, index=None): if not self.connection: return query = self._create_term_query(must_list=filter_terms) return self.connection.search(query=Search(query, size=size), indices=index or self.index, doc_types=doc_types) def find_one(self, filter_terms, doc_types=None, index=None): if not self.connection: return results = self.find(filter_terms=filter_terms, size=1, doc_types=doc_types, index=index) return results[0] if len(results) > 0 else None
from pyes import ES es = ES() index_name = "my_index" type_name = "my_type" from utils_pyes import create_and_add_mapping, populate create_and_add_mapping(es, index_name, type_name) populate(es, index_name, type_name) from pyes.query import * q = MatchAllQuery() q = q.search() q.facet.add_term_facet('tag') results = es.search(index_name, type_name, q) from pyes.facets import * q = MatchAllQuery() q = q.search() q.facet.facets.append(DateHistogramFacet('date_facet', field='date', interval='month')) results = es.search(index_name, type_name, q) es.indices.delete(index_name)
class DocManager(): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. We are using elastic native fields for _id and ns, but we also store them as fields in the document, due to compatibility issues. """ def __init__(self, url, auto_commit=True, unique_key='_id'): """Verify Elastic URL and establish a connection. """ if verify_url(url) is False: raise SystemError self.elastic = ES(server=url) self.auto_commit = auto_commit self.doc_type = 'string' # default type is string, change if needed self.unique_key = unique_key if auto_commit: self.run_auto_commit() def stop(self): """ Stops the instance """ self.auto_commit = False def upsert(self, doc): """Update or insert a document into Elastic If you'd like to have different types of document in your database, you can store the doc type as a field in Mongo and set doc_type to that field. (e.g. doc_type = doc['_type']) """ doc_type = self.doc_type index = doc['ns'] doc[self.unique_key] = str(doc[self.unique_key]) doc_id = doc[self.unique_key] id_query = TextQuery('_id', doc_id) elastic_cursor = self.elastic.search(query=id_query, indices=index) try: self.elastic.index(bsjson.dumps(doc), index, doc_type, doc_id) except ValueError: logging.info("Could not update %s" % (doc,)) self.elastic.refresh() def remove(self, doc): """Removes documents from Elastic The input is a python dictionary that represents a mongo document. """ try: self.elastic.delete(doc['ns'], 'string', str(doc[self.unique_key])) except (NotFoundException, TypeMissingException, IndexMissingException): pass def _remove(self): """For test purposes only. Removes all documents in test.test """ try: self.elastic.delete('test.test', 'string', '') except (NotFoundException, TypeMissingException, IndexMissingException): pass def search(self, start_ts, end_ts): """Called to query Elastic for documents in a time range. """ res = ESRange('_ts', from_value=start_ts, to_value=end_ts) results = self.elastic.search(RangeQuery(res)) return results def _search(self): """For test purposes only. Performs search on Elastic with empty query. Does not have to be implemented. """ results = self.elastic.search(MatchAllQuery()) return results def commit(self): """This function is used to force a refresh/commit. """ retry_until_ok(self.elastic.refresh) def run_auto_commit(self): """Periodically commits to the Elastic server. """ self.elastic.refresh() if self.auto_commit: Timer(1, self.run_auto_commit).start() def get_last_doc(self): """Returns the last document stored in the Elastic engine. """ result = self.elastic.search(MatchAllQuery(), size=1, sort='_ts:desc') for item in result: return item
from pyes import ES es = ES() index_name = "my_index" type_name = "my_type" from utils_pyes import create_and_add_mapping, populate create_and_add_mapping(es, index_name, type_name) populate(es, index_name, type_name) from pyes.query import * from pyes.aggs import * q = MatchAllQuery() q = q.search() q.get_agg_factory().add(TermsAgg('pterms', field="parsedtext")) results = es.search(q, indices=index_name, doc_types=type_name) q = MatchAllQuery() q = q.search() q.get_agg_factory().add(DateHistogramAgg('date_add', field='date', interval='month')) results = es.search(q, indices=index_name, doc_types=type_name) es.indices.delete_index(index_name)
from mediaresearchapp.tasks import MediaAggregateSQLTask if __name__ == '__main__': es = ES("127.0.0.1:9200", default_indices='mediaaggregate') # Filters filters = [GeoDistanceFilter('location', [40.0, 9.00], 20, 'arc', 'km')] # filters = [TermFilter('message', 'elastic'), # GeoDistanceFilter('locations', # {"lat": 40.0, "lon": 9.00}, # 20, 'arc', 'km') # ] filter = ANDFilter(filters) q = FilteredQuery(MatchAllQuery(), filter) results = es.search(q) for r in results: print r break q4 = RegexTermQuery('city', 'bang.*') print q4 resultset = es.search(q4) for r in resultset: print r query_str = { "query": { "termquery": [{ "fieldname1": "value" }, {
class ESIndexerBase(object): ES_HOST = ES_HOST ES_INDEX_NAME = ES_INDEX_NAME ES_INDEX_TYPE = 'gene' def __init__(self): self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME], timeout=10.0) self.step = 10000 def create_index(self): try: print self.conn.open_index(self.ES_INDEX_NAME) except IndexMissingException: print self.conn.create_index(self.ES_INDEX_NAME) def delete_index_type(self, index_type): '''Delete all indexes for a given index_type.''' index_name = self.ES_INDEX_NAME # index_type = self.ES_INDEX_TYPE #Check if index_type exists mapping = self.conn.get_mapping(index_type, index_name) if index_name not in mapping or index_type not in mapping[index_name]: print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name) return path = '/%s/%s' % (index_name, index_type) if ask('Confirm to delete all data under "%s":' % path) == 'Y': return self.conn.delete_mapping(index_name, index_type) def index(self, doc, index_type, id=None): '''add a doc to the index. If id is not None, the existing doc will be updated. ''' # index_type = self.ES_INDEX_TYPE return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id) def delete_index(self, index_type, id): '''delete a doc from the index based on passed id.''' # index_type = self.ES_INDEX_TYPE return self.conn.delete(self.ES_INDEX_NAME, index_type, id) def optimize(self): return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True) def get_field_mapping(self): import dataload reload(dataload) dataload.register_sources() return dataload.get_mapping() def build_index(self, doc_d, update_mapping=False, bulk=True): index_name = self.ES_INDEX_NAME index_type = self.ES_INDEX_TYPE #Test if index exists try: print "Opening index...", self.conn.open_index(index_name) except NotFoundException: print 'Error: index "%s" does not exist. Create it first.' % index_name return -1 try: cur_mapping = self.conn.get_mapping(index_type, index_name) empty_mapping = False except ElasticSearchException: #if no existing mapping available for index_type #force update_mapping to True empty_mapping = True update_mapping = True # empty_mapping = not cur_mapping[index_name].get(index_type, {}) # if empty_mapping: # #if no existing mapping available for index_type # #force update_mapping to True # update_mapping = True if update_mapping: print "Updating mapping...", if not empty_mapping: print "\n\tRemoving existing mapping...", print self.conn.delete_mapping(index_name, index_type) _mapping = self.get_field_mapping() print self.conn.put_mapping(index_type, _mapping, [index_name]) print "Building index..." t0 = time.time() for doc_id, doc in doc_d.items(): self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk) print self.conn.flush() print self.conn.refresh() print "Done[%s]" % timesofar(t0) def query(self, qs, fields='symbol,name', **kwargs): _q = StringQuery(qs) res = self.conn.search(_q, fields=fields, **kwargs) return res
from pyes import ES es = ES() index_name = "my_index" type_name = "my_type" from utils_pyes import create_and_add_mapping, populate create_and_add_mapping(es, index_name, type_name) populate(es, index_name, type_name) from pyes.query import * from pyes.aggs import * q = MatchAllQuery() q = q.search() q.get_agg_factory().add(TermsAgg('pterms', field="parsedtext")) results = es.search(q, indices=index_name, doc_types=type_name) q = MatchAllQuery() q = q.search() q.get_agg_factory().add( DateHistogramAgg('date_add', field='date', interval='month')) results = es.search(q, indices=index_name, doc_types=type_name) es.indices.delete_index(index_name)
from pyes import ES from pyes import TermQuery from pyes import RangeQuery from pyes import QueryStringQuery from pyes import BoolQuery from pyes import ESRange from pyes import ANDFilter from pyes import TermFilter from pyes import FilteredQuery from pyes import query conn = ES('localhost:9200') a_range = RangeQuery(qrange=ESRange('a', 0.179, 0.180)) b_filter = TermFilter("b", "0.2") period_filter = TermFilter("period", "2") total_filter = ANDFilter([b_filter, period_filter]) c_range = RangeQuery(qrange=ESRange('c', 8, 12)) que = FilteredQuery(BoolQuery(must=[a_range, c_range]), total_filter) search = query.Search(query=que) get = conn.search(search, indices='shrimp') census = get.total for i in get: print i
def import_instruments(instrs, es_url, index, alias): """Create JSON ES docs and import.""" prefix = { "bibo": "http://purl.org/ontology/bibo/", "dcterms": "http://purl.org/dc/terms/", "eos": "http://nasa.gov/eos.owl#", "gcis": "http://data.globalchange.gov/gcis.owl#", "hysds": "http://hysds.jpl.nasa.gov/hysds/0.1#", "info": "http://info-uri.info/", "xlink": "http://www.w3.org/1999/xlink" } conn = ES(es_url) if not conn.indices.exists_index(index): conn.indices.create_index(index) # track agencies/organizations orgs = {} for instr in instrs: identifier = "eos:%s" % instr['Instrument Name Short'] id = hashlib.md5(identifier).hexdigest() if 'Instrument Technology' in instr and not EMPTY.search(instr['Instrument Technology']): sensor = "eos:%s" % instr['Instrument Technology'] else: if 'Instrument Type' in instr and not EMPTY.search(instr['Instrument Type']): sensor = "eos:%s" % instr['Instrument Type'] else: if 'Subtype' in instr and not EMPTY.search(instr['Subtype']): sensor = "eos:%s" % instr['Subtype'] else: if 'Type' in instr and not EMPTY.search(instr['Type']): sensor = "eos:%s" % instr['Type'] else: if 'Class' in instr and not EMPTY.search(instr['Class']): sensor = "eos:%s" % instr['Class'] else: sensor = None #print(instr['Instrument Technology'], sensor) platform = None if 'Instrument Agencies' in instr and not EMPTY.search(instr['Instrument Agencies']): org = "eos:%s" % instr['Instrument Agencies'] if org not in orgs: orgs[org] = { "prov_es_json": { "prefix": prefix, "agent": { org: { "prov:type": { "type": "prov:QualifiedName", "$": "prov:Organization", }, }, }, }, "identifier": org, "prov:type": "prov:Organization", } if len(conn.search(query=TermQuery("_id", org), indices=[alias])) > 0: pass else: conn.index(orgs[org], index, 'agent', org) else: org = None doc = { "prov_es_json": { "prefix": prefix, "entity": { identifier: { "gcis:hasSensor": sensor, "gcis:inPlatform": platform, "prov:type": "eos:instrument", "gcis:hasGoverningOrganization": org, }, }, }, "gcis:hasSensor": sensor, "gcis:inPlatform": platform, "prov:type": "eos:instrument", "gcis:hasGoverningOrganization": org, "identifier": identifier, } if len(conn.search(query=TermQuery("_id", identifier), indices=[alias])) > 0: pass else: conn.index(doc, index, 'entity', identifier)
def import_instruments(instrs, es_url, index, alias): """Create JSON ES docs and import.""" prefix = { "bibo": "http://purl.org/ontology/bibo/", "dcterms": "http://purl.org/dc/terms/", "eos": "http://nasa.gov/eos.owl#", "gcis": "http://data.globalchange.gov/gcis.owl#", "hysds": "http://hysds.jpl.nasa.gov/hysds/0.1#", "info": "http://info-uri.info/", "xlink": "http://www.w3.org/1999/xlink" } conn = ES(es_url) if not conn.indices.exists_index(index): conn.indices.create_index(index) # track agencies/organizations orgs = {} for instr in instrs: identifier = "eos:%s" % instr['Instrument Name Short'] id = hashlib.md5(identifier).hexdigest() if 'Instrument Technology' in instr and not EMPTY.search( instr['Instrument Technology']): sensor = "eos:%s" % instr['Instrument Technology'] else: if 'Instrument Type' in instr and not EMPTY.search( instr['Instrument Type']): sensor = "eos:%s" % instr['Instrument Type'] else: if 'Subtype' in instr and not EMPTY.search(instr['Subtype']): sensor = "eos:%s" % instr['Subtype'] else: if 'Type' in instr and not EMPTY.search(instr['Type']): sensor = "eos:%s" % instr['Type'] else: if 'Class' in instr and not EMPTY.search( instr['Class']): sensor = "eos:%s" % instr['Class'] else: sensor = None #print(instr['Instrument Technology'], sensor) platform = None if 'Instrument Agencies' in instr and not EMPTY.search( instr['Instrument Agencies']): org = "eos:%s" % instr['Instrument Agencies'] if org not in orgs: orgs[org] = { "prov_es_json": { "prefix": prefix, "agent": { org: { "prov:type": { "type": "prov:QualifiedName", "$": "prov:Organization", }, }, }, }, "identifier": org, "prov:type": "prov:Organization", } if len( conn.search(query=TermQuery("_id", org), indices=[alias])) > 0: pass else: conn.index(orgs[org], index, 'agent', org) else: org = None doc = { "prov_es_json": { "prefix": prefix, "entity": { identifier: { "gcis:hasSensor": sensor, "gcis:inPlatform": platform, "prov:type": "eos:instrument", "gcis:hasGoverningOrganization": org, }, }, }, "gcis:hasSensor": sensor, "gcis:inPlatform": platform, "prov:type": "eos:instrument", "gcis:hasGoverningOrganization": org, "identifier": identifier, } if len(conn.search(query=TermQuery("_id", identifier), indices=[alias])) > 0: pass else: conn.index(doc, index, 'entity', identifier)
ftrans = FormatTranslator() # 1. Create Connection conn = ES() # 2. Index Data dataset_json = open("../dataset.json") dataset = json.load(dataset_json)['data'] for data in dataset: conn.index(data, "example_index", "example_type", "example_id_" + str(dataset.index(data))) # 3. Create Simple Query query = MatchAllQuery() # 4. Create Simple Aggregation agg = TermsAgg('agg1', field="name", sub_aggs=[], size=100) # 5. Get Result search = Search(query, size=5) search.agg.add(agg) print search.serialize() result = conn.search(search, "example_index", "example_type") for i in result: print json.dumps(i, indent=2) print json.dumps(result.aggs, indent=2) result._do_search() print json.dumps(result._results, indent=2)
class ElasticSearchServer(ESDBRequests): """ An object representing the CouchDB server, use it to list, create, delete and connect to databases. More info http://wiki.apache.org/couchdb/HTTP_database_API """ def __init__(self, dburl = 'http://localhost:9200', indices, types, usePYCurl = False, ckey = None, cert = None, capath = None): """ Set up a connection to the CouchDB server """ check_server_url(dburl) # PYCurl TODO # Same with cert and key self.url = dburl self.ESconn = ES(dburl) self.ckey = ckey self.cert = cert check_name(indices) check_name(types) self.indices = indices self.types = types def listDatabases(self): "List all the databases the server hosts" # TODO return self.get('/_all_dbs') def createDatabase(self, schema): """ A database must be named with all lowercase characters (a-z), digits (0-9), or any of the _$()+-/ characters and must end with a slash in the URL. """ self.ESconn.indices.create_index_if_missing(self.indices) self.ESconn.indices.put_mapping(self.types, {'properties': schema}, [self.indices]) def insertDoc(self, doc, _id): """ TODO """ self.ESconn.index(doc, self.indices, self.types, _id) def deleteDoc(self, _id): self.ESconn.delete(self.indices, self.types, _id) def termBoolQuery(self, query): """ query - dict must: key = key in the database value = searchable value should key = key in the database value = searchable value must_not key = key in the database value = searchable value """ queryMust = [] queryShould = [] queryMustNot = [] for item in ["must", "should", "must_not"]: if item in query: for dictVals in query[item]: for dictKey in dictVals: tempq = TermQuery(dictKey, dictVals[dictKey]) if item == "must": queryMust.append(tempq) elif item == "should": queryShould.append(tempq) elif item == "must_not": queryMustNot.append(tempq) query = BoolQuery(must=None if not queryMust else queryMust, should=None if not queryShould else queryShould, must_not=None if not queryMustNot else queryMustNot) search = Search(query) results = self.ESconn.search(search, self.indices) response = {"status_code": 200, "message": "Successful", "content": []} response["content"] = [result for result in results] return response
class EsRestConnection(RestConnection): def __init__(self, serverInfo, proto = "http"): #serverInfo can be a json object #only connect pyes to master es node #in the case that other nodes are taken down #because http requests will fail # TODO: dynamic master node detection if isinstance(serverInfo, dict): self.ip = serverInfo["ip"] self.rest_username = serverInfo["username"] self.rest_password = serverInfo["password"] self.username = serverInfo["es_username"] self.password = serverInfo["es_password"] self.port = 9091 #serverInfo["port"] else: self.ip = serverInfo.ip self.rest_username = serverInfo.rest_username self.rest_password = serverInfo.rest_password self.username = serverInfo.es_username self.password = serverInfo.es_password self.port = 9091 # serverInfo.port self.baseUrl = "http://{0}:{1}/".format(self.ip, self.port) self.capiBaseUrl = self.baseUrl self.esHttpUrl = "http://{0}:9200".format(self.ip) self.http_port = str(int(self.port) + 109) self.proto = proto self.conn = ES(server=self.esHttpUrl) self.manager = managers.Cluster(self.conn) self.test_params = TestInputSingleton.input self.docs = None def get_index_stats(self): return ES.index_stats() def get_indices(self): return self.conn.indices.get_indices() def get_indices_as_buckets(self, doc_type='couchbaseDocument'): buckets = [] indices = self.get_indices() for index in indices: bucket = Bucket() q = query.MatchAllQuery() docs = self.conn.search(q,index,doc_type) bucket.name = index bucket.type = "es" bucket.port = self.port bucket.authType = None bucket.saslPassword = self.password bucket.nodes = list() #vBucketServerMap bucketStats = BucketStats() bucketStats.itemCount = docs.count() bucket.stats = bucketStats buckets.append(bucket) bucket.master_id = "es@"+self.ip return buckets def get_bucket(self, bucket_name, doc_type): for bucket in self.get_indices_as_buckets(doc_type): if bucket.name == bucket_name: return bucket return def get_buckets(self): return self.get_indices_as_buckets() def delete_index(self, name): self.conn.indices.delete_index(name) return self.conn.indices.exists_index(name) def create_index(self, name): if self.conn.indices.exists_index(name): self.delete_index(name) self.conn.indices.create_index(name) return self.conn.indices.exists_index(name) def delete_bucket(self, name): return self.delete_index(name) def create_bucket(self, *args, **kwargs): name = 'default' if len(args) > 0: name = args[0] else: name = kwargs['bucket'] return self.create_index(name) def is_ns_server_running(self, timeout_in_seconds=360): return True def node_statuses(self, timeout=120): otp_nodes = [] for node in self.get_nodes(): #get otp,get status otp_node = OtpNode(id=node.id, status=node.status) otp_node.ip = node.ip otp_node.port = node.port otp_node.replication = None otp_nodes.append(node) return otp_nodes def get_nodes_self(self, timeout=120): for node in self.get_nodes(): # force to return master node if node.port == 9091: return node return def get_nodes(self): es_nodes = [] nodes = self.manager.state()['nodes'] status = self.manager.health()['status'] if status == "green": status = "healthy" for node_key in nodes: nodeInfo = nodes[node_key] ex_params = self.get_node_params(nodeInfo) nodeInfo.update({'ssh_password' : ex_params.ssh_password, 'ssh_username' : ex_params.ssh_username}) nodeInfo['key'] = node_key node = ESNode(nodeInfo) node.status = status es_nodes.append(node) return es_nodes def get_node_params(self, info): ip, port = parse_addr(info["transport_address"]) clusters = self.test_params.clusters master_node = None for _id in clusters: for node in clusters[_id]: if node.ip == ip and int(node.port) == port: return node if int(node.port) == 9091: master_node = node # use params from master node return master_node def search_term(self, key, indices=["default"]): result = None params = {"term":{"_id":key}} query = ES.Search(params) row = self.conn.search(query, indices = indices) if row.total > 0: result = row[0] return result def term_exists(self, key, indices=["default"]): return self.search_term(key, indices = indices) is not None def all_docs(self, keys_only = False, indices=["default"],size=10000): q = query.MatchAllQuery() docs = self.conn.search(q,indices=indices,doc_types='couchbaseDocument') docs = [] for row in docs: if keys_only: row = row['meta']['id'] docs.append(row) return docs # check if a key exists by checking all known nodes # See - CBES-17 # for use when it seems nodes are out of sync def search_all_nodes(self, key, indices=["default"]): doc = None for index in indices: for _node in self.get_nodes(): ip, port = (_node.ip, _node.ht_port) r = requests.get('http://%s:%s/%s/couchbaseDocument/%s?preference=_only_node:%s' %\ (ip, port, index, key, _node.key)) if r.status_code == 200 : if r.json()['_id'] == key: doc = r.json() break return doc def fetch_bucket_stats(self, bucket='default', zoom='minute'): return { "op" : { "samples" : { "xdc_ops" : [0] } } } def start_replication(self, *args, **kwargs): return "es",self.ip def _rebalance_progress(self, *args, **kwargs): return 100 def _rebalance_progress_status(self, *args, **kwargs): return 'not running' def get_vbuckets(self, *args, **kwargs): return () def replace_template(self, node, file): f = open(file, 'r') template = f.read().replace('\n', ' ') api = "http://{0}:9200/_template/couchbase".format(node.ip) status, content, header = self._http_request(api, 'PUT', template) if status: log.info('uploaded couchbase template: '+file) else: log.error('template upload failed: {0}'.format(content)) def add_node(self, user='', password='', remoteIp='', port='8091',zone_name='', services=None): pass def update_configuration(self, node, commands): rmc = RemoteMachineShellConnection(node) shell = rmc._ssh_client.invoke_shell() for command in commands: log.info('Adding elastic search config {0} on node {1}'.format(command, self.ip)) shell.send('echo "{0}" >> ~/elasticsearch/config/elasticsearch.yml \n'.format(command)) while not shell.recv_ready(): time.sleep(2) rc = shell.recv(1024) log.info(rc) def reset_configuration(self, node, count=1): rmc = RemoteMachineShellConnection(node) shell = rmc._ssh_client.invoke_shell() log.info('Removing last {0} lines from elastic search config on node {1}'.format(count, self.ip)) shell.send('head -n -{0} ~/elasticsearch/config/elasticsearch.yml > temp ; mv temp ~/elasticsearch/config/elasticsearch.yml \n'.format(count)) while not shell.recv_ready(): time.sleep(2) rc = shell.recv(1024) log.info(rc) def start_es_node(self, node): rmc = RemoteMachineShellConnection(node) shell=rmc._ssh_client.invoke_shell() es_kill = "pkill -f elasticsearch;" shell.send(es_kill+' \n') while not shell.recv_ready(): time.sleep(2) rc = shell.recv(1024) log.info(rc) log.info("Sleep for 30 seconds") time.sleep(30) # define es exec path if not in $PATH environment es_bin = "~/elasticsearch/bin/elasticsearch -Dtransport.couchbase=TRACE -Dcom.couchbase=TRACE > /var/log/es.log 2>&1 &" if 'es_bin' in TestInputSingleton.input.test_params: es_bin = TestInputSingleton.input.test_params['es_bin'] # connect to remote node log.info('Starting node: %s:%s' % (node.ip, node.port)) # start es service shell.send(es_bin+' \n') while not shell.recv_ready(): time.sleep(2) rc = shell.recv(1024) log.info(rc) log.info("Sleep for 5 seconds before the node can appear") time.sleep(5) # wait for new node tries = 0 while tries < 10: for cluster_node in self.get_nodes(): if cluster_node.ip == node.ip and cluster_node.port == int(node.port): return else: log.info('Waiting for new node to appear') time.sleep(5) tries = tries + 1 raise Exception("failed to add node to cluster: %s:%s" % (node.ip,node.port)) def log_client_error(self, post): # cannot post req errors to 9091 pass def vbucket_map_ready(self, *args, **kwargs): return True def init_cluster(self, *args, **kwargs): pass def init_cluster_memoryQuota(self, *args, **kwargs): pass def set_reb_cons_view(self, *args, **kwargs): pass def set_reb_index_waiting(self, *args, **kwargs): pass def set_rebalance_index_pausing(self, *args, **kwargs): pass def set_max_parallel_indexers(self, *args, **kwargs): pass def set_max_parallel_replica_indexers(self, *args, **kwargs): pass def log_client_error(self, post): # cannot post req errors to 9091 pass def vbucket_map_ready(self, *args, **kwargs): return True def init_cluster(self, *args, **kwargs): pass def init_cluster_memoryQuota(self, *args, **kwargs): pass def set_reb_cons_view(self, *args, **kwargs): pass def set_reb_index_waiting(self, *args, **kwargs): pass def set_rebalance_index_pausing(self, *args, **kwargs): pass def set_max_parallel_indexers(self, *args, **kwargs): pass def set_max_parallel_replica_indexers(self, *args, **kwargs): pass def rebalance(self, otpNodes, ejectedNodes): # shutdown ejected nodes # wait for shards to be rebalanced nodesToShutdown = \ [node for node in self.get_nodes() if node.id in ejectedNodes] for node in nodesToShutdown: self.eject_node(node) def eject_node(self, node): api = "http://%s:9200/_cluster/nodes/local/_shutdown?delay=0s" % (node.ip) status, content, header = self._http_request(api, 'POST', '') if status: log.info('ejected node: '+node.ip) else: log.error('rebalance operation failed: {0}'.format(content)) def monitorRebalance(self, stop_if_loop=False): # since removed nodes are shutdown use master node for monitoring return self.get_nodes_self() def get_pools_info(self): return {'pools' : []} def add_remote_cluster(self, *args, **kwargs): # detect 2:1 mapping and do spectial cluster add # otherwise run super method pass def remove_all_remote_clusters(self): pass def remove_all_replications(self): pass def is_cluster_mixed(self): return False def set_internalSetting(self, param, value): return {'ok' : True}
class EsRestConnection(RestConnection): def __init__(self, serverInfo, proto="http"): #serverInfo can be a json object #only connect pyes to master es node #in the case that other nodes are taken down #because http requests will fail # TODO: dynamic master node detection if isinstance(serverInfo, dict): self.ip = serverInfo["ip"] self.rest_username = serverInfo["username"] self.rest_password = serverInfo["password"] self.username = serverInfo["es_username"] self.password = serverInfo["es_password"] self.port = 9091 #serverInfo["port"] else: self.ip = serverInfo.ip self.rest_username = serverInfo.rest_username self.rest_password = serverInfo.rest_password self.username = serverInfo.es_username self.password = serverInfo.es_password self.port = 9091 # serverInfo.port self.baseUrl = "http://{0}:{1}/".format(self.ip, self.port) self.capiBaseUrl = self.baseUrl self.esHttpUrl = "http://{0}:9200".format(self.ip) self.http_port = str(int(self.port) + 109) self.proto = proto self.conn = ES(server=self.esHttpUrl) self.manager = managers.Cluster(self.conn) self.test_params = TestInputSingleton.input self.docs = None def get_index_stats(self): return ES.index_stats() def get_indices(self): schema = self.conn.indices.get_mapping() indices_full_list = schema.get_all_indices() just_indices = [ index for index in indices_full_list if not index.startswith(".") ] return just_indices def get_indices_as_buckets(self, doc_type='couchbaseDocument'): buckets = [] indices = self.get_indices() for index in indices: bucket = Bucket() q = query.MatchAllQuery() docs = self.conn.search(q, index, doc_type) bucket.name = index bucket.type = "es" bucket.port = self.port bucket.nodes = list() #vBucketServerMap bucketStats = BucketStats() bucketStats.itemCount = docs.count() bucket.stats = bucketStats buckets.append(bucket) bucket.master_id = "es@" + self.ip return buckets def get_bucket(self, bucket_name, doc_type='couchbaseDocument'): for bucket in self.get_indices_as_buckets(doc_type): if bucket.name == bucket_name: return bucket return def get_buckets(self): return self.get_indices_as_buckets() def delete_index(self, name): self.conn.indices.delete_index(name) return self.conn.indices.exists_index(name) def create_index(self, name): if self.conn.indices.exists_index(name): self.delete_index(name) self.conn.indices.create_index(name) return self.conn.indices.exists_index(name) def delete_bucket(self, name): return self.delete_index(name) def create_bucket(self, *args, **kwargs): name = 'default' if len(args) > 0: name = args[0] else: name = kwargs['bucket'] return self.create_index(name) def is_ns_server_running(self, timeout_in_seconds=360): return True def node_statuses(self, timeout=120): otp_nodes = [] for node in self.get_nodes(): #get otp,get status otp_node = OtpNode(id=node.id, status=node.status) otp_node.ip = node.ip otp_node.port = node.port otp_node.replication = None otp_nodes.append(node) return otp_nodes def get_nodes_self(self, timeout=120): for node in self.get_nodes(): # force to return master node if node.port == 9091: return node return def get_nodes(self): es_nodes = [] nodes = self.manager.state()['nodes'] status = self.manager.health()['status'] if status == "green": status = "healthy" for node_key in nodes: nodeInfo = nodes[node_key] ex_params = self.get_node_params(nodeInfo) nodeInfo.update({ 'ssh_password': ex_params.ssh_password, 'ssh_username': ex_params.ssh_username }) nodeInfo['key'] = node_key node = ESNode(nodeInfo) node.status = status es_nodes.append(node) return es_nodes def get_node_params(self, info): ip, port = parse_addr(info["transport_address"]) clusters = self.test_params.clusters master_node = None for _id in clusters: for node in clusters[_id]: if node.ip == ip and int(node.port) == port: return node if int(node.port) == 9091: master_node = node # use params from master node return master_node def search_term(self, key, indices=["default"]): result = None params = {"term": {"_id": key}} query = ES.Search(params) row = self.conn.search(query, indices=indices) if row.total > 0: result = row[0] return result def term_exists(self, key, indices=["default"]): return self.search_term(key, indices=indices) is not None def all_docs(self, keys_only=False, indices=["default"], size=10000): q = query.MatchAllQuery() docs = self.conn.search(q, indices=indices, doc_types='couchbaseDocument') res_docs = [] for row in docs: if keys_only: row = row['meta']['id'] res_docs.append(row) return res_docs # check if a key exists by checking all known nodes # See - CBES-17 # for use when it seems nodes are out of sync def search_all_nodes(self, key, indices=["default"]): doc = None for index in indices: for _node in self.get_nodes(): ip, port = (_node.ip, _node.ht_port) r = requests.get('http://%s:%s/%s/couchbaseDocument/%s?preference=_only_node:%s' %\ (ip, port, index, key, _node.key)) if r.status_code == 200: if r.json()['_id'] == key: doc = r.json() break return doc def fetch_bucket_stats(self, bucket_name='default'): bucket = self.get_bucket(bucket_name=bucket_name) return bucket.stats def start_replication(self, *args, **kwargs): return "es", self.ip def _rebalance_progress(self, *args, **kwargs): return 100 def _rebalance_progress_status(self, *args, **kwargs): return 'not running' def get_vbuckets(self, *args, **kwargs): return () def replace_template(self, node, file): f = open(file, 'r') template = f.read().replace('\n', ' ') api = "http://{0}:9200/_template/couchbase".format(node.ip) status, content, header = self._http_request(api, 'PUT', template) if status: log.info('uploaded couchbase template: ' + file) else: log.error('template upload failed: {0}'.format(content)) def add_node(self, user='', password='', remoteIp='', port='8091', zone_name='', services=None): pass def update_configuration(self, node, commands): rmc = RemoteMachineShellConnection(node) shell = rmc._ssh_client.invoke_shell() for command in commands: log.info('Adding elastic search config {0} on node {1}'.format( command, self.ip)) shell.send( 'echo "{0}" >> ~/elasticsearch/config/elasticsearch.yml \n'. format(command)) while not shell.recv_ready(): time.sleep(2) rc = shell.recv(1024) log.info(rc) def reset_configuration(self, node, count=1): rmc = RemoteMachineShellConnection(node) shell = rmc._ssh_client.invoke_shell() log.info( 'Removing last {0} lines from elastic search config on node {1}'. format(count, self.ip)) shell.send( 'head -n -{0} ~/elasticsearch/config/elasticsearch.yml > temp ; mv temp ~/elasticsearch/config/elasticsearch.yml \n' .format(count)) while not shell.recv_ready(): time.sleep(2) rc = shell.recv(1024) log.info(rc) def start_es_node(self, node): rmc = RemoteMachineShellConnection(node) shell = rmc._ssh_client.invoke_shell() es_kill = "pkill -f elasticsearch;" shell.send(es_kill + ' \n') while not shell.recv_ready(): time.sleep(2) rc = shell.recv(1024) log.info(rc) log.info("Sleep for 30 seconds") time.sleep(30) # define es exec path if not in $PATH environment es_bin = "~/elasticsearch/bin/elasticsearch -Dtransport.couchbase=TRACE -Dcom.couchbase=TRACE > /var/log/es.log 2>&1 &" if 'es_bin' in TestInputSingleton.input.test_params: es_bin = TestInputSingleton.input.test_params['es_bin'] # connect to remote node log.info('Starting node: %s:%s' % (node.ip, node.port)) # start es service shell.send(es_bin + ' \n') while not shell.recv_ready(): time.sleep(2) rc = shell.recv(1024) log.info(rc) log.info("Sleep for 5 seconds before the node can appear") time.sleep(5) # wait for new node tries = 0 while tries < 10: for cluster_node in self.get_nodes(): if cluster_node.ip == node.ip and cluster_node.port == int( node.port): return else: log.info('Waiting for new node to appear') time.sleep(5) tries = tries + 1 raise Exception("failed to add node to cluster: %s:%s" % (node.ip, node.port)) def log_client_error(self, post): # cannot post req errors to 9091 pass def vbucket_map_ready(self, *args, **kwargs): return True def init_cluster(self, *args, **kwargs): pass def init_cluster_memoryQuota(self, *args, **kwargs): pass def set_reb_cons_view(self, *args, **kwargs): pass def set_reb_index_waiting(self, *args, **kwargs): pass def set_rebalance_index_pausing(self, *args, **kwargs): pass def set_max_parallel_indexers(self, *args, **kwargs): pass def set_max_parallel_replica_indexers(self, *args, **kwargs): pass def log_client_error(self, post): # cannot post req errors to 9091 pass def vbucket_map_ready(self, *args, **kwargs): return True def init_cluster(self, *args, **kwargs): pass def init_cluster_memoryQuota(self, *args, **kwargs): pass def set_reb_cons_view(self, *args, **kwargs): pass def set_reb_index_waiting(self, *args, **kwargs): pass def set_rebalance_index_pausing(self, *args, **kwargs): pass def set_max_parallel_indexers(self, *args, **kwargs): pass def set_max_parallel_replica_indexers(self, *args, **kwargs): pass def rebalance(self, otpNodes, ejectedNodes): # shutdown ejected nodes # wait for shards to be rebalanced nodesToShutdown = \ [node for node in self.get_nodes() if node.id in ejectedNodes] for node in nodesToShutdown: self.eject_node(node) def eject_node(self, node): api = "http://%s:9200/_cluster/nodes/local/_shutdown?delay=0s" % ( node.ip) status, content, header = self._http_request(api, 'POST', '') if status: log.info('ejected node: ' + node.ip) else: log.error('rebalance operation failed: {0}'.format(content)) def monitorRebalance(self, stop_if_loop=False): # since removed nodes are shutdown use master node for monitoring return self.get_nodes_self() def get_pools_info(self): return {'pools': []} def add_remote_cluster(self, *args, **kwargs): # detect 2:1 mapping and do spectial cluster add # otherwise run super method pass def remove_all_remote_clusters(self): pass def remove_all_replications(self): pass def is_cluster_mixed(self): return False def set_internalSetting(self, param, value): return {'ok': True}
class ElasticCatalog(object): default_indexes = { 'zelastic_doc_id': { 'type': 'string', 'index': 'not_analyzed' } } def __init__(self, connection_string, elastic_name, storage, bulk=False, bulk_size=400): self.conn = ES(connection_string, bulk_size=bulk_size) self.bulk_size = bulk_size self.name = elastic_name self.storage = storage self.bulk = bulk def update_mapping(self, name): meta = self.storage.meta(name) indexes = meta['indexes'] properties = self.default_indexes.copy() try: self.conn.create_index(self.name) except IndexAlreadyExistsException: pass for index_name, _type in indexes.items(): index = None if _type == 'str': index = { 'type': 'string', 'index': 'not_analyzed', } elif _type == 'full': index = { 'type': 'string', 'index': 'analyzed', } elif _type == 'bool': index = { 'type': 'boolean' } elif _type == 'int': index = { 'type': 'integer', } elif _type in ('datetime', 'date'): index = { 'type': 'date', } elif _type == 'float': index = { 'type': 'float', } if index is not None: properties[index_name] = index self.conn.indices.put_mapping( doc_type=name, mapping={ 'ignore_conflicts': True, 'properties': properties }, indices=[self.name]) def id(self, container_name, key): return '%s-%s' % (container_name, key) def index(self, container_name, doc, key): # need to add data to the index that isn't actually persisted data = { 'zelastic_doc_id': key } meta = self.storage.meta(container_name) indexes = meta['indexes'] for index in indexes.keys(): if index in doc: data[index] = doc[index] self.conn.index( data, self.name, container_name, self.id(container_name, key), bulk=self.bulk) def delete(self, container_name, key): self.conn.delete( self.name, container_name, self.id(container_name, key), bulk=self.bulk) def delete_all(self, container_name): self.conn.delete_mapping( self.name, container_name) def search(self, container_name, query, **kwargs): return self.conn.search( query, indexes=[self.name], doc_types=[container_name], **kwargs) def getFacets(self, container_name, field, size=100): return self.conn.search_raw({ "facets": { field: { "terms": { "all_terms": True, "field": field, "size": size, "order": "term" } } } }, indexes=[self.name], doc_type=container_name)
class ElasticSearch(object): def __init__(self, query): self.elastic = ES(settings.SEARCH_HOSTS) self.query = QueryParser(query) def search(self, index_type='job'): if not self.query.is_valid(): self.results = EmptyResults() return where_filters = [ MatchAllFilter() ] if self.query.params['where']: where_filters= [ QueryFilter(StringQuery(self.query.get_where(), search_fields=['city'],\ analyze_wildcard=True, default_operator="AND")) ] query = FilteredQuery( StringQuery(self.query.params['what'], search_fields=settings.SEARCH_WHAT_FIELDS, default_operator='AND', analyze_wildcard=True)\ if self.query.params['what'] else MatchAllQuery(), ORFilter(self._get_geo_filters(where_filters)) ) facets_filter = [] if self.query.params.has_key('company.facet') and len(self.query.params['company.facet']): facets_filter.append(TermFilter('company.facet', self.query.params['company.facet'])) sorting = {'_score': {'order': 'desc' }} if self.query.params.has_key('sorting'): if self.query.params['sorting'] in ['score', 'published_date']: sorting = {self.query.params['sorting']: {'order': 'desc' }} query = Search( query, ORFilter(facets_filter) if len(facets_filter) else [], start=self.query.start_page(), size=settings.PAGINATION_PAGE_SIZE, sort=sorting ) query.facet.add_term_facet( 'company.facet', size=settings.SEARCH_FACETS_SIZE ) self.results = self.elastic.search(query, settings.SEARCH_ALIASES, index_type) logger.info('Elastic query: %s\n' % str(query.to_search_json())) def get_results(self): data = [] if self.results.total: for result in self.results.hits: item = {} if result.has_key('_source'): item = result['_source'] del item['details_url'] if item.has_key('title'): item['redirect_url'] = reverse('redirect', kwargs={ 'slug' : slugify(result['_source']['title']), 'source' : result['_source']['source'], 'job_id' : result['_id'] }) if item.has_key('published_date'): item['published_date_ago'] = timesince(result['_source']['published_date']).encode('utf-8') if item.has_key('summary'): item['summary'] = truncatechars(result['_source']['summary'], 350) elif item['content']: item['summary'] = truncatechars(result['_source']['content'], 350) if item.has_key('image'): item['image'] = '%s/job/thumbs/small/%s' % (settings.MEDIA_URL, item['image']) if len(item): data.append(item) return data def get_facets(self): facets = {} if self.results.total: for facet in self.results.facets: if self.results.facets[facet].has_key('terms'): facets[facet] = self.results.facets[facet]['terms'] if facets.has_key('company.facet'): for item in facets['company.facet']: if self.query.params.has_key('company.facet') and len(self.query.params['company.facet'])\ and self.query.params['company.facet'] == item['term']: item['url'] = self._get_url({'company.facet': '', 'page': 1}) item['active'] = True else: item['url'] = self._get_url({'company.facet': item['term'], 'page': 1}) return facets def list_pages(self): if self.results.total <= 0: return [] pages = divmod(self.results.total, settings.PAGINATION_PAGE_SIZE) pages = pages[0] + 1 if pages[1] > 0 else pages[0] paginator = Pagination(self.query.params['page'], settings.PAGINATION_PAGE_SIZE, self.results.total) iterator = paginator.iter_pages( left_current = settings.PAGINATION_CURRENT_LEFT, right_current = settings.PAGINATION_CURRENT_RIGHT, left_edge = settings.PAGINATION_EDGE_LEFT, right_edge = settings.PAGINATION_EDGE_RIGHT ) return [{ 'page': page, 'url': self._get_url({'page': page}), 'selected': self.query.params['page'] == page } for page in iterator] def close(self): self.elastic.connection.close() def _get_geo_filters(self, filters=[]): for geo in self.query.get_geoquery(): filters.append(GeoDistanceFilter( 'pin.location', { 'lat' : geo[0], 'lon': geo[1] }, settings.APP_GEO_CITIES_RANGE )) return filters def _get_url(self, data): params = self.query.params.copy() params.update(data) url = '' for key, value in params.items(): url += '%s=%s&' % (key, urllib2.quote(unicode(value).encode('utf8'))) return url[:-1]
from pyes import ES es = ES() index_name = "my_index" type_name = "my_type" from utils_pyes import create_and_add_mapping, populate create_and_add_mapping(es, index_name, type_name) populate(es, index_name, type_name) from pyes.query import * from pyes.filters import * results = es.search(MatchAllQuery(), indices=index_name, doc_types=type_name) print "total:", results.total for r in results: print r print "first element: ", results[0] print "slice elements: ", results[1:4] results = es.search(TermQuery("name", "joe", 3), indices=index_name, doc_types=type_name) q1 = TermFilter("position", 1) q2 = TermFilter("position", 2) orq = ORFilter([q1, q2]) q = FilteredQuery(MatchAllQuery(), orq) results = es.search(q, indices=index_name, doc_types=type_name)
ftool = FileTools() ftrans = FormatTranslator() # 1. Create Connection conn = ES() # 2. Index Data dataset_json = open("../dataset.json") dataset = json.load(dataset_json)['data'] for data in dataset: conn.index(data, "example_index", "example_type", "example_id_"+str(dataset.index(data))) # 3. Create Simple Query query = MatchAllQuery() # 4. Create Simple Aggregation agg = TermsAgg('agg1', field="name",sub_aggs=[],size=100) # 5. Get Result search = Search(query,size=5) search.agg.add(agg) print search.serialize() result = conn.search(search, "example_index", "example_type" ) for i in result: print json.dumps(i,indent=2) print json.dumps(result.aggs,indent=2) result._do_search() print json.dumps(result._results,indent=2)