示例#1
0
def multi_param_search(request):
    log_results = None
    es = ES()  # create elastic seach object
    if request.method == 'POST':  # if the search form is submitted
        filters_list = []
        # loop on each search param and check if it has value to add it to filter list
        for param in [
                "version", "ip_header_length", "ttl", "protocol",
                "source_address", "destination_address", "source_port",
                "dest_port", "sequence_number", "acknowledgement",
                "tcp_header_length", "data", "datetime"
        ]:
            if request.POST.get(param) != '':
                q_param = TermFilter(param, request.POST.get(param))
                filters_list.append(q_param)
        if len(filters_list
               ) != 0:  # if there is filter params  get the results
            orq = ANDFilter(filters_list)
            q = FilteredQuery(MatchAllQuery(), orq)
            log_results = es.search(q, indices=index_name, doc_types=type_name)
        else:
            log_results = None
    elif request.method == 'GET':  # get all packet when get the search page
        log_results = es.search(MatchAllQuery(),
                                indices=index_name,
                                doc_types=type_name)
    return render(request, 'multi_param_search.html',
                  {'log_results': log_results})
示例#2
0
class BaseElasticSearchClient(BaseClient):

    def __init__(self, servers, index):
        """
        @param servers: Make sure to include the port with the server address
        @param index: Document index
        @return:
        """
        super(BaseElasticSearchClient, self).__init__()
        self.connection = None
        self.servers = servers
        self.index = index if type(index) is list else [index]

    def connect(self, connection_pool=1):
        update_connection_pool(connection_pool)

        try:
            self.connection = ES(self.servers)
        except NoServerAvailable:
            self._log.error('Failed to connect to elastic search server')
            return False
        return True

    def close(self):
        self.connection = None

    def _create_term_query(self, must_list):
        # TODO: add remaining conditional list functionality.
        query = BoolQuery()
        for term in must_list:
            query.add_must(term)

    def find_term(self, name, value, size=10):
        if not self.connection:
            return

        query = TermQuery(name, value)
        return self.connection.search(query=Search(query, size=size),
                                      indices=self.index)

    def find(self, filter_terms, size=10, doc_types=None):
        if not self.connection:
            return

        query = self._create_term_query(must_list=filter_terms)
        return self.connection.search(query=Search(query, size=size),
                                      indices=self.index,
                                      doc_types=doc_types)

    def find_one(self, filter_terms, size=10, doc_types=None):
        if not self.connection:
            return

        results = self.find(filter_terms=filter_terms, size=size,
                            doc_types=doc_types)
        return results[0] if len(results) > 0 else None
示例#3
0
def facets(host='localhost:9200',
          facet_terms=['bibleverse'],
          _type='habakkuk',
          date_filter=[],
          size=10):
    ret = {}
    conn = ES(host)
    q = MatchAllQuery()
    if date_filter:
        start,end = date_filter
        q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',
                                                        start.isoformat(),
                                                        end.isoformat(),
                                                        include_upper=False)))

    q = q.search(size=0)
    for term in facet_terms:
        q.facet.add_term_facet(term,order='count',size=size)
        
    es_logger.info(q.serialize())

    resultset = conn.search(query=q, indices=_type+'-*', doc_types=[_type])
    for facet in resultset.facets:
        ret[facet] = []
        for row in resultset.facets[facet]['terms']:
            ret[facet].append({"value":row['term'],"count":row['count']})

    logger.debug("facets return|'%s'"%json.dumps(ret))
    return ret
示例#4
0
def term_facet(host='localhost:9200',
               terms=['bibleverse'],
               _type='habakkuk',
               date_filter=[],
               size=10):
    ret = []
    conn = ES(host)
    q = MatchAllQuery()
    if date_filter:
        start,end = date_filter
        q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',start,end,include_upper=False)))

    q = q.search(size=0)
    for term in terms:
        q.facet.add_term_facet(term,order='count',size=size)
        
    print json.dumps(json.loads(q.to_search_json()),indent=2)

    resultset = conn.search(query=q, indices=_type+'-*', doc_types=[_type])
    for facet in resultset.facets:
        print "Total",facet,resultset.facets[facet]['total']
        for row in resultset.facets[facet]['terms']:
            print "\t",row['term'],row['count']
            ret.append((facet,row['term']))
        
    return ret
示例#5
0
def single_param_search(request):
    log_results = None
    es = ES()  # create elastic seach object
    if request.method == 'POST':  # if the search form is submitted
        # filter with search param and search tearm
        q1 = TermFilter(request.POST.get('searchby'),
                        request.POST.get('searchterm'))
        orq = ORFilter([q1])
        q = FilteredQuery(MatchAllQuery(), orq)
        log_results = es.search(
            q, indices=index_name,
            doc_types=type_name)  # get the filtered data from elasticsearch
    elif request.method == 'GET':  # get all packet when get the search page
        log_results = es.search(MatchAllQuery(),
                                indices=index_name,
                                doc_types=type_name)
    return render(request, 'single_param_search.html',
                  {'log_results': log_results})
示例#6
0
def find_BID_in_SBN(bid, es_server="localhost:9200"):
    sbn_bid = to_iccu_bid(bid)
    q = TermQuery('codiceIdentificativo', sbn_bid)
    es_conn = ES(server=es_server)
    resultset = list(es_conn.search(query=q, indices="iccu"))
    if (len(resultset) > 0):
        return resultset
    else:
        return None
示例#7
0
def search_people_by_bio(query,
                         limit_results=DEFAULT_LIMIT,
                         index=['onename_people_index']):
    """ queries lucene index to find a nearest match, output is profile username
    """

    from pyes import QueryStringQuery, ES
    conn = ES()

    q = QueryStringQuery(query,
                         search_fields=['username', 'profile_bio'],
                         default_operator='and')

    results = conn.search(query=q, size=20, indices=index)
    count = conn.count(query=q)
    count = count.count

    # having 'or' gives more results but results quality goes down
    if (count == 0):

        q = QueryStringQuery(query,
                             search_fields=['username', 'profile_bio'],
                             default_operator='or')

        results = conn.search(query=q, size=20, indices=index)

    results_list = []
    counter = 0

    for profile in results:

        username = profile['username']
        results_list.append(username)

        counter += 1

        if (counter == limit_results):
            break

    return results_list
def search_people_by_bio(query, limit_results=DEFAULT_LIMIT,
                         index=['onename_people_index']):
    """ queries lucene index to find a nearest match, output is profile username
    """

    from pyes import QueryStringQuery, ES
    conn = ES()

    q = QueryStringQuery(query,
                         search_fields=['username', 'profile_bio'],
                         default_operator='and')

    results = conn.search(query=q, size=20, indices=index)
    count = conn.count(query=q)
    count = count.count

    # having 'or' gives more results but results quality goes down
    if(count == 0):

        q = QueryStringQuery(query,
                             search_fields=['username', 'profile_bio'],
                             default_operator='or')

        results = conn.search(query=q, size=20, indices=index)

    results_list = []
    counter = 0

    for profile in results:

        username = profile['username']
        results_list.append(username)

        counter += 1

        if(counter == limit_results):
            break

    return results_list
示例#9
0
class ESPages():
    ''' For use with Django's paginator. Currently not used after pyes
        update implemented ResultSet, which provides the count,
        __getitem__, and __len__ methods required for Django's paginator. '''
    def __init__(self, es_query, **kwargs):
        ''' Make initial ES query'''
        self.conn = ES(settings.ES_HOST[0], timeout=10.0)
        self.es_query = es_query
        res = self.conn.search(query=self.es_query, size='0', **kwargs)
        self.total_hits = res['hits']['total']

    def count(self):
        return self.total_hits

    def __getitem__(self, q_slice):
        ''' Make ES query for range of hits'''
        q = self.es_query.search(start=str(q_slice.start), size=str(q_slice.stop-q_slice.start+1))
        res = self.conn.search(q)
        return res['hits']['hits']

    def __len__(self):
        return self.count()
示例#10
0
文件: app.py 项目: iamsk/es-demo
def search(searchkey=u"电影"):
    conn = ES('127.0.0.1:9200')
    # TextQuery会对searchkey进行分词
    qtitle = TextQuery("title", searchkey)
    h = HighLighter(['<b>'], ['</b>'], fragment_size=500)
    # 多字段搜索(must=>and,should=>or),高亮,结果截取(分页),排序
    q = Search(BoolQuery(should=[qtitle]), highlight=h, start=0, size=3,
               sort={'id': {'order': 'asc'}})
    q.add_highlight("title")
    results = conn.search(q, "zhihu", "answer")
    list = []
    for r in results:
        if("title" in r._meta.highlight):
            r['title'] = r._meta.highlight[u"title"][0]
        list.append(r)
    return template('results.html', list=list, count=results.total)
示例#11
0
def search(searchkey=u"电影"):
    conn = ES('127.0.0.1:9200')
    # TextQuery会对searchkey进行分词
    qtitle = TextQuery("title", searchkey)
    h = HighLighter(['<b>'], ['</b>'], fragment_size=500)
    # 多字段搜索(must=>and,should=>or),高亮,结果截取(分页),排序
    q = Search(BoolQuery(should=[qtitle]),
               highlight=h,
               start=0,
               size=3,
               sort={'id': {
                   'order': 'asc'
               }})
    q.add_highlight("title")
    results = conn.search(q, "zhihu", "answer")
    list = []
    for r in results:
        if ("title" in r._meta.highlight):
            r['title'] = r._meta.highlight[u"title"][0]
        list.append(r)
    return template('results.html', list=list, count=results.total)
class KVStore(KVStoreBase):
    def __init__(self, *args, **kwargs):
        super(KVStore, self).__init__(*args, **kwargs)
        self.connection = ES(settings.THUMBNAIL_ELASTIC_SEARCH_SERVERS)

    def _get_raw(self, key):
        try:
            #import pdb; pdb.set_trace()
            value = self.connection.get(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, 
                                        settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                                        key)
            return value['_source']['value']
        except:
            return None

    def _set_raw(self, key, value):
        ret = self.connection.index({"value": value}, 
                                    settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,
                                    settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                                    key)
        return ret['ok']
    
    def _delete_raw(self, *keys):
        rets = []
        for key in keys:
            try:
                ret = self.connection.delete(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,
                                             settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                                             key)
                rets.append(ret['ok'])
            except:
                rets.append(False)
        return rets

    def _find_keys_raw(self, prefix):
        search = Search(query=PrefixQuery("_id", prefix), size=1000, start=0, fields=[])
        results = self.connection.search(search, 
                                         indexes=[settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,], 
                                         doc_types=[settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,])
        return [hit['_id'] for hit in results['hits']['hits']]
from pyes import ES

es = ES()
index_name = "my_index"
type_name = "my_type"

from utils_pyes import create_and_add_mapping, populate

create_and_add_mapping(es, index_name, type_name)
populate(es, index_name, type_name)

from pyes.query import *

q = MatchAllQuery()
q = q.search()
q.facet.add_term_facet('tag')

results = es.search(indices=index_name, doc_types=type_name, query=q)

from pyes.facets import *
q = MatchAllQuery()
q = q.search()
q.facet.facets.append(DateHistogramFacet('date_facet',
    field='date',
    interval='month'))

results = es.search(indices=index_name, doc_types=type_name, query=q)

es.indices.delete_index(index_name)
示例#14
0
class Elastic(object):

    def init_app(self, app):
        self.conn = ES(app.config['ELASTIC_URL'], timeout=2)
        #self.remote_conns = [ES(url) for url in app.config['REMOTE_ELASTIC_URL']]

    def search(self, start=0, size=20, doc_types='resource', indices='order_index', sort=None, **kwargs):
        # set filter
        filters = []
        for k,v in kwargs.items():
            if k and k!='complete_time':
                filters.append(TermFilter(k, v))
            elif k and v!='' and k=='complete_time':
                ct = kwargs['complete_time']
                if len(ct) == 2:
                    filters.append(RangeFilter(ESRange('complete_time', from_value=ct[0], to_value=ct[1])))
                else:
                    filters.append(RangeFilter(ESRange('complete_time', from_value=ct[0])))
        
        _filter = None
        if filters:
            _filter = ANDFilter(filters)

        bq = MatchAllQuery()
        # filtered
        q = FilteredQuery(bq, _filter)

        # sort
        if sort:
            sf = SortFactory()
            for s in sort:
                sf.add(s)
            s = Search(q, sort=sf)
        else:
            s = Search(q)

        # result
        return self.conn.search(s, indices=indices, doc_types=doc_types, start=start, size=size)

    def delete(self, index='order_index', doc_type='resource', id=''):
        return self.conn.delete(index=index, doc_type=doc_type, id=id)

    def create(self, index='order_index', doc_type='resource', doc=None):
        # try:
        #     self.delete(index, doc_type, doc['id'])
        # except NotFoundException:
        #     pass
        try:
            return self.conn.index(doc, index, doc_type, id=doc['id'])
        except:# not connection
            pass

    def multi_create(self, index='order_index', doc_type='resource', doc=None):
        """如果同步缓存到远程,要使用celery"""
        try:
            return self.conn.index(doc, index, doc_type, id=doc['id'])
        except:# not connection
            pass
            
        try:
            for rconn in self.remote_conns:
                rconn.index(doc, index, doc_type, id=doc['id'])
        except:
            print '--------sync cache to remote error------'
from pyes import ES

es = ES()
index_name = "my_index"
type_name = "my_type"

from utils_pyes import create_and_add_mapping, populate

create_and_add_mapping(es, index_name, type_name)
populate(es, index_name, type_name)

from pyes.query import *
from pyes.filters import *

results = es.search(index_name, type_name, MatchAllQuery())

print "total:", results.total
for r in results:
    print r

print "first element: ", results[0]
print "slice elements: ", results[1:4]

results = es.search(index_name, type_name, TermQuery("name", "joe", 3))

q1 = TermFilter("position", 1)
q2 = TermFilter("position", 2)
orq = ORFilter([q1, q2])
q = FilteredQuery(MatchAllQuery(), orq)

results = es.search(index_name, type_name, q)
示例#16
0
class BaseElasticSearchClient(BaseClient):

    def __init__(self, servers, index=None):
        """
        @param servers: Make sure to include the port with the server address
        @param index: Document index
        @return:
        """
        super(BaseElasticSearchClient, self).__init__()
        self.connection = None
        self.servers = servers

        if index is not None:
            self.index = index if type(index) is list else [index]

    def connect(self, connection_pool=1, bulk_size=10):
        update_connection_pool(connection_pool)

        try:
            self.connection = ES(self.servers, bulk_size=bulk_size)
        except NoServerAvailable:
            self._log.error('Failed to connect to elastic search server')
            return False
        return True

    def close(self):
        self.connection = None

    def _create_term_query(self, must_list):
        # TODO: add remaining conditional list functionality.
        query = BoolQuery()
        for term in must_list:
            query.add_must(term)

    def refresh_index(self, index_name, wait=1):
        self._log.info('ES: Refreshing index {0}'.format(index_name))
        self.connection.indices.refresh(index_name, timesleep=wait)

    def has_index(self, index_name):
        self._log.info('ES: Checking for index {0}'.format(index_name))
        try:
            self.connection.status(index_name)
        except IndexMissingException:
            return False
        return True

    def wait_for_index(self, index_name, wait=30):
        """ Checks to see if an index exists.
        Checks every second for int(X) seconds and returns True if successful
        """
        for i in range(0, int(wait)):
            if self.has_index(index_name):
                return True

            sleep(1)
        return False

    def wait_for_messages(self, name, value, num=1, index=None, max_wait=30):
        """ Wait for a specific number of messages to be returned within a
        specified amount of time.
        Checks every second for {max_wait} seconds and returns a list of msgs
        """
        for i in range(0, int(max_wait)):
            msgs = self.find_term(name=name, value=value, size=1, index=index)
            if len(msgs) == num:
                return msgs
            sleep(1)
        return []

    def delete_index(self, index_name):
        self._log.info('ES: Deleting index {0}'.format(index_name))
        self.connection.delete_index(index_name)

    def find_term(self, name, value, size=10, index=None):
        if not self.connection:
            return

        query = TermQuery(name, value)
        return self.connection.search(query=Search(query, size=size),
                                      indices=index or self.index)

    def find(self, filter_terms, size=10, doc_types=None, index=None):
        if not self.connection:
            return

        query = self._create_term_query(must_list=filter_terms)
        return self.connection.search(query=Search(query, size=size),
                                      indices=index or self.index,
                                      doc_types=doc_types)

    def find_one(self, filter_terms, doc_types=None, index=None):
        if not self.connection:
            return

        results = self.find(filter_terms=filter_terms, size=1,
                            doc_types=doc_types, index=index)
        return results[0] if len(results) > 0 else None
from pyes import ES

es = ES()
index_name = "my_index"
type_name = "my_type"

from utils_pyes import create_and_add_mapping, populate

create_and_add_mapping(es, index_name, type_name)
populate(es, index_name, type_name)

from pyes.query import *

q = MatchAllQuery()
q = q.search()
q.facet.add_term_facet('tag')

results = es.search(index_name, type_name, q)

from pyes.facets import *
q = MatchAllQuery()
q = q.search()
q.facet.facets.append(DateHistogramFacet('date_facet',
    field='date',
    interval='month'))

results = es.search(index_name, type_name, q)

es.indices.delete(index_name)
示例#18
0
class DocManager():
    """The DocManager class creates a connection to the backend engine and
        adds/removes documents, and in the case of rollback, searches for them.

        The reason for storing id/doc pairs as opposed to doc's is so that
        multiple updates to the same doc reflect the most up to date version as
        opposed to multiple, slightly different versions of a doc.

        We are using elastic native fields for _id and ns, but we also store
        them as fields in the document, due to compatibility issues.
        """

    def __init__(self, url, auto_commit=True, unique_key='_id'):
        """Verify Elastic URL and establish a connection.
        """

        if verify_url(url) is False:
            raise SystemError
        self.elastic = ES(server=url)
        self.auto_commit = auto_commit
        self.doc_type = 'string'  # default type is string, change if needed
        self.unique_key = unique_key
        if auto_commit:
            self.run_auto_commit()

    def stop(self):
        """ Stops the instance
        """
        self.auto_commit = False

    def upsert(self, doc):
        """Update or insert a document into Elastic

        If you'd like to have different types of document in your database,
        you can store the doc type as a field in Mongo and set doc_type to
        that field. (e.g. doc_type = doc['_type'])

        """

        doc_type = self.doc_type
        index = doc['ns']
        doc[self.unique_key] = str(doc[self.unique_key])
        doc_id = doc[self.unique_key]
        id_query = TextQuery('_id', doc_id)
        elastic_cursor = self.elastic.search(query=id_query, indices=index)

        try:
            self.elastic.index(bsjson.dumps(doc), index, doc_type, doc_id)
        except ValueError:
            logging.info("Could not update %s" % (doc,))
        self.elastic.refresh()

    def remove(self, doc):
        """Removes documents from Elastic

        The input is a python dictionary that represents a mongo document.
        """
        try:
            self.elastic.delete(doc['ns'], 'string', str(doc[self.unique_key]))
        except (NotFoundException, TypeMissingException, IndexMissingException):
            pass

    def _remove(self):
        """For test purposes only. Removes all documents in test.test
        """
        try:
            self.elastic.delete('test.test', 'string', '')
        except (NotFoundException, TypeMissingException, IndexMissingException):
            pass

    def search(self, start_ts, end_ts):
        """Called to query Elastic for documents in a time range.
        """
        res = ESRange('_ts', from_value=start_ts, to_value=end_ts)
        results = self.elastic.search(RangeQuery(res))
        return results

    def _search(self):
        """For test purposes only. Performs search on Elastic with empty query.
        Does not have to be implemented.
        """
        results = self.elastic.search(MatchAllQuery())
        return results

    def commit(self):
        """This function is used to force a refresh/commit.
        """
        retry_until_ok(self.elastic.refresh)

    def run_auto_commit(self):
        """Periodically commits to the Elastic server.
        """
        self.elastic.refresh()

        if self.auto_commit:
            Timer(1, self.run_auto_commit).start()

    def get_last_doc(self):
        """Returns the last document stored in the Elastic engine.
        """

        result = self.elastic.search(MatchAllQuery(), size=1, sort='_ts:desc')
        for item in result:
            return item
from pyes import ES

es = ES()
index_name = "my_index"
type_name = "my_type"

from utils_pyes import create_and_add_mapping, populate

create_and_add_mapping(es, index_name, type_name)
populate(es, index_name, type_name)

from pyes.query import *
from pyes.aggs import *

q = MatchAllQuery()
q = q.search()
q.get_agg_factory().add(TermsAgg('pterms', field="parsedtext"))

results = es.search(q, indices=index_name, doc_types=type_name)

q = MatchAllQuery()
q = q.search()
q.get_agg_factory().add(DateHistogramAgg('date_add',
    field='date',
    interval='month'))

results = es.search(q, indices=index_name, doc_types=type_name)

es.indices.delete_index(index_name)
示例#20
0
from mediaresearchapp.tasks import MediaAggregateSQLTask

if __name__ == '__main__':
    es = ES("127.0.0.1:9200", default_indices='mediaaggregate')

    # Filters
    filters = [GeoDistanceFilter('location', [40.0, 9.00], 20, 'arc', 'km')]

    #     filters = [TermFilter('message', 'elastic'),
    #                GeoDistanceFilter('locations',
    #                                  {"lat": 40.0, "lon": 9.00},
    #                                  20, 'arc', 'km')
    #                ]
    filter = ANDFilter(filters)
    q = FilteredQuery(MatchAllQuery(), filter)
    results = es.search(q)
    for r in results:
        print r
        break

    q4 = RegexTermQuery('city', 'bang.*')
    print q4
    resultset = es.search(q4)
    for r in resultset:
        print r

    query_str = {
        "query": {
            "termquery": [{
                "fieldname1": "value"
            }, {
示例#21
0
class ESIndexerBase(object):
    ES_HOST = ES_HOST
    ES_INDEX_NAME = ES_INDEX_NAME
    ES_INDEX_TYPE = 'gene'

    def __init__(self):
        self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME],
        	           timeout=10.0)
        self.step = 10000

    def create_index(self):
        try:
            print self.conn.open_index(self.ES_INDEX_NAME)
        except IndexMissingException:
            print self.conn.create_index(self.ES_INDEX_NAME)

    def delete_index_type(self, index_type):
        '''Delete all indexes for a given index_type.'''
        index_name = self.ES_INDEX_NAME
#        index_type = self.ES_INDEX_TYPE
        #Check if index_type exists
        mapping = self.conn.get_mapping(index_type, index_name)
        if index_name not in mapping or index_type not in mapping[index_name]:
            print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name)
            return
        path = '/%s/%s' % (index_name, index_type)
        if ask('Confirm to delete all data under "%s":' % path) == 'Y':
            return self.conn.delete_mapping(index_name, index_type)

    def index(self, doc, index_type, id=None):
        '''add a doc to the index. If id is not None, the existing doc will be
           updated.
        '''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id)

    def delete_index(self, index_type, id):
        '''delete a doc from the index based on passed id.'''
#        index_type = self.ES_INDEX_TYPE
        return self.conn.delete(self.ES_INDEX_NAME, index_type, id)

    def optimize(self):
        return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True)

    def get_field_mapping(self):
        import dataload
        reload(dataload)
        dataload.register_sources()
        return dataload.get_mapping()

    def build_index(self, doc_d, update_mapping=False, bulk=True):
        index_name = self.ES_INDEX_NAME
        index_type = self.ES_INDEX_TYPE

        #Test if index exists
        try:
            print "Opening index...", self.conn.open_index(index_name)
        except NotFoundException:
            print 'Error: index "%s" does not exist. Create it first.' % index_name
            return -1

        try:
            cur_mapping = self.conn.get_mapping(index_type, index_name)
            empty_mapping = False
        except ElasticSearchException:
            #if no existing mapping available for index_type
            #force update_mapping to True
            empty_mapping = True
            update_mapping = True

#        empty_mapping = not cur_mapping[index_name].get(index_type, {})
#        if empty_mapping:
#            #if no existing mapping available for index_type
#            #force update_mapping to True
#            update_mapping = True

        if update_mapping:
            print "Updating mapping...",
            if not empty_mapping:
                print "\n\tRemoving existing mapping...",
                print self.conn.delete_mapping(index_name, index_type)
            _mapping = self.get_field_mapping()
            print self.conn.put_mapping(index_type,
                                   _mapping,
                                   [index_name])
        print "Building index..."
        t0 = time.time()
        for doc_id, doc in doc_d.items():
            self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk)
        print self.conn.flush()
        print self.conn.refresh()
        print "Done[%s]" % timesofar(t0)

    def query(self, qs, fields='symbol,name', **kwargs):
        _q = StringQuery(qs)
        res = self.conn.search(_q, fields=fields, **kwargs)
        return res
示例#22
0
from pyes import ES

es = ES()
index_name = "my_index"
type_name = "my_type"

from utils_pyes import create_and_add_mapping, populate

create_and_add_mapping(es, index_name, type_name)
populate(es, index_name, type_name)

from pyes.query import *
from pyes.aggs import *

q = MatchAllQuery()
q = q.search()
q.get_agg_factory().add(TermsAgg('pterms', field="parsedtext"))

results = es.search(q, indices=index_name, doc_types=type_name)

q = MatchAllQuery()
q = q.search()
q.get_agg_factory().add(
    DateHistogramAgg('date_add', field='date', interval='month'))

results = es.search(q, indices=index_name, doc_types=type_name)

es.indices.delete_index(index_name)
from pyes import ES
from pyes import TermQuery
from pyes import RangeQuery
from pyes import QueryStringQuery
from pyes import BoolQuery
from pyes import ESRange
from pyes import ANDFilter
from pyes import TermFilter
from pyes import FilteredQuery
from pyes import query

conn = ES('localhost:9200')

a_range = RangeQuery(qrange=ESRange('a', 0.179, 0.180))
b_filter = TermFilter("b", "0.2")
period_filter = TermFilter("period", "2")
total_filter = ANDFilter([b_filter, period_filter])
c_range = RangeQuery(qrange=ESRange('c', 8, 12))
que = FilteredQuery(BoolQuery(must=[a_range, c_range]), total_filter)

search = query.Search(query=que)
get = conn.search(search, indices='shrimp')
census = get.total

for i in get:
    print i
示例#24
0
def import_instruments(instrs, es_url, index, alias):
    """Create JSON ES docs and import."""

    prefix = {
        "bibo": "http://purl.org/ontology/bibo/",
        "dcterms": "http://purl.org/dc/terms/",
        "eos": "http://nasa.gov/eos.owl#",
        "gcis": "http://data.globalchange.gov/gcis.owl#",
        "hysds": "http://hysds.jpl.nasa.gov/hysds/0.1#",
        "info": "http://info-uri.info/",
        "xlink": "http://www.w3.org/1999/xlink"
    }

    conn = ES(es_url)
    if not conn.indices.exists_index(index):
        conn.indices.create_index(index)

    # track agencies/organizations
    orgs = {}

    for instr in instrs:
        identifier = "eos:%s" % instr['Instrument Name Short']
        id = hashlib.md5(identifier).hexdigest()
        if 'Instrument Technology' in instr and not EMPTY.search(instr['Instrument Technology']):
            sensor = "eos:%s" % instr['Instrument Technology']
        else:
            if 'Instrument Type' in instr and not EMPTY.search(instr['Instrument Type']):
                sensor = "eos:%s" % instr['Instrument Type']
            else:
                if 'Subtype' in instr and not EMPTY.search(instr['Subtype']):
                    sensor = "eos:%s" % instr['Subtype']
                else:
                    if 'Type' in instr and not EMPTY.search(instr['Type']):
                        sensor = "eos:%s" % instr['Type']
                    else:
                        if 'Class' in instr and not EMPTY.search(instr['Class']):
                            sensor = "eos:%s" % instr['Class']
                        else:
                            sensor = None
        #print(instr['Instrument Technology'], sensor)
        platform = None
        if 'Instrument Agencies' in instr and not EMPTY.search(instr['Instrument Agencies']):
            org = "eos:%s" % instr['Instrument Agencies']
            if org not in orgs:
               orgs[org] = {
                   "prov_es_json": {
                       "prefix": prefix,
                       "agent": {
                           org: {
                               "prov:type": {
                                   "type": "prov:QualifiedName",
                                   "$": "prov:Organization",
                               },
                           },
                       },
                   },
                   "identifier": org,
                   "prov:type": "prov:Organization",
               }
               if len(conn.search(query=TermQuery("_id", org),
                                  indices=[alias])) > 0: pass
               else: conn.index(orgs[org], index, 'agent', org)
        else: org = None
        doc = {
            "prov_es_json": {
                "prefix": prefix,
                "entity": {
                    identifier: {
                        "gcis:hasSensor": sensor,
                        "gcis:inPlatform": platform,
                        "prov:type": "eos:instrument",
                        "gcis:hasGoverningOrganization": org,
                    },
                },
            },
            "gcis:hasSensor": sensor,
            "gcis:inPlatform": platform,
            "prov:type": "eos:instrument",
            "gcis:hasGoverningOrganization": org,
            "identifier": identifier,
        }
        if len(conn.search(query=TermQuery("_id", identifier),
                           indices=[alias])) > 0: pass
        else: conn.index(doc, index, 'entity', identifier)
示例#25
0
def import_instruments(instrs, es_url, index, alias):
    """Create JSON ES docs and import."""

    prefix = {
        "bibo": "http://purl.org/ontology/bibo/",
        "dcterms": "http://purl.org/dc/terms/",
        "eos": "http://nasa.gov/eos.owl#",
        "gcis": "http://data.globalchange.gov/gcis.owl#",
        "hysds": "http://hysds.jpl.nasa.gov/hysds/0.1#",
        "info": "http://info-uri.info/",
        "xlink": "http://www.w3.org/1999/xlink"
    }

    conn = ES(es_url)
    if not conn.indices.exists_index(index):
        conn.indices.create_index(index)

    # track agencies/organizations
    orgs = {}

    for instr in instrs:
        identifier = "eos:%s" % instr['Instrument Name Short']
        id = hashlib.md5(identifier).hexdigest()
        if 'Instrument Technology' in instr and not EMPTY.search(
                instr['Instrument Technology']):
            sensor = "eos:%s" % instr['Instrument Technology']
        else:
            if 'Instrument Type' in instr and not EMPTY.search(
                    instr['Instrument Type']):
                sensor = "eos:%s" % instr['Instrument Type']
            else:
                if 'Subtype' in instr and not EMPTY.search(instr['Subtype']):
                    sensor = "eos:%s" % instr['Subtype']
                else:
                    if 'Type' in instr and not EMPTY.search(instr['Type']):
                        sensor = "eos:%s" % instr['Type']
                    else:
                        if 'Class' in instr and not EMPTY.search(
                                instr['Class']):
                            sensor = "eos:%s" % instr['Class']
                        else:
                            sensor = None
        #print(instr['Instrument Technology'], sensor)
        platform = None
        if 'Instrument Agencies' in instr and not EMPTY.search(
                instr['Instrument Agencies']):
            org = "eos:%s" % instr['Instrument Agencies']
            if org not in orgs:
                orgs[org] = {
                    "prov_es_json": {
                        "prefix": prefix,
                        "agent": {
                            org: {
                                "prov:type": {
                                    "type": "prov:QualifiedName",
                                    "$": "prov:Organization",
                                },
                            },
                        },
                    },
                    "identifier": org,
                    "prov:type": "prov:Organization",
                }
                if len(
                        conn.search(query=TermQuery("_id", org),
                                    indices=[alias])) > 0:
                    pass
                else:
                    conn.index(orgs[org], index, 'agent', org)
        else:
            org = None
        doc = {
            "prov_es_json": {
                "prefix": prefix,
                "entity": {
                    identifier: {
                        "gcis:hasSensor": sensor,
                        "gcis:inPlatform": platform,
                        "prov:type": "eos:instrument",
                        "gcis:hasGoverningOrganization": org,
                    },
                },
            },
            "gcis:hasSensor": sensor,
            "gcis:inPlatform": platform,
            "prov:type": "eos:instrument",
            "gcis:hasGoverningOrganization": org,
            "identifier": identifier,
        }
        if len(conn.search(query=TermQuery("_id", identifier),
                           indices=[alias])) > 0:
            pass
        else:
            conn.index(doc, index, 'entity', identifier)
示例#26
0
ftrans = FormatTranslator()

# 1. Create Connection
conn = ES()

# 2. Index Data
dataset_json = open("../dataset.json")
dataset = json.load(dataset_json)['data']
for data in dataset:
    conn.index(data, "example_index", "example_type",
               "example_id_" + str(dataset.index(data)))

# 3. Create Simple Query
query = MatchAllQuery()

# 4. Create Simple Aggregation
agg = TermsAgg('agg1', field="name", sub_aggs=[], size=100)

# 5. Get Result
search = Search(query, size=5)
search.agg.add(agg)
print search.serialize()

result = conn.search(search, "example_index", "example_type")

for i in result:
    print json.dumps(i, indent=2)
print json.dumps(result.aggs, indent=2)

result._do_search()
print json.dumps(result._results, indent=2)
示例#27
0
class ElasticSearchServer(ESDBRequests):
    """
    An object representing the CouchDB server, use it to list, create, delete
    and connect to databases.

    More info http://wiki.apache.org/couchdb/HTTP_database_API
    """

    def __init__(self, dburl = 'http://localhost:9200', indices, types, usePYCurl = False, ckey = None, cert = None, capath = None):
        """
        Set up a connection to the CouchDB server
        """
        check_server_url(dburl)
        # PYCurl TODO
        # Same with cert and key
        self.url = dburl
        self.ESconn = ES(dburl)
        self.ckey = ckey
        self.cert = cert
        check_name(indices)
        check_name(types)
        self.indices = indices
        self.types = types

    def listDatabases(self):
        "List all the databases the server hosts"
        # TODO
        return self.get('/_all_dbs')

    def createDatabase(self, schema):
        """
        A database must be named with all lowercase characters (a-z),
        digits (0-9), or any of the _$()+-/ characters and must end with a slash
        in the URL.
        """
        self.ESconn.indices.create_index_if_missing(self.indices)
        self.ESconn.indices.put_mapping(self.types, {'properties': schema}, [self.indices])

    def insertDoc(self, doc, _id):
        """ TODO """
        self.ESconn.index(doc, self.indices, self.types, _id)

    def deleteDoc(self,  _id):
        self.ESconn.delete(self.indices, self.types, _id)

    def termBoolQuery(self, query):
        """ query - dict
            must:
                key = key in the database
                value = searchable value
            should
                key = key in the database
                value = searchable value
            must_not
                key = key in the database
                value = searchable value
        """
        queryMust = []
        queryShould = []
        queryMustNot = []
        for item in ["must", "should", "must_not"]:
            if item in query:
                for dictVals in query[item]:
                    for dictKey in dictVals:
                        tempq = TermQuery(dictKey, dictVals[dictKey])
                        if item == "must":
                            queryMust.append(tempq)
                        elif item == "should":
                            queryShould.append(tempq)
                        elif item == "must_not":
                            queryMustNot.append(tempq)
        query = BoolQuery(must=None if not queryMust else queryMust,
                          should=None if not queryShould else queryShould,
                          must_not=None if not queryMustNot else queryMustNot)

        search = Search(query)
        results = self.ESconn.search(search, self.indices)
        response = {"status_code": 200, "message": "Successful", "content": []}
        response["content"] = [result for result in results]
        return response
示例#28
0
class EsRestConnection(RestConnection):
    def __init__(self, serverInfo, proto = "http"):
        #serverInfo can be a json object
        #only connect pyes to master es node
        #in the case that other nodes are taken down
        #because http requests will fail
        # TODO: dynamic master node detection
        if isinstance(serverInfo, dict):
            self.ip = serverInfo["ip"]
            self.rest_username = serverInfo["username"]
            self.rest_password = serverInfo["password"]
            self.username = serverInfo["es_username"]
            self.password = serverInfo["es_password"]
            self.port = 9091 #serverInfo["port"]
        else:
            self.ip = serverInfo.ip
            self.rest_username = serverInfo.rest_username
            self.rest_password = serverInfo.rest_password
            self.username = serverInfo.es_username
            self.password = serverInfo.es_password
            self.port = 9091 # serverInfo.port

        self.baseUrl = "http://{0}:{1}/".format(self.ip, self.port)
        self.capiBaseUrl = self.baseUrl
        self.esHttpUrl = "http://{0}:9200".format(self.ip)
        self.http_port = str(int(self.port) + 109)
        self.proto = proto
        self.conn = ES(server=self.esHttpUrl)
        self.manager = managers.Cluster(self.conn)
        self.test_params = TestInputSingleton.input
        self.docs = None

    def get_index_stats(self):
        return ES.index_stats()

    def get_indices(self):
        return self.conn.indices.get_indices()

    def get_indices_as_buckets(self, doc_type='couchbaseDocument'):
        buckets = []
        indices = self.get_indices()

        for index in indices:
            bucket = Bucket()
            q = query.MatchAllQuery()
            docs = self.conn.search(q,index,doc_type)
            bucket.name = index
            bucket.type = "es"
            bucket.port = self.port
            bucket.authType = None
            bucket.saslPassword = self.password
            bucket.nodes = list()

            #vBucketServerMap
            bucketStats = BucketStats()
            bucketStats.itemCount = docs.count()
            bucket.stats = bucketStats
            buckets.append(bucket)
            bucket.master_id = "es@"+self.ip

        return buckets

    def get_bucket(self, bucket_name, doc_type):
        for bucket in self.get_indices_as_buckets(doc_type):
            if bucket.name == bucket_name:
                return bucket
        return

    def get_buckets(self):
        return self.get_indices_as_buckets()

    def delete_index(self, name):
        self.conn.indices.delete_index(name)
        return self.conn.indices.exists_index(name)

    def create_index(self, name):

        if self.conn.indices.exists_index(name):
            self.delete_index(name)

        self.conn.indices.create_index(name)
        return self.conn.indices.exists_index(name)

    def delete_bucket(self, name):
        return self.delete_index(name)

    def create_bucket(self, *args, **kwargs):
        name  = 'default'

        if len(args) > 0:
            name = args[0]
        else:
            name = kwargs['bucket']

        return self.create_index(name)

    def is_ns_server_running(self, timeout_in_seconds=360):
        return True


    def node_statuses(self, timeout=120):
        otp_nodes = []

        for node in self.get_nodes():

            #get otp,get status
            otp_node = OtpNode(id=node.id,
                               status=node.status)

            otp_node.ip = node.ip
            otp_node.port = node.port
            otp_node.replication = None
            otp_nodes.append(node)

        return otp_nodes


    def get_nodes_self(self, timeout=120):
        for node in self.get_nodes():
            # force to return master node
            if node.port == 9091:
                return node
        return

    def get_nodes(self):
        es_nodes = []
        nodes = self.manager.state()['nodes']
        status = self.manager.health()['status']
        if status == "green":
            status = "healthy"

        for node_key in nodes:
            nodeInfo = nodes[node_key]
            ex_params = self.get_node_params(nodeInfo)

            nodeInfo.update({'ssh_password' : ex_params.ssh_password,
                             'ssh_username' : ex_params.ssh_username})
            nodeInfo['key'] = node_key
            node = ESNode(nodeInfo)
            node.status = status
            es_nodes.append(node)
        return es_nodes

    def get_node_params(self, info):
        ip, port = parse_addr(info["transport_address"])
        clusters = self.test_params.clusters
        master_node = None
        for _id in clusters:
            for node in clusters[_id]:
                if node.ip == ip and int(node.port) == port:
                    return node
                if int(node.port) == 9091:
                    master_node = node

        # use params from master node
        return master_node

    def search_term(self, key, indices=["default"]):
        result = None
        params = {"term":{"_id":key}}
        query = ES.Search(params)
        row = self.conn.search(query, indices = indices)
        if row.total > 0:
           result = row[0]
        return result

    def term_exists(self, key, indices=["default"]):
        return self.search_term(key, indices = indices) is not None

    def all_docs(self, keys_only = False, indices=["default"],size=10000):
        q = query.MatchAllQuery()

        docs = self.conn.search(q,indices=indices,doc_types='couchbaseDocument')
        docs = []

        for row in docs:
            if keys_only:
                row = row['meta']['id']
            docs.append(row)

        return docs

    # check if a key exists by checking all known nodes
    # See - CBES-17
    # for use when it seems nodes are out of sync
    def search_all_nodes(self, key, indices=["default"]):
        doc = None
        for index in indices:
           for _node in self.get_nodes():
               ip, port = (_node.ip, _node.ht_port)
               r = requests.get('http://%s:%s/%s/couchbaseDocument/%s?preference=_only_node:%s' %\
                   (ip, port, index, key, _node.key))
               if r.status_code == 200 :
                   if r.json()['_id'] == key:
                       doc = r.json()
                       break

        return doc

    def fetch_bucket_stats(self, bucket='default', zoom='minute'):

        return { "op" : { "samples" : { "xdc_ops" : [0] } } }

    def start_replication(self, *args, **kwargs):
        return "es",self.ip

    def _rebalance_progress(self, *args, **kwargs):
        return 100

    def _rebalance_progress_status(self, *args, **kwargs):
        return 'not running'

    def get_vbuckets(self, *args, **kwargs):
        return ()

    def replace_template(self, node, file):
        f = open(file, 'r')
        template = f.read().replace('\n', ' ')
        api =  "http://{0}:9200/_template/couchbase".format(node.ip)
        status, content, header = self._http_request(api, 'PUT', template)
        if status:
            log.info('uploaded couchbase template: '+file)
        else:
            log.error('template upload failed: {0}'.format(content))

    def add_node(self, user='', password='', remoteIp='', port='8091',zone_name='', services=None):
        pass

    def update_configuration(self, node, commands):
        rmc = RemoteMachineShellConnection(node)
        shell = rmc._ssh_client.invoke_shell()
        for command in commands:
            log.info('Adding elastic search config {0} on node {1}'.format(command, self.ip))
            shell.send('echo "{0}" >> ~/elasticsearch/config/elasticsearch.yml \n'.format(command))
            while not shell.recv_ready():
                time.sleep(2)
            rc = shell.recv(1024)
            log.info(rc)

    def reset_configuration(self, node, count=1):
        rmc = RemoteMachineShellConnection(node)
        shell = rmc._ssh_client.invoke_shell()
        log.info('Removing last {0} lines from elastic search config on node {1}'.format(count, self.ip))
        shell.send('head -n -{0}  ~/elasticsearch/config/elasticsearch.yml > temp ; mv temp  ~/elasticsearch/config/elasticsearch.yml \n'.format(count))
        while not shell.recv_ready():
            time.sleep(2)
        rc = shell.recv(1024)
        log.info(rc)


    def start_es_node(self, node):
        rmc = RemoteMachineShellConnection(node)
        shell=rmc._ssh_client.invoke_shell()
        es_kill = "pkill -f elasticsearch;"

        shell.send(es_kill+' \n')
        while not shell.recv_ready():
            time.sleep(2)

        rc = shell.recv(1024)
        log.info(rc)
        log.info("Sleep for 30 seconds")
        time.sleep(30)


        # define es exec path if not in $PATH environment

        es_bin = "~/elasticsearch/bin/elasticsearch -Dtransport.couchbase=TRACE -Dcom.couchbase=TRACE > /var/log/es.log 2>&1 &"
        if 'es_bin' in TestInputSingleton.input.test_params:
            es_bin = TestInputSingleton.input.test_params['es_bin']

        # connect to remote node
        log.info('Starting node: %s:%s' % (node.ip, node.port))

        # start es service
        shell.send(es_bin+' \n')
        while not shell.recv_ready():
            time.sleep(2)

        rc = shell.recv(1024)
        log.info(rc)

        log.info("Sleep for 5 seconds before the node can appear")
        time.sleep(5)
        # wait for new node
        tries = 0
        while tries < 10:
            for cluster_node in self.get_nodes():
                if cluster_node.ip == node.ip and cluster_node.port == int(node.port):
                    return
                else:
                    log.info('Waiting for new node to appear')
                    time.sleep(5)
                    tries = tries + 1

        raise Exception("failed to add node to cluster: %s:%s" % (node.ip,node.port))

    def log_client_error(self, post):
        # cannot post req errors to 9091
        pass

    def vbucket_map_ready(self, *args, **kwargs):
        return True

    def init_cluster(self, *args, **kwargs):
        pass

    def init_cluster_memoryQuota(self, *args, **kwargs):
        pass

    def set_reb_cons_view(self, *args, **kwargs):
        pass

    def set_reb_index_waiting(self, *args, **kwargs):
        pass

    def set_rebalance_index_pausing(self, *args, **kwargs):
        pass

    def set_max_parallel_indexers(self, *args, **kwargs):
        pass

    def set_max_parallel_replica_indexers(self, *args, **kwargs):
        pass


    def log_client_error(self, post):
        # cannot post req errors to 9091
        pass

    def vbucket_map_ready(self, *args, **kwargs):
        return True

    def init_cluster(self, *args, **kwargs):
        pass

    def init_cluster_memoryQuota(self, *args, **kwargs):
        pass

    def set_reb_cons_view(self, *args, **kwargs):
        pass

    def set_reb_index_waiting(self, *args, **kwargs):
        pass

    def set_rebalance_index_pausing(self, *args, **kwargs):
        pass

    def set_max_parallel_indexers(self, *args, **kwargs):
        pass

    def set_max_parallel_replica_indexers(self, *args, **kwargs):
        pass

    def rebalance(self, otpNodes, ejectedNodes):
        # shutdown ejected nodes
        # wait for shards to be rebalanced

        nodesToShutdown = \
            [node for node in self.get_nodes() if node.id in ejectedNodes]

        for node in nodesToShutdown:
            self.eject_node(node)

    def eject_node(self, node):
        api = "http://%s:9200/_cluster/nodes/local/_shutdown?delay=0s" % (node.ip)
        status, content, header = self._http_request(api, 'POST', '')
        if status:
            log.info('ejected node: '+node.ip)
        else:
            log.error('rebalance operation failed: {0}'.format(content))



    def monitorRebalance(self, stop_if_loop=False):
        # since removed nodes are shutdown use master node for monitoring
        return self.get_nodes_self()

    def get_pools_info(self):
        return {'pools' : []}

    def add_remote_cluster(self, *args, **kwargs):
        # detect 2:1 mapping and do spectial cluster add
        # otherwise run super method
        pass

    def remove_all_remote_clusters(self):
        pass

    def remove_all_replications(self):
        pass

    def is_cluster_mixed(self):
        return False

    def set_internalSetting(self, param, value):
        return {'ok' : True}
示例#29
0
class EsRestConnection(RestConnection):
    def __init__(self, serverInfo, proto="http"):
        #serverInfo can be a json object
        #only connect pyes to master es node
        #in the case that other nodes are taken down
        #because http requests will fail
        # TODO: dynamic master node detection
        if isinstance(serverInfo, dict):
            self.ip = serverInfo["ip"]
            self.rest_username = serverInfo["username"]
            self.rest_password = serverInfo["password"]
            self.username = serverInfo["es_username"]
            self.password = serverInfo["es_password"]
            self.port = 9091  #serverInfo["port"]
        else:
            self.ip = serverInfo.ip
            self.rest_username = serverInfo.rest_username
            self.rest_password = serverInfo.rest_password
            self.username = serverInfo.es_username
            self.password = serverInfo.es_password
            self.port = 9091  # serverInfo.port

        self.baseUrl = "http://{0}:{1}/".format(self.ip, self.port)
        self.capiBaseUrl = self.baseUrl
        self.esHttpUrl = "http://{0}:9200".format(self.ip)
        self.http_port = str(int(self.port) + 109)
        self.proto = proto
        self.conn = ES(server=self.esHttpUrl)
        self.manager = managers.Cluster(self.conn)
        self.test_params = TestInputSingleton.input
        self.docs = None

    def get_index_stats(self):
        return ES.index_stats()

    def get_indices(self):
        schema = self.conn.indices.get_mapping()
        indices_full_list = schema.get_all_indices()
        just_indices = [
            index for index in indices_full_list if not index.startswith(".")
        ]
        return just_indices

    def get_indices_as_buckets(self, doc_type='couchbaseDocument'):
        buckets = []
        indices = self.get_indices()

        for index in indices:
            bucket = Bucket()
            q = query.MatchAllQuery()
            docs = self.conn.search(q, index, doc_type)
            bucket.name = index
            bucket.type = "es"
            bucket.port = self.port
            bucket.nodes = list()

            #vBucketServerMap
            bucketStats = BucketStats()
            bucketStats.itemCount = docs.count()
            bucket.stats = bucketStats
            buckets.append(bucket)
            bucket.master_id = "es@" + self.ip

        return buckets

    def get_bucket(self, bucket_name, doc_type='couchbaseDocument'):
        for bucket in self.get_indices_as_buckets(doc_type):
            if bucket.name == bucket_name:
                return bucket
        return

    def get_buckets(self):
        return self.get_indices_as_buckets()

    def delete_index(self, name):
        self.conn.indices.delete_index(name)
        return self.conn.indices.exists_index(name)

    def create_index(self, name):

        if self.conn.indices.exists_index(name):
            self.delete_index(name)

        self.conn.indices.create_index(name)
        return self.conn.indices.exists_index(name)

    def delete_bucket(self, name):
        return self.delete_index(name)

    def create_bucket(self, *args, **kwargs):
        name = 'default'

        if len(args) > 0:
            name = args[0]
        else:
            name = kwargs['bucket']

        return self.create_index(name)

    def is_ns_server_running(self, timeout_in_seconds=360):
        return True

    def node_statuses(self, timeout=120):
        otp_nodes = []

        for node in self.get_nodes():

            #get otp,get status
            otp_node = OtpNode(id=node.id, status=node.status)

            otp_node.ip = node.ip
            otp_node.port = node.port
            otp_node.replication = None
            otp_nodes.append(node)

        return otp_nodes

    def get_nodes_self(self, timeout=120):
        for node in self.get_nodes():
            # force to return master node
            if node.port == 9091:
                return node
        return

    def get_nodes(self):
        es_nodes = []
        nodes = self.manager.state()['nodes']
        status = self.manager.health()['status']
        if status == "green":
            status = "healthy"

        for node_key in nodes:
            nodeInfo = nodes[node_key]
            ex_params = self.get_node_params(nodeInfo)

            nodeInfo.update({
                'ssh_password': ex_params.ssh_password,
                'ssh_username': ex_params.ssh_username
            })
            nodeInfo['key'] = node_key
            node = ESNode(nodeInfo)
            node.status = status
            es_nodes.append(node)
        return es_nodes

    def get_node_params(self, info):
        ip, port = parse_addr(info["transport_address"])
        clusters = self.test_params.clusters
        master_node = None
        for _id in clusters:
            for node in clusters[_id]:
                if node.ip == ip and int(node.port) == port:
                    return node
                if int(node.port) == 9091:
                    master_node = node

        # use params from master node
        return master_node

    def search_term(self, key, indices=["default"]):
        result = None
        params = {"term": {"_id": key}}
        query = ES.Search(params)
        row = self.conn.search(query, indices=indices)
        if row.total > 0:
            result = row[0]
        return result

    def term_exists(self, key, indices=["default"]):
        return self.search_term(key, indices=indices) is not None

    def all_docs(self, keys_only=False, indices=["default"], size=10000):
        q = query.MatchAllQuery()

        docs = self.conn.search(q,
                                indices=indices,
                                doc_types='couchbaseDocument')
        res_docs = []

        for row in docs:
            if keys_only:
                row = row['meta']['id']
            res_docs.append(row)

        return res_docs

    # check if a key exists by checking all known nodes
    # See - CBES-17
    # for use when it seems nodes are out of sync
    def search_all_nodes(self, key, indices=["default"]):
        doc = None
        for index in indices:
            for _node in self.get_nodes():
                ip, port = (_node.ip, _node.ht_port)
                r = requests.get('http://%s:%s/%s/couchbaseDocument/%s?preference=_only_node:%s' %\
                    (ip, port, index, key, _node.key))
                if r.status_code == 200:
                    if r.json()['_id'] == key:
                        doc = r.json()
                        break

        return doc

    def fetch_bucket_stats(self, bucket_name='default'):
        bucket = self.get_bucket(bucket_name=bucket_name)
        return bucket.stats

    def start_replication(self, *args, **kwargs):
        return "es", self.ip

    def _rebalance_progress(self, *args, **kwargs):
        return 100

    def _rebalance_progress_status(self, *args, **kwargs):
        return 'not running'

    def get_vbuckets(self, *args, **kwargs):
        return ()

    def replace_template(self, node, file):
        f = open(file, 'r')
        template = f.read().replace('\n', ' ')
        api = "http://{0}:9200/_template/couchbase".format(node.ip)
        status, content, header = self._http_request(api, 'PUT', template)
        if status:
            log.info('uploaded couchbase template: ' + file)
        else:
            log.error('template upload failed: {0}'.format(content))

    def add_node(self,
                 user='',
                 password='',
                 remoteIp='',
                 port='8091',
                 zone_name='',
                 services=None):
        pass

    def update_configuration(self, node, commands):
        rmc = RemoteMachineShellConnection(node)
        shell = rmc._ssh_client.invoke_shell()
        for command in commands:
            log.info('Adding elastic search config {0} on node {1}'.format(
                command, self.ip))
            shell.send(
                'echo "{0}" >> ~/elasticsearch/config/elasticsearch.yml \n'.
                format(command))
            while not shell.recv_ready():
                time.sleep(2)
            rc = shell.recv(1024)
            log.info(rc)

    def reset_configuration(self, node, count=1):
        rmc = RemoteMachineShellConnection(node)
        shell = rmc._ssh_client.invoke_shell()
        log.info(
            'Removing last {0} lines from elastic search config on node {1}'.
            format(count, self.ip))
        shell.send(
            'head -n -{0}  ~/elasticsearch/config/elasticsearch.yml > temp ; mv temp  ~/elasticsearch/config/elasticsearch.yml \n'
            .format(count))
        while not shell.recv_ready():
            time.sleep(2)
        rc = shell.recv(1024)
        log.info(rc)

    def start_es_node(self, node):
        rmc = RemoteMachineShellConnection(node)
        shell = rmc._ssh_client.invoke_shell()
        es_kill = "pkill -f elasticsearch;"

        shell.send(es_kill + ' \n')
        while not shell.recv_ready():
            time.sleep(2)

        rc = shell.recv(1024)
        log.info(rc)
        log.info("Sleep for 30 seconds")
        time.sleep(30)

        # define es exec path if not in $PATH environment

        es_bin = "~/elasticsearch/bin/elasticsearch -Dtransport.couchbase=TRACE -Dcom.couchbase=TRACE > /var/log/es.log 2>&1 &"
        if 'es_bin' in TestInputSingleton.input.test_params:
            es_bin = TestInputSingleton.input.test_params['es_bin']

        # connect to remote node
        log.info('Starting node: %s:%s' % (node.ip, node.port))

        # start es service
        shell.send(es_bin + ' \n')
        while not shell.recv_ready():
            time.sleep(2)

        rc = shell.recv(1024)
        log.info(rc)

        log.info("Sleep for 5 seconds before the node can appear")
        time.sleep(5)
        # wait for new node
        tries = 0
        while tries < 10:
            for cluster_node in self.get_nodes():
                if cluster_node.ip == node.ip and cluster_node.port == int(
                        node.port):
                    return
                else:
                    log.info('Waiting for new node to appear')
                    time.sleep(5)
                    tries = tries + 1

        raise Exception("failed to add node to cluster: %s:%s" %
                        (node.ip, node.port))

    def log_client_error(self, post):
        # cannot post req errors to 9091
        pass

    def vbucket_map_ready(self, *args, **kwargs):
        return True

    def init_cluster(self, *args, **kwargs):
        pass

    def init_cluster_memoryQuota(self, *args, **kwargs):
        pass

    def set_reb_cons_view(self, *args, **kwargs):
        pass

    def set_reb_index_waiting(self, *args, **kwargs):
        pass

    def set_rebalance_index_pausing(self, *args, **kwargs):
        pass

    def set_max_parallel_indexers(self, *args, **kwargs):
        pass

    def set_max_parallel_replica_indexers(self, *args, **kwargs):
        pass

    def log_client_error(self, post):
        # cannot post req errors to 9091
        pass

    def vbucket_map_ready(self, *args, **kwargs):
        return True

    def init_cluster(self, *args, **kwargs):
        pass

    def init_cluster_memoryQuota(self, *args, **kwargs):
        pass

    def set_reb_cons_view(self, *args, **kwargs):
        pass

    def set_reb_index_waiting(self, *args, **kwargs):
        pass

    def set_rebalance_index_pausing(self, *args, **kwargs):
        pass

    def set_max_parallel_indexers(self, *args, **kwargs):
        pass

    def set_max_parallel_replica_indexers(self, *args, **kwargs):
        pass

    def rebalance(self, otpNodes, ejectedNodes):
        # shutdown ejected nodes
        # wait for shards to be rebalanced

        nodesToShutdown = \
            [node for node in self.get_nodes() if node.id in ejectedNodes]

        for node in nodesToShutdown:
            self.eject_node(node)

    def eject_node(self, node):
        api = "http://%s:9200/_cluster/nodes/local/_shutdown?delay=0s" % (
            node.ip)
        status, content, header = self._http_request(api, 'POST', '')
        if status:
            log.info('ejected node: ' + node.ip)
        else:
            log.error('rebalance operation failed: {0}'.format(content))

    def monitorRebalance(self, stop_if_loop=False):
        # since removed nodes are shutdown use master node for monitoring
        return self.get_nodes_self()

    def get_pools_info(self):
        return {'pools': []}

    def add_remote_cluster(self, *args, **kwargs):
        # detect 2:1 mapping and do spectial cluster add
        # otherwise run super method
        pass

    def remove_all_remote_clusters(self):
        pass

    def remove_all_replications(self):
        pass

    def is_cluster_mixed(self):
        return False

    def set_internalSetting(self, param, value):
        return {'ok': True}
示例#30
0
class ElasticCatalog(object):
    default_indexes = {
        'zelastic_doc_id': {
            'type': 'string',
            'index': 'not_analyzed'
        }
    }

    def __init__(self, connection_string, elastic_name, storage, bulk=False,
                 bulk_size=400):
        self.conn = ES(connection_string, bulk_size=bulk_size)
        self.bulk_size = bulk_size
        self.name = elastic_name
        self.storage = storage
        self.bulk = bulk

    def update_mapping(self, name):
        meta = self.storage.meta(name)
        indexes = meta['indexes']
        properties = self.default_indexes.copy()
        try:
            self.conn.create_index(self.name)
        except IndexAlreadyExistsException:
            pass
        for index_name, _type in indexes.items():
            index = None
            if _type == 'str':
                index = {
                    'type': 'string',
                    'index': 'not_analyzed',
                }
            elif _type == 'full':
                index = {
                    'type': 'string',
                    'index': 'analyzed',
                }
            elif _type == 'bool':
                index = {
                    'type': 'boolean'
                }
            elif _type == 'int':
                index = {
                    'type': 'integer',
                }
            elif _type in ('datetime', 'date'):
                index = {
                    'type': 'date',
                }
            elif _type == 'float':
                index = {
                    'type': 'float',
                }
            if index is not None:
                properties[index_name] = index
        self.conn.indices.put_mapping(
            doc_type=name,
            mapping={
                'ignore_conflicts': True,
                'properties': properties
            },
            indices=[self.name])

    def id(self, container_name, key):
        return '%s-%s' % (container_name, key)

    def index(self, container_name, doc, key):
        # need to add data to the index that isn't actually persisted
        data = {
            'zelastic_doc_id': key
        }
        meta = self.storage.meta(container_name)
        indexes = meta['indexes']
        for index in indexes.keys():
            if index in doc:
                data[index] = doc[index]
        self.conn.index(
            data,
            self.name,
            container_name,
            self.id(container_name, key),
            bulk=self.bulk)

    def delete(self, container_name, key):
        self.conn.delete(
            self.name,
            container_name,
            self.id(container_name, key),
            bulk=self.bulk)

    def delete_all(self, container_name):
        self.conn.delete_mapping(
            self.name,
            container_name)

    def search(self, container_name, query, **kwargs):
        return self.conn.search(
            query,
            indexes=[self.name],
            doc_types=[container_name],
            **kwargs)

    def getFacets(self, container_name, field, size=100):
        return self.conn.search_raw({
            "facets": {
                field: {
                    "terms": {
                        "all_terms": True,
                        "field": field,
                        "size": size,
                        "order": "term"
                    }
                }
            }
        }, indexes=[self.name], doc_type=container_name)
示例#31
0
class ElasticSearch(object):
    def __init__(self, query):
        self.elastic = ES(settings.SEARCH_HOSTS)
        self.query = QueryParser(query)

    def search(self, index_type='job'):
        if not self.query.is_valid():
            self.results = EmptyResults()
            return
        
        where_filters = [ MatchAllFilter() ]
            
        if self.query.params['where']:
            where_filters= [ QueryFilter(StringQuery(self.query.get_where(), search_fields=['city'],\
                analyze_wildcard=True, default_operator="AND")) ] 

        query = FilteredQuery(
            StringQuery(self.query.params['what'], search_fields=settings.SEARCH_WHAT_FIELDS, default_operator='AND', analyze_wildcard=True)\
                if self.query.params['what'] else MatchAllQuery(),
            ORFilter(self._get_geo_filters(where_filters))
        )

        facets_filter = []
        
        if self.query.params.has_key('company.facet') and len(self.query.params['company.facet']):
            facets_filter.append(TermFilter('company.facet', self.query.params['company.facet']))

        sorting = {'_score': {'order': 'desc' }}
        
        if self.query.params.has_key('sorting'):
            if self.query.params['sorting'] in ['score', 'published_date']:
                sorting = {self.query.params['sorting']: {'order': 'desc' }}

        query = Search(
            query, ORFilter(facets_filter) if len(facets_filter) else [], 
            start=self.query.start_page(), size=settings.PAGINATION_PAGE_SIZE, sort=sorting
        )
        
        query.facet.add_term_facet(
            'company.facet', 
            size=settings.SEARCH_FACETS_SIZE
        )

        self.results = self.elastic.search(query, settings.SEARCH_ALIASES, index_type)
        logger.info('Elastic query: %s\n' % str(query.to_search_json()))

    def get_results(self):
        data = []

        if self.results.total:
            for result in self.results.hits:
                item = {}              
                
                if result.has_key('_source'):
                    item = result['_source']
                    del item['details_url']
                    
                    if item.has_key('title'):
                        item['redirect_url'] = reverse('redirect', kwargs={
                            'slug'   : slugify(result['_source']['title']),
                            'source' : result['_source']['source'],
                            'job_id' : result['_id']
                        })
                    
                    if item.has_key('published_date'):
                        item['published_date_ago'] = timesince(result['_source']['published_date']).encode('utf-8')
    
                    if item.has_key('summary'): item['summary'] = truncatechars(result['_source']['summary'], 350)
                    elif item['content']:       item['summary'] = truncatechars(result['_source']['content'], 350)
                        
                    if item.has_key('image'):
                        item['image'] = '%s/job/thumbs/small/%s' % (settings.MEDIA_URL, item['image'])
                        
                if len(item):
                    data.append(item)
                
        return data
    
    def get_facets(self):
        facets = {}
        
        if self.results.total:
            for facet in self.results.facets:
                if self.results.facets[facet].has_key('terms'):
                    facets[facet] = self.results.facets[facet]['terms']

            if facets.has_key('company.facet'):
                for item in facets['company.facet']:
                    if self.query.params.has_key('company.facet') and len(self.query.params['company.facet'])\
                        and self.query.params['company.facet'] == item['term']:
                            item['url'] = self._get_url({'company.facet': '', 'page': 1})
                            item['active'] = True
                    else:
                        item['url'] = self._get_url({'company.facet': item['term'], 'page': 1})
                                
        return facets

    def list_pages(self):
        if self.results.total <= 0:
            return []
            
        pages = divmod(self.results.total, settings.PAGINATION_PAGE_SIZE)
        pages = pages[0] + 1 if pages[1] > 0 else pages[0]

        paginator = Pagination(self.query.params['page'], settings.PAGINATION_PAGE_SIZE, self.results.total)
        iterator = paginator.iter_pages(
            left_current  = settings.PAGINATION_CURRENT_LEFT, 
            right_current = settings.PAGINATION_CURRENT_RIGHT,
            left_edge     = settings.PAGINATION_EDGE_LEFT,
            right_edge    = settings.PAGINATION_EDGE_RIGHT
        )

        return [{
            'page': page, 
            'url': self._get_url({'page': page}),
            'selected': self.query.params['page'] == page 
        } for page in iterator]

    def close(self):
        self.elastic.connection.close()
        
    def _get_geo_filters(self, filters=[]):
        for geo in self.query.get_geoquery():
            filters.append(GeoDistanceFilter(
                'pin.location', 
                { 'lat' : geo[0], 'lon': geo[1] }, 
                settings.APP_GEO_CITIES_RANGE
            ))
        
        return filters
    
    def _get_url(self, data):
        params = self.query.params.copy()
        params.update(data)        

        url = ''
        
        for key, value in params.items():
            url += '%s=%s&' % (key, urllib2.quote(unicode(value).encode('utf8')))
            
        return url[:-1]
from pyes import ES

es = ES()
index_name = "my_index"
type_name = "my_type"

from utils_pyes import create_and_add_mapping, populate

create_and_add_mapping(es, index_name, type_name)
populate(es, index_name, type_name)

from pyes.query import *
from pyes.filters import *

results = es.search(MatchAllQuery(), indices=index_name, doc_types=type_name)

print "total:", results.total
for r in results:
    print r

print "first element: ", results[0]
print "slice elements: ", results[1:4]

results = es.search(TermQuery("name", "joe", 3), indices=index_name, doc_types=type_name)

q1 = TermFilter("position", 1)
q2 = TermFilter("position", 2)
orq = ORFilter([q1, q2])
q = FilteredQuery(MatchAllQuery(), orq)

results = es.search(q, indices=index_name, doc_types=type_name)
ftool = FileTools()
ftrans = FormatTranslator() 
  
# 1. Create Connection
conn = ES()
  
# 2. Index Data
dataset_json = open("../dataset.json")
dataset = json.load(dataset_json)['data']
for data in dataset:
    conn.index(data, "example_index", "example_type", "example_id_"+str(dataset.index(data)))
      
# 3. Create Simple Query
query = MatchAllQuery()
  
# 4. Create Simple Aggregation
agg = TermsAgg('agg1', field="name",sub_aggs=[],size=100)
  
# 5. Get Result
search = Search(query,size=5)
search.agg.add(agg)
print search.serialize()
  
result = conn.search(search, "example_index", "example_type" )
  
for i in result:
    print json.dumps(i,indent=2)
print json.dumps(result.aggs,indent=2)
  
result._do_search()
print json.dumps(result._results,indent=2)