Пример #1
0
def dump(start,end,backupdir,eshost):
    conn = ES(eshost)
    out = file('/tmp/out.json','w')
    _type = 'habakkuk'
    q = MatchAllQuery()
    q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',start,end,include_upper=False)))
    q = q.search()
    # print json.dumps(json.loads(q.to_search_json()),indent=2)
    resultset = conn.search(query=q,indices=_type+"-*", doc_types=[_type], scan=True)
    cnt=0
    if not resultset.total:
        sys.stderr.write("no data for %s - %s\n"%(start,end))
        return

    try:
        sys.stderr.write("Will write %d lines to %s\n"%(resultset.total, out.name))
        while True:
            r = resultset.next()
            cnt+=1
            out.write(json.dumps(r)+'\n')
    except StopIteration:
        pass

    out.close()

    # gzip
    ext = datetime.strftime(start,'%Y-%m-%d')
    backup = os.path.join(backupdir,"habakkuk-%s.json.gz"%ext)

    f_in = open(out.name,'rb')
    f_out = gzip.open(backup,'wb')
    f_out.writelines(f_in)
    f_out.close()
    f_out.close()
    sys.stderr.write("Created %s\n"%backup)
Пример #2
0
 def __init__(self, server, settings=None):
     self.conn = ES(server)
     self.indices = {}
     if settings:
         self.settings = settings
     else:
         self.settings = {
             'index': {
                 'analysis': {
                     'analyzer': {
                         'ngram_analyzer': {
                             'tokenizer': 'keyword',
                             'filter': ['lowercase', 'filter_ngram'],
                             'type': 'custom'
                         }
                     },
                     'filter': {
                         'filter_ngram': {
                             'type': 'nGram',
                             'max_gram': 30,
                             'min_gram': 1
                         }
                     }
                 }
             }
         }
Пример #3
0
    def get_entries(self):
        '''Get all entries for a team + their filter from ES/MozDef'''
        teamfilter = self.config['teamsetup'][self.team]['filter']
        es = ES((self.config['mozdef']['proto'], self.config['mozdef']['host'], self.config['mozdef']['port']))

        # Default filter - time period
        try:
            td = self.config['es'][teamfilter]['_time_period']
        except KeyError:
            debug('No _time_period defined, defaulting to 24h')
            td = 24
        begindateUTC = toUTC(datetime.now() - timedelta(hours=td))
        enddateUTC= toUTC(datetime.now())
        print begindateUTC, enddateUTC
        fDate = pyes.RangeQuery(qrange=pyes.ESRange('utctimestamp', from_value=begindateUTC, to_value=enddateUTC))

        # Load team queries from our json config.
        # Lists are "should" unless an item is negated with "!" then it's must_not
        # Single items are "must"
        query = pyes.query.BoolQuery()
        query.add_must(pyes.QueryStringQuery('asset.autogroup: "{}"'.format(self.team)))
        for item in self.config['es'][teamfilter]:
            # items starting with '_' are internal/reserved, like _time_period
            if (item.startswith('_')):
                continue
            val = self.config['es'][teamfilter][item]
            if (type(val) == list):
                for v in val:
                    if (v.startswith("!")):
                        query.add_must_not(pyes.MatchQuery(item, v[1:]))
                    else:
                        query.add_should(pyes.MatchQuery(item, v))
            else:
                if (val.startswith("!")):
                    query.add_must_not(pyes.MatchQuery(item, val))
                else:
                    query.add_must(pyes.MatchQuery(item, val))


        q = pyes.ConstantScoreQuery(query)
        q = pyes.FilteredQuery(q, pyes.BoolFilter(must=[fDate]))

        results = es.search(query=q, indices=self.config['es']['index'])

        raw = results._search_raw(0, results.count())
        # This doesn't do much, but pyes has no "close()" or similar functionality.
        es.force_bulk()

        if (raw._shards.failed != 0):
            raise Exception("Some shards failed! {0}".format(raw._shards.__str__()))

        # Nobody cares for the metadata past this point (all the goodies are in '_source')
        data = []
        for i in raw.hits.hits:
            data += [i._source]
        return data
Пример #4
0
    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        self.connection = ES(server, **args)
        self.connection.default_indices = self.index
        self.connection.default_types = self.document_type
Пример #5
0
class ElasticSearchTestCase(unittest.TestCase):

    def search(self, query):
        return self.conn.search_raw(query, 'test-index')

    def setUp(self):
        self.conn = ElasticSearch('http://localhost:9200/')

    def tearDown(self):
        self.conn.delete_index("test-index")

    def assertResultContains(self, result, expected):
        for (key, value) in expected.items():
            self.assertEquals(value, result[key])
Пример #6
0
 def test_servers(self):
     es = ES("127.0.0.1:9200")
     self.assertEquals(es.servers, [("http", "127.0.0.1", 9200)])
     es = ES("127.0.0.1:9500")
     self.assertEquals(es.servers, [("thrift", "127.0.0.1", 9500)])
     es = ES("http://127.0.0.1:9400")
     self.assertEquals(es.servers, [("http", "127.0.0.1", 9400)])
     es = ES("thrift://127.0.0.1:9100")
     self.assertEquals(es.servers, [("thrift", "127.0.0.1", 9100)])
     es = ES([
         "thrift://127.0.0.1:9100", "127.0.0.1:9200",
         ("thrift", "127.0.0.1", 9000)
     ])
     self.assertEquals(sorted(es.servers), [("http", "127.0.0.1", 9200),
                                            ("thrift", "127.0.0.1", 9000),
                                            ("thrift", "127.0.0.1", 9100)])
Пример #7
0
 def __init__(self,server,settings = None ):
     self.conn = ES(server)
     self.indices = {}
     if settings:
         self.settings = settings
     else:
         self.settings = { 
             'index': {
                 'analysis' : {
                     'analyzer' : {                             
                         'ngram_analyzer' : {                   
                             'tokenizer' : 'keyword',
                             'filter' : ['lowercase', 'filter_ngram'],
                             'type' : 'custom'
                         }  
                     },
                     'filter' : {
                         'filter_ngram' : {                                 
                             'type' : 'nGram',
                             'max_gram' : 30,
                             'min_gram' : 1                                 
                         }                           
                     }
                 }
             }
         }
Пример #8
0
def connect_to_db():
    eshosts = settings.ES_SETTINGS['ES_HOSTS']
    index = settings.ES_SETTINGS['INDEX']
    timeout = settings.ES_SETTINGS.get('TIMEOUT', 60.0)

    # build query
    return ES(eshosts, timeout=timeout)
Пример #9
0
 def __init__(self, server, settings=None):
     # These timeout and bulk_size parameters were determined through
     # trial and error to be necessary to avoid timeout errors when
     # generating indices on Sandbox. They should not be taken as gospel.
     self.conn = ES(server, timeout=120.0)  # Default timeout: 30.0
     self.conn.bulker.bulk_size = 25  # Default: 400
     if settings:
         self.settings = settings
     else:
         self.settings = {
             'index': {
                 'analysis': {
                     'analyzer': {
                         'ngram_analyzer': {
                             'tokenizer': 'keyword',
                             'filter': ['lowercase', 'filter_ngram'],
                             'type': 'custom'
                         }
                     },
                     'filter': {
                         'filter_ngram': {
                             'type': 'nGram',
                             'max_gram': 30,
                             'min_gram': 1
                         }
                     }
                 }
             }
         }
     self.refresh_index_cache()
Пример #10
0
    def initialize(self):
        """
        Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.index
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.index)
                self.connection.refresh(self.index)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.index)
            self.connection.refresh(self.index)
            self.connection.create_index(self.index)
            self.connection.refresh(self.index)

        if self.truncate:
            self.connection.delete_mapping(self.index, self.document_type)
            self.connection.refresh(self.index)
        #check mapping
        try:
            self.connection.get_mapping(self.document_type, self.index)
        except TypeMissingException:
            self.connection.put_mapping(self.document_type,
                                        self._get_mapping(), self.index)
Пример #11
0
 def test_servers(self):
     geturls = lambda servers: [server.geturl() for server in servers]
     es = ES("127.0.0.1:9200")
     self.assertEqual(geturls(es.servers), ["http://127.0.0.1:9200"])
     es = ES("127.0.0.1:9500")
     self.assertEqual(geturls(es.servers), ["thrift://127.0.0.1:9500"])
     es = ES(("http", "127.0.0.1", 9400))
     self.assertEqual(geturls(es.servers), ["http://127.0.0.1:9400"])
     es = ES(("thrift", "127.0.0.1", 9100))
     self.assertEqual(geturls(es.servers), ["thrift://127.0.0.1:9100"])
     es = ES([
         "http://127.0.0.1:9100",
         "127.0.0.1:9200",
         ("thrift", "127.0.0.1", 9000),
         "127.0.0.1:9500",
     ])
     self.assertEqual(geturls(sorted(es.servers)), [
         "http://127.0.0.1:9100", "http://127.0.0.1:9200",
         "thrift://127.0.0.1:9000", "thrift://127.0.0.1:9500"
     ])
Пример #12
0
    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        self.connection = ES(server, **args)
        self.connection.default_indices = self.database_name
        self.connection.default_types = self.document_type
Пример #13
0
    def setUp(self):
        """reads the "real" elasticsearch settings 
        from SOURCE/elasticsearch/settings.json and uses it to configure
        an index for the unittests"""
        self.es_settings = {'ES_HOSTS':['localhost:9200',],
                            'INDEX':"unittest-binarypig",
                            'FACET_SIZE':999999}
        query.settings.ES_SETTINGS = self.es_settings

        index_template_fn = os.path.join(settings.SOURCE_ROOT, 
                                         'elasticsearch', 
                                         'settings.json')
        self.index_settings = json.loads(file(index_template_fn).read())
        conn = ES(self.es_settings['ES_HOSTS'])
        self.createIndex(conn)
Пример #14
0
def dump_topics(backupdir,
                eshost,
                _type,
                indices="topics-all"):
    conn = ES(eshost)
    out = file('/tmp/out.json','w')
    q = MatchAllQuery()
    q = q.search()

    resultset = conn.search(query=q,indices=indices, doc_types=[_type], scan=True)
    cnt=0
    if not resultset.total:
        sys.stderr.write("no data\n")
        return

    try:
        sys.stderr.write("Will write %d lines to %s\n"%(resultset.total, out.name))
        while True:
            r = resultset.next()
            r['_id'] = r._meta.id
            cnt+=1
            out.write(json.dumps(r)+'\n')
    except StopIteration:
        pass

    out.close()

    # gzip
    backup = os.path.join(backupdir,"topics.{}.json.gz".format(_type))

    f_in = open(out.name,'rb')
    f_out = gzip.open(backup,'wb')
    f_out.writelines(f_in)
    f_out.close()
    f_out.close()
    sys.stderr.write("Created %s\n"%backup)
Пример #15
0
    def initialize(self):
        """
        Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.index
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.index)
                self.connection.refresh(self.index)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.index)
            self.connection.refresh(self.index)
            self.connection.create_index(self.index)
            self.connection.refresh(self.index)

        if self.truncate:
            self.connection.delete_mapping(self.index, self.document_type)
            self.connection.refresh(self.index)
        #check mapping
        try:
            self.connection.get_mapping(self.document_type, self.index)
        except TypeMissingException:
            self.connection.put_mapping(self.document_type, self._get_mapping(), self.index)
Пример #16
0
 def setUp(self):
     self.conn = ElasticSearch('http://localhost:9200/')
Пример #17
0
 def tearDown(self):
     """delete the index"""
     conn = ES(self.es_settings['ES_HOSTS'])
     self.deleteIndex(conn)
Пример #18
0
Файл: run.py Проект: mlys/gsapi
from utils import myyaml

app = Flask(__name__)

# ElasticSearch
escfg = Config.ES
# Wanna make sure test db is used if /test/ in url
try:
    if '/test/' in request.url:
        Config.TESTING = True
        escfg = Config.ES_TEST
except:
    pass

# ElasticSearch
es = ES(("http", escfg['host'], escfg['port']))
es.__dict__['index_name'] = escfg['name']
app.es = es

app.config.from_object(Config)

mongo     = PyMongo()
app.mongo = mongo
mongo.init_app(app)

# add regex for routing
app.url_map.converters['regex'] = RegexConverter

##################### GET SEARCH
@app.route( '/es', methods=['GET'])
def es():
Пример #19
0
class ESDataTarget(DataTarget):
    """docstring for ClassName
    """

    def __init__(self, document_type, index="test", host="127.0.0.1", port="9200", truncate=False, expand=False,
                 **elasticsearch_args):
        """Creates a ElasticSearch data target stream.

        :Attributes:
            * document_ElasticSearch elasticsearch document_type name
            * index: database name
            * host: ElasticSearch database server host, default is ``localhost``
            * port: ElasticSearch port, default is ``9200``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
            * truncate: delete existing data in the document_type. Default: False
        """
        super(ESDataTarget, self).__init__()
        self.document_type = document_type
        self.index = index
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.truncate = truncate
        self._fields = None

    def initialize(self):
        """
        Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.index
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.index)
                self.connection.refresh(self.index)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.index)
            self.connection.refresh(self.index)
            self.connection.create_index(self.index)
            self.connection.refresh(self.index)

        if self.truncate:
            self.connection.delete_mapping(self.index, self.document_type)
            self.connection.refresh(self.index)
        #check mapping
        try:
            self.connection.get_mapping(self.document_type, self.index)
        except TypeMissingException:
            self.connection.put_mapping(self.document_type, self._get_mapping(), self.index)

    def _get_mapping(self):
        """Build an ES optimized mapping for the given fields"""
        from pyes.mappings import DocumentObjectField, IntegerField, StringField, BooleanField, FloatField, DateField

        document = DocumentObjectField(name=self.document_type)
        for field in self.fields:
            st = field.storage_type
            if st == "unknown":
                #lets es detect the type
                continue
            elif st in ["string", "text"]:
                document.add_property(StringField(name=field.name))
            elif st == "integer":
                document.add_property(IntegerField(name=field.name))
            elif st == "boolean":
                document.add_property(BooleanField(name=field.name))
            elif st == "date":
                document.add_property(DateField(name=field.name))
            elif st == "float":
                document.add_property(FloatField(name=field.name))

        return document


    def append(self, obj):
        record = obj
        if not isinstance(obj, dict):
            record = dict(zip(self.field_names, obj))

        if self.expand:
            record = expand_record(record)

        id = record.get('id') or record.get('_id')
        self.connection.index(record, self.index, self.document_type, id, bulk=True)

    def finalize(self):
        self.connection.flush_bulk(forced=True)
Пример #20
0
def get_conn(*args, **kwargs):
    return ES(("http", "127.0.0.1", 9200), *args, **kwargs)
Пример #21
0
class FullTextSearch(object):
    def __init__(self, server, settings=None):
        self.conn = ES(server)
        if settings:
            self.settings = settings
        else:
            self.settings = {
                'index': {
                    'analysis': {
                        'analyzer': {
                            'ngram_analyzer': {
                                'tokenizer': 'keyword',
                                'filter': ['lowercase', 'filter_ngram'],
                                'type': 'custom'
                            }
                        },
                        'filter': {
                            'filter_ngram': {
                                'type': 'nGram',
                                'max_gram': 30,
                                'min_gram': 1
                            }
                        }
                    }
                }
            }
        self.refresh_index_cache()

    def search_index_text(self, query_string, fields="_all", **args):
        q = query.TextQuery(fields, query_string)
        return self.search_index(q, **args)

    def search_index(self, query, indices=None, num_results=None, node_type=None):
        results = self.conn.search(
            query=query, indices=indices, doc_types=node_type)
        meta_list = [r.get_meta() for r in results[0:num_results]]
        node_dict = {}

        # fetch nodes grouped by type to reduce number of db calls
        key = itemgetter('type')
        for t, grouped_list in groupby(sorted(meta_list, key=key), key=key):
            ids = [meta['id'] for meta in grouped_list]
            for node in self.datastore.get_nodes(t, ids):
                node_dict[(node.type, node.key)] = node

        # return nodes in original order
        nodelist = [node_dict[(meta['type'], meta['id'])]
                    for meta in meta_list]

        return nodelist

    def create_index(self, type, indexed_variables, index_name):
        self.conn.create_index_if_missing(index_name, self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {'boost': 1.0,
                            'analyzer': 'ngram_analyzer',
                            'type': 'string',
                            'term_vector': 'with_positions_offsets'}
        index_settings = {'index_analyzer': 'ngram_analyzer',
                          'search_analyzer': 'standard',
                          'properties': mapping}
        self.conn.put_mapping(str(type), index_settings, [index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        try:
            self.indices = self.conn.get_mapping()
        except exceptions.IndexMissingException:
            self.indices = {}

    def delete_index(self, index_name):
        self.conn.delete_index_if_exists(index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]

        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, type, key)
            except exceptions.NotFoundException:
                pass
            self.conn.index(index_dict, index_name, type, key)
        self.conn.refresh([index_name])

    def on_create(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            self.conn.index(index_dict, index_name, node.type, node.key)
            self.conn.refresh([index_name])

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            try:
                self.conn.delete(index_name, node.type, node.key)
                self.conn.refresh([index_name])
            except exceptions.NotFoundException:
                pass

    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, node.type, node.key)
                self.conn.index(index_dict, index_name, node.type, node.key)
                self.conn.refresh([index_name])
            except exceptions.NotFoundException:
                pass

    def get_indices_of_type(self, type):
        type_indices = [
            key for key, value in self.indices.items()
            if type in value
        ]
        return type_indices

    def populate_index_document(self, node, index_name):
        indexed_variables = self.indices[index_name][node.type]['properties'].keys()
        index_dict = {
            field: node[field] for field in indexed_variables
        }
        return index_dict
Пример #22
0
'''
Created on Jan 13, 2013

@author: Fang Jiaguo
'''
from pyes.es import ES
from pymongo.connection import Connection
import json

settings = json.load(open('settings.json', 'r'))
mongodb = Connection(settings['mongodb']['host'],
                     settings['mongodb']['port'])[settings['mongodb']['db']]
elasticsearch = ES(('http', settings['elasticsearch']['host'],
                    settings['elasticsearch']['port']))
Пример #23
0
from pyes.es import ES
import random
import datetime

now = datetime.datetime.now()
es = ES()
for i in range(1000):
    d = {'created_at': now - datetime.timedelta(seconds=i),
            'level': 'ERROR' if random.random() < 0.1 else 'WARN',
            'message': "Test message"}
    es.index(d, 'my_index', 'my_type')
Пример #24
0
class FullTextSearch(object):
    def __init__(self,server,settings = None ):
        self.conn = ES(server)
        self.indices = {}
        if settings:
            self.settings = settings
        else:
            self.settings = { 
                'index': {
                    'analysis' : {
                        'analyzer' : {                             
                            'ngram_analyzer' : {                   
                                'tokenizer' : 'keyword',
                                'filter' : ['lowercase', 'filter_ngram'],
                                'type' : 'custom'
                            }  
                        },
                        'filter' : {
                            'filter_ngram' : {                                 
                                'type' : 'nGram',
                                'max_gram' : 30,
                                'min_gram' : 1                                 
                            }                           
                        }
                    }
                }
            }

    def search_index(self, type, index_names, query_string, num_results=-1):
        ns_index_names= [str(type) + "-_-" + index_name for index_name in index_names]
        q = WildcardQuery('_all',lower(query_string))
        results = self.conn.search(query=q, indices=ns_index_names, doc_types=type)
        num_found = len(results)
        if(num_results > num_found):
            num_results = num_found
        nodelist = [self.datastore.get_node(type,r['_id']) for r in results['hits']['hits']]
        if(num_results!=-1):
            return nodelist[0:num_results]
        else:
            return nodelist

    def create_index(self, type, indexed_variables, index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.conn.create_index(ns_index_name,self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {'boost':1.0,
                            'analyzer' : 'ngram_analyzer',
                            'type': u'string',
                            'term_vector': 'with_positions_offsets'}
        index_settings = {'index_analyzer':'ngram_analyzer',
                          'search_analyzer':'standard',
                          'properties':mapping}
        self.conn.put_mapping(str(type),index_settings,[ns_index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        self.indices = self.conn.get_indices()

    def delete_index(self,type,index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ns_index_name = str(type) + "-_-" + index_name
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]
        mapping = self.conn.get_mapping(type,ns_index_name)
        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(type,ns_index_name,node.attributes,mapping)
            try:
                self.conn.delete(ns_index_name,type,key)
            except exceptions.NotFoundException:
                pass
            try: 
                self.conn.index(index_dict,ns_index_name,type,key)
            except exceptions.ElasticSearchParseException:
                pass
        self.conn.refresh([ns_index_name])

    def on_create(self,node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type,ns_index_name)
            index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping)
            self.conn.index(index_dict,ns_index_name,node.type,node.key)
            self.conn.refresh([ns_index_name])

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            try:
                self.conn.delete(ns_index_name,node.type,node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass
           
    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type,ns_index_name)
            index_dict = self.populate_index_document(node.type,ns_index_name,node.attributes,mapping)
            try:
                self.conn.delete(ns_index_name,node.type,node.key)
                self.conn.index(index_dict,ns_index_name,node.type,node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass

    def get_indices_of_type(self,type):
        type_indices = []
        for index in self.indices.keys():
            if index.startswith(type+"-_-"):
                type_indices.append(index)
        return type_indices

    def populate_index_document(self,type,ns_index_name,attributes,mapping):
        indexed_variables = mapping[type]['properties'].keys()
        index_dict = {}
        for arg in indexed_variables:
            try:
                index_dict[arg] = attributes[arg]
            except KeyError:
                #if this attribute doesn't exist for this node, just pass
                pass
        return index_dict
Пример #25
0
class FullTextSearch(object):
    def __init__(self, server, settings=None):
        self.conn = ES(server)
        self.indices = {}
        if settings:
            self.settings = settings
        else:
            self.settings = {
                'index': {
                    'analysis': {
                        'analyzer': {
                            'ngram_analyzer': {
                                'tokenizer': 'keyword',
                                'filter': ['lowercase', 'filter_ngram'],
                                'type': 'custom'
                            }
                        },
                        'filter': {
                            'filter_ngram': {
                                'type': 'nGram',
                                'max_gram': 30,
                                'min_gram': 1
                            }
                        }
                    }
                }
            }

    def search_index(self, type, index_names, query_string, num_results=-1):
        ns_index_names = [
            str(type) + "-_-" + index_name for index_name in index_names
        ]
        q = WildcardQuery('_all', lower(query_string))
        results = self.conn.search(query=q,
                                   indices=ns_index_names,
                                   doc_types=type)
        num_found = len(results)
        if (num_results > num_found):
            num_results = num_found
        nodelist = [
            self.datastore.get_node(type, r['_id'])
            for r in results['hits']['hits']
        ]
        if (num_results != -1):
            return nodelist[0:num_results]
        else:
            return nodelist

    def create_index(self, type, indexed_variables, index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.conn.create_index(ns_index_name, self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {
                'boost': 1.0,
                'analyzer': 'ngram_analyzer',
                'type': u'string',
                'term_vector': 'with_positions_offsets'
            }
        index_settings = {
            'index_analyzer': 'ngram_analyzer',
            'search_analyzer': 'standard',
            'properties': mapping
        }
        self.conn.put_mapping(str(type), index_settings, [ns_index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        self.indices = self.conn.get_indices()

    def delete_index(self, type, index_name):
        ns_index_name = str(type) + "-_-" + index_name
        self.conn.delete_index_if_exists(ns_index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ns_index_name = str(type) + "-_-" + index_name
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]
        mapping = self.conn.get_mapping(type, ns_index_name)
        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(type, ns_index_name,
                                                      node.attributes, mapping)
            try:
                self.conn.delete(ns_index_name, type, key)
            except exceptions.NotFoundException:
                pass
            try:
                self.conn.index(index_dict, ns_index_name, type, key)
            except exceptions.ElasticSearchParseException:
                pass
        self.conn.refresh([ns_index_name])

    def on_create(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type, ns_index_name)
            index_dict = self.populate_index_document(node.type, ns_index_name,
                                                      node.attributes, mapping)
            self.conn.index(index_dict, ns_index_name, node.type, node.key)
            self.conn.refresh([ns_index_name])

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            try:
                self.conn.delete(ns_index_name, node.type, node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass

    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for ns_index_name in type_indices:
            mapping = self.conn.get_mapping(node.type, ns_index_name)
            index_dict = self.populate_index_document(node.type, ns_index_name,
                                                      node.attributes, mapping)
            try:
                self.conn.delete(ns_index_name, node.type, node.key)
                self.conn.index(index_dict, ns_index_name, node.type, node.key)
                self.conn.refresh([ns_index_name])
            except exceptions.NotFoundException:
                pass

    def get_indices_of_type(self, type):
        type_indices = []
        for index in self.indices.keys():
            if index.startswith(type + "-_-"):
                type_indices.append(index)
        return type_indices

    def populate_index_document(self, type, ns_index_name, attributes,
                                mapping):
        indexed_variables = mapping[type]['properties'].keys()
        index_dict = {}
        for arg in indexed_variables:
            try:
                index_dict[arg] = attributes[arg]
            except KeyError:
                #if this attribute doesn't exist for this node, just pass
                pass
        return index_dict
    database = 'processed'
    resource_collection = mcm.get_collection(database, 'resources', Resource)

    # message queue
    mq_config = {
        'transport': 'socket',
        'protocol': 'binary',
        'host': 'localhost',
        'port': 9091
        }
    mq_client = message_queue_client_from_config(mq_config)
    mq_codec = JSONCodec()
    processed_resource_queue = 'processed_resources'

    # ElasticSearch
    es = ES('localhost:9200', timeout=60)
    es_index = 'topic_tracking'

    # dequeue one resource
    mq_client.connect()
    message = mq_client.get_message(processed_resource_queue)
    resource = mq_codec.decode(message.body, Resource)
    mq_client.delete_message(processed_resource_queue, message.id)
    mq_client.disconnect()

    # save the resource to mongo
    resource._id = makeIdFromURI(resource.uri)
    resource_collection.insert_model(resource)

    # index the resource
    for boost in [1, 1000]:
Пример #27
0
class ESDataSource(DataSource):
    """
    docstring for ClassName
    """
    def __init__(self,
                 document_type,
                 index=None,
                 host=None,
                 port=None,
                 expand=False,
                 **elasticsearch_args):
        """Creates a ElasticSearch data source stream.

        :Attributes:
            * document_type: elasticsearch document_type name
            * index: index name, default is test
            * host: elasticsearch database server host, default is ``localhost``
            * port: elasticsearch port, default is ``27017``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
        """
        super(ESDataSource, self).__init__()
        self.document_type = document_type
        self.index = index or "test"
        self.host = host or "127.0.0.1"
        self.port = port or "9200"
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.connection = None
        self._fields = None

    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        self.connection = ES(server, **args)
        self.connection.default_indices = self.index
        self.connection.default_types = self.document_type

    def read_fields(self, limit=0, collapse=False):
        keys = []
        probes = {}

        def probe_record(record, parent=None):
            for key, value in record.items():
                if parent:
                    full_key = parent + "." + key
                else:
                    full_key = key

                if self.expand and type(value) == dict:
                    probe_record(value, full_key)
                    continue

                if not full_key in probes:
                    probe = FieldTypeProbe(full_key)
                    probes[full_key] = probe
                    keys.append(full_key)
                else:
                    probe = probes[full_key]
                probe.probe(value)

        for record in self.document_type.find(limit=limit):
            probe_record(record)

        fields = []

        for key in keys:
            probe = probes[key]
            field = Field(probe.field)

            storage_type = probe.unique_storage_type
            if not storage_type:
                field.storage_type = "unknown"
            elif storage_type == "unicode":
                field.storage_type = "string"
            else:
                field.storage_type = "unknown"
                field.concrete_storage_type = storage_type

            # FIXME: Set analytical type

            fields.append(field)

        self._fields = list(fields)
        return self._fields

    def rows(self):
        if not self.connection:
            raise RuntimeError("Stream is not initialized")
        from pyes.query import MatchAllQuery
        fields = self.field_names
        results = self.connection.search(MatchAllQuery(),
                                         search_type="scan",
                                         timeout="5m",
                                         size="200")
        return ESRowIterator(results, fields)

    def records(self):
        if not self.connection:
            raise RuntimeError("Stream is not initialized")
        from pyes.query import MatchAllQuery
        results = self.connection.search(MatchAllQuery(),
                                         search_type="scan",
                                         timeout="5m",
                                         size="200")
        return ESRecordIterator(results, self.expand)
Пример #28
0
import sys

#sys.path.insert(0, "../")

#from pyes import ES
from pyes.es import ES
from datetime import datetime
import shelve
conn = ES('127.0.0.1:9500')
#conn = ES('192.168.2.50:9200')
try:
    conn.delete_index("test-index")
except:
    pass

dataset = shelve.open("samples.shelve")

mapping = {u'description': {'boost': 1.0,
                            'index': 'analyzed',
                            'store': 'yes',
                            'type': u'string',
                            "term_vector": "with_positions_offsets"
},
           u'name': {'boost': 1.0,
                     'index': 'analyzed',
                     'store': 'yes',
                     'type': u'string',
                     "term_vector": "with_positions_offsets"
           },
           u'age': {'store': 'yes',
                    'type': u'integer'},
Пример #29
0
class ESDataSource(base.DataSource):
    """docstring for ClassName
    """
    def __init__(self, document_type, database=None, host=None, port=None,
                 expand=False, **elasticsearch_args):
        """Creates a ElasticSearch data source stream.

        :Attributes:
            * document_type: elasticsearch document_type name
            * database: database name
            * host: elasticsearch database server host, default is ``localhost``
            * port: elasticsearch port, default is ``27017``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
        """
        self.document_type = document_type
        self.database_name = database
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.connection = None
        self._fields = None

    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        self.connection = ES(server, **args)
        self.connection.default_indices = self.database_name
        self.connection.default_types = self.document_type

    def read_fields(self, limit=0):
        keys = []
        probes = {}

        def probe_record(record, parent=None):
            for key, value in record.items():
                if parent:
                    full_key = parent + "." + key
                else:
                    full_key = key

                if self.expand and type(value) == dict:
                    probe_record(value, full_key)
                    continue

                if not full_key in probes:
                    probe = dq.FieldTypeProbe(full_key)
                    probes[full_key] = probe
                    keys.append(full_key)
                else:
                    probe = probes[full_key]
                probe.probe(value)

        for record in self.document_type.find(limit=limit):
            probe_record(record)

        fields = []

        for key in keys:
            probe = probes[key]
            field = base.Field(probe.field)

            storage_type = probe.unique_storage_type
            if not storage_type:
                field.storage_type = "unknown"
            elif storage_type == "unicode":
                field.storage_type = "string"
            else:
                field.storage_type = "unknown"
                field.concrete_storage_type = storage_type

            # FIXME: Set analytical type

            fields.append(field)

        self.fields = list(fields)
        return self.fields

    def rows(self):
        if not self.connection:
            raise RuntimeError("Stream is not initialized")
        from pyes.query import MatchAllQuery
        fields = self.fields.names()
        results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200")
        return ESRowIterator(results, fields)

    def records(self):
        if not self.connection:
            raise RuntimeError("Stream is not initialized")
        from pyes.query import MatchAllQuery
        results = self.connection.search(MatchAllQuery(), search_type="scan", timeout="5m", size="200")
        return ESRecordIterator(results, self.expand)
                'size': 2,
                'explain': explain
            }

        return query


if __name__ == '__main__':

    explain = True

    # MongoDB
    host = 'localhost'
    port = 27017
    mcm = MongoConnectionManager(host, port, MongoCodec())
    database = 'processed'
    resource_collection = mcm.get_collection(database, 'resources', Resource)

    # ElasticSearch
    es = ES('localhost:9200', timeout=60)
    es_index = 'topic_tracking'

    # find the same resource
    resource = resource_collection.find_one_model()
    query = build_query(resource, explain)
    result = es.search(query, es_index, 'resource')

    for r in result['hits']['hits']:
        pprint(r)
    print('Tested resource %s: %s' % (resource._id, resource.uri))
Пример #31
0
class ESDataTarget(base.DataTarget):
    """docstring for ClassName
    """
    def __init__(self, document_type, database="test", host="127.0.0.1", port="9200",
                 truncate=False, expand=False, **elasticsearch_args):
        """Creates a ElasticSearch data target stream.

        :Attributes:
            * document_ElasticSearch elasticsearch document_type name
            * database: database name
            * host: ElasticSearch database server host, default is ``localhost``
            * port: ElasticSearch port, default is ``9200``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
            * truncate: delete existing data in the document_type. Default: False
        """
        self.document_type = document_type
        self.database_name = database
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.truncate = truncate
        self._fields = None

    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.database_name
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.database_name)
                self.connection.refresh(self.database_name)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.database_name)
            time.sleep(2)
            self.connection.create_index(self.database_name)
            self.connection.refresh(self.database_name)

        if self.truncate:
            self.connection.delete_mapping(self.database_name, self.document_type)
            self.connection.refresh(self.database_name)

    def append(self, obj):
        record = obj
        if not isinstance(obj, dict):
            record = dict(zip(self.fields.names(), obj))

        if self.expand:
            record = expand_record(record)

        id = record.get('id') or record.get('_id')
        self.connection.index(record, self.database_name, self.document_type, id, bulk=True)

    def finalize(self):
        self.connection.flush_bulk(forced=True)
Пример #32
0
def _get_conn(*args, **kwargs):
    _conn = ES(settings.es_hosts, *args, **kwargs)
    _conn.default_indices = settings.es_index
    return _conn
Пример #33
0
class FullTextSearch(object):
    def __init__(self, server, settings=None):
        # These timeout and bulk_size parameters were determined through
        # trial and error to be necessary to avoid timeout errors when
        # generating indices on Sandbox. They should not be taken as gospel.
        self.conn = ES(server, timeout=120.0)  # Default timeout: 30.0
        self.conn.bulker.bulk_size = 25  # Default: 400
        if settings:
            self.settings = settings
        else:
            self.settings = {
                'index': {
                    'analysis': {
                        'analyzer': {
                            'ngram_analyzer': {
                                'tokenizer': 'keyword',
                                'filter': ['lowercase', 'filter_ngram'],
                                'type': 'custom'
                            }
                        },
                        'filter': {
                            'filter_ngram': {
                                'type': 'nGram',
                                'max_gram': 30,
                                'min_gram': 1
                            }
                        }
                    }
                }
            }
        self.refresh_index_cache()

    def search_index_text(self, query_string, fields="_all", **args):
        q = query.MatchQuery(fields, query_string)
        return self.search_index(q, **args)

    def search_index(self, query, indices=None, num_results=None, node_type=None):
        results = self.conn.search(
            query=query, indices=indices, doc_types=node_type)
        meta_list = [r.get_meta() for r in results[0:num_results]]
        node_dict = {}

        # fetch nodes grouped by type to reduce number of db calls
        key = itemgetter('type')
        for t, grouped_list in groupby(sorted(meta_list, key=key), key=key):
            ids = [meta['id'] for meta in grouped_list]
            for node in self.datastore.get_nodes(t, ids):
                node_dict[(node.type, node.key)] = node

        # return nodes in original order
        nodelist = [node_dict[(meta['type'], meta['id'])]
                    for meta in meta_list]

        return nodelist

    def create_index(self, type, indexed_variables, index_name):
        self.conn.indices.create_index_if_missing(index_name, self.settings)
        mapping = {}
        for arg in indexed_variables:
            mapping[arg] = {'boost': 1.0,
                            'analyzer': 'ngram_analyzer',
                            'type': 'string',
                            'term_vector': 'with_positions_offsets'}
        index_settings = {'index_analyzer': 'ngram_analyzer',
                          'search_analyzer': 'standard',
                          'properties': mapping}
        self.conn.indices.put_mapping(str(type), index_settings, [index_name])
        self.refresh_index_cache()
        self.populate_index(type, index_name)

    def refresh_index_cache(self):
        try:
            indices = self.conn.indices.get_mapping(raw=True)
        except exceptions.IndexMissingException:
            indices = {}
        else:
            indices = dict((k, v.get('mappings', {})) for k, v in indices.items())
        self.indices = indices

    def delete_index(self, index_name):
        self.conn.indices.delete_index_if_exists(index_name)
        self.refresh_index_cache()

    def populate_index(self, type, index_name):
        #add all the currently existing nodes into the index
        ref_node = self.datastore.get_reference_node(type)
        node_list = [rel.target_node for rel in ref_node.instance.outgoing]

        for node in node_list:
            key = node.key
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, type, key)
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass
            self.conn.index(index_dict, index_name, type, key, bulk=True)
        self.conn.indices.refresh([index_name])

    def on_create(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.index(index_dict, index_name, node.type, node.key, bulk=True)
                self.conn.indices.refresh([index_name])
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass

    def on_delete(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            try:
                self.conn.delete(index_name, node.type, node.key, bulk=True)
                self.conn.indices.refresh([index_name])
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass

    def on_modify(self, node):
        type_indices = self.get_indices_of_type(node.type)
        for index_name in type_indices:
            index_dict = self.populate_index_document(node, index_name)
            try:
                self.conn.delete(index_name, node.type, node.key)
                self.conn.index(index_dict, index_name, node.type, node.key, bulk=True)
                self.conn.indices.refresh([index_name])
            except ELASTIC_SEARCH_EXCEPTIONS as err:
                log.exception(err)
                pass

    def get_indices_of_type(self, type):
        type_indices = [
            key for key, value in self.indices.items()
            if type in value
        ]
        return type_indices

    def populate_index_document(self, node, index_name):
        indexed_variables = self.indices[index_name][node.type]['properties'].keys()
        index_dict = {
            field: node[field] for field in indexed_variables
        }
        return index_dict
Created on May 25, 2013

@author: yapianyu
'''
from bson.objectid import ObjectId
from difflib import SequenceMatcher
from pyes.es import ES
from pyes.query import MultiMatchQuery, Search
from pymongo.connection import Connection
import collections
import datetime

ml_100k_folder = '/home/yapianyu/Desktop/movielens/ml-100k/'
ml_10m_folder = '/home/yapianyu/Desktop/movielens/ml-10M100K/'
mongodb = Connection('127.0.0.1', 27017)['right-channel']
elasticsearch = ES(('http', '127.0.0.1', 9200))


def count_movie_num_each_year():
    movie_num = {}
    f = open(ml_10m_folder + 'movies.dat')
    for line in f:
        year = int(line.split('::')[1][-5:-1])
        if year in movie_num:
            movie_num[year] += 1
        else:
            movie_num[year] = 1

    d = collections.OrderedDict(sorted(movie_num.items(), key=lambda t: -t[0]))
    for year, num in d.items():
        print year, '\t', num
Пример #35
0
def index(request):
    es = ES(settings.ES_SERVER)
    graphs = Graph.objects.all()
    for graph in graphs:
        graph.count = es.count(parse(graph.query))['count']
    return render_to_response('dashboard/index.html', {'graphs': graphs})
Пример #36
0
import sys

#sys.path.insert(0, "../")

#from pyes import ES
from pyes.es import ES
from datetime import datetime
import shelve
conn = ES('127.0.0.1:9500')
#conn = ES('192.168.2.50:9200')
try:
    conn.delete_index("test-index")
except:
    pass

dataset = shelve.open("samples.shelve")

mapping = {
    u'description': {
        'boost': 1.0,
        'index': 'analyzed',
        'store': 'true',
        'type': u'string',
        "term_vector": "with_positions_offsets"
    },
    u'name': {
        'boost': 1.0,
        'index': 'analyzed',
        'store': 'true',
        'type': u'string',
        "term_vector": "with_positions_offsets"
Пример #37
0
    odict = obj.as_dict()
    if isinstance(obj, (mappings.DocumentObjectField, mappings.ObjectField,
                        mappings.NestedObject)):
        properties = odict.pop("properties", [])
        doc_count += 1
        kwargs = ["name=%r" % obj.name,
                  "type=%r" % odict.pop("type")] +\
                 ["%s=%r" % (k, odict[k]) for k in sorted(odict.keys())]
        result.append("doc%d=" % doc_count +
                      str(type(obj)).split(".")[-1].strip("'>") + "(" +
                      ', '.join(kwargs) + ")")
        for k in sorted(obj.properties.keys()):
            result.extend(mappings_to_code(obj.properties[k], doc_count))
    else:
        kwargs = ["name=%r" % obj.name,
                  "type=%r" % odict.pop("type"),
                  "store=%r" % obj.store,
                  "index=%r" % odict.pop("index")] +\
                 ["%s=%r" % (k, odict[k]) for k in sorted(odict.keys())]
        result.append("doc%d.add_property(" % doc_count +\
                      str(type(obj)).split(".")[-1].strip("'>") + "(" +\
                      ', '.join(kwargs) + "))")

    return result


if __name__ == '__main__':
    es = ES("192.168.1.1:9200")
    res = mappings_to_code(es.mappings.get_doctype("twitter", "twitter"))
    print("\n".join(res))
Пример #38
0
class ESDataTarget(DataTarget):
    """docstring for ClassName
    """
    def __init__(self,
                 document_type,
                 index="test",
                 host="127.0.0.1",
                 port="9200",
                 truncate=False,
                 expand=False,
                 **elasticsearch_args):
        """Creates a ElasticSearch data target stream.

        :Attributes:
            * document_ElasticSearch elasticsearch document_type name
            * index: database name
            * host: ElasticSearch database server host, default is ``localhost``
            * port: ElasticSearch port, default is ``9200``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
            * truncate: delete existing data in the document_type. Default: False
        """
        super(ESDataTarget, self).__init__()
        self.document_type = document_type
        self.index = index
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.truncate = truncate
        self._fields = None

    def initialize(self):
        """
        Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.index
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.index)
                self.connection.refresh(self.index)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.index)
            self.connection.refresh(self.index)
            self.connection.create_index(self.index)
            self.connection.refresh(self.index)

        if self.truncate:
            self.connection.delete_mapping(self.index, self.document_type)
            self.connection.refresh(self.index)
        #check mapping
        try:
            self.connection.get_mapping(self.document_type, self.index)
        except TypeMissingException:
            self.connection.put_mapping(self.document_type,
                                        self._get_mapping(), self.index)

    def _get_mapping(self):
        """Build an ES optimized mapping for the given fields"""
        from pyes.mappings import DocumentObjectField, IntegerField, StringField, BooleanField, FloatField, DateField

        document = DocumentObjectField(name=self.document_type)
        for field in self.fields:
            st = field.storage_type
            if st == "unknown":
                #lets es detect the type
                continue
            elif st in ["string", "text"]:
                document.add_property(StringField(name=field.name))
            elif st == "integer":
                document.add_property(IntegerField(name=field.name))
            elif st == "boolean":
                document.add_property(BooleanField(name=field.name))
            elif st == "date":
                document.add_property(DateField(name=field.name))
            elif st == "float":
                document.add_property(FloatField(name=field.name))

        return document

    def append(self, obj):
        record = obj
        if not isinstance(obj, dict):
            record = dict(zip(self.field_names, obj))

        if self.expand:
            record = expand_record(record)

        id = record.get('id') or record.get('_id')
        self.connection.index(record,
                              self.index,
                              self.document_type,
                              id,
                              bulk=True)

    def finalize(self):
        self.connection.flush_bulk(forced=True)
from topic_tracking.model_management.helper.index import IndexHelper
from topic_tracking.util.codec.mongo_codec import MongoCodec
from topic_tracking.util.mongo import MongoConnectionManager


if __name__ == '__main__':

    # MongoDB
    host = 'localhost'
    port = 27017
    mcm = MongoConnectionManager(host, port, MongoCodec())
    database = 'processed'
    resource_collection = mcm.get_collection(database, 'resources', Resource)

    # ElasticSearch
    es = ES('localhost:9200', timeout=60)
    resource_index = 'topic_tracking_resources'
    story_index = 'topic_tracking_stories'

    # utilities
    index_helper = IndexHelper(es, resource_index, story_index)

    # get a resource to mongo
    resource = resource_collection.find_one_model()

    # analyze the resource
    term_string = index_helper._build_payload_string(resource.terms)
    pprint(term_string)
    params = {}
    params['text'] = term_string
    response = es._send_request('GET', resource_index + '/_analyze', None, params)
Пример #40
0
class ESDataTarget(base.DataTarget):
    """docstring for ClassName
    """
    def __init__(self,
                 document_type,
                 database="test",
                 host="127.0.0.1",
                 port="9200",
                 truncate=False,
                 expand=False,
                 **elasticsearch_args):
        """Creates a ElasticSearch data target stream.

        :Attributes:
            * document_ElasticSearch elasticsearch document_type name
            * database: database name
            * host: ElasticSearch database server host, default is ``localhost``
            * port: ElasticSearch port, default is ``9200``
            * expand: expand dictionary values and treat children as top-level keys with dot '.'
                separated key path to the child..
            * truncate: delete existing data in the document_type. Default: False
        """
        self.document_type = document_type
        self.database_name = database
        self.host = host
        self.port = port
        self.elasticsearch_args = elasticsearch_args
        self.expand = expand
        self.truncate = truncate
        self._fields = None

    def initialize(self):
        """Initialize ElasticSearch source stream:
        """
        from pyes.es import ES
        from pyes.exceptions import IndexAlreadyExistsException

        args = self.elasticsearch_args.copy()
        server = ""
        if self.host:
            server = self.host
        if self.port:
            server += ":" + self.port

        create = args.pop("create", False)
        replace = args.pop("replace", False)

        self.connection = ES(server, **args)
        self.connection.default_indices = self.database_name
        self.connection.default_types = self.document_type

        created = False
        if create:
            try:
                self.connection.create_index(self.database_name)
                self.connection.refresh(self.database_name)
                created = True
            except IndexAlreadyExistsException:
                pass

        if replace and not created:
            self.connection.delete_index_if_exists(self.database_name)
            time.sleep(2)
            self.connection.create_index(self.database_name)
            self.connection.refresh(self.database_name)

        if self.truncate:
            self.connection.delete_mapping(self.database_name,
                                           self.document_type)
            self.connection.refresh(self.database_name)

    def append(self, obj):
        record = obj
        if not isinstance(obj, dict):
            record = dict(zip(self.fields.names(), obj))

        if self.expand:
            record = expand_record(record)

        id = record.get('id') or record.get('_id')
        self.connection.index(record,
                              self.database_name,
                              self.document_type,
                              id,
                              bulk=True)

    def finalize(self):
        self.connection.flush_bulk(forced=True)