class ElasticSearchPipeline(object):
    def __init__(self):

        self.settings = get_project_settings()

        basic_auth = {'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD']}

        if self.settings['ELASTICSEARCH_PORT']:

            uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT'])
        else:
            uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])

        self.es = ES([uri], basic_auth=basic_auth)

    def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            log.msg("ELASTICSEARCH_UNIQ_KEY is NONE")
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'],
                          id=item['id'], op_type='create',)
        else:
            log.msg("Generation SHA1")
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'],
                          hashlib.sha1(item[self.__get_uniq_key()]).hexdigest())
        log.msg("Item send to Elastic Search %s" %
                    (self.settings['ELASTICSEARCH_INDEX']),
                    level=log.DEBUG, spider=spider)
        return item

    def __get_uniq_key(self):
        if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings['ELASTICSEARCH_UNIQ_KEY'] == "":
            return None
        return self.settings['ELASTICSEARCH_UNIQ_KEY']
示例#2
0
 def try_conn(self):
     """Try a new connection to the Elasticsearch."""
     try:
         self.conn = ES(self.conn_strs, timeout=5)
         self.connected = True
     except NoServerAvailable:
         print("Error connecting to elasticsearch for logging")
示例#3
0
    def __init__(self, serverInfo, proto="http"):
        #serverInfo can be a json object
        #only connect pyes to master es node
        #in the case that other nodes are taken down
        #because http requests will fail
        # TODO: dynamic master node detection
        if isinstance(serverInfo, dict):
            self.ip = serverInfo["ip"]
            self.rest_username = serverInfo["username"]
            self.rest_password = serverInfo["password"]
            self.username = serverInfo["es_username"]
            self.password = serverInfo["es_password"]
            self.port = 9091  #serverInfo["port"]
        else:
            self.ip = serverInfo.ip
            self.rest_username = serverInfo.rest_username
            self.rest_password = serverInfo.rest_password
            self.username = serverInfo.es_username
            self.password = serverInfo.es_password
            self.port = 9091  # serverInfo.port

        self.baseUrl = "http://{0}:{1}/".format(self.ip, self.port)
        self.capiBaseUrl = self.baseUrl
        self.esHttpUrl = "http://{0}:9200".format(self.ip)
        self.http_port = str(int(self.port) + 109)
        self.proto = proto
        self.conn = ES(server=self.esHttpUrl)
        self.manager = managers.Cluster(self.conn)
        self.test_params = TestInputSingleton.input
        self.docs = None
示例#4
0
class ElasticSearchPipeline(object):

    def __init__(self):
        from pyes import ES
        self.settings = get_project_settings()
        if self.settings['ELASTICSEARCH_PORT']:
            uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT'])
        else:
            uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])
        self.es = ES([uri])

    def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'],
                          id=item['id'], op_type='create',)
        else:
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'],
                          self._get_item_key(item))
        return item

    def _get_item_key(self, item):
        uniq = self.__get_uniq_key()
        if isinstance(uniq, list):
            values = [item[key] for key in uniq]
            value = ''.join(values)
        else:
            value = uniq

        return hashlib.sha1(value).hexdigest()

    def __get_uniq_key(self):
        if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings['ELASTICSEARCH_UNIQ_KEY'] == "":
            return None
        return self.settings['ELASTICSEARCH_UNIQ_KEY']
示例#5
0
文件: stash.py 项目: hoffmann/stash
class StashHandler(logging.Handler):
    def __init__(self, constr, whitelist=None, blacklist=None):
        logging.Handler.__init__(self)
        self.conn = ES(constr)
        if blacklist is None:
            blacklist = set()
        self.whitelist = whitelist
        self.blacklist = blacklist
        self.record_type = 'record'

    @property
    def index_name(self):
        return 'logstash-'+datetime.date.today().strftime('%Y.%m.%d')

    def emit(self, record):
        if self.whitelist is None:
            d = { k: record.__dict__[k] for k in record.__dict__ if k not in self.blacklist }
        else:
            d = { k: record.__dict__[k] for k in record.__dict__ if k in self.whitelist and k not in self.blacklist }
        entry = {
            "@fields": d,
            "@message": record.msg, 
            "@source": "gelf://localhost", 
            "@source_host": "gelf://localhost", 
            "@source_path": "/", 
            "@tags": [], 
            "@timestamp": datetime.datetime.utcnow().isoformat(), 
            "@type": self.record_type}
        self.conn.index(entry, self.index_name, self.record_type)
示例#6
0
def get_related_videos(video):
    related_videos = []
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = VIDEO_INDEX
    conn.refresh(VIDEO_INDEX)
    q = {
        "query": {
                 "bool": {
                           "should": [
                                       {"term"  : { "uid" : video.uid } },
                                       {"terms" : { "category" : [video.category]}},
                                       {"terms" : { "topic" : [video.topic]}},
                                       {"terms" : { "language" : [video.language]}}
                                       ],
                           "minimum_should_match" : 1
                           }
                 }
        }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % VIDEO_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result['hits']['hits']:
            related_videos.append(res['_source'])
    except Exception:
        pass
    return related_videos
示例#7
0
def multi_param_search(request):
    log_results = None
    es = ES()  # create elastic seach object
    if request.method == 'POST':  # if the search form is submitted
        filters_list = []
        # loop on each search param and check if it has value to add it to filter list
        for param in [
                "version", "ip_header_length", "ttl", "protocol",
                "source_address", "destination_address", "source_port",
                "dest_port", "sequence_number", "acknowledgement",
                "tcp_header_length", "data", "datetime"
        ]:
            if request.POST.get(param) != '':
                q_param = TermFilter(param, request.POST.get(param))
                filters_list.append(q_param)
        if len(filters_list
               ) != 0:  # if there is filter params  get the results
            orq = ANDFilter(filters_list)
            q = FilteredQuery(MatchAllQuery(), orq)
            log_results = es.search(q, indices=index_name, doc_types=type_name)
        else:
            log_results = None
    elif request.method == 'GET':  # get all packet when get the search page
        log_results = es.search(MatchAllQuery(),
                                indices=index_name,
                                doc_types=type_name)
    return render(request, 'multi_param_search.html',
                  {'log_results': log_results})
示例#8
0
def index():
    """docstring for fname"""
    import time
    fptr = open(sys.argv[1], 'rb')
    line_count = 0
    conn = ES(["localhost:9200"])
    #conn.create_index('test-index')
    start = time.clock()
    numb_exceptions = 0

    for line in fptr:
        if ((line_count % 10000) == 0):
            end = time.clock()
            minutes = (end - start) / 60.0
            print 'Done with %d took %f min. ' %(line_count, minutes)
            print 'number of exceptions ', numb_exceptions
        line_count += 1
        data = json.loads(line)
        post_id = int(data['post_id'])
        if post_id and data:
            try:
                conn.index(data, "test-index", "test-type", post_id)
            except Exception:
                numb_exceptions += 1
                continue

    print 'number of exceptions ', numb_exceptions
示例#9
0
def get_related_collections(collection):
    related_collections = []
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = FACET_INDEX
    conn.refresh(FACET_INDEX)
    q ={"query": {
                        "bool" : {
                                  "must_not" : {"term" : { "uid" : collection.uid }},
                            "should" : [
                                        {"terms" : { "subject" : [collection.subject] }},
                                        {"terms" : { "topic" : [collection.topic] }},
                                        ],
                            "minimum_should_match" : 1,
                                }
                  }
        }
    try :
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % FACET_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result['hits']['hits']:
            related_collections.append(res['_source'])
    except Exception:
        pass
    return related_collections
示例#10
0
def facets(host='localhost:9200',
          facet_terms=['bibleverse'],
          _type='habakkuk',
          date_filter=[],
          size=10):
    ret = {}
    conn = ES(host)
    q = MatchAllQuery()
    if date_filter:
        start,end = date_filter
        q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',
                                                        start.isoformat(),
                                                        end.isoformat(),
                                                        include_upper=False)))

    q = q.search(size=0)
    for term in facet_terms:
        q.facet.add_term_facet(term,order='count',size=size)
        
    es_logger.info(q.serialize())

    resultset = conn.search(query=q, indices=_type+'-*', doc_types=[_type])
    for facet in resultset.facets:
        ret[facet] = []
        for row in resultset.facets[facet]['terms']:
            ret[facet].append({"value":row['term'],"count":row['count']})

    logger.debug("facets return|'%s'"%json.dumps(ret))
    return ret
示例#11
0
    def handle(self, *args, **kwargs):
        elastic = ES(settings.SEARCH_HOSTS)

        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(elastic.get_indices())

        elastic.connection.close()
示例#12
0
def get_related_videos(video):
    related_videos = []
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = VIDEO_INDEX
    conn.refresh(VIDEO_INDEX)
    q = {
        "query": {
                 "bool": {
                           "should": [
                                       {"term"  : { "uid" : video.uid } },
                                       {"terms" : { "category" : [video.category]}},
                                       {"terms" : { "topic" : [video.topic]}},
                                       {"terms" : { "language" : [video.language]}}
                                       ],
                           "minimum_should_match" : 1
                           }
                 }
        }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % VIDEO_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result['hits']['hits']:
            related_videos.append(res['_source'])
    except Exception:
        pass
    return related_videos
示例#13
0
def get_related_collections(collection, featured):
    related_collections = []
    conn = ES(["127.0.0.1:9200"])
    conn.default_indices = FACET_INDEX
    conn.refresh(FACET_INDEX)
    q = {
        "query": {
            "bool": {
                "must_not": {"term": {"uid": collection.uid}},
                "should": [{"terms": {"subject": [collection.subject]}}, {"terms": {"topic": [collection.topic]}}],
                "minimum_should_match": 1,
            }
        }
    }
    if featured:
        q = {
            "query": {
                "bool": {
                    "must_not": {"term": {"uid": collection.uid}},
                    "should": [{"term": {"featured": True}}],
                    "minimum_should_match": 1,
                }
            }
        }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % FACET_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        for res in result["hits"]["hits"]:
            related_collections.append(res["_source"])
    except Exception:
        pass
    return related_collections
示例#14
0
def term_facet(host='localhost:9200',
               terms=['bibleverse'],
               _type='habakkuk',
               date_filter=[],
               size=10):
    ret = []
    conn = ES(host)
    q = MatchAllQuery()
    if date_filter:
        start,end = date_filter
        q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',start,end,include_upper=False)))

    q = q.search(size=0)
    for term in terms:
        q.facet.add_term_facet(term,order='count',size=size)
        
    print json.dumps(json.loads(q.to_search_json()),indent=2)

    resultset = conn.search(query=q, indices=_type+'-*', doc_types=[_type])
    for facet in resultset.facets:
        print "Total",facet,resultset.facets[facet]['total']
        for row in resultset.facets[facet]['terms']:
            print "\t",row['term'],row['count']
            ret.append((facet,row['term']))
        
    return ret
class ElasticSearchPipeline(object):
    def __init__(self):
        self.settings = get_project_settings()

        basic_auth = {'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD']}

        if self.settings['ELASTICSEARCH_PORT']:
            uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT'])
        else:
            uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])

        self.es = ES([uri], basic_auth=basic_auth)

    def index_item(self, item):
        if self.settings['ELASTICSEARCH_UNIQ_KEY']:
            local_id = hashlib.sha1(item[uniq_key)]).hexdigest()
            log.msg("Generated unique key %s" % local_id, level=self.settings['ELASTICSEARCH_LOG_LEVEL'])
            op_type = 'none'
        else:
            op_type = 'create'
            local_id = item['id']
        self.es.index(dict(item),
                      self.settings['ELASTICSEARCH_INDEX'],
                      self.settings['ELASTICSEARCH_TYPE'],
                      id=local_id,
                      op_type=op_type)
示例#16
0
def main(argv):
    start = 1
    if len(sys.argv) > 1:
        if sys.argv[1]:
            start = sys.argv[1]

    bulksize = 1000

    es = ES(("http", "localhost", 9200), bulk_size=bulksize)

    c0 = 0
    t0 = time.time()
    c1 = 0
    t1 = time.time()
    for n in range(start, start + 1000000):
        result = es.index(
            {
                'a': random_string_generator(),
                'b': random_string_generator(),
                'c': random_string_generator(),
                'd': random_string_generator(),
                'e': random_string_generator(),
                'f': random_string_generator(),
                'g': random_string_generator(),
                'h': random_string_generator(),
                'i': random_string_generator(),
                'j': random_string_generator(),
                'k': random_string_generator(),
                'l': random_string_generator(),
                'm': random_string_generator(),
                'n': random_string_generator(),
                'o': random_string_generator(),
                'p': random_string_generator(),
                'q': random_string_generator(),
                'r': random_string_generator(),
                's': random_string_generator(),
                't': random_string_generator(),
                'u': random_string_generator(),
                'v': random_string_generator(),
                'w': random_string_generator(),
                'x': random_string_generator(),
                'y': random_string_generator(),
                'z': random_string_generator()
            },
            'pyindex',
            'pytype',
            n,
            bulk=True)
        c0 = c0 + bulksize
        c1 = c1 + bulksize
        if result:
            d0 = (time.time() - t0)
            d1 = (time.time() - t1)
            now = datetime.datetime.utcnow()
            print("{0},{1},{2},{3},{4},{5},{6},{7}".format(
                now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), result.took, c0, d0,
                c0 / (d0 * bulksize), c1, d1, c1 / (d1 * bulksize)))
            c1 = 0
            t1 = time.time()
示例#17
0
 def es_index(self):
     conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH)
     conn.index(
         doc=self.get_search_kwargs(),
         index=self.tenant.slug,
         doc_type=self.Meta.document_type,
         id=unicode(self.id)
     )
示例#18
0
文件: search.py 项目: tcpr1/vosae-app
 def es_deindex(self):
     conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH)
     try:
         conn.delete(index=self.tenant.slug,
                     doc_type=self.Meta.document_type,
                     id=meta.id)
     except:
         pass
示例#19
0
 def __init__(self,
              data_type,
              csv_path="/tmp/",
              es_hosts=("http://localhost:9200", )):
     self.data_type = data_type
     self.doc_type = "ticketnetwork_%s" % self.data_type
     self.csv_path = csv_path
     self.es = ES(es_hosts)
示例#20
0
 def __init__(self):
     from pyes import ES
     self.settings = get_project_settings()
     if self.settings['ELASTICSEARCH_PORT']:
         uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT'])
     else:
         uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])
     self.es = ES([uri])
示例#21
0
def get_es(**overrides):
    """Return one pyes.es.ES object

    :arg overrides: Allows you to override defaults to create the ES.

    Things you can override:

    * default_indexes
    * timeout
    * dump_curl

    Values for these correspond with the arguments to pyes.es.ES.

    For example, if you wanted to create an ES for indexing with a timeout
    of 30 seconds, you'd do:

    >>> es = get_es(timeout=30)

    If you wanted to create an ES for debugging that dumps curl
    commands to stdout, you could do:

    >>> class CurlDumper(object):
    ...     def write(self, s):
    ...         print s
    ...
    >>> es = get_es(dump_curl=CurlDumper())
    """
    if overrides or not hasattr(_local, 'es'):
        defaults = {
            'default_indexes': DEFAULT_INDEXES,
            'timeout': DEFAULT_TIMEOUT,
            'dump_curl': DEFAULT_DUMP_CURL,
            }

        defaults.update(overrides)
        if (not thrift_enable and
            not settings.ES_HOSTS[0].split(':')[1].startswith('92')):
            raise ValueError('ES_HOSTS is not set to a valid port starting '
                             'with 9200-9299 range. Other ports are valid '
                             'if using pythrift.')
        es = ES(settings.ES_HOSTS, **defaults)

        # pyes 0.15 does this lame thing where it ignores dump_curl in
        # the ES constructor and always sets it to None. So what we do
        # is set it manually after the ES has been created and
        # defaults['dump_curl'] is truthy. This might not work for all
        # values of dump_curl.
        if VERSION[0:2] == (0, 15):
            es.dump_curl = (defaults['dump_curl']
                            if defaults['dump_curl'] else None)

        # Cache the es if there weren't any overrides.
        if not overrides:
            _local.es = es
    else:
        es = _local.es

    return es
示例#22
0
def get_es(hosts=None, default_indexes=None, timeout=None, dump_curl=None,
           **settings):
    """Create an ES object and return it.

    :arg hosts: list of uris; ES hosts to connect to, defaults to
        ``['localhost:9200']``
    :arg default_indexes: list of strings; the default indexes to use,
        defaults to 'default'
    :arg timeout: int; the timeout in seconds, defaults to 5
    :arg dump_curl: function or None; function that dumps curl output,
        see docs, defaults to None
    :arg settings: other settings to pass into `pyes.es.ES`

    Examples:

    >>> es = get_es()


    >>> es = get_es(hosts=['localhost:9200'])


    >>> es = get_es(timeout=30)  # good for indexing


    >>> es = get_es(default_indexes=['sumo_prod_20120627']


    >>> class CurlDumper(object):
    ...     def write(self, text):
    ...         print text
    ...
    >>> es = get_es(dump_curl=CurlDumper())

    """
    # Cheap way of de-None-ifying things
    hosts = hosts or DEFAULT_HOSTS
    default_indexes = default_indexes or DEFAULT_INDEXES
    timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
    dump_curl = dump_curl or DEFAULT_DUMP_CURL

    if not isinstance(default_indexes, list):
        default_indexes = [default_indexes]

    es = ES(hosts,
            default_indexes=default_indexes,
            timeout=timeout,
            dump_curl=dump_curl,
            **settings)

    # pyes 0.15 does this lame thing where it ignores dump_curl in
    # the ES constructor and always sets it to None. So what we do
    # is set it manually after the ES has been created and
    # defaults['dump_curl'] is truthy. This might not work for all
    # values of dump_curl.
    if PYES_VERSION[0:2] == (0, 15) and dump_curl is not None:
        es.dump_curl = dump_curl

    return es
示例#23
0
def searchCompletions(request):
    searchString = request.GET.get('searchString')
    maxCount = int(request.GET.get('maxCount'))
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = COMPLETION_INDEX
    conn.refresh(COMPLETION_INDEX)
    q = {
        "query": {
            "query_string": {
                "fields": ["searchTerm.partial"],
                "query": searchString
            }
        },
        "facets": {
            "facet": {
                "terms": {
                    "fields": ["searchTerm"],
                    "size": MAX_RESULT_SIZE
                }
            }
        },
        "size": maxCount
    }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % COMPLETION_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        result_list = []
        done_list = []
        for res in result['hits']['hits']:
            if res['_source']['type'] != "Collections":
                result_list.append(res['_source'])
                res['_source']['count'] = 0
            elif res['_source']['searchTerm'] not in done_list:
                val = str(res['_source']['searchTerm']).lower()
                for term in result['facets']['facet']['terms']:
                    if val == term['term']:
                        res['_source']['count'] = term['count']
                        done_list.append(res['_source']['searchTerm'])
                result_list.append(res['_source'])
        if len(result_list) == 0:
            result_list.append(
                {"searchTerm": "No Results"}
            )  # for now just displaying no results when nothing is found in completion
        resp = json.dumps({
            "responseCode": "OK",
            "requestParameters": {
                "searchString": searchString,
                "maxCount": unicode(maxCount)
            },
            "completions": result_list,
            "totalCount": unicode(maxCount)
        })
        return HttpResponse(resp)
    except Exception, ex:
        return HttpResponse('0')
示例#24
0
    def tearDown(self):
        self.log.warning("before tearDown es")
	self._unlink_es_cluster()
	self._stop_es_replication()
	if self.es_host != None:
		conn = ES(self.es_host + ":9200")
	        conn.delete_index_if_exists("default")
        super(ElasticSearchSupport, self).tearDown()
        self.log.warning("after tearDown es")
示例#25
0
def find_BID_in_SBN(bid, es_server="localhost:9200"):
    sbn_bid = to_iccu_bid(bid)
    q = TermQuery('codiceIdentificativo', sbn_bid)
    es_conn = ES(server=es_server)
    resultset = list(es_conn.search(query=q, indices="iccu"))
    if (len(resultset) > 0):
        return resultset
    else:
        return None
示例#26
0
 def __init__(self, settings):
     basic_auth = {'username': settings.get('ELASTICSEARCH_USERNAME'),
                   'password': settings.get('ELASTICSEARCH_PASSWORD')}
     if settings.get('ELASTICSEARCH_PORT'):
         uri = "%s:%d" % (settings.get('ELASTICSEARCH_SERVER'), settings.get('ELASTICSEARCH_PORT'))
     else:
         uri = "%s" % (settings.get('ELASTICSEARCH_SERVER'))
     self.es = ES([uri], basic_auth=basic_auth)
     self.settings = settings
示例#27
0
    def connect(self, connection_pool=1, bulk_size=10):
        update_connection_pool(connection_pool)

        try:
            self.connection = ES(self.servers, bulk_size=bulk_size)
        except NoServerAvailable:
            self._log.error('Failed to connect to elastic search server')
            return False
        return True
示例#28
0
class BaseElasticSearchClient(BaseClient):

    def __init__(self, servers, index):
        """
        @param servers: Make sure to include the port with the server address
        @param index: Document index
        @return:
        """
        super(BaseElasticSearchClient, self).__init__()
        self.connection = None
        self.servers = servers
        self.index = index if type(index) is list else [index]

    def connect(self, connection_pool=1):
        update_connection_pool(connection_pool)

        try:
            self.connection = ES(self.servers)
        except NoServerAvailable:
            self._log.error('Failed to connect to elastic search server')
            return False
        return True

    def close(self):
        self.connection = None

    def _create_term_query(self, must_list):
        # TODO: add remaining conditional list functionality.
        query = BoolQuery()
        for term in must_list:
            query.add_must(term)

    def find_term(self, name, value, size=10):
        if not self.connection:
            return

        query = TermQuery(name, value)
        return self.connection.search(query=Search(query, size=size),
                                      indices=self.index)

    def find(self, filter_terms, size=10, doc_types=None):
        if not self.connection:
            return

        query = self._create_term_query(must_list=filter_terms)
        return self.connection.search(query=Search(query, size=size),
                                      indices=self.index,
                                      doc_types=doc_types)

    def find_one(self, filter_terms, size=10, doc_types=None):
        if not self.connection:
            return

        results = self.find(filter_terms=filter_terms, size=size,
                            doc_types=doc_types)
        return results[0] if len(results) > 0 else None
示例#29
0
 def es_deindex(self):
     conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH)
     try:
         conn.delete(
             index=self.tenant.slug,
             doc_type=self.Meta.document_type,
             id=meta.id
         )
     except:
         pass
示例#30
0
class ElasticSearchPipeline(object):
    def __init__(self):
        self.conn = ES('localhost:9200')
        # self.file = open('urls.csv', 'wb')
        # self.file.write('spider,url' + '\n')

    def process_item(self, item, spider):
        #self.file.write(spider.name + ',' + spider.start_urls[0] + '\n')
        self.conn.index(dict(item), "qrator", spider.name)
        return item
class ElasticSearchPipeline(object):
    def __init__(self):

        self.settings = get_project_settings()

        basic_auth = {
            'username': self.settings['ELASTICSEARCH_USERNAME'],
            'password': self.settings['ELASTICSEARCH_PASSWORD']
        }

        if self.settings['ELASTICSEARCH_PORT']:

            uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'],
                             self.settings['ELASTICSEARCH_PORT'])
        else:
            uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])

        self.es = ES([uri], basic_auth=basic_auth)

    def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            log.msg("ELASTICSEARCH_UNIQ_KEY is NONE")
            self.es.index(
                dict(item),
                self.settings['ELASTICSEARCH_INDEX'],
                self.settings['ELASTICSEARCH_TYPE'],
                id=item['id'],
                op_type='create',
            )
        else:
            self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'],
                          self.settings['ELASTICSEARCH_TYPE'],
                          self._get_item_key(item))
        log.msg("Item send to Elastic Search %s" %
                (self.settings['ELASTICSEARCH_INDEX']),
                level=log.DEBUG,
                spider=spider)
        return item

    def _get_item_key(self, item):
        uniq = self.__get_uniq_key()

        if isinstance(uniq, list):
            values = [item[key] for key in uniq]
            value = ''.join(values)
        else:
            value = uniq

        return hashlib.sha1(value).hexdigest()

    def __get_uniq_key(self):
        if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings[
                'ELASTICSEARCH_UNIQ_KEY'] == "":
            return None
        return self.settings['ELASTICSEARCH_UNIQ_KEY']
示例#32
0
def job_redirect(request, slug, source, job_id):
    if request.method == "GET" and request.GET.has_key("redirect"):
        try:
            elastic = ES(settings.SEARCH_HOSTS)
            data = elastic.get(source, "job", job_id)
            elastic.connection.close()
            return HttpResponseRedirect(data["_source"]["details_url"])
        except NotFoundException:
            raise Http404

    return direct_to_template(request, "pages/redirect.html")
示例#33
0
def job_redirect(request, slug, source, job_id):
    if request.method == 'GET' and request.GET.has_key('redirect'):
        try:
            elastic = ES(settings.SEARCH_HOSTS)
            data = elastic.get(source, 'job', job_id)
            elastic.connection.close()
            return HttpResponseRedirect(data['_source']['details_url'])
        except NotFoundException:
            raise Http404
        
    return direct_to_template(request, 'pages/redirect.html')
示例#34
0
 def __init__(self, name):
     log = open(name, "wb")
     self.log = log
     self.conn = ES(("http", "127.0.0.1", 9200),
                    timeout=300.0,
                    log_curl=True,
                    dump_curl=log)
     self.index_name = "test-index"
     self.document_type = "test-type"
     self.conn.delete_index_if_exists(self.index_name)
     self.init_default_index()
示例#35
0
class ElasticSearchPipeline(object):

    def __init__(self):    
        self.conn = ES('localhost:9200') 
        # self.file = open('urls.csv', 'wb')
        # self.file.write('spider,url' + '\n')

    def process_item(self, item, spider):        
        #self.file.write(spider.name + ',' + spider.start_urls[0] + '\n')
        self.conn.index(dict(item), "qrator", spider.name)
        return item
def setup_store():
    connection = ES(settings.THUMBNAIL_ELASTIC_SEARCH_SERVERS)
    try:
        connection.create_index_if_missing(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX)
    except:
        pass
    try:
        connection.put_mapping(settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,
                               settings.THUMBNAIL_ELASTIC_SEARCH_MAPPING,
                               indexes=[settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,])
    except:
        pass
示例#37
0
def main(argv):
    start = 1
    if len(sys.argv) > 1:
        if sys.argv[1]:
            start = sys.argv[1]

    bulksize = 1000

    es = ES(("http", "localhost", 9200), bulk_size=bulksize)

    c0 = 0
    t0 = time.time()
    c1 = 0
    t1 = time.time()
    for n in range(start, start + 1000000):
        result = es.index({ 
                 'a' : random_string_generator(),
                 'b' : random_string_generator(),
                 'c' : random_string_generator(),
                 'd' : random_string_generator(),
                 'e' : random_string_generator(),
                 'f' : random_string_generator(),
                 'g' : random_string_generator(),
                 'h' : random_string_generator(),
                 'i' : random_string_generator(),
                 'j' : random_string_generator(),
                 'k' : random_string_generator(),
                 'l' : random_string_generator(),
                 'm' : random_string_generator(),
                 'n' : random_string_generator(),
                 'o' : random_string_generator(),
                 'p' : random_string_generator(),
                 'q' : random_string_generator(),
                 'r' : random_string_generator(),
                 's' : random_string_generator(),
                 't' : random_string_generator(),
                 'u' : random_string_generator(),
                 'v' : random_string_generator(),
                 'w' : random_string_generator(),
                 'x' : random_string_generator(),
                 'y' : random_string_generator(),
                 'z' : random_string_generator()
             }, 'pyindex', 'pytype', n, bulk=True)
        c0 = c0 + bulksize
        c1 = c1 + bulksize
        if result:
            d0 = (time.time() - t0) 
            d1 = (time.time() - t1) 
            now = datetime.datetime.utcnow()
            print("{0},{1},{2},{3},{4},{5},{6},{7}"
                .format(now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), result.took, c0, d0, c0/(d0*bulksize), c1, d1, c1/(d1*bulksize)))
            c1 = 0
            t1 = time.time()
示例#38
0
 def __init__(self):
     self.settings = get_project_settings()
     basic_auth = {
         'username': self.settings['ELASTICSEARCH_USERNAME'],
         'password': self.settings['ELASTICSEARCH_PASSWORD']
     }
     if self.settings['ELASTICSEARCH_PORT']:
         uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'],
                          self.settings['ELASTICSEARCH_PORT'])
     else:
         uri = "%s" % (self.settings['ELASTICSEARCH_SERVER'])
     self.es = ES([uri], basic_auth=basic_auth)
示例#39
0
    def __init__(self, url, auto_commit=True, unique_key='_id'):
        """Verify Elastic URL and establish a connection.
        """

        if verify_url(url) is False:
            raise SystemError
        self.elastic = ES(server=url)
        self.auto_commit = auto_commit
        self.doc_type = 'string'  # default type is string, change if needed
        self.unique_key = unique_key
        if auto_commit:
            self.run_auto_commit()
示例#40
0
def main(options):
    es = ES([options.es_server])
    try:
        es.create_index_if_missing('bzcache')
    except ElasticSearchException:
        # create_index_if_missing is supposed not to raise if the index
        # already existing, but with the ancient pyes / ES server versions
        # we're using it still does.
        pass

    # re-cache all intermittent-failure bugs
    bzcache = BugzillaCache(es_server=options.es_server)
    bzcache.index_bugs_by_keyword('intermittent-failure')
示例#41
0
def callback(body, message):
    """Do actual work."""

    logger.info("body in callback() is %s" % body)

    # pull lat/lon, time
    path = body
    sd = SD(path)
    lat = N.array(sd.select('Latitude').get())
    lon = N.array(sd.select('Longitude').get())
    t = N.array(sd.select('Time').get())
    sd.end()
    #logger.info("lat: %s" % str(lat.shape))
    #logger.info("lon: %s" % str(lon.shape))
    #logger.info("time: %s" % str(t.shape))

    # build metadata json
    id = os.path.basename(path)
    md = {
        "id": id,
        "dataset": "AIRX2RET",
        "starttime": t[0,0],
        "endtime": t[44,29],
        "location": {
            "coordinates": [[
                [ lon[0,0], lat[0,0] ],
                [ lon[0,29], lat[0,29] ],
                [ lon[44,29], lat[44,29] ],
                [ lon[44,0], lat[44,0] ],
                [ lon[0,0], lat[0,0] ],
            ]], 
            "type": "polygon"
        }, 
        "urls": "http://mozart/data/public/products/%s" % id
    }

    # publish
    pub_dir = '/data/public/products'
    ensure_dir(pub_dir)
    shutil.move(path, os.path.join(pub_dir, id))

    # insert into ElasticSearch
    index = doctype = 'airs'
    conn = ES('http://localhost:9200')
    mapping = json.load(open('grq_mapping.json'))
    if not conn.indices.exists_index(index):
        conn.indices.create_index(index, mapping)
    conn.indices.put_mapping(doctype, mapping, index)
    ret = conn.index(md, index, doctype, md['id'])

    message.ack()
示例#42
0
def processData(esurl, esindex, estype, shpPath, simplify, tolerance,
                startfrom):

    # Open a file for reading
    try:
        with open(shpPath):
            pass
    except IOError:
        print 'Unable to locate file: ' + shpPath

    #open the es connection
    from pyes import ES
    conn = ES(esurl, timeout=60, bulk_size=10)

    #check that a tolerance is passed when simplifying.
    if (simplify == True):
        if (tolerance == None):
            raise ValueError(
                'You must pass a valid tolerance if simplifying geometry')

    #use fiona to open the shapefile and read it
    try:
        with fiona.open(shpPath) as source:

            for f in source:

                featid = int(f['id'])
                if (featid > startfrom):

                    #grab the geom
                    from shapely.geometry import shape
                    geom = shape(f['geometry'])

                    #simplify if required
                    if (validateGeometry(geom)):
                        if (simplify == True):
                            geom = simplifyGeometry(geom, tolerance)

                    #if the geom is valid then push it into es
                    if (validateGeometry(geom)):
                        data = json.dumps(f)
                        key = f['id']
                        conn.index(data, esindex, estype, key, bulk=True)

                    else:
                        logging.error('Invalid Geometry: ' + f['id'])

    except:
        raise
示例#43
0
def processData(esurl, esindex, estype, shpPath, simplify, tolerance, startfrom):

    # Open a file for reading
    try:
        with open(shpPath):
            pass
    except IOError:
        print "Unable to locate file: " + shpPath

    # open the es connection
    from pyes import ES

    conn = ES(esurl, timeout=60, bulk_size=10)

    # check that a tolerance is passed when simplifying.
    if simplify == True:
        if tolerance == None:
            raise ValueError("You must pass a valid tolerance if simplifying geometry")

    # use fiona to open the shapefile and read it
    try:
        with fiona.open(shpPath) as source:

            for f in source:

                featid = int(f["id"])
                if featid > startfrom:

                    # grab the geom
                    from shapely.geometry import shape

                    geom = shape(f["geometry"])

                    # simplify if required
                    if validateGeometry(geom):
                        if simplify == True:
                            geom = simplifyGeometry(geom, tolerance)

                    # if the geom is valid then push it into es
                    if validateGeometry(geom):
                        data = json.dumps(f)
                        key = f["id"]
                        conn.index(data, esindex, estype, key, bulk=True)

                    else:
                        logging.error("Invalid Geometry: " + f["id"])

    except:
        raise
示例#44
0
class Importer(object):
    base_filename = "TicketNetworkDataFeed"

    model_map = {
        "performers": {
            "file": "Performers.csv",
            "model": Performer,
        },
        "events": {
            "file": "Events.csv",
            "model": Event,
        },
        "venues": {
            "file": "Venues.csv",
            "model": Venue,
        }
    }

    def __init__(self,
                 data_type,
                 csv_path="/tmp/",
                 es_hosts=("http://localhost:9200", )):
        self.data_type = data_type
        self.doc_type = "ticketnetwork_%s" % self.data_type
        self.csv_path = csv_path
        self.es = ES(es_hosts)

    def model(self):
        return self.model_map[self.data_type]["model"]

    def filepath(self):
        return os.path.join(
            self.csv_path, '-'.join(
                [self.base_filename, self.model_map[self.data_type]["file"]]))

    def __call__(self, *args, **kwargs):
        with open(self.filepath()) as f:
            reader = DictReader(f)
            for entry in reader:
                sanitize(entry)
                model = self.model()(entry)
                d = model.dict()
                self.es.index(d,
                              "oedi_sources",
                              self.doc_type,
                              model.hash(),
                              bulk=True)
            self.es.flush_bulk(True)
示例#45
0
    def _ensure_is_connected(self):
        if not self._is_connected:
            try:
                port = int(self.settings_dict["PORT"])
            except ValueError:
                raise ImproperlyConfigured("PORT must be an integer")

            self.db_name = self.settings_dict["NAME"]

            self._connection = ES(
                "%s:%s" % (self.settings_dict["HOST"], port),
                decoder=Decoder,
                encoder=Encoder,
                autorefresh=True,
                default_indices=[self.db_name],
            )

            self._db_connection = self._connection
            # auto index creation: check if to remove
            try:
                self._connection.create_index(self.db_name)
            except:
                pass
            # We're done!
            self._is_connected = True
示例#46
0
 def conn(self):
     if self.tdata.conn is None:
         self.tdata.conn = ES(self.registry.connection_string,
                              bulk_size=self.bulk_size,
                              max_retries=self.max_retries,
                              timeout=self.timeout)
     return self.tdata.conn
示例#47
0
    def setUp(self):
	self.es_host = None
	self.es_cluster_name = None
	self._state = []
        super(ElasticSearchSupport, self).setUp()
        self.es_host = self.input.param("es_host", "127.0.0.1")
        self.es_port = self.input.param("es_port", 9091)
	conn = ES(self.es_host + ":9200")
        if not self.input.param("skip_cleanup", True) or self.case_number == 1:
                conn.delete_index_if_exists("default")
	conn.create_index("default")
        self.log.warning("waiting for ES index to be ready to use")
        time.sleep(30)
	self._link_es_cluster()
	self._start_es_replication()
        self.log.warning("after setUp es")
示例#48
0
文件: app.py 项目: iamsk/es-demo
def search(searchkey=u"电影"):
    conn = ES('127.0.0.1:9200')
    # TextQuery会对searchkey进行分词
    qtitle = TextQuery("title", searchkey)
    h = HighLighter(['<b>'], ['</b>'], fragment_size=500)
    # 多字段搜索(must=>and,should=>or),高亮,结果截取(分页),排序
    q = Search(BoolQuery(should=[qtitle]), highlight=h, start=0, size=3,
               sort={'id': {'order': 'asc'}})
    q.add_highlight("title")
    results = conn.search(q, "zhihu", "answer")
    list = []
    for r in results:
        if("title" in r._meta.highlight):
            r['title'] = r._meta.highlight[u"title"][0]
        list.append(r)
    return template('results.html', list=list, count=results.total)
示例#49
0
def get_es_conn(es_url, index):
    """Create connection and create index if it doesn't exist."""

    conn = ES(es_url)
    if not conn.indices.exists_index(index):
        conn.indices.create_index(index)
    return conn
示例#50
0
 def __init__(self, connection_string, elastic_name, storage, bulk=False,
              bulk_size=400):
     self.conn = ES(connection_string, bulk_size=bulk_size)
     self.bulk_size = bulk_size
     self.name = elastic_name
     self.storage = storage
     self.bulk = bulk
示例#51
0
    def __init__(self, serverInfo, proto = "http"):
        #serverInfo can be a json object
        #only connect pyes to master es node
        #in the case that other nodes are taken down
        #because http requests will fail
        # TODO: dynamic master node detection
        if isinstance(serverInfo, dict):
            self.ip = serverInfo["ip"]
            self.rest_username = serverInfo["username"]
            self.rest_password = serverInfo["password"]
            self.username = serverInfo["es_username"]
            self.password = serverInfo["es_password"]
            self.port = 9091 #serverInfo["port"]
        else:
            self.ip = serverInfo.ip
            self.rest_username = serverInfo.rest_username
            self.rest_password = serverInfo.rest_password
            self.username = serverInfo.es_username
            self.password = serverInfo.es_password
            self.port = 9091 # serverInfo.port

        self.baseUrl = "http://{0}:{1}/".format(self.ip, self.port)
        self.capiBaseUrl = self.baseUrl
        self.esHttpUrl = "http://{0}:9200".format(self.ip)
        self.http_port = str(int(self.port) + 109)
        self.proto = proto
        self.conn = ES(server=self.esHttpUrl)
        self.manager = managers.Cluster(self.conn)
        self.test_params = TestInputSingleton.input
        self.docs = None
示例#52
0
 def search_term(self, key, indices=["default"]):
     result = None
     params = {"term": {"_id": key}}
     query = ES.Search(params)
     row = self.conn.search(query, indices=indices)
     if row.total > 0:
         result = row[0]
     return result
示例#53
0
def searchCompletions(request):
    searchString = request.GET.get('searchString')
    maxCount = int(request.GET.get('maxCount'))
    conn = ES(['127.0.0.1:9200'])
    conn.default_indices = COMPLETION_INDEX
    conn.refresh(COMPLETION_INDEX)
    q = {"query" : {
                    "query_string" :{
                                    "fields" : ["searchTerm.partial"],
                                    "query" : searchString
                                    }
                    },
         "facets" : {
                    "facet" :{
                              "terms": {
                                        "fields" : [ "searchTerm"], 
                                        "size" : MAX_RESULT_SIZE
                                        }
                              }
                    },
         "size" : maxCount
        }
    try:
        query = json.dumps(q)
        url = "http://localhost:9200/%s/_search" % COMPLETION_INDEX
        response = urllib2.urlopen(url, query)
        result = json.loads(response.read())
        result_list = []
        done_list = []
        for res in result['hits']['hits']:
            if res['_source']['type'] != "Collections":
                result_list.append(res['_source'])
                res['_source']['count'] = 0
            elif res['_source']['searchTerm'] not in done_list:
                val = str(res['_source']['searchTerm']).lower()
                for term in result['facets']['facet']['terms']:
                    if val == term['term'] :
                        res['_source']['count'] = term['count']
                        done_list.append(res['_source']['searchTerm'])
                result_list.append(res['_source'])
        if len(result_list) == 0:
            result_list.append({"searchTerm" : "No Results"})    # for now just displaying no results when nothing is found in completion
        resp = json.dumps({"responseCode":"OK","requestParameters":{"searchString":searchString,"maxCount":unicode(maxCount)},"completions": result_list, "totalCount": unicode(maxCount)})
        return HttpResponse(resp)
    except Exception, ex:
        return HttpResponse('0')
示例#54
0
def count_documents():
    num_docs = cache.get('website.documents_count')

    if not num_docs:
        elastic = ES(settings.SEARCH_HOSTS)
        indices = elastic.get_indices()
        elastic.connection.close()

        indices = indices.values()
        num_docs = 0

        for item in indices:
            num_docs += item['num_docs']

        cache.set('website.documents_count', num_docs)

    return num_docs
示例#55
0
def get_es_conn(es_url, index, alias=None):
    """Create connection and create index if it doesn't exist."""

    conn = ES(es_url)
    if not conn.indices.exists_index(index):
        conn.indices.create_index(index)
        if alias is not None:
            conn.indices.add_alias(alias, [index])
    return conn
示例#56
0
def single_param_search(request):
    log_results = None
    es = ES()  # create elastic seach object
    if request.method == 'POST':  # if the search form is submitted
        # filter with search param and search tearm
        q1 = TermFilter(request.POST.get('searchby'),
                        request.POST.get('searchterm'))
        orq = ORFilter([q1])
        q = FilteredQuery(MatchAllQuery(), orq)
        log_results = es.search(
            q, indices=index_name,
            doc_types=type_name)  # get the filtered data from elasticsearch
    elif request.method == 'GET':  # get all packet when get the search page
        log_results = es.search(MatchAllQuery(),
                                indices=index_name,
                                doc_types=type_name)
    return render(request, 'single_param_search.html',
                  {'log_results': log_results})
示例#57
0
 def __init__(self, *args, **kwargs):
     self._dirty = set()
     # We have to wait for the elastic container to start or things go
     # sideways.
     # TODO: Check status properly somehow (straight HTTP request, perhaps)
     time.sleep(30)
     self._elastic = ES(ELASTIC_URL, max_retries=100)
     self._finalize = Finalize(self, self.sync, exitpriority=5)
     super(ControlPlaneScheduler, self).__init__(*args, **kwargs)
示例#58
0
class ElasticSearchPipeline(object):
    def __init__(self, settings):
        basic_auth = {'username': settings.get('ELASTICSEARCH_USERNAME'),
                      'password': settings.get('ELASTICSEARCH_PASSWORD')}
        if settings.get('ELASTICSEARCH_PORT'):
            uri = "%s:%d" % (settings.get('ELASTICSEARCH_SERVER'), settings.get('ELASTICSEARCH_PORT'))
        else:
            uri = "%s" % (settings.get('ELASTICSEARCH_SERVER'))
        self.es = ES([uri], basic_auth=basic_auth)
        self.settings = settings

    @classmethod
    def from_crawler(cls, crawler):
        pipe = cls(crawler.settings)
        return pipe

    def process_item(self, item, spider):
        if self.__get_uniq_key() is None:
            log.info("ELASTICSEARCH_UNIQ_KEY is NONE")
            self.es.index(dict(item), self.settings.get('ELASTICSEARCH_INDEX'), self.settings.get('ELASTICSEARCH_TYPE'),
                          id=item['id'], op_type='create', )
        else:
            self.es.index(dict(item), self.settings.get('ELASTICSEARCH_INDEX'), self.settings.get('ELASTICSEARCH_TYPE'),
                          self._get_item_key(item))
        log.debug("Item send to Elastic Search %s" %
                  (self.settings.get('ELASTICSEARCH_INDEX')), spider=spider)
        return item

    def _get_item_key(self, item):
        uniq = self.__get_uniq_key()

        if isinstance(uniq, list):
            values = [item[key] for key in uniq]
            value = ''.join(values)
        else:
            value = uniq

        return hashlib.sha1(value).hexdigest()

    def __get_uniq_key(self):
        if not self.settings.get('ELASTICSEARCH_UNIQ_KEY'):
            return self.settings.get('ELASTICSEARCH_UNIQ_KEY')
        else:
            return None
示例#59
0
 def set_connection(self, project):
     logger.debug('Setting up connection')
     if self.es_conn is None:
         try:
             cs = self.get_option('es_conn_string', project)
             logger.debug('Creating connection to %s', cs)
             self.es_conn = ES(cs)
         except Exception, e:
             logger.warning('Error setting up the connection: %s', e)
             return