class ElasticSearchPipeline(object): def __init__(self): self.settings = get_project_settings() basic_auth = {'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD']} if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri], basic_auth=basic_auth) def process_item(self, item, spider): if self.__get_uniq_key() is None: log.msg("ELASTICSEARCH_UNIQ_KEY is NONE") self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], id=item['id'], op_type='create',) else: log.msg("Generation SHA1") self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], hashlib.sha1(item[self.__get_uniq_key()]).hexdigest()) log.msg("Item send to Elastic Search %s" % (self.settings['ELASTICSEARCH_INDEX']), level=log.DEBUG, spider=spider) return item def __get_uniq_key(self): if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings['ELASTICSEARCH_UNIQ_KEY'] == "": return None return self.settings['ELASTICSEARCH_UNIQ_KEY']
def try_conn(self): """Try a new connection to the Elasticsearch.""" try: self.conn = ES(self.conn_strs, timeout=5) self.connected = True except NoServerAvailable: print("Error connecting to elasticsearch for logging")
def __init__(self, serverInfo, proto="http"): #serverInfo can be a json object #only connect pyes to master es node #in the case that other nodes are taken down #because http requests will fail # TODO: dynamic master node detection if isinstance(serverInfo, dict): self.ip = serverInfo["ip"] self.rest_username = serverInfo["username"] self.rest_password = serverInfo["password"] self.username = serverInfo["es_username"] self.password = serverInfo["es_password"] self.port = 9091 #serverInfo["port"] else: self.ip = serverInfo.ip self.rest_username = serverInfo.rest_username self.rest_password = serverInfo.rest_password self.username = serverInfo.es_username self.password = serverInfo.es_password self.port = 9091 # serverInfo.port self.baseUrl = "http://{0}:{1}/".format(self.ip, self.port) self.capiBaseUrl = self.baseUrl self.esHttpUrl = "http://{0}:9200".format(self.ip) self.http_port = str(int(self.port) + 109) self.proto = proto self.conn = ES(server=self.esHttpUrl) self.manager = managers.Cluster(self.conn) self.test_params = TestInputSingleton.input self.docs = None
class ElasticSearchPipeline(object): def __init__(self): from pyes import ES self.settings = get_project_settings() if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri]) def process_item(self, item, spider): if self.__get_uniq_key() is None: self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], id=item['id'], op_type='create',) else: self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], self._get_item_key(item)) return item def _get_item_key(self, item): uniq = self.__get_uniq_key() if isinstance(uniq, list): values = [item[key] for key in uniq] value = ''.join(values) else: value = uniq return hashlib.sha1(value).hexdigest() def __get_uniq_key(self): if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings['ELASTICSEARCH_UNIQ_KEY'] == "": return None return self.settings['ELASTICSEARCH_UNIQ_KEY']
class StashHandler(logging.Handler): def __init__(self, constr, whitelist=None, blacklist=None): logging.Handler.__init__(self) self.conn = ES(constr) if blacklist is None: blacklist = set() self.whitelist = whitelist self.blacklist = blacklist self.record_type = 'record' @property def index_name(self): return 'logstash-'+datetime.date.today().strftime('%Y.%m.%d') def emit(self, record): if self.whitelist is None: d = { k: record.__dict__[k] for k in record.__dict__ if k not in self.blacklist } else: d = { k: record.__dict__[k] for k in record.__dict__ if k in self.whitelist and k not in self.blacklist } entry = { "@fields": d, "@message": record.msg, "@source": "gelf://localhost", "@source_host": "gelf://localhost", "@source_path": "/", "@tags": [], "@timestamp": datetime.datetime.utcnow().isoformat(), "@type": self.record_type} self.conn.index(entry, self.index_name, self.record_type)
def get_related_videos(video): related_videos = [] conn = ES(['127.0.0.1:9200']) conn.default_indices = VIDEO_INDEX conn.refresh(VIDEO_INDEX) q = { "query": { "bool": { "should": [ {"term" : { "uid" : video.uid } }, {"terms" : { "category" : [video.category]}}, {"terms" : { "topic" : [video.topic]}}, {"terms" : { "language" : [video.language]}} ], "minimum_should_match" : 1 } } } try: query = json.dumps(q) url = "http://localhost:9200/%s/_search" % VIDEO_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) for res in result['hits']['hits']: related_videos.append(res['_source']) except Exception: pass return related_videos
def multi_param_search(request): log_results = None es = ES() # create elastic seach object if request.method == 'POST': # if the search form is submitted filters_list = [] # loop on each search param and check if it has value to add it to filter list for param in [ "version", "ip_header_length", "ttl", "protocol", "source_address", "destination_address", "source_port", "dest_port", "sequence_number", "acknowledgement", "tcp_header_length", "data", "datetime" ]: if request.POST.get(param) != '': q_param = TermFilter(param, request.POST.get(param)) filters_list.append(q_param) if len(filters_list ) != 0: # if there is filter params get the results orq = ANDFilter(filters_list) q = FilteredQuery(MatchAllQuery(), orq) log_results = es.search(q, indices=index_name, doc_types=type_name) else: log_results = None elif request.method == 'GET': # get all packet when get the search page log_results = es.search(MatchAllQuery(), indices=index_name, doc_types=type_name) return render(request, 'multi_param_search.html', {'log_results': log_results})
def index(): """docstring for fname""" import time fptr = open(sys.argv[1], 'rb') line_count = 0 conn = ES(["localhost:9200"]) #conn.create_index('test-index') start = time.clock() numb_exceptions = 0 for line in fptr: if ((line_count % 10000) == 0): end = time.clock() minutes = (end - start) / 60.0 print 'Done with %d took %f min. ' %(line_count, minutes) print 'number of exceptions ', numb_exceptions line_count += 1 data = json.loads(line) post_id = int(data['post_id']) if post_id and data: try: conn.index(data, "test-index", "test-type", post_id) except Exception: numb_exceptions += 1 continue print 'number of exceptions ', numb_exceptions
def get_related_collections(collection): related_collections = [] conn = ES(['127.0.0.1:9200']) conn.default_indices = FACET_INDEX conn.refresh(FACET_INDEX) q ={"query": { "bool" : { "must_not" : {"term" : { "uid" : collection.uid }}, "should" : [ {"terms" : { "subject" : [collection.subject] }}, {"terms" : { "topic" : [collection.topic] }}, ], "minimum_should_match" : 1, } } } try : query = json.dumps(q) url = "http://localhost:9200/%s/_search" % FACET_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) for res in result['hits']['hits']: related_collections.append(res['_source']) except Exception: pass return related_collections
def facets(host='localhost:9200', facet_terms=['bibleverse'], _type='habakkuk', date_filter=[], size=10): ret = {} conn = ES(host) q = MatchAllQuery() if date_filter: start,end = date_filter q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date', start.isoformat(), end.isoformat(), include_upper=False))) q = q.search(size=0) for term in facet_terms: q.facet.add_term_facet(term,order='count',size=size) es_logger.info(q.serialize()) resultset = conn.search(query=q, indices=_type+'-*', doc_types=[_type]) for facet in resultset.facets: ret[facet] = [] for row in resultset.facets[facet]['terms']: ret[facet].append({"value":row['term'],"count":row['count']}) logger.debug("facets return|'%s'"%json.dumps(ret)) return ret
def handle(self, *args, **kwargs): elastic = ES(settings.SEARCH_HOSTS) pp = pprint.PrettyPrinter(indent=4) pp.pprint(elastic.get_indices()) elastic.connection.close()
def get_related_collections(collection, featured): related_collections = [] conn = ES(["127.0.0.1:9200"]) conn.default_indices = FACET_INDEX conn.refresh(FACET_INDEX) q = { "query": { "bool": { "must_not": {"term": {"uid": collection.uid}}, "should": [{"terms": {"subject": [collection.subject]}}, {"terms": {"topic": [collection.topic]}}], "minimum_should_match": 1, } } } if featured: q = { "query": { "bool": { "must_not": {"term": {"uid": collection.uid}}, "should": [{"term": {"featured": True}}], "minimum_should_match": 1, } } } try: query = json.dumps(q) url = "http://localhost:9200/%s/_search" % FACET_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) for res in result["hits"]["hits"]: related_collections.append(res["_source"]) except Exception: pass return related_collections
def term_facet(host='localhost:9200', terms=['bibleverse'], _type='habakkuk', date_filter=[], size=10): ret = [] conn = ES(host) q = MatchAllQuery() if date_filter: start,end = date_filter q = FilteredQuery(q, RangeFilter(qrange=ESRange('created_at_date',start,end,include_upper=False))) q = q.search(size=0) for term in terms: q.facet.add_term_facet(term,order='count',size=size) print json.dumps(json.loads(q.to_search_json()),indent=2) resultset = conn.search(query=q, indices=_type+'-*', doc_types=[_type]) for facet in resultset.facets: print "Total",facet,resultset.facets[facet]['total'] for row in resultset.facets[facet]['terms']: print "\t",row['term'],row['count'] ret.append((facet,row['term'])) return ret
class ElasticSearchPipeline(object): def __init__(self): self.settings = get_project_settings() basic_auth = {'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD']} if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri], basic_auth=basic_auth) def index_item(self, item): if self.settings['ELASTICSEARCH_UNIQ_KEY']: local_id = hashlib.sha1(item[uniq_key)]).hexdigest() log.msg("Generated unique key %s" % local_id, level=self.settings['ELASTICSEARCH_LOG_LEVEL']) op_type = 'none' else: op_type = 'create' local_id = item['id'] self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], id=local_id, op_type=op_type)
def main(argv): start = 1 if len(sys.argv) > 1: if sys.argv[1]: start = sys.argv[1] bulksize = 1000 es = ES(("http", "localhost", 9200), bulk_size=bulksize) c0 = 0 t0 = time.time() c1 = 0 t1 = time.time() for n in range(start, start + 1000000): result = es.index( { 'a': random_string_generator(), 'b': random_string_generator(), 'c': random_string_generator(), 'd': random_string_generator(), 'e': random_string_generator(), 'f': random_string_generator(), 'g': random_string_generator(), 'h': random_string_generator(), 'i': random_string_generator(), 'j': random_string_generator(), 'k': random_string_generator(), 'l': random_string_generator(), 'm': random_string_generator(), 'n': random_string_generator(), 'o': random_string_generator(), 'p': random_string_generator(), 'q': random_string_generator(), 'r': random_string_generator(), 's': random_string_generator(), 't': random_string_generator(), 'u': random_string_generator(), 'v': random_string_generator(), 'w': random_string_generator(), 'x': random_string_generator(), 'y': random_string_generator(), 'z': random_string_generator() }, 'pyindex', 'pytype', n, bulk=True) c0 = c0 + bulksize c1 = c1 + bulksize if result: d0 = (time.time() - t0) d1 = (time.time() - t1) now = datetime.datetime.utcnow() print("{0},{1},{2},{3},{4},{5},{6},{7}".format( now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), result.took, c0, d0, c0 / (d0 * bulksize), c1, d1, c1 / (d1 * bulksize))) c1 = 0 t1 = time.time()
def es_index(self): conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH) conn.index( doc=self.get_search_kwargs(), index=self.tenant.slug, doc_type=self.Meta.document_type, id=unicode(self.id) )
def es_deindex(self): conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH) try: conn.delete(index=self.tenant.slug, doc_type=self.Meta.document_type, id=meta.id) except: pass
def __init__(self, data_type, csv_path="/tmp/", es_hosts=("http://localhost:9200", )): self.data_type = data_type self.doc_type = "ticketnetwork_%s" % self.data_type self.csv_path = csv_path self.es = ES(es_hosts)
def __init__(self): from pyes import ES self.settings = get_project_settings() if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri])
def get_es(**overrides): """Return one pyes.es.ES object :arg overrides: Allows you to override defaults to create the ES. Things you can override: * default_indexes * timeout * dump_curl Values for these correspond with the arguments to pyes.es.ES. For example, if you wanted to create an ES for indexing with a timeout of 30 seconds, you'd do: >>> es = get_es(timeout=30) If you wanted to create an ES for debugging that dumps curl commands to stdout, you could do: >>> class CurlDumper(object): ... def write(self, s): ... print s ... >>> es = get_es(dump_curl=CurlDumper()) """ if overrides or not hasattr(_local, 'es'): defaults = { 'default_indexes': DEFAULT_INDEXES, 'timeout': DEFAULT_TIMEOUT, 'dump_curl': DEFAULT_DUMP_CURL, } defaults.update(overrides) if (not thrift_enable and not settings.ES_HOSTS[0].split(':')[1].startswith('92')): raise ValueError('ES_HOSTS is not set to a valid port starting ' 'with 9200-9299 range. Other ports are valid ' 'if using pythrift.') es = ES(settings.ES_HOSTS, **defaults) # pyes 0.15 does this lame thing where it ignores dump_curl in # the ES constructor and always sets it to None. So what we do # is set it manually after the ES has been created and # defaults['dump_curl'] is truthy. This might not work for all # values of dump_curl. if VERSION[0:2] == (0, 15): es.dump_curl = (defaults['dump_curl'] if defaults['dump_curl'] else None) # Cache the es if there weren't any overrides. if not overrides: _local.es = es else: es = _local.es return es
def get_es(hosts=None, default_indexes=None, timeout=None, dump_curl=None, **settings): """Create an ES object and return it. :arg hosts: list of uris; ES hosts to connect to, defaults to ``['localhost:9200']`` :arg default_indexes: list of strings; the default indexes to use, defaults to 'default' :arg timeout: int; the timeout in seconds, defaults to 5 :arg dump_curl: function or None; function that dumps curl output, see docs, defaults to None :arg settings: other settings to pass into `pyes.es.ES` Examples: >>> es = get_es() >>> es = get_es(hosts=['localhost:9200']) >>> es = get_es(timeout=30) # good for indexing >>> es = get_es(default_indexes=['sumo_prod_20120627'] >>> class CurlDumper(object): ... def write(self, text): ... print text ... >>> es = get_es(dump_curl=CurlDumper()) """ # Cheap way of de-None-ifying things hosts = hosts or DEFAULT_HOSTS default_indexes = default_indexes or DEFAULT_INDEXES timeout = timeout if timeout is not None else DEFAULT_TIMEOUT dump_curl = dump_curl or DEFAULT_DUMP_CURL if not isinstance(default_indexes, list): default_indexes = [default_indexes] es = ES(hosts, default_indexes=default_indexes, timeout=timeout, dump_curl=dump_curl, **settings) # pyes 0.15 does this lame thing where it ignores dump_curl in # the ES constructor and always sets it to None. So what we do # is set it manually after the ES has been created and # defaults['dump_curl'] is truthy. This might not work for all # values of dump_curl. if PYES_VERSION[0:2] == (0, 15) and dump_curl is not None: es.dump_curl = dump_curl return es
def searchCompletions(request): searchString = request.GET.get('searchString') maxCount = int(request.GET.get('maxCount')) conn = ES(['127.0.0.1:9200']) conn.default_indices = COMPLETION_INDEX conn.refresh(COMPLETION_INDEX) q = { "query": { "query_string": { "fields": ["searchTerm.partial"], "query": searchString } }, "facets": { "facet": { "terms": { "fields": ["searchTerm"], "size": MAX_RESULT_SIZE } } }, "size": maxCount } try: query = json.dumps(q) url = "http://localhost:9200/%s/_search" % COMPLETION_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) result_list = [] done_list = [] for res in result['hits']['hits']: if res['_source']['type'] != "Collections": result_list.append(res['_source']) res['_source']['count'] = 0 elif res['_source']['searchTerm'] not in done_list: val = str(res['_source']['searchTerm']).lower() for term in result['facets']['facet']['terms']: if val == term['term']: res['_source']['count'] = term['count'] done_list.append(res['_source']['searchTerm']) result_list.append(res['_source']) if len(result_list) == 0: result_list.append( {"searchTerm": "No Results"} ) # for now just displaying no results when nothing is found in completion resp = json.dumps({ "responseCode": "OK", "requestParameters": { "searchString": searchString, "maxCount": unicode(maxCount) }, "completions": result_list, "totalCount": unicode(maxCount) }) return HttpResponse(resp) except Exception, ex: return HttpResponse('0')
def tearDown(self): self.log.warning("before tearDown es") self._unlink_es_cluster() self._stop_es_replication() if self.es_host != None: conn = ES(self.es_host + ":9200") conn.delete_index_if_exists("default") super(ElasticSearchSupport, self).tearDown() self.log.warning("after tearDown es")
def find_BID_in_SBN(bid, es_server="localhost:9200"): sbn_bid = to_iccu_bid(bid) q = TermQuery('codiceIdentificativo', sbn_bid) es_conn = ES(server=es_server) resultset = list(es_conn.search(query=q, indices="iccu")) if (len(resultset) > 0): return resultset else: return None
def __init__(self, settings): basic_auth = {'username': settings.get('ELASTICSEARCH_USERNAME'), 'password': settings.get('ELASTICSEARCH_PASSWORD')} if settings.get('ELASTICSEARCH_PORT'): uri = "%s:%d" % (settings.get('ELASTICSEARCH_SERVER'), settings.get('ELASTICSEARCH_PORT')) else: uri = "%s" % (settings.get('ELASTICSEARCH_SERVER')) self.es = ES([uri], basic_auth=basic_auth) self.settings = settings
def connect(self, connection_pool=1, bulk_size=10): update_connection_pool(connection_pool) try: self.connection = ES(self.servers, bulk_size=bulk_size) except NoServerAvailable: self._log.error('Failed to connect to elastic search server') return False return True
class BaseElasticSearchClient(BaseClient): def __init__(self, servers, index): """ @param servers: Make sure to include the port with the server address @param index: Document index @return: """ super(BaseElasticSearchClient, self).__init__() self.connection = None self.servers = servers self.index = index if type(index) is list else [index] def connect(self, connection_pool=1): update_connection_pool(connection_pool) try: self.connection = ES(self.servers) except NoServerAvailable: self._log.error('Failed to connect to elastic search server') return False return True def close(self): self.connection = None def _create_term_query(self, must_list): # TODO: add remaining conditional list functionality. query = BoolQuery() for term in must_list: query.add_must(term) def find_term(self, name, value, size=10): if not self.connection: return query = TermQuery(name, value) return self.connection.search(query=Search(query, size=size), indices=self.index) def find(self, filter_terms, size=10, doc_types=None): if not self.connection: return query = self._create_term_query(must_list=filter_terms) return self.connection.search(query=Search(query, size=size), indices=self.index, doc_types=doc_types) def find_one(self, filter_terms, size=10, doc_types=None): if not self.connection: return results = self.find(filter_terms=filter_terms, size=size, doc_types=doc_types) return results[0] if len(results) > 0 else None
def es_deindex(self): conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH) try: conn.delete( index=self.tenant.slug, doc_type=self.Meta.document_type, id=meta.id ) except: pass
class ElasticSearchPipeline(object): def __init__(self): self.conn = ES('localhost:9200') # self.file = open('urls.csv', 'wb') # self.file.write('spider,url' + '\n') def process_item(self, item, spider): #self.file.write(spider.name + ',' + spider.start_urls[0] + '\n') self.conn.index(dict(item), "qrator", spider.name) return item
class ElasticSearchPipeline(object): def __init__(self): self.settings = get_project_settings() basic_auth = { 'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD'] } if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri], basic_auth=basic_auth) def process_item(self, item, spider): if self.__get_uniq_key() is None: log.msg("ELASTICSEARCH_UNIQ_KEY is NONE") self.es.index( dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], id=item['id'], op_type='create', ) else: self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], self._get_item_key(item)) log.msg("Item send to Elastic Search %s" % (self.settings['ELASTICSEARCH_INDEX']), level=log.DEBUG, spider=spider) return item def _get_item_key(self, item): uniq = self.__get_uniq_key() if isinstance(uniq, list): values = [item[key] for key in uniq] value = ''.join(values) else: value = uniq return hashlib.sha1(value).hexdigest() def __get_uniq_key(self): if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings[ 'ELASTICSEARCH_UNIQ_KEY'] == "": return None return self.settings['ELASTICSEARCH_UNIQ_KEY']
def job_redirect(request, slug, source, job_id): if request.method == "GET" and request.GET.has_key("redirect"): try: elastic = ES(settings.SEARCH_HOSTS) data = elastic.get(source, "job", job_id) elastic.connection.close() return HttpResponseRedirect(data["_source"]["details_url"]) except NotFoundException: raise Http404 return direct_to_template(request, "pages/redirect.html")
def job_redirect(request, slug, source, job_id): if request.method == 'GET' and request.GET.has_key('redirect'): try: elastic = ES(settings.SEARCH_HOSTS) data = elastic.get(source, 'job', job_id) elastic.connection.close() return HttpResponseRedirect(data['_source']['details_url']) except NotFoundException: raise Http404 return direct_to_template(request, 'pages/redirect.html')
def __init__(self, name): log = open(name, "wb") self.log = log self.conn = ES(("http", "127.0.0.1", 9200), timeout=300.0, log_curl=True, dump_curl=log) self.index_name = "test-index" self.document_type = "test-type" self.conn.delete_index_if_exists(self.index_name) self.init_default_index()
def setup_store(): connection = ES(settings.THUMBNAIL_ELASTIC_SEARCH_SERVERS) try: connection.create_index_if_missing(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX) except: pass try: connection.put_mapping(settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, settings.THUMBNAIL_ELASTIC_SEARCH_MAPPING, indexes=[settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,]) except: pass
def main(argv): start = 1 if len(sys.argv) > 1: if sys.argv[1]: start = sys.argv[1] bulksize = 1000 es = ES(("http", "localhost", 9200), bulk_size=bulksize) c0 = 0 t0 = time.time() c1 = 0 t1 = time.time() for n in range(start, start + 1000000): result = es.index({ 'a' : random_string_generator(), 'b' : random_string_generator(), 'c' : random_string_generator(), 'd' : random_string_generator(), 'e' : random_string_generator(), 'f' : random_string_generator(), 'g' : random_string_generator(), 'h' : random_string_generator(), 'i' : random_string_generator(), 'j' : random_string_generator(), 'k' : random_string_generator(), 'l' : random_string_generator(), 'm' : random_string_generator(), 'n' : random_string_generator(), 'o' : random_string_generator(), 'p' : random_string_generator(), 'q' : random_string_generator(), 'r' : random_string_generator(), 's' : random_string_generator(), 't' : random_string_generator(), 'u' : random_string_generator(), 'v' : random_string_generator(), 'w' : random_string_generator(), 'x' : random_string_generator(), 'y' : random_string_generator(), 'z' : random_string_generator() }, 'pyindex', 'pytype', n, bulk=True) c0 = c0 + bulksize c1 = c1 + bulksize if result: d0 = (time.time() - t0) d1 = (time.time() - t1) now = datetime.datetime.utcnow() print("{0},{1},{2},{3},{4},{5},{6},{7}" .format(now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), result.took, c0, d0, c0/(d0*bulksize), c1, d1, c1/(d1*bulksize))) c1 = 0 t1 = time.time()
def __init__(self): self.settings = get_project_settings() basic_auth = { 'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD'] } if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri], basic_auth=basic_auth)
def __init__(self, url, auto_commit=True, unique_key='_id'): """Verify Elastic URL and establish a connection. """ if verify_url(url) is False: raise SystemError self.elastic = ES(server=url) self.auto_commit = auto_commit self.doc_type = 'string' # default type is string, change if needed self.unique_key = unique_key if auto_commit: self.run_auto_commit()
def main(options): es = ES([options.es_server]) try: es.create_index_if_missing('bzcache') except ElasticSearchException: # create_index_if_missing is supposed not to raise if the index # already existing, but with the ancient pyes / ES server versions # we're using it still does. pass # re-cache all intermittent-failure bugs bzcache = BugzillaCache(es_server=options.es_server) bzcache.index_bugs_by_keyword('intermittent-failure')
def callback(body, message): """Do actual work.""" logger.info("body in callback() is %s" % body) # pull lat/lon, time path = body sd = SD(path) lat = N.array(sd.select('Latitude').get()) lon = N.array(sd.select('Longitude').get()) t = N.array(sd.select('Time').get()) sd.end() #logger.info("lat: %s" % str(lat.shape)) #logger.info("lon: %s" % str(lon.shape)) #logger.info("time: %s" % str(t.shape)) # build metadata json id = os.path.basename(path) md = { "id": id, "dataset": "AIRX2RET", "starttime": t[0,0], "endtime": t[44,29], "location": { "coordinates": [[ [ lon[0,0], lat[0,0] ], [ lon[0,29], lat[0,29] ], [ lon[44,29], lat[44,29] ], [ lon[44,0], lat[44,0] ], [ lon[0,0], lat[0,0] ], ]], "type": "polygon" }, "urls": "http://mozart/data/public/products/%s" % id } # publish pub_dir = '/data/public/products' ensure_dir(pub_dir) shutil.move(path, os.path.join(pub_dir, id)) # insert into ElasticSearch index = doctype = 'airs' conn = ES('http://localhost:9200') mapping = json.load(open('grq_mapping.json')) if not conn.indices.exists_index(index): conn.indices.create_index(index, mapping) conn.indices.put_mapping(doctype, mapping, index) ret = conn.index(md, index, doctype, md['id']) message.ack()
def processData(esurl, esindex, estype, shpPath, simplify, tolerance, startfrom): # Open a file for reading try: with open(shpPath): pass except IOError: print 'Unable to locate file: ' + shpPath #open the es connection from pyes import ES conn = ES(esurl, timeout=60, bulk_size=10) #check that a tolerance is passed when simplifying. if (simplify == True): if (tolerance == None): raise ValueError( 'You must pass a valid tolerance if simplifying geometry') #use fiona to open the shapefile and read it try: with fiona.open(shpPath) as source: for f in source: featid = int(f['id']) if (featid > startfrom): #grab the geom from shapely.geometry import shape geom = shape(f['geometry']) #simplify if required if (validateGeometry(geom)): if (simplify == True): geom = simplifyGeometry(geom, tolerance) #if the geom is valid then push it into es if (validateGeometry(geom)): data = json.dumps(f) key = f['id'] conn.index(data, esindex, estype, key, bulk=True) else: logging.error('Invalid Geometry: ' + f['id']) except: raise
def processData(esurl, esindex, estype, shpPath, simplify, tolerance, startfrom): # Open a file for reading try: with open(shpPath): pass except IOError: print "Unable to locate file: " + shpPath # open the es connection from pyes import ES conn = ES(esurl, timeout=60, bulk_size=10) # check that a tolerance is passed when simplifying. if simplify == True: if tolerance == None: raise ValueError("You must pass a valid tolerance if simplifying geometry") # use fiona to open the shapefile and read it try: with fiona.open(shpPath) as source: for f in source: featid = int(f["id"]) if featid > startfrom: # grab the geom from shapely.geometry import shape geom = shape(f["geometry"]) # simplify if required if validateGeometry(geom): if simplify == True: geom = simplifyGeometry(geom, tolerance) # if the geom is valid then push it into es if validateGeometry(geom): data = json.dumps(f) key = f["id"] conn.index(data, esindex, estype, key, bulk=True) else: logging.error("Invalid Geometry: " + f["id"]) except: raise
class Importer(object): base_filename = "TicketNetworkDataFeed" model_map = { "performers": { "file": "Performers.csv", "model": Performer, }, "events": { "file": "Events.csv", "model": Event, }, "venues": { "file": "Venues.csv", "model": Venue, } } def __init__(self, data_type, csv_path="/tmp/", es_hosts=("http://localhost:9200", )): self.data_type = data_type self.doc_type = "ticketnetwork_%s" % self.data_type self.csv_path = csv_path self.es = ES(es_hosts) def model(self): return self.model_map[self.data_type]["model"] def filepath(self): return os.path.join( self.csv_path, '-'.join( [self.base_filename, self.model_map[self.data_type]["file"]])) def __call__(self, *args, **kwargs): with open(self.filepath()) as f: reader = DictReader(f) for entry in reader: sanitize(entry) model = self.model()(entry) d = model.dict() self.es.index(d, "oedi_sources", self.doc_type, model.hash(), bulk=True) self.es.flush_bulk(True)
def _ensure_is_connected(self): if not self._is_connected: try: port = int(self.settings_dict["PORT"]) except ValueError: raise ImproperlyConfigured("PORT must be an integer") self.db_name = self.settings_dict["NAME"] self._connection = ES( "%s:%s" % (self.settings_dict["HOST"], port), decoder=Decoder, encoder=Encoder, autorefresh=True, default_indices=[self.db_name], ) self._db_connection = self._connection # auto index creation: check if to remove try: self._connection.create_index(self.db_name) except: pass # We're done! self._is_connected = True
def conn(self): if self.tdata.conn is None: self.tdata.conn = ES(self.registry.connection_string, bulk_size=self.bulk_size, max_retries=self.max_retries, timeout=self.timeout) return self.tdata.conn
def setUp(self): self.es_host = None self.es_cluster_name = None self._state = [] super(ElasticSearchSupport, self).setUp() self.es_host = self.input.param("es_host", "127.0.0.1") self.es_port = self.input.param("es_port", 9091) conn = ES(self.es_host + ":9200") if not self.input.param("skip_cleanup", True) or self.case_number == 1: conn.delete_index_if_exists("default") conn.create_index("default") self.log.warning("waiting for ES index to be ready to use") time.sleep(30) self._link_es_cluster() self._start_es_replication() self.log.warning("after setUp es")
def search(searchkey=u"电影"): conn = ES('127.0.0.1:9200') # TextQuery会对searchkey进行分词 qtitle = TextQuery("title", searchkey) h = HighLighter(['<b>'], ['</b>'], fragment_size=500) # 多字段搜索(must=>and,should=>or),高亮,结果截取(分页),排序 q = Search(BoolQuery(should=[qtitle]), highlight=h, start=0, size=3, sort={'id': {'order': 'asc'}}) q.add_highlight("title") results = conn.search(q, "zhihu", "answer") list = [] for r in results: if("title" in r._meta.highlight): r['title'] = r._meta.highlight[u"title"][0] list.append(r) return template('results.html', list=list, count=results.total)
def get_es_conn(es_url, index): """Create connection and create index if it doesn't exist.""" conn = ES(es_url) if not conn.indices.exists_index(index): conn.indices.create_index(index) return conn
def __init__(self, connection_string, elastic_name, storage, bulk=False, bulk_size=400): self.conn = ES(connection_string, bulk_size=bulk_size) self.bulk_size = bulk_size self.name = elastic_name self.storage = storage self.bulk = bulk
def __init__(self, serverInfo, proto = "http"): #serverInfo can be a json object #only connect pyes to master es node #in the case that other nodes are taken down #because http requests will fail # TODO: dynamic master node detection if isinstance(serverInfo, dict): self.ip = serverInfo["ip"] self.rest_username = serverInfo["username"] self.rest_password = serverInfo["password"] self.username = serverInfo["es_username"] self.password = serverInfo["es_password"] self.port = 9091 #serverInfo["port"] else: self.ip = serverInfo.ip self.rest_username = serverInfo.rest_username self.rest_password = serverInfo.rest_password self.username = serverInfo.es_username self.password = serverInfo.es_password self.port = 9091 # serverInfo.port self.baseUrl = "http://{0}:{1}/".format(self.ip, self.port) self.capiBaseUrl = self.baseUrl self.esHttpUrl = "http://{0}:9200".format(self.ip) self.http_port = str(int(self.port) + 109) self.proto = proto self.conn = ES(server=self.esHttpUrl) self.manager = managers.Cluster(self.conn) self.test_params = TestInputSingleton.input self.docs = None
def search_term(self, key, indices=["default"]): result = None params = {"term": {"_id": key}} query = ES.Search(params) row = self.conn.search(query, indices=indices) if row.total > 0: result = row[0] return result
def searchCompletions(request): searchString = request.GET.get('searchString') maxCount = int(request.GET.get('maxCount')) conn = ES(['127.0.0.1:9200']) conn.default_indices = COMPLETION_INDEX conn.refresh(COMPLETION_INDEX) q = {"query" : { "query_string" :{ "fields" : ["searchTerm.partial"], "query" : searchString } }, "facets" : { "facet" :{ "terms": { "fields" : [ "searchTerm"], "size" : MAX_RESULT_SIZE } } }, "size" : maxCount } try: query = json.dumps(q) url = "http://localhost:9200/%s/_search" % COMPLETION_INDEX response = urllib2.urlopen(url, query) result = json.loads(response.read()) result_list = [] done_list = [] for res in result['hits']['hits']: if res['_source']['type'] != "Collections": result_list.append(res['_source']) res['_source']['count'] = 0 elif res['_source']['searchTerm'] not in done_list: val = str(res['_source']['searchTerm']).lower() for term in result['facets']['facet']['terms']: if val == term['term'] : res['_source']['count'] = term['count'] done_list.append(res['_source']['searchTerm']) result_list.append(res['_source']) if len(result_list) == 0: result_list.append({"searchTerm" : "No Results"}) # for now just displaying no results when nothing is found in completion resp = json.dumps({"responseCode":"OK","requestParameters":{"searchString":searchString,"maxCount":unicode(maxCount)},"completions": result_list, "totalCount": unicode(maxCount)}) return HttpResponse(resp) except Exception, ex: return HttpResponse('0')
def count_documents(): num_docs = cache.get('website.documents_count') if not num_docs: elastic = ES(settings.SEARCH_HOSTS) indices = elastic.get_indices() elastic.connection.close() indices = indices.values() num_docs = 0 for item in indices: num_docs += item['num_docs'] cache.set('website.documents_count', num_docs) return num_docs
def get_es_conn(es_url, index, alias=None): """Create connection and create index if it doesn't exist.""" conn = ES(es_url) if not conn.indices.exists_index(index): conn.indices.create_index(index) if alias is not None: conn.indices.add_alias(alias, [index]) return conn
def single_param_search(request): log_results = None es = ES() # create elastic seach object if request.method == 'POST': # if the search form is submitted # filter with search param and search tearm q1 = TermFilter(request.POST.get('searchby'), request.POST.get('searchterm')) orq = ORFilter([q1]) q = FilteredQuery(MatchAllQuery(), orq) log_results = es.search( q, indices=index_name, doc_types=type_name) # get the filtered data from elasticsearch elif request.method == 'GET': # get all packet when get the search page log_results = es.search(MatchAllQuery(), indices=index_name, doc_types=type_name) return render(request, 'single_param_search.html', {'log_results': log_results})
def __init__(self, *args, **kwargs): self._dirty = set() # We have to wait for the elastic container to start or things go # sideways. # TODO: Check status properly somehow (straight HTTP request, perhaps) time.sleep(30) self._elastic = ES(ELASTIC_URL, max_retries=100) self._finalize = Finalize(self, self.sync, exitpriority=5) super(ControlPlaneScheduler, self).__init__(*args, **kwargs)
class ElasticSearchPipeline(object): def __init__(self, settings): basic_auth = {'username': settings.get('ELASTICSEARCH_USERNAME'), 'password': settings.get('ELASTICSEARCH_PASSWORD')} if settings.get('ELASTICSEARCH_PORT'): uri = "%s:%d" % (settings.get('ELASTICSEARCH_SERVER'), settings.get('ELASTICSEARCH_PORT')) else: uri = "%s" % (settings.get('ELASTICSEARCH_SERVER')) self.es = ES([uri], basic_auth=basic_auth) self.settings = settings @classmethod def from_crawler(cls, crawler): pipe = cls(crawler.settings) return pipe def process_item(self, item, spider): if self.__get_uniq_key() is None: log.info("ELASTICSEARCH_UNIQ_KEY is NONE") self.es.index(dict(item), self.settings.get('ELASTICSEARCH_INDEX'), self.settings.get('ELASTICSEARCH_TYPE'), id=item['id'], op_type='create', ) else: self.es.index(dict(item), self.settings.get('ELASTICSEARCH_INDEX'), self.settings.get('ELASTICSEARCH_TYPE'), self._get_item_key(item)) log.debug("Item send to Elastic Search %s" % (self.settings.get('ELASTICSEARCH_INDEX')), spider=spider) return item def _get_item_key(self, item): uniq = self.__get_uniq_key() if isinstance(uniq, list): values = [item[key] for key in uniq] value = ''.join(values) else: value = uniq return hashlib.sha1(value).hexdigest() def __get_uniq_key(self): if not self.settings.get('ELASTICSEARCH_UNIQ_KEY'): return self.settings.get('ELASTICSEARCH_UNIQ_KEY') else: return None
def set_connection(self, project): logger.debug('Setting up connection') if self.es_conn is None: try: cs = self.get_option('es_conn_string', project) logger.debug('Creating connection to %s', cs) self.es_conn = ES(cs) except Exception, e: logger.warning('Error setting up the connection: %s', e) return