def test_initial_search_error(self): with patch.object(self, "client") as client_mock: client_mock.search.return_value = { "_scroll_id": "dummy_id", "_shards": {"successful": 4, "total": 5}, "hits": {"hits": [{"search_data": 1}]}, } client_mock.scroll.side_effect = self.mock_scroll_responses data = list( helpers.scan( self.client, index="test_index", size=2, raise_on_error=False ) ) self.assertEqual(data, [{"search_data": 1}, {"scroll_data": 42}]) client_mock.scroll.side_effect = self.mock_scroll_responses with self.assertRaises(ScanError): data = list( helpers.scan( self.client, index="test_index", size=2, raise_on_error=True ) ) self.assertEqual(data, [{"search_data": 1}]) client_mock.scroll.assert_not_called()
def test_clear_scroll(self): bulk = [] for x in range(4): bulk.append({"index": {"_index": "test_index", "_type": "_doc"}}) bulk.append({"value": x}) self.client.bulk(bulk, refresh=True) with patch.object( self.client, "clear_scroll", wraps=self.client.clear_scroll ) as spy: list(helpers.scan(self.client, index="test_index", size=2)) spy.assert_called_once() spy.reset_mock() list( helpers.scan(self.client, index="test_index", size=2, clear_scroll=True) ) spy.assert_called_once() spy.reset_mock() list( helpers.scan( self.client, index="test_index", size=2, clear_scroll=False ) ) spy.assert_not_called()
def test_logger(self, logger_mock): bulk = [] for x in range(4): bulk.append({"index": {"_index": "test_index", "_type": "_doc"}}) bulk.append({"value": x}) self.client.bulk(bulk, refresh=True) with patch.object(self.client, "scroll") as scroll_mock: scroll_mock.side_effect = self.mock_scroll_responses list( helpers.scan( self.client, index="test_index", size=2, raise_on_error=False, clear_scroll=False, ) ) logger_mock.warning.assert_called() scroll_mock.side_effect = self.mock_scroll_responses try: list( helpers.scan( self.client, index="test_index", size=2, raise_on_error=True, clear_scroll=False, ) ) except ScanError: pass logger_mock.warning.assert_called()
def scan_and_queue(self,p_queue,p_index,p_query={},p_doctype=None,p_scroll_time='5m',p_timeout='1m'): """Reads docs from an es index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_scroll_time: Time for scroll method p_timeout: Timeout - After this period, scan context is closed p_index: Index where items are picked from p_doctype: DocType of the items p_query: ElasticSearch query for scanning the index """ try: param = [{'host':self.host,'port':self.port}] es = Elasticsearch(param) logger.info('Connected to ES Server for reading: %s',json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server for reading: %s',json.dumps(param)) logger.error(e) sys.exit(EXIT_IO_ERROR) try: if 'p_doctype' is not None: documents = helpers.scan(client=es, query=p_query, size=1000, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout) else: documents = helpers.scan(client=es, query=p_query, size=1000, scroll= p_scroll_time, index=p_index, timeout=p_timeout) for doc in documents: logger.debug(doc) p_queue.put(doc) except Exception as e: logger.info("Error while scanning ES index %s with query %s",p_index,p_query)
def test_scroll_error(self): bulk = [] for x in range(4): bulk.append({"index": {"_index": "test_index", "_type": "_doc"}}) bulk.append({"value": x}) self.client.bulk(bulk, refresh=True) with patch.object(self.client, "scroll") as scroll_mock: scroll_mock.side_effect = self.mock_scroll_responses data = list( helpers.scan( self.client, index="test_index", size=2, raise_on_error=False, clear_scroll=False, ) ) self.assertEqual(len(data), 3) self.assertEqual(data[-1], {"scroll_data": 42}) scroll_mock.side_effect = self.mock_scroll_responses with self.assertRaises(ScanError): data = list( helpers.scan( self.client, index="test_index", size=2, raise_on_error=True, clear_scroll=False, ) ) self.assertEqual(len(data), 3) self.assertEqual(data[-1], {"scroll_data": 42})
def test_general_kwargs_forwarded_to_search(self): inexistent_index = 'test_index_123' self.assertRaises( NotFoundError, lambda: list(helpers.scan(self.client, index=inexistent_index, doc_type="answers", size=2)) ) global_kwargs = {'ignore': 404} list(helpers.scan(self.client, index=inexistent_index, doc_type="answers", size=2, global_kwargs=global_kwargs))
def generate_all_doc(_es_instance, _my_index, _my_type="_all"): if _my_type == "_all": for _doc in scan(_es_instance, index=_my_index, query={"query": {"match_all": {}}}): yield _doc else: for _doc in scan(_es_instance, index=_my_index, doc_type=_my_type, query={"query": {"match_all": {}}}): yield _doc
def test_migrate(self): real_names = self.client.indices_manager.real_names('slingshot') docs = list(scan(self.client, index='slingshot')) self.assertEqual(3, len(docs)) self.client.indices_manager.migrate('slingshot', CONFIG) self.assertNotEqual(real_names, self.client.indices_manager.real_names('slingshot')) self.client.indices.refresh('slingshot') docs = list(scan(self.client, index='slingshot')) self.assertEqual(3, len(docs)) self.client.indices.refresh('slingshot')
def test_general_kwargs_forwarded_to_scroll(self): with self.assertRaises(NotFoundError): for page in helpers.scan(self.client, index="test_index", doc_type="answers", size=2): # Deleting the index after first request was done makes sure # we test the scroll method. self.client.indices.delete('test_index', ignore=404) self.setUp() # Still raises a scanning error, but gets to that point only because # ignore=404 was forwarded to scroll. with self.assertRaises(helpers.ScanError): for page in helpers.scan(self.client, index="test_index", doc_type="answers", size=2, global_kwargs={'ignore': 404}): self.client.indices.delete('test_index', ignore=404)
def generate_all_doc_list(_es_instance, _my_index, _my_type="_all"): _docs_lst = [] if _my_type == "_all": for _doc in scan(_es_instance, index=_my_index, query={"query": {"match_all": {}}}): _docs_lst.append(_doc) else: for _doc in scan(_es_instance, index=_my_index, doc_type=_my_type, query={"query": {"match_all": {}}}): _docs_lst.append(_doc) return _docs_lst
def scan_and_queue(self, p_queue, p_index, p_query={}, p_doctype=None, p_scroll_time='5m', p_timeout='1m', p_size=100, p_overall_timeout=30, p_nbmax_retry=3): """Reads docs from an es index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_scroll_time: Time for scroll method p_timeout: Timeout - After this period, scan context is closed p_index: Index where items are picked from p_doctype: DocType of the items p_query: ElasticSearch query for scanning the index """ logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: param = [{'host': self.host, 'port': self.port, 'timeout': p_overall_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True}] if self.proxy is None: es = Elasticsearch(param) else: es = Elasticsearch(param, connection_class=MyConnection, proxies={'http': self.proxy}) es.ping() logger_mp.info('Connected to ES Server for reading: {0}'.format(json.dumps(param))) except Exception as e: logger_mp.error('Connection failed to ES Server for reading: {0}'.format(json.dumps(param))) logger_mp.error(e) try: if not self.scroll_docs: if 'p_doctype' is not None: self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout) else: self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, timeout=p_timeout) start = time.time() for doc in self.scroll_docs: p_queue.put(doc) elapsed = time.time() - start with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 nb_items = self.counters['nb_items_scanned'].value self.counters['scan_time'].value += elapsed if nb_items % self.counters['log_every'] == 0: logger_mp.info("Scan : {0} items".format(nb_items)) logger_mp.debug(" -> Avg scan time : {0}ms".format(1000 * self.counters['scan_time'].value / nb_items)) # Start timers reinit start = time.time() except Exception as e: logger_mp.info("Error while scanning ES index %s with query %s", p_index, p_query) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1
def load_old(self): query = {'query': {'prefix': {'_id': '%d:' % group_code}}} try: res1 = es_client.count(index=name_index, doc_type='geometry', body=query) res2 = es_client.count(index=name_index, doc_type='route', body=query) if res1['count'] > res2['count']: for geometry in scan(es_client, query, '10m', index=name_index, doc_type='geometry'): self.es_geometry[geometry['_id']] = geometry['_source']['points'] else: for route in scan(es_client, query, '10m', index=name_index, doc_type='route'): self.es_geometry[route['_id']] = route['_source']['geometry'] except: logger.error(u'Ошибка чтения геометрии из ElasticSearch')
def doc_feeder(self, index_type=None, index_name=None, step=10000, verbose=True, query=None, scroll='10m', **kwargs): conn = self.conn index_name = index_name or self.ES_INDEX_NAME doc_type = index_type or self.ES_INDEX_TYPE n = self.count(query=query)['count'] cnt = 0 t0 = time.time() if verbose: print('\ttotal docs: {}'.format(n)) _kwargs = kwargs.copy() _kwargs.update(dict(size=step, index=index_name, doc_type=doc_type)) res = helpers.scan(conn, query=query, scroll=scroll, **_kwargs) t1 = time.time() for doc in res: if verbose and cnt % step == 0: if cnt != 0: print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1))) print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='') t1 = time.time() yield doc cnt += 1 if verbose: print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1))) print("Finished! [{}]".format(timesofar(t0)))
def searchTweets(keyword, latlondist): #Variables that contains the user credentials to access Twitter API if TwitterHelper.AWS_ACCESS_KEY == None: raise KeyError("Please set the AWS_ACCESS_KEY env. variable") if TwitterHelper.AWS_SECRET_KEY == None: raise KeyError("Please set the AWS_SECRET_KEY env. variable") s = Search() if latlondist != None: locJson = json.loads(latlondist) s = s.query({"filtered" : {"query" : {"match_all" : {}}, "filter" : {"geo_distance" : {"distance" : locJson['dist'], "location" : {"lat" : locJson['lat'], "lon" : locJson['lon']}}}}}) if keyword != None: q = Q("match_phrase", text = keyword) s = s.query(q) scanResp = None scanResp = helpers.scan(client = TwitterHelper.ES, query = s.to_dict(), scroll = "1m", index = "tweets", timeout = "1m") arr = [] for resp in scanResp: hit = resp['_source'] d = {} d['name'] = hit['name'] d['text'] = hit['text'] d['sentiment'] = hit['sentiment'] d['lat'] = hit['location']['lat'] d['lon'] = hit['location']['lon'] arr.append(d) allD = {} allD['tweets'] = arr mapInput = json.dumps(allD) return mapInput
def query_and_dump_reults(args): es = Elasticsearch([args.hostname + ':' + str(args.port)]) query = '{"query":{"match_all":{}}}' if args.query is not None: query = args.query doc_type = None if args.doc_type is not None: doc_type = args.doc_type target = "output.csv" if args.target is not None: target = args.target res = es.count(index=args.index, body=query) nhits = res['count'] counter = 0 bar = progressbar.ProgressBar(max_value=nhits) res = helpers.scan(es, index=args.index, query=query, doc_type=doc_type) fields = args.fields.split(',') with open(target, 'w') as csvfile: datawriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) datawriter.writerow(fields) for item in res: item = item['_source'] datawriter.writerow([get_var(item, field) for field in fields]) counter += 1 bar.update(counter) bar.finish()
def get_venues_by_location(location, radius, hour_for_cache): """Return a list of venues oid that are in location ('latitude,longitude') in the given radius. Cache results for one hour. """ lat_lon = location.split(',') body = { "query": { "filtered": { "query": {"match_all": {}}, "filter": { "geo_distance": { "distance": str(radius) + 'km', "location": { "lat": float(lat_lon[0]), "lon": float(lat_lon[1]) } } } } }, "_source": { "include": ["oid"] } } try: result = scan( es, index='lac', doc_type='geo_location', query=body, size=500) return [v['_source']['oid'] for v in result] except Exception as e: log.exception(e) return []
def main(): s_re = scan(es_user_portrait, query={'query':{'match_all':{}}, 'size':1}, index=index_name, doc_type=index_type) count = 0 while True: try: scan_re = s_re.next()['_source'] count += 1 task_name = scan_re['task_name'] history_status = json.loads(scan_re['history_status']) #iter history status new_history_status = [] for history_item in history_status: history_item_last = history_item[-1] if history_item_last == u'': new_history_item = history_item[:-1] new_history_item.append("0") new_history_status.append(new_history_item) else: new_history_status.append(history_item) new_history_item = history_item print 'new_history_status:', new_history_status es_user_portrait.update(index=index_name, doc_type=index_type, \ id=task_name, body={'doc':{'history_status': json.dumps(new_history_status)}}) except StopIteration: print 'all done' break except Exception as e: raise e print 'count:', count
def facebook_trend_corpus(self): return scan(self._es, query={"query": {"match_all": {}}}, index="signals_time_series_20160601", doc_type="facebookTrend", _source=['url', 'parentPageId'] )
def scan_topic2redis(): count = 0 s_re = scan(es_user_portrait, query={'query':{'match_all': {}}, 'size':1000}, index=portrait_index_name, doc_type=portrait_index_type) start_ts = time.time() hmset_dict = {} while True: try: scan_re = s_re.next()['_source'] count += 1 uid = scan_re['uid'] topic_ch_string = scan_re['topic_string'] topic_ch_list = topic_ch_string.split('&') topic_en_string = [topic_ch2en_dict[item] for item in topic_ch_list] hmset_dict[uid] = json.dumps(topic_en_string) if count % 1000 == 0 and count != 0: R_TOPIC.hmset(r_topic_name, hmset_dict) end_ts = time.time() print '%s sec count 1000' % (end_ts - start_ts) except StopIteration: if hmset_dict: R_TOPIC.hmset(r_topic_name, hmset_dict) hmset_dict = {} break except Exception as e: raise e break if hmset_dict: R_TOPIC.hmset(r_topic_name, hmset_dict) print 'all count:', count
def __iter__(self, item_type=None): query = { 'fields': ['uuid'], 'filter': {'term': {'item_type': item_type}} if item_type else {'match_all': {}}, } for hit in scan(self.es, query=query): yield hit['fields']['uuid'][0]
def update_attribute_day(): # scan the user_portrait and bulk action to update status = False results = {} count = 0 index_name = "user_portrait" index_type = "user" s_re = scan(es, query={"query": {"match_all": {}}, "size": 1000}, index=index_name, doc_type=index_type) while True: bulk_action = [] while True: try: scan_re = s_re.next()["_source"] count += 1 except StopIteration: print "all done" if bulk_action: # print 'bulk_action:', bulk_action status = save_user_results(bulk_action) # print 'status:', status sys.exit(0) except Exception, r: print Exception, r sys.exit(0) uid = scan_re["uid"] user_info = {"uid": uid} evaluate_result = get_evaluate_index(user_info, status="update") results = {} results = dict(results, **evaluate_result) action = {"update": {"_id": str(uid)}} bulk_action.extend([action, {"doc": results}])
def freshness_iterator( self, query, outdated_status, prefix, start_time, timeout ): """ Make a scan query then manipulate hits Yield on all modified hits """ self.elasticclient.indices.refresh( index=self.status_index, ignore_unavailable=True) start_time = start_time * 1000 now = int(time.time()) * 1000 for hit in helpers.scan( self.elasticclient, index=self.status_index, size=self.batch_size, query=query, scroll="%ss" % timeout, ): # Startup grace_time handle if 'last_check' not in hit['_source']: hit['_source']['last_check'] = hit['_source']['timestamp'] if ( hit['_source']['last_check'] < start_time and now < ( start_time + hit['_source']['last_check'] - hit['_source']['freshness']) ): continue hit['_op_type'] = 'update' hit['doc'] = {} # Update status only if OK before if hit['_source']['status'] == 0: hit['doc']['status'] = outdated_status else: hit['doc']['status'] = hit['_source']['status'] hit['doc']['timestamp'] = int(time.time()) * 1000 hit['doc']['output'] = prefix + hit['_source']['output'] # Build a log entry log = {} for field in Check.log_fields: if field in ('timestamp', 'status', 'output'): log[field] = hit['doc'][field] else: log[field] = hit['_source'][field] del hit['_source'] yield hit # Update OK # Forward update to _send_to_logs self.logs.append(log)
def main(flag): es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) r = redis.StrictRedis() if flag == '1': body = {'query': {'match_all': {}}} result = scan(es, index='throwtable', doc_type='implementation', query=body) for p in result: r.sadd('pkgs', p['_source']['instruction']['package']) else: result = json.load(open('list.json')) for p in result: r.sadd('pmgs', p) samples = r.srandmember('pkgs', 100) for pkgName in samples: r.sadd('samples', pkgName) continue print '================' printPkgContent(pkgName) result = [] while True: answer = getUserInput("Article? ") if answer == '': break page = queryWikipedia(answer) answer = getUserInput("Do you mean: %s page? ") if answer is True: result.append(page) break if len(result) != 0: r.sadd('%s:map' % pkgName, *result)
def scan_es2redis_month(): count = 0 s_re = scan(es_user_portrait, query={'query':{'match_all': {}}, 'size':1000}, index=portrait_index_name, doc_type=portrait_index_type) start_ts = time.time() user_info = {} while True: try: scan_re = s_re.next()['_source'] count += 1 uid = scan_re['uid'] user_info[uid] = {'fansnum':scan_re['fansnum'], 'topic_string':scan_re['topic_string']} update_month_redis.lpush(UPDATE_MONTH_REDIS_KEY, json.dumps(user_info)) user_info = {} if count % 1000 == 0 and count != 0: end_ts = time.time() print '%s sec count 1000' % (end_ts - start_ts) start_ts = end_ts except StopIteration: print 'all done' if user_info: update_month_redis.lpush(UPDATE_MONTH_REDIS_KEY, json.dumps(user_info)) user_info = {} break except Exception, r: raise r break
def copy(self, source_index, target_index, transform=None, ignore_types=None): if source_index == target_index: raise SameIndex("source_index and target_index must be different") if not self.client.indices.exists(source_index): raise IndexDoesNotExist("source_index '{}' does not exist".format(source_index)) if not self.client.indices.exists(target_index): raise IndexDoesNotExist("target_index '{}' does not exist".format(source_index)) transform = transform or (lambda doc: doc) ignore_types = ignore_types or [] hits = helpers.scan(self.client, index=source_index) def _process_hits(hits, index): for doc in hits: if doc['_type'] in ignore_types: continue doc['_index'] = index doc['_op_type'] = 'create' doc = transform(doc) if not doc: continue yield doc return helpers.bulk(self.client, _process_hits(hits, target_index), chunk_size=1000, stats_only=True)
def search(self, query_string, search_type='default'): '''Run a Query String query and return a list of sample_ids associated with the matches. Run the query against all document types. ''' if search_type == 'advanced': query = self.build_query(query_string) else: es_reserved_chars_re = r'([\+\-=\>\<\!\(\)\{\}\[\]\^\"\~\*\?\:\\/ ])' query_string = re.sub(es_reserved_chars_re, r'\\\g<1>', query_string) if search_type == 'default': query = self.build_query("*" + query_string + "*") elif search_type == 'exact': query = self.build_query("\"" + query_string + "\"") else: print('Unknown search type!') return None result = helpers.scan( self.es, query=query, index=self.index ) matches = [] for r in result: if r.get('_source', {}).get('doc_type', {}) == 'sample': field = '_id' else: field = '_routing' matches.append(r[field]) return tuple(set(matches))
def delete(self, **kwargs): """ Deletes a document from the index Pass an index, doc_type, and id to delete a specific document Pass a body with a query dsl to delete by query """ body = kwargs.pop('body', None) if body != None: try: data = [] refresh = kwargs.pop('refresh', False) for hit in helpers.scan(self.es, query=body, **kwargs): hit['_op_type'] = 'delete' data.append(hit) return helpers.bulk(self.es, data, refresh=refresh, **kwargs) except Exception as detail: self.logger.warning('%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning('%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail
def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan( self.elastic, query=kwargs.pop("body", None), scroll="10m", **kwargs ): hit["_source"]["_id"] = hit["_id"] yield hit["_source"]
def load_old(self): query = {'query': {'prefix': { '_id': '%s:' % group_code }}} try: for station in scan(es_client, query, '10m', index=name_index, doc_type='station'): self.es_location[station['_id']] = {'location': {'long': station['_source']['location'][0], 'lat': station['_source']['location'][1] }} except: logger.error(u'Ошибка чтения остановок из ElasticSearch')
def scan_documents(old_es, new_es, old_index, new_index): """ Scan for matching documents In order to match the two indices without having to deal with ordering issues, we pull a set of documents from the old ES index, and then try to find matching documents with the same _id in the new ES index. This process is batched to avoid making individual network calls to the new ES index. """ matching = 0 total = 0 old_iter = scan(old_es, index=old_index) for old_elts in grouper(old_iter, SCAN_ITER_STEP): old_elt_ids = [] old_elt_docs = {} for elt in old_elts: if elt is not None: old_elt_ids.append({'_id': elt['_id']}) old_elt_docs[elt['_id']] = elt matching += find_matching_ids(new_es, new_index, old_elt_ids, old_elt_docs) total += len(old_elt_ids) if total % 100 == 0: print 'processed {} items'.format(total) ratio = float(matching)/total print "{}: scanned documents matching ({} out of {}, {:.6}%)".format( 'OK' if ratio > SCAN_MATCH_THRESHOLD else 'FAILURE', matching, total, ratio * 100 )
search_body = { "query": { "bool": { "filter": [ {"term": {"platform.keyword": platform}}, # {"term": {"releaser.keyword": releaser}}, {"term": {"releaser_id_str": doc_id}}, {"range": {"release_time": {"gte": re_s_t, "lt": re_e_t}}}, {"range": {"fetch_time": {"gte": re_s_t}}} ] } } } # #scan_re = scan(client=es, index='crawler-data-raw', doc_type='doc',query=search_body, scroll='3m') scan_re = scan(client=es, index='short-video-all-time-url', doc_type='all-time-url',query=search_body, scroll='3m') for one_scan in scan_re: doc_id = cal_doc_id(one_scan["_source"]["platform"], url=one_scan["_source"]["url"], doc_id_type='all-time-url', data_dict=one_scan["_source"]) find_exist = { "query": { "bool": { "filter": [ {"term": {"_id": doc_id}} ] } } } search_re = es.search(index='short-video-weekly', doc_type=weekly_doc_type_name, body=find_exist) if search_re['hits']['total'] == 0: re_list.append(one_scan['_source'])
def _es_results(self, query_compiler, collector): query_params, post_processing = self._resolve_tasks(query_compiler) size, sort_params = Operations._query_params_to_size_and_sort( query_params) script_fields = query_params.script_fields query = Query(query_params.query) body = query.to_search_body() if script_fields is not None: body["script_fields"] = script_fields # Only return requested field_names _source = query_compiler.get_field_names(include_scripted_fields=False) if _source: # For query_compiler._client.search we could add _source # as a parameter, or add this value in body. # # If _source is a parameter it is encoded into to the url. # # If _source is a large number of fields (1000+) then this can result in an # extremely long url and a `too_long_frame_exception`. Therefore, add # _source to the body rather than as a _source parameter body["_source"] = _source else: body["_source"] = False es_results = None # If size=None use scan not search - then post sort results when in df # If size>10000 use scan is_scan = False if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW: if size > 0: try: es_results = query_compiler._client.search( index=query_compiler._index_pattern, size=size, sort=sort_params, body=body, ) except Exception: # Catch all ES errors and print debug (currently to stdout) error = { "index": query_compiler._index_pattern, "size": size, "sort": sort_params, "body": body, } print("Elasticsearch error:", error) raise else: is_scan = True es_results = scan( client=query_compiler._client, index=query_compiler._index_pattern, query=body, ) # create post sort if sort_params is not None: post_processing.append(SortFieldAction(sort_params)) if is_scan: while True: partial_result, df = query_compiler._es_results_to_pandas( es_results, collector.batch_size(), collector.show_progress) df = self._apply_df_post_processing(df, post_processing) collector.collect(df) if not partial_result: break else: partial_result, df = query_compiler._es_results_to_pandas( es_results) df = self._apply_df_post_processing(df, post_processing) collector.collect(df)
for i in aliases: if index in aliases[i]['aliases']: index = i ''' Fetch the mapping in order to create the header ''' mapping = es.indices.get_mapping( index=index, doc_type=doc_type)[index]['mappings'][doc_type]['properties'].keys() ''' Set handler to elasticsearch ''' scanResp = helpers.scan(client=es, query=query, scroll="10m", index=index, size=size, doc_type=doc_type, clear_scroll=False, request_timeout=300) with open(output_files, 'w') as f: counter = 0 if fields == "all": w = csv.DictWriter(f, mapping, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL) else: fields = fields.split(",") w = csv.DictWriter(f, [i for i in mapping if i in fields], delimiter=delimiter,
df_movies = pd.read_csv(r".\data\movies.csv", engine="python") df_ratings = pd.read_csv(r".\data\ratings.csv", engine="python").drop("timestamp", axis=1) all_genres = [] for i in range(len(df_movies)): temp = df_movies.loc[i, "genres"].split("|") for j in range(len(temp)): if (temp[j] not in all_genres): all_genres.append(temp[j]) #print (all_genres) # In[ ]: results = helpers.scan(es, index='ratings', query={"query": {"match_all": {}}}) res_set = set() for item in results: popo = item['_source']['userId'] res_set.add(int(popo)) #print(res_set) # In[ ]: df3 = df_ratings.drop("movieId", axis=1) df3.groupby('userId').first() df3.drop("rating", axis=1) lstset = list(res_set) data = {'userId': lstset}
es = Elasticsearch('123.123.123.123:9201') sres = helpers.scan( es, index="webhook-*", preserve_order=True, query={ "query": { "bool": { "must": [{ "query_string": { "query": "_type:notify AND notify_number:1 AND paid_time:{2018-04-23T04:00 TO * }", "analyze_wildcard": True } }, { "range": { "@timestamp": { "gte": 1524457200000, "lte": 1524461100000, "format": "epoch_millis" } } }], "must_not": [] } } }, scroll="300s") dd = {}
def fb_count2flow_text(): index_name = facebook_count_index_name_pre + '2017-10-12' query_body = {'query': {'match_all': {}}} scan_results = scan(es, index=index_name, doc_type=facebook_count_index_type, query=query_body, size=1000) count = 0 t1 = time.time() while 1: try: body_dict = {} data = scan_results.next() item = data['_source'] body_dict['comment'] = item['comment'] body_dict['favorite'] = item['favorite'] body_dict['share'] = item['share'] body_dict['update_time'] = item['update_time'] start_ts = datetime2ts('2017-10-10') end_ts = datetime2ts('2017-10-25') day_num = (end_ts - start_ts) / (24 * 3600) + 1 count += 1 if count % 1000 == 0: print 'fb..', count t2 = time.time() print 'time cost..', t2 - t1 t1 = t2 for i in range(day_num): timestamp = start_ts + i * 24 * 3600 date = ts2datetime(timestamp) flow_text_index_name = facebook_flow_text_index_name_pre + date _id = item['fid'] try: es.update(index=flow_text_index_name,doc_type=facebook_flow_text_index_type,\ id=_id,body={'doc':body_dict}) # count += 1 # if count % 1000 == 0: # print 'fb..',count except: continue except StopIteration: break
def tw_flow_text(): start_ts = datetime2ts('2017-10-10') end_ts = datetime2ts('2017-10-25') day_num = (end_ts - start_ts) / (24 * 3600) + 1 count = 0 for i in range(day_num): timestamp = start_ts + i * 24 * 3600 date = ts2datetime(timestamp) index_name = twitter_flow_text_index_name_pre + date query_body = {'query': {'match_all': {}}} scan_results = scan(es, index=index_name, doc_type=twitter_flow_text_index_type, query=query_body, size=1000) bulk_action = [] while 1: try: body_dict = {} data = scan_results.next() item = data['_source'] body_dict['comment'] = 0 body_dict['favorite'] = 0 body_dict['share'] = 0 body_dict['update_time'] = item['timestamp'] #flow_text_index_name = twitter_flow_text_index_name_pre + ts2datetime(item['timestamp']) _id = item['tid'] action = {'update': {'_id': _id}} bulk_action.extend([action, {'doc': body_dict}]) count += 1 if count % 100 == 0: print 'tw..', count es.bulk(bulk_action, index=index_name, doc_type=twitter_flow_text_index_type, timeout=100) except StopIteration: break if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=twitter_flow_text_index_type, timeout=100)
def read(self, index, query): return [[doc["_id"], doc["_source"]] for doc in scan(self.conn, index=index, query=query)]
def build_orange_table_from_es_logs( mongo_query, valid_keys=None, prune_null_resources=True, all_logs_index='flat-all-log-entries', unique_logs_index='flat-unique-log-entries'): field_values = {} single_value_columns = set( ) #TODO return values that are always true, add them to Rules records = 0 key_value_counter = Counter() paginator = helpers.scan(es, query={"query": { "match_all": {} }}, index=unique_logs_index, doc_type='doc') for hit in paginator: records += 1 # if records % 1000 == 0: # print('Records : ' + str(records)) for key, value in hit['_source'].items(): if key == '_id' or (valid_keys and key not in valid_keys): continue RuleUtils.addMulti(field_values, key, value) key_value_counter.update(['%s=%s' % (key, value)]) for k, v in dict(key_value_counter).items(): if v == records: single_value_columns.add( k) # ignore fields that always have the same value field_name = k.split('=')[0] field_values.pop(field_name) orange_columns = [] for key, value in field_values.items(): # if len(value) == 1 and records > 1: # single_value_columns.add('%s=%s' % (key, value.pop()))#ignore fields that always have the same value # continue for elem in value: if not isinstance(elem, str): value.remove(elem) value.add(str(elem)) try: column = DiscreteVariable(key, values=value) except Exception as ex: traceback.print_exc() print(value) orange_columns.append(column) # if use_resources: # resource_encoder = OrangeTableResourceColumnGenerator(mongo_query) # resource_columns = resource_encoder.get_table_columns() # orange_columns.extend(resource_columns) # else: resource_encoder = None domain = Domain(orange_columns) records = 0 table = Table(domain) paginator = helpers.scan(es, query={"query": { "match_all": {} }}, index=all_logs_index, doc_type='doc') for hit in paginator: instance = createInstance(domain, hit['_source'], resource_encoder, prune_null_resources) table.append(instance) records += 1 # if records % 1000 == 0: # print('Records : ' + str(records)) # print('Built Table: %d recrods' % len(table)) return table, single_value_columns
def tag_positive_terms(self): ''' get documents without a sentiment tag that match phrase with slop: - protect|support|keep|need net neutrality - let the new neutrality stand for a broader result set than regex in analyze ''' query = { "_source": "text_data", "query": { "bool": { "filter": { "bool": { "should": [], "must": [{ "term": { "analysis.source": "unknown" } }], "must_not": [{ "exists": { "field": "analysis.titleii" } }, { "exists": { "field": "analysis.sentiment_manual" } }, { "exists": { "field": "analysis.sentiment_sig_terms_ordered" } }] } } } } } phrases = [ 'essential net neutrality', 'keep net neutrality', 'maintain net neutrality', 'need net neutrality', 'preserve net neutrality' 'protect net neutrality', 'save net neutrality', 'support net neutrality', 'support title 2', 'support title II', 'let the new neutrality stand', 'net neutrality rules are extremely important' 'net neutrality is important' ] for phrase in phrases: subq = { "match_phrase": { "text_data": { "query": phrase, "slop": 3 } } } query['query']['bool']['filter']['bool']['should'].append(subq) print(json.dumps(query)) resp = self.es.search(index='fcc-comments', body=query, size=0) total = resp['hits']['total'] print('tagging %s / %s matches' % (self.limit, total)) docs = [] for doc in scan(self.es, index='fcc-comments', query=query, size=1000): docs.append( lib.bulk_update_doc(doc['_id'], {'source': 'es_terms_positive'})) if not len(docs) % 1000: print( '\tfetched %s\n%s\t%s' % (len(docs), doc['_id'], doc['_source']['text_data'][:400])) if len(docs) == self.limit: break print('indexing %s' % (len(docs))) tagged = lib.bulk_update(self.es, docs) print('tagged %s / %s matches' % (tagged, total)) return tagged
def scroll_over_all_docs(_index): for hit in helpers.scan(es, index=_index): populate_dict_of_duplicate_docs(hit)
def interpolate(a): doc = a['_source']['__meta__']['financials'] for k, v in doc.iteritems(): if v == None and k == 'assets': doc['assets'] = {} doc['assets']['value'] = _assets(doc) elif v == None and k == 'liabilities': doc['liabilities'] = {} doc['liabilities']['value'] = _liabilities(doc) elif v == None and k == 'stockholdersEquity': doc['stockholdersEquity'] = {} doc['stockholdersEquity']['value'] = _stockholdersEquity(doc) elif v == None and k == 'liabilitiesAndStockholdersEquity': doc['liabilitiesAndStockholdersEquity'] = {} doc['liabilitiesAndStockholdersEquity'][ 'value'] = _liabilitiesAndStockholdersEquity(doc) else: pass doc['interpolated'] = True return a # -- # run for a in scan(client, index=config['aq_forms_enrich']['index'], query=query): s = interpolate(a) client.index(index=config['aq_forms_enrich']['index'], doc_type=config['aq_forms_enrich']['_type'], body=s['_source'], id=s['_id'])
def grab(args): """ Find index pattern, iterate through documents, collecting entries. If fields is set we will narrow our results to only include the specified fields. Otherwise, all fields will be returned, using the first document to determine the fields. """ # Setup client username = args.username password = args.password if not args.password: password = getpass() auth = (username, password) es = Elasticsearch(args.host, use_ssl='https' in args.host, verify_certs=True, http_auth=auth) query_string = args.query # Build query query = { 'query': { 'bool': { 'must': [{ 'range': { '@timestamp': { 'gte': args.range_from, 'lte': args.range_to } } }] } } } if query_string: query['query']['bool']['must'].append( {'query_string': { 'query': query_string }}) else: query['query']['bool']['must'].append({'match_all': {}}) args.total = int(args.total) # Search for records kwargs = {} kwargs['index'] = args.index kwargs['size'] = args.total if args.fields: kwargs['_source_includes'] = args.fields try: results = None # Scan if we're looking for more than 500 results. Note that size # for scan denotes number of entries retrieved each call. if kwargs['size'] > 500: kwargs['size'] = 500 kwargs['query'] = query results = scan(es, **kwargs) else: kwargs['body'] = query results = es.search(**kwargs) results = results['hits']['hits'] def flatten(d, path=None): """ Returns list of fields and their path recursively separated by dot notation. """ l = [] path = path or [] for key, value in d.items(): if isinstance(value, dict): for item in flatten(value, [*path, key]): l.append(item) else: if isinstance(value, list): l.append(('.'.join([*path, key]), ','.join(value))) else: l.append(('.'.join([*path, key]), value)) return l # Flatten all records to dot notation records = [] for i, hit in enumerate(results): if i >= args.total: break if 'sort' in hit: del hit['sort'] if args.only_source: records.append(dict(flatten(hit['_source']))) else: records.append(dict(flatten(hit))) except: raise SystemExit( f'Error connecting to ElasticSearch at "{args.host}". Please ensure that ElasticSearch is running, and your credentials are correct.' ) if len(records) == 0: raise SystemExit('Query returned no results') # Setup CSV field names field_names = set() for item in records: for key in item.keys(): field_names.add(key) # Save as CSV with open(args.output, 'w') as f: writer = csv.DictWriter(f, sorted(field_names), extrasaction='ignore') if not args.no_header: writer.writeheader() writer.writerows(records)
def get_hits(self, index, query=None): return scan(self.es, query=query, index=index)
def create_dump(ctx, index): """ Create a dump of an index. If you don't provide an ``--index`` option, you will be prompted with a list of available index names. Dumps are stored as a gzipped txt file in ``settings.DUMPS_DIR/<index_name>/< timestamp>_<index-name>.gz``, and a symlink ``<index-name>_latest.gz`` is created, pointing to the last created dump. :param ctx: Click context, so we can issue other management commands :param index: name of the index you want to create a dump for """ if not index: available_idxs = ctx.invoke(available_indices) if not available_idxs: return index = click.prompt('Name of index to dump') if index not in available_idxs: click.secho('"%s" is not an available index' % index, fg='red') return match_all = {'query': {'match_all': {}}} total_docs = es.count(index=index).get('count') path = _create_path(path=os.path.join(DUMPS_DIR, index)) dump_name = '%(index_name)s_%(timestamp)s.gz' % { 'index_name': index, 'timestamp': datetime.now().strftime('%Y%m%d%H%M%S') } new_dump = os.path.join(path, dump_name) with gzip.open(new_dump, 'wb') as g: with click.progressbar(es_helpers.scan(es, query=match_all, scroll='1m', index=index), length=total_docs) as documents: for doc in documents: g.write('%s\n' % json.dumps(doc)) click.secho('Generating checksum', fg='green') checksum = _checksum_file(new_dump) checksum_path = os.path.join(DUMPS_DIR, index, '%s.sha1' % dump_name) with open(checksum_path, 'w') as f: f.write(checksum) click.secho('Created dump "%s" (checksum %s)' % (dump_name, checksum), fg='green') latest = os.path.join(path, '%s_latest.gz' % index) try: os.unlink(latest) except OSError: click.secho('First time creating dump, skipping unlinking', fg='yellow') os.symlink(new_dump, latest) click.secho('Created symlink "%s_latest.gz" to "%s"' % (index, new_dump), fg='green') latest_checksum = os.path.join(os.path.dirname(checksum_path), '%s_latest.gz.sha1' % index) try: os.unlink(latest_checksum) except OSError: click.secho('First time creating dump, skipping unlinking checksum', fg='yellow') os.symlink(checksum_path, latest_checksum) click.secho('Created symlink "%s_latest.gz.sha1" to "%s"' % (index, checksum_path), fg='green')
import argparse __author__ = 'bejar' if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--index', default=None, required=True, help='Index to search') parser.add_argument('--alpha', action='store_true', default=False, help='Sort words alphabetically') args = parser.parse_args() index = args.index try: client = Elasticsearch() voc = {} sc = scan(client, index=index, doc_type='document', query={"query" : {"match_all": {}}}) for s in sc: tv = client.termvectors(index=index, doc_type='document', id=s['_id'], fields=['text']) if 'text' in tv['term_vectors']: for t in tv['term_vectors']['text']['terms']: if t in voc: voc[t] += tv['term_vectors']['text']['terms'][t]['term_freq'] else: voc[t] = tv['term_vectors']['text']['terms'][t]['term_freq'] lpal = [] for v in voc: lpal.append((v.encode("utf8", "ignore"), voc[v])) for pal, cnt in sorted(lpal, key=lambda x: x[0 if args.alpha else 1]):
def get_source(index, type_): """ Get _source of an index. """ if es.indices.exists(index): res = helpers.scan(es, index=index, doc_type=type_) return res
def do_search(opts, host, protocol, port, LOGGER): query = \ { 'query': { 'bool': { 'must': [] } }, 'aggs': { 'groupby': { 'terms': { 'field': 'module.keyword', 'size': 10000 }, 'aggs': { 'latest-revision': { 'max': { 'field': 'revision' } } } } } } es = Elasticsearch([{'host': '{}'.format(host), 'port': port}]) search_term = opts['search'] if 'case-sensitive' in opts and opts['case-sensitive']: case_sensitive = 'sensitive' else: case_sensitive = 'lowercase' search_term = search_term.lower() # case_sensitivity = 'BINARY ' sts = __search_fields if 'search-fields' in opts: sts = opts['search-fields'] if 'type' in opts and opts['type'] == 'regex': term_regex = 'regexp' else: term_regex = 'term' should = {'bool': {'should': []}} request_number = 1 if 'request-number' in opts: request_number = opts['request-number'] for field in sts: if field in __search_fields: if field == 'module': field_term = field final_term = search_term.lower() else: field_term = '{}.{}'.format(field, case_sensitive) final_term = search_term term = {term_regex: {field_term: final_term}} should['bool']['should'].append(term) queries = [] if 'schema-types' in opts: for st in opts['schema-types']: if st in __schema_types: queries.append(st) must = {'terms': {'statement': queries}} query['query']['bool']['must'].append(must) query['query']['bool']['must'].append(should) LOGGER.info('query: {}'.format(query)) limit_reacher = LimitReacher() search = scan(es, LOGGER, limit_reacher, query, scroll=u'2m', scroll_limit=2 * request_number, index='yindex', doc_type='modules') LOGGER.info(search) filter_list = __node_data.keys() results = [] rows = search if 'filter' in opts and 'node' in opts['filter']: filter_list = opts['filter']['node'] LOGGER.info('filter list: {}'.format(filter_list)) latest_revisions = {} all_revisions = True if 'latest-revisions' in opts and opts['latest-revisions'] is True: all_revisions = False aggregations = es.search(index='yindex', doc_type='modules', body=query, size=0)['aggregations']['groupby']['buckets'] for agg in aggregations: latest_revisions[agg['key']] = agg['latest-revision'][ 'value_as_string'].split('T')[0] for row in rows: r = row['_source'] if all_revisions or r['revision'] == latest_revisions[r['module']]: module = { 'name': r['module'], 'revision': r['revision'], 'organization': r['organization'] } result = {'module': module} result['node'] = {} for nf in filter_list: if nf in __node_data: result['node'][nf] = r[__node_data[nf]] results.append(result) return (results, limit_reacher.limit_reached)
def scan(self): """Return an iterator over the whole result set, unpaginated and without aggregations.""" body = {'query': self.get_query(), '_source': self.RETURN_FIELDS} return scan(es, index=es_index, doc_type=self.DOC_TYPES, query=body)
def generate_graph(app, app_type='logs', log_type='raw', targets=[], events=[], time_range=['now-1h', 'now'], size=20): """ Return all elements from an application, possible matching against a specific event type (e.g. click, mouseover, etc) """ # @TODO ref_url filter must_not_query = [{ "term": { "type": "mousedown" } }, { "term": { "type": "mouseup" } }] filter_query = [ { "term": { "logType": log_type }, }, ] # Filtering should_query = [] must_query = [] # Include these events in the request if events: include_events = {"terms": {"type": events}} filter_query.append(include_events) target_in = targets[0] target_out = targets[1] if target_in: include_targets = {"terms": {"target": target_in}} filter_query.append(include_targets) # Remove these elementids from result set for target in target_out: res = {"term": {"target": target}} must_not_query.append(res) # Finish off should query # must_query.append({"bool": {"should": should_query}}) # Sort By Time sort_query = [{"clientTime": {"order": "asc"}}] # Timestamp range - date math timestamp_query = { "range": { "@timestamp": { "gte": time_range[0], "lte": time_range[1] } } } filter_query.append(timestamp_query) agg_query = dict() # Get all unique sessions session_query = {"terms": {"field": "sessionID", "min_doc_count": 1}} agg_query['sessions'] = session_query # Generating all top targets and breakdowns by type, including path_length target_query = { "terms": { "field": "target", "min_doc_count": 1, "size": size }, "aggs": { "events": { "terms": { "field": "type", "min_doc_count": 1, "size": size } }, "top_target": { "top_hits": { "script_fields": { "path_length": { "script": { "lang": "painless", "inline": "doc['path'].length;" } } }, "size": 1 } } } } agg_query['targets'] = target_query # Main query query = { "sort": sort_query, "query": { "bool": { # "must": must_query, # "should": should_query, "filter": filter_query, "must_not": must_not_query, # "minimum_should_match": len(should_query) - 1 } }, "_source": { "includes": ['*'], }, "script_fields": { "path_length": { "script": { "lang": "painless", "inline": "doc['path'].length;" } } }, "aggregations": agg_query } # return query # Process Aggregate Results response = es.search(app, doc_type=app_type, body=query, size=0) # Only want to look at aggregations sessions = response['aggregations']['sessions']['buckets'] # allSessions = { x['key']: [] for x in sessions } # intervalSessions = { x['key']: [] for x in sessions } # Deal with bar chart allTargets = response['aggregations']['targets']['buckets'] # Re-execute query to get all hits iter = helpers.scan(es, query=query, index=app, doc_type=app_type, preserve_order=True) allSessions = dict() # Store all hits in the user's bucket. for elem in iter: data = elem['_source'] data['pathLength'] = elem['fields']['path_length'][0] if 'sessionID' in data: sessionID = data['sessionID'] if sessionID in allSessions: allSessions[sessionID].append(data) else: allSessions[sessionID] = [data] # This fixed sequence/interval logging that what was produced in # UserALE.js v 0.2.0 # Possible to remove self-loops here as well (html->html->html->window) := (html->window) intervalSessions = dict() for sessionID in allSessions: data = allSessions[sessionID] newData = [] intervalLog = [] pairs = zip(data, data[1:]) for curr, next in pairs: target1 = curr['target'] event1 = curr['type'] target2 = next['target'] event2 = next['type'] if target1 != target2: # ignore self-loops targetChange = int(True) eventChange = int(False) if event1 != event2: eventChange = int(True) # Starting over no matter what # Based off of curr, update the log curr['targetChange'] = targetChange curr['typeChange'] = eventChange curr['intervalCount'] = len( intervalLog) # some number maybe 0 if len(intervalLog) >= 2: # Calculate duration curr['duration'] = intervalLog[-1:]['clientTime'] - \ intervalLog[0]['clientTime'] else: curr['duration'] = 0 newData.append(curr) intervalLog = [] # else: # # They are the same # targetChange = int(False) # eventChange = int(False) # if event1 != event2: # eventChange = int(True) # # starting over # curr['targetChange'] = targetChange # curr['typeChange'] = eventChange # curr['intervalCount'] = len(intervalLog) # # if len(intervalLog) >= 2: # # # Calculate duration # # curr['duration'] = intervalLog[-1:]['clientTime'] - \ # # intervalLog[0]['clientTime'] # # else: # # curr['duration'] = 0 # newData.append(curr) # intervalLog = [] # else: # # increase counter # intervalLog.append(curr) intervalSessions[sessionID] = newData # return intervalSessions newSessions = [] # Generate all edges tied to a user # [ edge list, edge list, ... ] for k, v in intervalSessions.items(): pairs = pairwise(v) # list of edges for a user newSessions.append(pairs) # Node Map node_list = [] # Need to keep 0-based index for sankey diagram links = [] # Aggregate sequence list node_map = [] # Final node map {"name": "foo", "id": 0"} # Align the sequences alignment = itertools.izip_longest(*newSessions) src_ids = {} target_ids = {} for i, step in enumerate(alignment): # print(i) c = collections.Counter() visitedLinks = [] # visitedLinksUnique = set([]) nodenames = set([]) for edge in step: # for a single step look at all links if edge: node1 = edge[0] node2 = edge[1] session = node1['sessionID'] nodename1 = node1['target'] nodename2 = node2['target'] seqID = '%s->%s' % (nodename1, nodename2) #print(seqID) if nodename1 != nodename2: #double check again for self-loops #print(node1) link = { 'sequenceID': seqID, 'sourceName': nodename1, 'targetName': nodename2, 'type': node1['type'], 'duration': node1['duration'], 'pathLength': len(node1['path']) if node1['path'] is not None else 0, 'targetChange': node1['targetChange'], 'typeChange': node1['typeChange'] } visitedLinks.append(link) # Done with visits in a step. Now calculate counts counts = collections.Counter(k['sequenceID'] for k in visitedLinks if k.get('sequenceID')) # print(counts) visitedLinksUnique = {v['sequenceID']: v for v in visitedLinks}.values() # print(visitedLinksUnique) # Visit unique links and generate src/targetid if len(node_map) == 0: for link in visitedLinksUnique: # Add all sources if link['sourceName'] not in src_ids: node_map.append({"name": link['sourceName']}) src_ids[link['sourceName']] = len(node_map) - 1 # Add all targets if link['targetName'] not in target_ids: node_map.append({"name": link['targetName']}) target_ids[link['targetName']] = len(node_map) - 1 else: src_ids = target_ids # sources were previous targets target_ids = {} for link in visitedLinksUnique: # Add all sources # if link['sourceName'] not in src_ids.values(): # node_map.append(link['sourceName']) # src_ids[len(node_map)-1] = link['sourceName'] # Add all targets if link['targetName'] not in target_ids: node_map.append({"name": link['targetName']}) target_ids[link['targetName']] = len(node_map) - 1 for link in visitedLinksUnique: # Perform lookup for ids # Perform lookup for counts link['source'] = src_ids[link['sourceName']] link['target'] = target_ids[link['targetName']] link['value'] = counts[link['sequenceID']] links.append(link) # for step in alignment: # # step through every users sequence # c = collections.Counter() # visitedLinks = [] # nodenames = set([]) # # # Process all the edges # for edge in step: # if edge: # node1 = edge[0] # node2 = edge[1] # # nodename1 = node1['target'] # nodename2 = node2['target'] # # # Add src and targetids # nodenames.add(nodename1) # nodenames.add(nodename2) # # # Generate sequence ID # seqID = '%s->%s' % (nodename1, nodename2) # # # @todo Ensure src and target are not the same (self-loop) # if nodename1 != nodename2: # link = { # 'sequenceID': seqID, # 'sourceName': nodename1, # 'targetName': nodename2, # 'type': node1['type'], # # 'duration': node1['duration'], # 'pathLength': len(node1['path']), # 'targetChange': node1['targetChange'], # 'typeChange': node1['typeChange'] # } # visitedLinks.append(link) # # # How many users visited a sequence at this step # counts = collections.Counter(k['sequenceID'] for k in visitedLinks if k.get('sequenceID')) # # print(counts) # # Append into growing node_list # map(lambda x: node_list.append(x), nodenames) # # # map(lambda x: node_map.append({ "name": x} # # "id": len(node_list) - 1 - node_list[::-1].index(x)}), nodenames) # # map(lambda x: node_map.append({ "name": x}), nodenames) # # "id": len(node_list) - 1 - node_list[::-1].index(x)}), nodenames) # for v in visitedLinks: # # Pass through and update count, also generate src and target id # v['value'] = counts[v['sequenceID']] # # Last occurence is the src and target id # v['source'] = len(node_list) -1 - node_list[::-1].index(v['sourceName']) # v['target'] = len(node_list) -1 - node_list[::-1].index(v['targetName']) # links.append(v) # Save everything res = dict() res['histogram'] = generate_bargraph(allTargets) # res['sankey'] = { # # 'sessions': sessions, # 'links': links, # 'nodes': node_map # } res['nodes'] = node_map res['links'] = links res['sessions'] = sessions # with open('sankey.json', 'w') as outfile: # json.dump(res, outfile, sort_keys=False, indent=4) # with open('data.txt', 'w') as outfile: # json.dump(intervalSessions, outfile, indent=4, sort_keys=False) # # with open('query.json', 'w') as outfile: # json.dump(query, outfile, indent=4, sort_keys=False) # Iterate first to get nodes # pairs = pairwise(iter) # # nodes = [] # links = [] # for p in pairs: # node1 = p[0]['_source'] # node2 = p[1]['_source'] # # Append nodes to list # nodes.append(node1['target']) # nodes.append(node2['target']) # Iterate again to get edges # pairs = pairwise(iter2) # srcID = targetID = None # for p in pairs: # node1 = p[0]['_source'] # node2 = p[1]['_source'] # # # Append nodes to list # nodes.append(node1['target']) # # nodes.append(node2['target']) # # srcID = len(nodes) - 1 # targetID = len(nodes) # # # if (node1['target'] != node2['target']): # # Append links to list (remove self-loops) # link = { # 'sourceID': srcID, # 'targetID': targetID, # 'sourceName': node1['target'], # 'targetName': node2['target'], # 'type': node1['type'], # 'duration': node1['duration'], # 'value': node1['count'], # 'pathLength': len(node1['path']), # 'targetChange': int(node1['targetChange']), # 'typeChange': int(node1['typeChange']) # } # links.append(link) # # # Get all unique nodes # # node_names = np.unique(nodes).tolist() # node_list = [] # # for indx, name in enumerate(nodes): # n = {'id': indx, 'name': name} # node_list.append(n) # # # Remove self-loops # newLinks = [] # for indx, elem in enumerate(links): # srcID = elem['sourceID'] # targetID = elem['targetID'] # # if srcID != targetID: # newLinks.append(elem) # # return res
from elasticsearch import Elasticsearch from elasticsearch import helpers es = Elasticsearch("http://127.0.0.1:9200/") print(es.info()) helpers.bulk() helpers.scan()
# 引入安装的库 import Bert_clear_title #模型下载自https://www.kaggle.com/terrychanorg/bertcleartitlemodel TClear = Bert_clear_title.Marker( model_path="/mnt/data/dev/model/Bert_clear_title/model/") TClear.load_model() es = Elasticsearch('127.0.0.1:9200') index_v = "terry-index" index_v = "scrapy_search-2020-11" doc_type_v = "items" query = {"query": {"match_all": {}}} scanResp = helpers.scan(client=es, query=query, scroll="10m", index=index_v, doc_type=doc_type_v, timeout="10m") items = [] for i, resp in enumerate(scanResp): print("\n" * 2) qid = resp['_id'] # print(resp) # print(resp['_source']['title']) one = TClear.pre(resp['_source']['title']) # print(TClear.get_mark_data(one[0])) if len(TClear.get_mark_data(one[0])) == 0: items.append(resp['_source']['title']) else:
es = Elasticsearch( hosts=[{ 'host': os.environ["ES_HOST"], 'port': os.environ["ES_PORT"] }], http_auth=( os.environ["ES_USER"], os.environ["ES_PASS"] ) ) data = pd.DataFrame([thing["_source"] for thing in list( helpers.scan( es, index="assessment", query={"query": {"match_all": {}}} )) ]) for candidate in np.unique(data[['candidate_a', 'candidate_b']]): elo.addPlayer(candidate) for candidate_a, candidate_b, winner in data.values: elo.recordMatch(candidate_a, candidate_b, winner=winner) n_classifiers_options = [32, 64, 128, 256, 512] n_clusters_options = [8, 16, 32, 64, 128, 256] results = pd.DataFrame({ str(n_classifiers): { str(n_clusters): None
import redis from elasticsearch import Elasticsearch from elasticsearch.helpers import scan es = Elasticsearch("219.224.134.213:9200", timeout=600) user_keys = [ "domain", "uid", "importance", "influence", "activity_geo", "uname", "hashtag", "fansnum", "tendency", "photo_url", "statusnum", "gender", "topic_string", "activeness", "location", "friendsnum", "character_sentiment", "character_text" ] es_scan = scan(es, query={ "query": { "match_all": {} }, "size": 1000 }, index="user_portrait_1222", doc_type="user") f = open("user_portrait.txt", "wb") while 1: try: k = es_scan.next() scan_re = k["_source"] user_dict = dict() for key in user_keys: user_dict[key] = scan_re[key] f.write(json.dumps(user_dict) + "\n") except StopIteration: print "all done"
def add_zcta_zip_to_index(index, doc_type, loc_field, id_field, prefix=None): #iterates through zcta zip code polygons, updates index records with zip code id they are contained within #if no lat/lng adds 'NA' to field #input: index name, doc_type #Searches ES collisions data, updates records with gepshape mapping es_url = 'http://%s:%s@%s:9200' % (ES_username, ES_password, ES_url) es = Elasticsearch(es_url) proj = Proj(init='epsg:2263') #NY/Long Island UTM projection if prefix: zip_field1 = prefix + '_ZCTA_ZIP' zip_field2 = prefix + '_ZCTA_ZIP_NoSuffix' else: zip_field1 = 'ZCTA_ZIP' zip_field2 = 'ZCTA_ZIP_NoSuffix' try: mapping = {} mapping['properties'] = {} #set the ZCTA zip fieldmapping mapping['properties'][zip_field1] = {'type': 'string'} mapping['properties'][zip_field2] = {'type': 'string'} #use cURL to put the mapping p = subprocess.Popen([ 'curl', '%s/%s/_mapping/%s' % (es_url, index, doc_type), '-d', '%s' % json.dumps(mapping) ], stderr=subprocess.PIPE) out, err = p.communicate() if err: print '\n' + err except Exception as e: #do not try to recreate the index print "Error creating index:" print e idx = 0 updates = [] for result in helpers.scan(es, index=index, doc_type=doc_type): idx += 1 _id = result['_id'] #Add placeholder for ZCTA zip code result['_source'][zip_field1] = 'NA' result['_source'][zip_field2] = 'NA' if loc_field in result['_source']: query = '''{ "query":{ "bool":{ "must":{"match_all": {}}, "filter":{ "geo_shape":{ "coords":{ "indexed_shape": { "index": "%s", "type": "%s", "id": "%s", "path": "%s" }, "relation": "intersects" } } } } } }''' % (index, doc_type, _id, loc_field) max_area = 0 max_zip = False #query the zip codes, finding all zip shapes that contain the current colision for shape in helpers.scan(es, query=query, index='nyc_zip_codes', doc_type='zip_codes'): coords = [ proj(lng, lat) for lng, lat in shape['_source']['coords'] ['coordinates'][0] ] poly = Polygon(coords) if poly.area > max_area: #get the largest zip code by geographic area max_area = poly.area max_zip = shape['_id'] if max_zip: result['_source'][zip_field1] = max_zip result['_source'][zip_field2] = max_zip.split('-')[0] updates.append(result['_source']) if idx >= 10000: upload_to_Elasticsearch.update_ES_records_curl(updates, index=index, doc_type=doc_type, id_field=id_field) idx = 0 updates = [] #upload the remaining records upload_to_Elasticsearch.update_ES_records_curl(updates, index=index, doc_type=doc_type, id_field=id_field)
def stream_all_docs(self): return helpers.scan(query={ "query": { "match_all": {} } }, scroll="30s", client=self.es, index=self.index_name, request_timeout=30)
def delete_collection_entities(): q = {'query': {'exists': {'field': 'collection_id'}}} for ent in scan(es, query=q, index=es_index, doc_type=[TYPE_ENTITY]): es.delete(index=es_index, doc_type=TYPE_ENTITY, id=ent.get('_id'))
def analyze_git(es_read, es_write, es_read_index, es_write_index, git_enrich, size, incremental): query = {"match_all": {}} sort = [{"metadata__timestamp": {"order": "asc"}}] if incremental.lower() == 'true': search = Search(using=es_write, index=es_write_index) # from:to parameters (=> from: 0, size: 0) search = search[0:0] search = search.aggs.metric('max_date', 'max', field='metadata__timestamp') try: response = search.execute() if response.to_dict()['aggregations']['max_date']['value'] is None: msg = "No data for 'metadata__timestamp' field found in " msg += es_write_index + " index" logging.warning(msg) init_write_index(es_write, es_write_index) else: # Incremental case: retrieve items from last item in ES write index max_date = response.to_dict( )['aggregations']['max_date']['value_as_string'] max_date = date_parser.parse(max_date).isoformat() logging.info("Starting retrieval from: " + max_date) query = {"range": {"metadata__timestamp": {"gte": max_date}}} except NotFoundError: logging.warning("Index not found: " + es_write_index) init_write_index(es_write, es_write_index) else: init_write_index(es_write, es_write_index) search_query = {"query": query, "sort": sort} logging.info(search_query) logging.info("Start reading items...") commits = [] cont = 0 for hit in helpers.scan(es_read, search_query, scroll='300m', index=es_read_index, preserve_order=True): cont = cont + 1 item = hit["_source"] commits.append(item) logging.debug("[Hit] metadata__timestamp: " + item['metadata__timestamp']) if cont % size == 0: logging.info("Total Items read: " + str(cont)) events_df = eventize_and_enrich(commits, git_enrich) upload_data(events_df, es_write_index, es_write) commits = [] events_df = None # In case we have some commits pending, process them if len(commits) > 0: logging.info("Total Items read: " + str(cont)) events_df = eventize_and_enrich(commits, git_enrich) upload_data(events_df, es_write_index, es_write)
body: dict = {"query": {"bool": {}}} if filters: filter_clause = [] for key, values in filters.items(): filter_clause.append( { "terms": {key: values} } ) body["query"]["bool"]["filter"] = filter_clause if only_documents_without_embedding: body["query"]["bool"] = {"must_not": {"exists": {"field": self.embedding_field}}} result = scan(self.client, query=body, index=index, size=batch_size, scroll="1d") yield from result def query( self, query: Optional[str], filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, ) -> List[Document]: """ Scan through documents in DocumentStore and return a small number documents that are most relevant to the query as defined by the BM25 algorithm. :param query: The query
delete_index() if args.create: create_index() if args.insert: insert(args.insert[0], args.update, args.expires, args.depth) if args.kexport: # Export dashboards, searches, and visualizations. kibana_export = list( scan(client=es, index='.kibana', doc_type='dashboard,search,visualization')) # Export the ipwhois index pattern. kibana_idx_export = list( scan(client=es, index='.kibana', doc_type='index-pattern', query={'query': { 'match': { '_id': 'ipwhois' } }})) # Dump exports to json file. with io.open(args.kexport[0], 'w') as data_file: