def consensus(offset=60): """ check for 'eth.chain.new_head' messages and return the max number of clients, that had the same head during the last `offset` seconds. """ s = Search(client) # s = s.query(Q('match', message='eth.chain.new_head')) s = s.filter('exists', field='json_message.eth.chain.new_head.block_number') s = s.sort({'json_message.eth.chain.new_head.ts': {'order': 'desc', 'ignore_unmapped': 'true'}}) response = s.execute() # Get latest block number x = max(hit['_source']['json_message']['eth.chain.new_head']['block_number'] for hit in response.hits.hits) # By default, the buckets are ordered by their doc_count descending # s.aggs.bucket('by_block_hash', 'terms', field='json_message.eth.chain.new_head.block_hash', size=3) # Reach consensus around latest block number s = Search(client) s = s.filter(time_range_filter(field="json_message.eth.chain.new_head.ts", offset=offset)) s.aggs.bucket('latest', 'range', field='json_message.eth.chain.new_head.block_number', ranges=[{"from": x - 1, "to": x + 1}]).bucket( 'by_block_hash', 'terms', field='json_message.eth.chain.new_head.block_hash', size=3) # s = s[10:10] response = s.execute() # pprint(response) if response: return max(tag.doc_count for tag in response.aggregations.latest.buckets[0].by_block_hash.buckets) else: return 0
def build_query(self, start_date, end_date, **kwargs): """Build the elasticsearch query.""" agg_query = Search(using=self.client, index=self.index, doc_type=self.doc_type)[0:0] if start_date is not None or end_date is not None: time_range = {} if start_date is not None: time_range['gte'] = start_date.isoformat() if end_date is not None: time_range['lte'] = end_date.isoformat() agg_query = agg_query.filter( 'range', **{self.time_field: time_range}) term_agg = agg_query.aggs for term in self.aggregated_fields: term_agg = term_agg.bucket(term, 'terms', field=term, size=0) term_agg.metric('total', 'sum', field='count') if self.copy_fields: term_agg.metric( 'top_hit', 'top_hits', size=1, sort={'timestamp': 'desc'} ) for query_param, filtered_field in self.required_filters.items(): if query_param in kwargs: agg_query = agg_query.filter( 'term', **{filtered_field: kwargs[query_param]} ) return agg_query
def hotspots(self, family, name): query = Search(using=self.client, index=family) if name: query = query.filter('term', name=name) query = query.filter('range', timestamp={'gte':self.lookback}) query.aggs.bucket('hotspot', 'geohash_grid', field='location', precision=7) hashes = query[0].execute().aggregations['hotspot']['buckets'][:3] return [Geohash.decode_exactly(hash['key'])[:2] for hash in hashes]
def search_jobs(): s = Search(using=client) s = s.filter('term', _index='can_tenant_chouun') s = s.filter('term', _type='job') s = s.filter('term', id=12) resp = s.execute() for hit in resp: analysis = hit['analysis'] print(type(analysis)) print(dir(analysis))
def get_rev_links(self, model, rel, *item_types): search = Search(using=self.es) search = search.extra(size=SEARCH_MAX) # rel links use '~' instead of '.' due to ES field restraints proc_rel = rel.replace('.', '~') # had to use ** kw notation because of variable in field name search = search.filter('term', **{'links.' + proc_rel: str(model.uuid)}) if item_types: search = search.filter('terms', item_type=item_types) hits = search.execute() return [hit.to_dict().get('uuid', hit.to_dict().get('_id')) for hit in hits]
def fetch(session): s = Search(client) s = s.filter('bool', should=[F('term', message='p2p.disconnected'), F('term', message='p2p.connected')]) s = s.filter('range', **{'@timestamp': dict(gte=session['start'], lte=session['stop'])}) s = s.fields(['json_message.p2p.connected.remote_id', 'guid', 'message', '@timestamp']) s = s[0:100000] # s = s[0:10] s = s.sort('@timestamp') response = s.execute() return response
def test_connections(clients): assert_connected(minconnected=len(clients), minpeers=len(clients)-2) guids = [nodeid_tool.topub(ext_id.encode('utf-8')) for ext_id in clients] for guid in guids: s = Search(client) s = s.filter(F('term', at_message='p2p.connected')) s = s.filter(F('term', guid=guid)) s = s.filter(F('term', remote_id=guid)) response = s.execute() # pprint (response) assert response.hits.total == 0, 'a client is connected to itself' print 'PASS: no client is connected to itself'
def search(self, **params): limit_cat = params.get('cat', "").strip() limit_forum = params.get('forum', "").strip() limit_count = int(params.get('count', 100)) limit_size_min = human2bytes(params.get('min', "0b")) limit_size_max = human2bytes(params.get('max', "0b")) limit_wild = int(params.get('wild', 0)) arg = params.get('query', '').strip() if not arg: arg = "hobbit" s = Search(using=es, index=ela_index) if limit_size_min: s = s.filter("range", size = {'gte' : limit_size_min }) if limit_size_max: s = s.filter("range", size = {'lte' : limit_size_max }) arg = arg.split(' ') if limit_wild: q = Q("wildcard", name="*"+arg.pop(0)+"*") for a in arg: q = q & Q("wildcard", name="*"+a+"*") else: q = Q("match", name=arg.pop(0)) for a in arg: q = q & Q("match", name=a) if len(limit_cat): for a in limit_cat.split(' '): q = q & Q("match", category=a) if len(limit_forum): for a in limit_forum.split(' '): q = q & Q("match", forum=a) s = s.query(q) #cherrypy.log("query is "+str(s.to_dict())) r = s.execute() size = r.hits.total #cherrypy.log("query have "+str(size)+" elements") if size > limit_count: size = limit_count s = s.sort('-size') s = s.extra(size=size) r = s.execute() data = [] for b in r: a = [b.id, b.size, b.name, b.category, b.forum, b.date[0] if b.date else '', b.hash] data.append(a) return {'data': data}
def get_all_cans(index, estype=Types.candidate, fields=['id'], status=1, at_most=10000): s = Search(using=client) s = s.filter('term', _index=index) s = s.filter('term', _type=estype) s = s.filter('term', status=status) s = s.source(include=fields) s = s[:at_most] resp = s.execute() # print(resp.took) # print(resp.hits.total) return [hit['id'] for hit in resp]
def test_connections(clients): len_clients = len(clients) min_peers = len_clients if len_clients <= 3 else 3 assert_connected(minconnected=len_clients, minpeers=min_peers, offset=offset) guids = [nodeid_tool.topub(ext_id.encode('utf-8')) for ext_id in clients] for guid in guids: s = Search(client) s = s.filter('exists', field='json_message.p2p.connected.ts') s = s.filter(F('term', guid=guid)) s = s.filter(F('term', remote_id=guid)) response = s.execute() # pprint (response) assert response.hits.total == 0, 'a client is connected to itself' print 'PASS: no client is connected to itself'
def es_read(self, log_id, offset): """ Returns the logs matching log_id in Elasticsearch and next offset. Returns '' if no log is found or there was an error. :param log_id: the log_id of the log to read. :type log_id: str :param offset: the offset start to read log from. :type offset: str """ # Offset is the unique key for sorting logs given log_id. s = Search(using=self.client) \ .query('match', log_id=log_id) \ .sort('offset') s = s.filter('range', offset={'gt': offset}) logs = [] if s.count() != 0: try: logs = s[self.MAX_LINE_PER_PAGE * self.PAGE:self.MAX_LINE_PER_PAGE] \ .execute() except Exception as e: msg = 'Could not read log with log_id: {}, ' \ 'error: {}'.format(log_id, str(e)) self.log.exception(msg) return logs
def get_by_unique_key(self, unique_key, name): term = 'unique_keys.' + unique_key # had to use ** kw notation because of variable in field name search = Search(using=self.es) search = search.filter('term', **{term: name}) search = search.extra(version=True) return self._one(search)
def search(self): self.reindex(Addon) qs = Search(using=amo.search.get_es(), index=AddonIndexer.get_index_alias(), doc_type=AddonIndexer.get_doctype_name()) return qs.filter('term', id=self.addon.pk).execute()[0]
def find(self, region, account, start, end): s = Search(using=self.es, index=app.config['ELASTICSEARCH_INDEX'], doc_type=app.config['ELASTICSEARCH_TYPE']) s = s.filter('term', region=region) if account is not None: s = s.filter('term', account=account) s = s.filter('range', date={ 'gte': parse(start).date().isoformat(), 'lte': parse(end).date().isoformat() })[0:0] s.aggs.bucket('by_project', 'terms', field='projectid.raw', size=0) \ .bucket('by_type', 'terms', field='usagetype.raw') \ .bucket('by_offering', 'terms', field='offeringid.raw') \ .metric('rawusage_sum', 'sum', field='rawusage') return s.execute().aggregations.to_dict()
def assert_started(minstarted, offset=90): """Asserts that at least `minstarted` clients logged 'starting' event.""" """ "starting": { "comment": "one of the first log events, before any operation is started", "client_impl": "Impl/OS/version, e.g. Go/Linux/0.8.2", "eth_version": "int, e.g. 52", "ts": "YYYY-MM-DDTHH:MM:SS.SSSSSSZ" } """ s = Search(client) s = s.filter(time_range_filter(field="json_message.starting.ts", offset=offset)) s.aggs.bucket('by_host', 'terms', field='syslog_hostname.raw', size=0) response = s.execute() # pprint(response) print "passed for:" for tag in response.aggregations.by_host.buckets: print ' %s' % tag.key # ip_from_guid(tag.key) num_started = len(response.aggregations.by_host.buckets) assert num_started >= minstarted, 'only %d (of at least %d) clients started' % (num_started, minstarted) for tag in response.aggregations.by_host.buckets: assert tag.doc_count == 1, 'client %s started more than once' % tag.key # ip_from_guid(tag.key)
def search(self, **params): index = params.get('index', self.index) search = Search(using=self.client, index=index) page = params.get('page', None) per_page = params.get('per_page', None) if page and per_page: page = page - 1 search._extra = {'from': page, 'size': per_page} sort = params.get('sort', None) if sort and sort.replace('-', '') in ['created_at', 'level']: search = search.sort(sort) date_filter = self._filter_by_date_interval(params) if date_filter: search = search.filter(date_filter) level = params.get('group_by', None) if level: search = search.query('match', level=level) hits = search.execute() format = params.get('format', 'object') if format == 'dict': return self._to_dict(hits) else: return self._to_logs(hits)
def process(self, start_time:datetime, end_time:datetime, input:DataFrame): logger.debug('Start: %s End: %s Log: index=%s fields=%s' % (start_time.isoformat(), end_time.isoformat(), str(self.indices), str(self.fields))) search = Search(using=self.client, index=self.indices[0]) search = search.filter(Range(** {'@timestamp': {'gte': start_time.isoformat(), 'lte': end_time.isoformat()}})) for k,v in self.fields.items(): if isinstance(v, list): for sv in v: search = search.query("match", **{k:sv}) else: search = search.query("match", **{k:v}) logger.debug('ES Query: %s' % str(search.to_dict())) response = search.execute() logger.debug('Results: success:%d failed:%d hits:%d' % (response._shards.successful, response._shards.failed, len(response.hits))) for hit in response: # filter out the meta key and flatten the values row = {k: str(hit[k]) for k in hit if k != 'meta'} logger.debug(row) input = input.append(row, ignore_index=True) return input
def get_by_json(self, key, value, item_type, default=None): # find the term with the specific type term = 'embedded.' + key + '.raw' search = Search(using=self.es) search = search.filter('term', **{term: value}) search = search.filter('type', value=item_type) return self._one(search)
def search(q=None, tags=None, events_only=None, accounts=None, location=None, latitude=None, longitude=None, distance=None, distance_switch=False, **kwargs): es = get_es() queryset = Search(using=es).index(ELASTICSEARCH_INDEX_NAME) # Exclude events from the past queryset = queryset.filter( F("bool", should=[F({"range": {"event_end": {"lte" : "now"}}}), F({"missing": {"field": "event_end"}})]) ) queryset = filter_by_query(queryset, q=q) queryset = filter_by_tags(queryset, tags=tags) queryset = filter_by_events_only(queryset, events_only=events_only) queryset = filter_by_accounts(queryset, accounts=accounts) queryset = filter_by_location(queryset, location=location, latitude=latitude, longitude=longitude, distance=distance, distance_switch=distance_switch) return queryset
def session_times(): # {"@fields": {}, "@timestamp": "2015-02-23T17:03:41.738412Z", "@source_host": "newair.brainbot.com", "@message": "scenario.p2p_connect.started"} start_message = 'scenario.p2p_connect.starting.clients.sequentially' stop_message = 'scenario.p2p_connect.stopping.clients' s = Search(client) s = s.filter('bool', should=[F('term', message=start_message), F('term', message=stop_message)]) s = s.fields(['message', '@timestamp']) s = s[0:100000] s = s.sort('-@timestamp') # desc, we want the latest events response = s.execute() events = [] # joungest to oldest, last should be a stop message for h in response: msg = 'start' if h['message'][0] == start_message else 'stop' ts = h['@timestamp'][0] events.append((msg, ts)) assert not events or events[0][0] == 'stop' sessions = [] while len(events) >= 2: stop = events.pop() start = events.pop() sessions.append(dict([start, stop])) return list(reversed(sessions))
def get_journals_by_collection_institution(collection_acronym, page_from=0, page_size=1000): search = Search(index=INDEX).query( "nested", path="collections", query=Q("match", collections__acronym=COLLECTION)) search = search.filter("exists", field="sponsors") search = search[page_from:page_size] search_response = search.execute() meta = { 'total': search_response.hits.total, } sponsors = {} for journal in search_response: j = {'jid': journal.jid, 'title': journal.title, 'current_status': journal.current_status, 'last_issue': journal.last_issue, 'issue_count': journal.issue_count } for sponsor in journal['sponsors']: sponsors.setdefault(sponsor, []).append(j) result = { 'meta': meta, 'objects': sponsors } return result
def search(): q = request.args.get('q') #resp = es.search(index='hoe', doc_type='record', q=q, body=aggs) #logging.info(q) s = Search(using=es, index='hoe', doc_type='record') s.aggs.bucket('library_place', 'terms', field='library-place') s.aggs.bucket('type', 'terms', field='type') s.aggs.bucket('genre', 'terms', field='genre') s.aggs.bucket('keywords', 'terms', field='keywords.label') s.aggs.bucket('author', 'terms', field='author.literal') s.query = Q('multi_match', query=q, fields=['_all']) filters = [] if 'filter' in request.args: filters = request.args.getlist('filter') logging.info(filters) for filter in filters: cat, val = filter.split(':') cat = cat.replace('_', '-') filter_dict = {} filter_dict.setdefault(cat, val) logging.info(cat) s.filter = F('term', **filter_dict) #if request.args resp = s.execute() #logging.info(resp) #logging.info(resp.aggregations.per_category.buckets) return render_template('resultlist.html', records=resp.to_dict().get('hits'), facets=resp.aggregations.to_dict(), header=q, query=q, filters=filters)
def get_all_job_cans(index, estype=Types.job_candidate, fields=['id', 'job', 'candidate'], status=None, at_most=10000): s = Search(using=client) s = s.filter('term', _index=index) s = s.filter('term', _type=estype) if status: s = s.filter('term', status=status) s = s.source(include=fields) s = s[:at_most] resp = s.execute() # print(resp.took) # print(resp.hits.total) return [{'id': hit['id'], 'job_id': hit['job'], 'can_id': hit['candidate']} for hit in resp]
def assert_mining(minmining, offset=300): """ assert that at least `minmining` clients have started mining and mined a block """ s = Search(client) s = s.filter(F('term', message='eth.miner.new_block')) s = s.filter(time_range_filter(offset=offset)) s.aggs.bucket('by_host', 'terms', field='syslog_hostname.raw', size=0) response = s.execute() # pprint(response) print "passed for: " for tag in response.aggregations.by_host.buckets: print ' %s, blocks mined: %d' % (tag.key, tag.doc_count) # ip_from_guid(tag.key) num_mining = len(response.aggregations.by_host.buckets) assert num_mining >= minmining, 'only %d clients mining, expected at least %d' % (num_mining, minmining)
def tx_propagation(client_count, offset=10): """ check for 'eth.tx.tx_new' messages and return the max number of clients, that had the same tx during the last `offset` seconds. """ s = Search(client) # s = s.query(Q("match", message='eth.tx.received')) s = s.filter('exists', field='json_message.eth.tx.received.tx_hash') s = s.filter(time_range_filter(field="json_message.eth.tx.received.ts", offset=offset)) s.aggs.bucket('by_tx', 'terms', field='json_message.eth.tx.received.tx_hash', size=client_count) # s = s[0:1000] response = s.execute() if response: return max(tag.doc_count for tag in response.aggregations.by_tx.buckets) else: return 0
def get_last_day_top(top_len=50): OFFSET = '+8h' # TODO: didn't figuer out why s = Search().using(client) time_range = {'gte': 'now-1d' + OFFSET, 'lte': 'now' + OFFSET} s_q = s.filter('range', timestamp=time_range).sort('-likes')[:100] r = s_q.execute() return r.hits.hits
def search(self, query: str, filters: dict=None, only_this_type: bool=True, **kwargs: dict) -> list: """performs a search against elasticsearch and then pulls the corresponding data from the db :param query: query terms to search by :param filters: named (attribute, value) filters to limit the query results :param kwargs: additional search keyword arguments :return: a list of models with an additional `__score` value added """ # build base search object s = Search(using=self.indexer.es).index(self.indexer.index_name) if only_this_type: s = s.doc_type(self.indexer.doc_type_name) # build query s = s.query('match', _all=query) # add filter if filters is not None: for attr, value in filters.items(): s = s.filter(F({'term': {attr: value}})) # execute query res = s.execute() # build up django query results = {} for hit in res: # get the model dj_type = hit._meta.doc_type model = get_model(dj_type) # get the pk pk_name = model._meta.pk.name pk = getattr(hit, pk_name) # get the score score = hit._meta.score # add to mapping results.setdefault(model, {}) results[model][pk] = score # get queryset querysets = [] for model, pk_score in results.items(): qs = model.objects.filter(pk__in=pk_score.keys()) querysets += list(qs) # attach scores to instances for instance in querysets: score = results[type(instance)][instance.pk] instance._meta.es_score = score # order by score querysets = sorted(querysets, key=lambda i: i._meta.es_score, reverse=True) # return return querysets
def create_search_obj(user, search_param_dict=None, filter_on_email_optin=False): """ Creates a search object and prepares it with metadata and query parameters that we want to apply for all ES requests Args: user (User): User object search_param_dict (dict): A dict representing the body of an ES query filter_on_email_optin (bool): If true, filter out profiles where email_optin != True Returns: Search: elasticsearch_dsl Search object """ staff_program_ids = get_advance_searchable_program_ids(user) is_advance_search_capable = bool(staff_program_ids) index_type = PRIVATE_ENROLLMENT_INDEX_TYPE if is_advance_search_capable else PUBLIC_ENROLLMENT_INDEX_TYPE index = get_default_alias(index_type) search_obj = Search(index=index) # Update from search params first so our server-side filtering will overwrite it if necessary if search_param_dict is not None: search_obj.update_from_dict(search_param_dict) if not is_advance_search_capable: # Learners can't search for other learners with privacy set to private search_obj = search_obj.filter( ~Q('term', **{'profile.account_privacy': Profile.PRIVATE}) # pylint: disable=invalid-unary-operand-type ) # Limit results to one of the programs the user is staff on search_obj = search_obj.filter(create_program_limit_query( user, staff_program_ids, filter_on_email_optin=filter_on_email_optin )) # Filter so that only filled_out profiles are seen search_obj = search_obj.filter( Q('term', **{'profile.filled_out': True}) ) # Force size to be the one we set on the server update_dict = {'size': settings.ELASTICSEARCH_DEFAULT_PAGE_SIZE} if search_param_dict is not None and search_param_dict.get('from') is not None: update_dict['from'] = search_param_dict['from'] search_obj.update_from_dict(update_dict) return search_obj
def query(self): search_obj = Search() for f in self.filters: search_obj = search_obj.filter(f) for q in self.queries: search_obj = search_obj.query(q) return search_obj.to_dict()
def aggregated_search(self, search_query, indices, aggregations, size, request_timeout): search_obj = Search(using=self.es_connection, index=indices).params(size=size, request_timeout=request_timeout) query_obj = search_obj.filter(search_query) for aggregation in aggregations: query_obj.aggs.bucket(name=aggregation.to_dict()['terms']['field'], agg_type=aggregation) results = query_obj.execute() result_set = AggregatedResults(results) return result_set
def page_detail(id): try: # search the document based on its metaid s = Search(using=es) s = s.index('job_index') s = s.filter('term', _id=id) ret = s.execute() job=get_job_detail(ret.hits[0].to_dict(),id) return render_template('detail.html', job) except KeyError: return "Problem"
def _queryElasticsearch(self, from_date, to_date, query): logging.debug("Connecting to ES") client = Elasticsearch([self._config['ElasticSearch']['uri']]) logging.debug("Beginning search") s = Search(using=client, index=self._config['ElasticSearch']['raw_index']) s = s.filter('range', **{'EndTime': {'from': from_date, 'to': to_date }}) logging.debug("About to execute query:\n%s" % str(s.to_dict())) for hit in s.scan(): yield hit
def serialize(self): self.reindex(Addon) qs = Search(using=amo.search.get_es(), index=AddonIndexer.get_index_alias(), doc_type=AddonIndexer.get_doctype_name()) obj = qs.filter('term', id=self.addon.pk).execute()[0] with self.assertNumQueries(0): serializer = ESAddonSerializer(context={'request': self.request}) result = serializer.to_representation(obj) return result
def get_sensors(self, **kwargs): sensors = [] s = Search(index="sensors", using=self.es) # Build filters from kwargs for k, v in kwargs.items(): s = s.filter('wildcard', **{k: v}) response = s.execute() for hit in response: sensors.append(hit.to_dict()) return sensors
def aggregated_search(self, search_query, indices, aggregations, size, request_timeout): search_obj = Search(using=self.es_connection, index=indices).params( size=size, request_timeout=request_timeout) query_obj = search_obj.filter(search_query) for aggregation in aggregations: query_obj.aggs.bucket(name=aggregation.to_dict()['terms']['field'], agg_type=aggregation) results = query_obj.execute() result_set = AggregatedResults(results) return result_set
def get_tm_index(topic_modelling_name): from elasticsearch_dsl import Search, Q from nlpmonitor.settings import ES_CLIENT, ES_INDEX_TOPIC_MODELLING print("!!!", "Get topic model") ss = Search(using=ES_CLIENT, index=ES_INDEX_TOPIC_MODELLING) ss = ss.query( Q("term", name=topic_modelling_name) | Q("term", **{"name.keyword": topic_modelling_name})) ss = ss.filter("term", is_ready=True) tm_index = ss.source(['number_of_topics', 'name']).execute()[0] return tm_index
def get_session_id(esindex, estype, name, gender): esconn = es_object.connection s = Search(using=esconn, index=esindex, doc_type=estype) s = s.filter("term", gender=gender.lower()).query("match_phrase", name=stripSpaces( str(name)).title()) response = s.execute() try: hit = response.hits[0] return hit.meta.id except IndexError: return None
def test_filter_org(self): """Tests add organization name inclusion filter. """ s = Search() s.filter = MagicMock(return_value='test') result = esc.filter_org(s, esc.UNKNOWN_ORG_NAME) s.filter.assert_called_with('term', author_org_name=esc.UNKNOWN_ORG_NAME) self.assertEqual(result, 'test')
def _filter_licenses(s: Search, licenses): """ Filter out all licenses except for those provided in the `licenses` parameter. """ if not licenses: return s license_filters = [] for _license in licenses.split(','): license_filters.append(Q('term', license__keyword=_license)) s = s.filter('bool', should=license_filters, minimum_should_match=1) return s
def get_queryset(self): s = Search(index=ELASTIC_INDEX) title_param = self.request.query_params.get('q', None) genre_param = self.request.query_params.get('g', None) title_query = _field_query(title_param, 'title') genre_query = _field_query(genre_param, 'genre') return [ i.__dict__['_d_'] for i in s.filter(title_query & genre_query).execute() ]
def delete(self, start_date=None, end_date=None): """Delete aggregation documents.""" aggs_query = Search( using=self.client, index=self.aggregation_alias, doc_type=self.aggregation_doc_type).extra(_source=False) range_args = {} if start_date: range_args['gte'] = format_range_dt( start_date.replace(microsecond=0), self.aggregation_interval) if end_date: range_args['lte'] = format_range_dt( end_date.replace(microsecond=0), self.aggregation_interval) if range_args: aggs_query = aggs_query.filter('range', timestamp=range_args) bookmarks_query = Search( using=self.client, index=self.bookmark_api.bookmark_index, ).sort({'date': { 'order': 'desc' }}) if range_args: bookmarks_query = bookmarks_query.filter('range', date=range_args) def _delete_actions(): for query in (aggs_query, bookmarks_query): affected_indices = set() for doc in query.scan(): affected_indices.add(doc.meta.index) yield dict(_index=doc.meta.index, _op_type='delete', _id=doc.meta.id, _type=doc.meta.doc_type) current_search_client.indices.flush( index=','.join(affected_indices), wait_if_ongoing=True) bulk(self.client, _delete_actions(), refresh=True)
def main(): parser = argparse.ArgumentParser(description='Download items from ES index') arg = parser.add_argument arg('output', help='output in .jl.gz format') arg('index', help='ES index name') arg('--domain', help='url.domain to filter') arg('--id', help='record id') arg('--host', default='localhost', help='ES host in host[:port] format') arg('--user', help='HTTP Basic Auth user') arg('--password', help='HTTP Basic Auth password') arg('--chunk-size', type=int, default=100, help='download chunk size') args = parser.parse_args() kwargs = {} if args.user or args.password: kwargs['http_auth'] = (args.user, args.password) client = elasticsearch.Elasticsearch( [args.host], connection_class=elasticsearch.RequestsHttpConnection, timeout=600, **kwargs) print(client.info()) search = Search(using=client, index=args.index) if args.domain: search = search.filter('term', **{'url.domain': args.domain}) if args.id: search = search.filter('term', **{'_id': args.id}) total = 0 with tqdm.tqdm(total=search.count()) as pbar: with gzip.open(args.output, 'wt') as f: for x in search.params(size=args.chunk_size).scan(): total += 1 pbar.update(1) f.write(json.dumps(x.to_dict())) f.write('\n') print('{:,} items downloaded to {}'.format(total, args.output))
def _search(self, query): s = Search(using=self.Client, index="winlogbeat-*").query(query) if self.DTRange != None: s = s.filter('range', **self.DTRange) s.source(includes=['winlog.*']) s.sort('-winlog.event_data.UtcTime') if self.Scan: return s.scan() else: return s.execute().hits
def search(self, cls: AbstractText, query: str = None, start_date_str: str = None, end_date_str: str = None): s = Search(using=self.client, index=cls.index) if query: s = s.query('multi_match', query=query, fields=cls.get_all_fields()) if start_date_str: s = s.filter('range', **{cls.Field.DATE: {'gte': start_date_str}}) if end_date_str: s = s.filter('range', **{cls.Field.DATE: {'lt': end_date_str}}) try: res = s.execute() return list( map(lambda hit: cls(hit['_source']), res['hits']['hits'])) except Exception as e: raise ElasticsearchException( f'failed to search {cls.__class__.__name__} for {s.to_dict()}' ) from e
def search_my_data(self, username, q, offset, limit): search = Search(index='des-files') search = search.filter("nested", path="permissions", query=Q("term", permissions__username=username)) search = search.query("simple_query_string", query=q, fields=["name", "name._exact", "keywords"]) search = search.query( Q('bool', must=[Q({'prefix': { 'path._exact': username }})])) search = search.filter("term", system='designsafe.storage.default') search = search.query( Q('bool', must_not=[ Q({'prefix': { 'path._exact': '{}/.Trash'.format(username) }}) ])) logger.info(search.to_dict()) return search
def build_query(self, interval, start_date, end_date, **kwargs): """Build the elasticsearch query.""" agg_query = Search(using=self.client, index=self.index)[0:0] if start_date is not None or end_date is not None: time_range = {} if start_date is not None: time_range['gte'] = start_date.isoformat() if end_date is not None: time_range['lte'] = end_date.isoformat() agg_query = agg_query.filter( 'range', **{self.time_field: time_range}) for modifier in self.query_modifiers: agg_query = modifier(agg_query, **kwargs) base_agg = agg_query.aggs.bucket( 'histogram', 'date_histogram', field=self.time_field, interval=interval ) for destination, (metric, field, opts) in self.metric_fields.items(): base_agg.metric(destination, metric, field=field, **opts) if self.copy_fields: base_agg.metric( 'top_hit', 'top_hits', size=1, sort={'timestamp': 'desc'} ) for query_param, filtered_field in self.required_filters.items(): if query_param in kwargs: agg_query = agg_query.filter( 'term', **{filtered_field: kwargs[query_param]} ) return agg_query
def default_string_query(self, q, options): match = self._parse_interval_query(q) if match: # interval query search = Search() if match['query'] != '': search = search.query("query_string", query=match['query']) search = search.filter('match', chrom=match['chr']) assembly = 'hg38' if options.assembly == 'hg38' else 'hg19' search = search.filter( 'range', **{assembly + ".start": { "lte": match['gend'] }}) search = search.filter( 'range', **{assembly + ".end": { "gte": match['gstart'] }}) else: # default query search = super().default_string_query(q, options) return search
def get_access_search( client: Elasticsearch, index: str, ts_range: TimestampRange = None, prefixes: typing.Sequence[str] = None, timestamp_field: str = 'timestamp', ) -> Search: search = Search(using=client, index=index) search = filter_url_by_prefixes(search, prefixes) range_filter = get_range_filter(ts_range, timestamp_field) if range_filter: search = search.filter(range_filter) return search
def infoTopApp(self): s = Search(index='ossim-osdepym*') s = s.query('match_all') s = s.filter('range', log_date={ "gte": 1554087600000, "lte": 1556679599999 }) s.aggs.bucket( 'users', A('terms', field='app.keyword', size=5, order={"_count": "desc"})) return s.execute().aggregations.users.buckets
def test_add_date_filter_min_date(self): """Test add filter calls with `start_date`. """ s = Search() s.filter = MagicMock(return_value='test') min_date = '2018-01-01' result = esc.add_date_filter(s, start_date=min_date) s.filter.assert_called_with('range', grimoire_creation_date={'gt': min_date}) self.assertEqual(result, 'test')
def post(self): json = request.get_json() s = Search(using=es,index='twitter',doc_type='items') username = json.pop('username') if 'username' in json else None following = True if 'following' in json: following = json.pop('following') else: if username: following = True timestamp = json.pop('timestamp') if 'timestamp' in json else time() search = 'q' in json limit = json.pop('limit') if 'limit' in json and json['limit'] <= 100 else 50 following_list = db.user.find_one({'username':session['username']})['following'] # do we only need this is following=true? #query = {'timestamp':{'$lte':timestamp}} s = s.filter('range', timestamp={'lte':timestamp}) if search: #query['$text'] = {'$search':json['q']} s = s.query('match', content=json['q']) if username: if following: query['username'] = username if username in following_list else '' s = s.filter('term',username=query['username']) else: #query['username'] = username s =s.filter('term',username=username) else: if following: #query['username'] = {'$in': following_list} s = s.filter('terms',username=following_list) # my code if 'parent' in json: #query['parent'] = json['parent'] s = s.filter('term', parent=json['parent']) if 'replies' not in json: json['replies'] = True if not json['replies']: query['parent'] = None s = s.filter('term', parent=None) # endmy code if 'rank' not in json: json['rank'] = 'interest' s =s[0:limit] if json['rank'] == 'time': sort_key = 'timestamp' s = s.sort('-timestamp') else: sort_key = 'interest_score' s = s.sort('-interest_score') #sort_dir = -1 #results = db.items.find(query).sort(sort_key, sort_dir).limit(limit) #results = db.items.find(filter=query, limit=limit, sort=sort_by) #results = db.items.aggregate([{'$match':query}, {'$limit': limit}, {'$sort': sort_by}]) results = s.execute() l = [x['_source'].to_dict() for x in results['hits']['hits']] return Response(response = dumps({'status':'OK','items':l}),mimetype='application/json')
def get_search_by_entities_query( entities, term=None, filter_data=None, composite_field_mapping=None, permission_filters=None, ordering=None, fields_to_include=None, fields_to_exclude=None, ): """ Performs filtered search for the given term across given entities. """ filter_data = filter_data or {} query = [] if term != '': for entity in entities: query.append(_build_term_query(term, fields=entity.SEARCH_FIELDS)) filters, ranges = _split_range_fields(filter_data) # document must match all filters in the list (and) must_filter = _build_must_queries(filters, ranges, composite_field_mapping) s = Search(index=[entity.get_read_alias() for entity in entities], ).query(Bool(must=query), ) permission_query = _build_entity_permission_query(permission_filters) if permission_query: s = s.filter(permission_query) s = s.filter(Bool(must=must_filter)) s = _apply_sorting_to_query(s, ordering) return _apply_source_filtering_to_query( s, fields_to_include=fields_to_include, fields_to_exclude=fields_to_exclude, )
def tx_list(offset=10): """ check for 'eth.tx.tx_new' messages and return the max number of clients, that had the same tx during the last `offset` seconds. """ s = Search(client) s = s.query(Q("match", message='eth.tx.received')) s = s.filter(time_range_filter(offset=offset)) s = s[0:100] response = s.execute() for hit in response.hits: print hit.to_dict() return response
def __build_search(self, date_range, project_name=None, org_name=None): s = Search(using=self._es_conn, index=self._es_index) s = s.filter('range', **date_range) if project_name: s = s.filter('term', project=project_name) if org_name: s = s.filter('term', author_org_name=org_name) # from:to parameters (=> from: 0, size: 0) s = s[0:0] # Get author_name and most recent metadata__timestamp for quarter (should be enough per quarter, # computing it by user probably is not needed as we are going to recalculate the whole quarter) # We are not keeping all metadata__* fields because we are grouping commits by author, so we can only # store one value per author. s.aggs.bucket(self.TIMEFRAME, 'date_histogram', field=self._timeframe_field, interval='quarter') \ .metric(self.LATEST_TS, 'max', field=self._sort_on_field)\ .bucket(self.AUTHOR_UUID, 'terms', field=self.AUTHOR_UUID, size=1000) \ .metric(self.CONTRIBUTIONS, 'cardinality', field=self.contribs_field, precision_threshold=40000)\ .bucket(self.AUTHOR_NAME, 'terms', field=self.AUTHOR_NAME, size=1) return s
def get_top_genes_aggregated_filtered_statistics(filters): s = Search(using=es, doc_type='genes') if 'chr' in filters and len(filters['chr']) > 0 and len( filters['chr']) < 5: s = s.filter( Q('bool', should=[ Q('term', chr=chrom if len(chrom) > 3 else 'chr%s' % chrom) for chrom in filters['chr'] ])) agg_chr = A("terms", field="chr") s.aggs.bucket('chr_count', agg_chr) agg_results = s.execute().aggregations return agg_results.chr_count.buckets
def tx_propagation(client_count, offset=10): """ check for 'eth.tx.tx_new' messages and return the max number of clients, that had the same tx during the last `offset` seconds. """ s = Search(client) # s = s.query(Q("match", message='eth.tx.received')) s = s.filter('exists', field='json_message.eth.tx.received.tx_hash') s = s.filter( time_range_filter(field="json_message.eth.tx.received.ts", offset=offset)) s.aggs.bucket('by_tx', 'terms', field='json_message.eth.tx.received.tx_hash', size=client_count) # s = s[0:1000] response = s.execute() if response: return max(tag.doc_count for tag in response.aggregations.by_tx.buckets) else: return 0
def getHostBytes(client, starttime, endtime): s = Search(using=client, index="htcondor-xfer-stats2-*") s = s.filter('range', **{'@timestamp': {'gte': starttime, 'lt': endtime}}) # Remove records with more than 1 TB of data transferred, bug: # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=7575,0 s = s.filter('range', bytes={'from': 0, 'to': 1024**4}) bkt = s.aggs bkt = bkt.bucket('hosts', 'terms', size=MAXSZ, field='host.name.keyword') bkt = bkt.metric('Bytes', 'sum', field='bytes') bkt = bkt.metric('loss', 'avg', field='lost') print(s.to_dict()) response = s.execute() hosts = {} for tag in response.aggregations.hosts: hosts[tag.key] = { 'bytes': tag.Bytes.value, 'bytes_str': convert_gb(tag.Bytes.value), 'loss': tag.loss.value } return hosts
def test_filters(): s = Search() s = s.filter('terms', tags=['search', 'python']) print(s.to_dict()) # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}} s = s.query('bool', filter=[Q('terms', tags=['search', 'python'])]) print(s.to_dict()) # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}} s = s.exclude('terms', tags=['search', 'python']) # 或者 # s = s.query('bool', filter=[~Q('terms', tags=['search', 'python'])]) print(s.to_dict())
def filter_url_by_prefixes( search: Search, prefixes: typing.Sequence[str] = None, url_field: str = 'url__original', ) -> Search: if prefixes: prefix, *tail = prefixes lookup = {url_field: prefix} query = Q('match_bool_prefix', **lookup) for prefix in tail: lookup[url_field] = prefix query = query | Q('match_bool_prefix', **lookup) search = search.filter(query) return search
def es_issue_count( es_client: Any, container_id: str, year: int, volume: str, issue: str ) -> int: search = Search(using=es_client, index="fatcat_release") search = ( search.filter("term", container_id=container_id) .filter("term", year=year) .filter("term", volume=volume) .filter("term", issue=issue) .extra(request_cache=True) ) search = search.params() return search.count()