def funding_by_state(**kwargs): # topic_query = kwargs.get("topic") topic = kwargs.get("topic") element = kwargs.get("element") filters = dict(element=element, topic=topic) # run query s = query.run_query(Q({"match_all": {}}), index=index, filters=filters) # aggregations a1 = A("nested", path="funding_agencies") a2 = A( "terms", field="funding_agencies.state.keyword", size=50, order={"_count": "desc"}, ) a3 = A("reverse_nested") a4 = A("range", field="funding", ranges=[{ "from": 0, "to": 100000 }, { "from": 100000, "to": 250000 }, { "from": 250000, "to": 500000 }, { "from": 500000, "to": 750000 }, { "from": 750000, "to": 1000000 }, { "from": 1000000 }], keyed=True) # chain aggregations and execute s.aggs\ .bucket('agencies', a1)\ .bucket('states',a2)\ .bucket('reverse',a3)\ .bucket('fund_amt',a4) response = s.execute() # filter response res = {} for b in response.aggregations.agencies.states.buckets: state = b.key if len(state) > 2: continue if state in res: continue buckets = b.reverse.fund_amt.buckets.to_dict() res[state] = buckets return res
def get_users_in_course( cls, course_id, segments=None, ignore_segments=None, cohort=None, enrollment_mode=None, text_search=None, sort_policies=None, ): """ Construct a search query for all users in `course_id` and return the Search object. sort_policies is an array, where the first element is the primary sort. Elements in the array are dicts with fields: order_by (field to sort by) and sort_order (either 'asc' or 'desc'). Default to 'username' and 'asc'. Raises `ValueError` if both `segments` and `ignore_segments` are provided. """ if not sort_policies: sort_policies = [{'order_by': None, 'sort_order': None}] # set default sort policy to 'username' and 'asc' for field, default in [('order_by', 'username'), ('sort_order', 'asc')]: if sort_policies[0][field] is None: sort_policies[0][field] = default # Error handling if segments and ignore_segments: raise ValueError( 'Cannot combine `segments` and `ignore_segments` parameters.') for segment in (segments or list()) + (ignore_segments or list()): if segment not in learner.SEGMENTS: raise ValueError( "segments/ignore_segments value '{segment}' must be one of: ({segments})" .format(segment=segment, segments=', '.join(learner.SEGMENTS))) order_by_options = ('username', 'email', 'discussion_contributions', 'problems_attempted', 'problems_completed', 'problem_attempts_per_completed', 'attempt_ratio_order', 'videos_viewed') sort_order_options = ('asc', 'desc') for sort_policy in sort_policies: if sort_policy['order_by'] not in order_by_options: raise ValueError( "order_by value '{order_by}' must be one of: ({order_by_options})" .format(order_by=sort_policy['order_by'], order_by_options=', '.join(order_by_options))) if sort_policy['sort_order'] not in sort_order_options: raise ValueError( "sort_order value '{sort_order}' must be one of: ({sort_order_options})" .format(sort_order=sort_policy['sort_order'], sort_order_options=', '.join(sort_order_options))) search = cls.search() search.query = Q('bool', must=[Q('term', course_id=course_id)]) # Filtering/Search if segments: search.query.must.append( Q('bool', should=[Q('term', segments=segment) for segment in segments])) elif ignore_segments: for segment in ignore_segments: search = search.query(~Q('term', segments=segment)) # pylint: disable=invalid-unary-operand-type if cohort: search = search.query('term', cohort=cohort) if enrollment_mode: search = search.query('term', enrollment_mode=enrollment_mode) if text_search: search.query.must.append( Q('multi_match', query=text_search, fields=['name', 'username', 'email'])) # construct the sort hierarchy search_request = search.sort(*[ { sort_policy['order_by']: { 'order': sort_policy['sort_order'], # ordering of missing fields 'missing': '_last' if sort_policy['sort_order'] == 'asc' else '_first' } } for sort_policy in sort_policies ]) return search_request
def _Q(self, name_or_query='match', **params): """ It is a wrapper to ElasticDSL Query module used to create a query object. :param str name_or_query is the type of the query """ return Q(name_or_query, **params)
class ElasticDB(DB): nested_fields = [] # filters flt_empty = Q() def __init__(self, url): super().__init__() self.username = '' self.password = '' self.hosts = None if '@' in url.netloc: username, hostname = url.netloc.split('@', 1) if ':' in username: self.username, self.password = (unquote(val) for val in username.split(':', 1)) else: self.username = unquote(username) if hostname: self.hosts = [hostname] elif url.netloc: self.hosts = [url.netloc] index_prefix = url.path.lstrip('/') if index_prefix: self.index_prefix = index_prefix + '-' else: self.index_prefix = 'ivre-' self.params = dict(x.split('=', 1) if '=' in x else (x, None) for x in url.query.split('&') if x) def init(self): """Initializes the mappings.""" for idxnum, mapping in enumerate(self.mappings): idxname = self.indexes[idxnum] self.db_client.indices.delete( index=idxname, ignore=[400, 404], ) self.db_client.indices.create( index=idxname, body={ "mappings": { "properties": mapping, # Since we do not need full text searches, use # type "keyword" for strings (unless otherwise # specified in mapping) instead of default # (text + keyword) "dynamic_templates": [ {"strings": { "match_mapping_type": "string", # prevent RequestError exceptions when # one term's UTF-8 encoding is bigger # than the max length 32766 "mapping": {"type": "keyword", "ignore_above": 32000}, }}, ], } }, ) @property def db_client(self): """The DB connection.""" try: return self._db_client except AttributeError: self._db_client = Elasticsearch( hosts=self.hosts, http_auth=(self.username, self.password) ) return self._db_client @property def server_info(self): """Server information.""" try: return self._server_info except AttributeError: self._server_info = self.db_client.info() return self._server_info @staticmethod def to_binary(data): return utils.encode_b64(data).decode() @staticmethod def from_binary(data): return utils.decode_b64(data.encode()) @staticmethod def ip2internal(addr): return addr @staticmethod def internal2ip(addr): return addr @staticmethod def searchnonexistent(): return Q('match', _id=0) @classmethod def searchhost(cls, addr, neg=False): """Filters (if `neg` == True, filters out) one particular host (IP address). """ return Q('match', addr=addr) @classmethod def searchhosts(cls, hosts, neg=False): pass @staticmethod def _get_pattern(regexp): # The equivalent to a MongoDB or PostgreSQL search for regexp # /Test/ would be /.*Test.*/ in Elasticsearch, while /Test/ in # Elasticsearch is equivalent to /^Test$/ in MongoDB or # PostgreSQL. pattern, flags = utils.regexp2pattern(regexp) if flags & ~re.UNICODE: # is a flag, other than re.UNICODE, is set, issue a # warning as it will not be used utils.LOGGER.warning( 'Elasticsearch does not support flags in regular ' 'expressions [%r with flags=%r]', pattern, flags ) return pattern @staticmethod def _flt_and(cond1, cond2): return cond1 & cond2 @staticmethod def _flt_or(cond1, cond2): return cond1 | cond2 @staticmethod def flt2str(flt): return json.dumps(flt.to_dict())
def get_max(self): logger.debug("SearchManager get_max invoked") q = Q("match_all") return self.__get_queryset(q)
def searchscript(cls, name=None, output=None, values=None, neg=False): """Search a particular content in the scripts results. """ req = [] if name is not None: if isinstance(name, utils.REGEXP_T): req.append(Q("regexp", **{"ports.scripts.id": cls._get_pattern(name)})) else: req.append(Q("match", **{"ports.scripts.id": name})) if output is not None: if isinstance(output, utils.REGEXP_T): req.append(Q("regexp", **{"ports.scripts.output": cls._get_pattern(output)})) else: req.append(Q("match", **{"ports.scripts.output": output})) if values: if name is None: raise TypeError(".searchscript() needs a `name` arg " "when using a `values` arg") subfield = ALIASES_TABLE_ELEMS.get(name, name) if isinstance(values, Query): req.append(values) elif isinstance(values, str): req.append(Q("match", **{"ports.scripts.%s" % subfield: values})) elif isinstance(values, utils.REGEXP_T): req.append(Q("regexp", **{"ports.scripts.%s" % subfield: cls._get_pattern(values)})) else: for field, value in values.items(): if isinstance(value, utils.REGEXP_T): req.append(Q("regexp", **{"ports.scripts.%s.%s" % (subfield, field): cls._get_pattern(value)})) else: req.append(Q("match", **{"ports.scripts.%s.%s" % (subfield, field): value})) if not req: res = Q('nested', path='ports', query=Q('nested', path='ports.scripts', query=Q("exists", field="ports.scripts"))) else: query = cls.flt_and(*req) res = Q("nested", path="ports", query=Q("nested", path="ports.scripts", query=query)) if neg: return ~res return res
def searchhost(cls, addr, neg=False): """Filters (if `neg` == True, filters out) one particular host (IP address). """ return Q('match', addr=addr)
def get_top_n_statistics(): """ Obtains TOP N DNS statistics. :return: JSON with status "ok" or "error" and requested data. """ # Check login if not session.logged: json_response = '{"status": "Error", "data": "You must be logged!"}' return json_response # Check mandatory inputs if not (request.get_vars.beginning and request.get_vars.end and request.get_vars.type and request.get_vars.number): json_response = '{"status": "Error", "data": "Some mandatory argument is missing!"}' return json_response # Parse inputs and set correct format beginning = escape(request.get_vars.beginning) end = escape(request.get_vars.end) type = escape(request.get_vars.type) number = int(escape(request.get_vars.number)) try: # Elastic query client = elasticsearch.Elasticsearch([{ 'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port') }]) elastic_bool = [] elastic_bool.append( {'range': { '@timestamp': { 'gte': beginning, 'lte': end } }}) elastic_bool.append({'term': {'@stat_type': type}}) # Prepare query qx = Q({'bool': {'must': elastic_bool}}) # Set query according to the statistic type if type == "queried_by_ip": search_ip = Search(using=client, index='_all').query(qx) search_ip.aggs.bucket('all_nested', 'nested', path='data_array') \ .bucket('by_key', 'terms', field='data_array.key.raw', size=2147483647)\ .bucket('by_ip', 'terms', field='data_array.ip', size=1, order={'sum_by_ip': 'desc'}) \ .bucket('sum_by_ip', 'sum', field='data_array.value') search_ip.aggs['all_nested']['by_key'].bucket( 'sum_total', 'sum', field='data_array.value') results = search_ip.execute() else: search_ip = Search(using=client, index='_all').query(qx) search_ip.aggs.bucket('all_nested', 'nested', path='data_array') \ .bucket('by_key', 'terms', field='data_array.key.raw', size=2147483647) \ .bucket('stats_sum', 'sum', field='data_array.value') results = search_ip.execute() # Prepare data variable data = "" # Prepare ordered collection counter = collections.Counter() if type == "queried_by_ip": for record in results.aggregations.all_nested.by_key.buckets: top_ip = record.by_ip.buckets[0] counter[(record.key, top_ip.key, int( top_ip.sum_by_ip.value))] = int(record.sum_total.value) # Select top N (number) values for value, count in counter.most_common(number): data += value[0] + "," + value[1] + "," + str( value[2]) + "," + str(count) + "," else: for all_buckets in results.aggregations.all_nested.by_key: counter[all_buckets.key] += int(all_buckets.stats_sum.value) # Select top N (number) values for value, count in counter.most_common(number): data += value + "," + str(count) + "," # Remove trailing comma data = data[:-1] if data == "": json_response = '{"status": "Empty", "data": "No data found"}' else: json_response = '{"status": "Ok", "data": "' + data + '"}' return json_response except Exception as e: json_response = '{"status": "Error", "data": "Elasticsearch query exception: ' + escape( str(e)) + '"}' return json_response
def search(request, spec, operator="and"): if not isinstance(spec, collections.abc.Mapping): raise XMLRPCWrappedError( TypeError("Invalid spec, must be a mapping/dictionary.")) if operator not in {"and", "or"}: raise XMLRPCWrappedError( ValueError("Invalid operator, must be one of 'and' or 'or'.")) # Remove any invalid spec fields spec = { k: [v] if isinstance(v, str) else v for k, v in spec.items() if v and k in { "name", "version", "author", "author_email", "maintainer", "maintainer_email", "home_page", "license", "summary", "description", "keywords", "platform", "download_url", } } queries = [] for field, value in sorted(spec.items()): q = None for item in value: kw = {"query": item} if field in SEARCH_BOOSTS: kw["boost"] = SEARCH_BOOSTS[field] if q is None: q = Q("match", **{field: kw}) else: q |= Q("match", **{field: kw}) queries.append(q) if operator == "and": query = request.es.query("bool", must=queries) else: query = request.es.query("bool", should=queries) results = query[:100].execute() request.registry.datadog.histogram("warehouse.xmlrpc.search.results", len(results)) if "version" in spec.keys(): return [{ "name": r.name, "summary": getattr(r, "summary", None), "version": v, "_pypi_ordering": False, } for r in results for v in r.version if v in spec.get("version", [v])] return [{ "name": r.name, "summary": getattr(r, "summary", None), "version": r.latest_version, "_pypi_ordering": False, } for r in results]
def __payload_body(self, query_params, aggs_params, size=SEARCH_SPLIT_LIMIT, source=None): if size > SEARCH_SPLIT_LIMIT or size < 1: raise ElasticsearchKibanaCLIException( 'Payload size is out-of-bounds in __payload_body()', size) for param_name in [ 'must', 'must_not', 'should', 'should_not', 'filter' ]: if param_name in query_params: query_params[param_name] = self.__parse_query_param( query_params[param_name]) else: query_params[param_name] = [] aggs = {} # for another day query = Q('bool', must=query_params['must'], must_not=query_params['must_not'], should=query_params['should'], should_not=query_params['should_not'], minimum_should_match=query_params['minimum_should_match'] if 'minimum_should_match' in query_params else (1 if len(query_params['should']) > 0 else None), filter=query_params['filter']) payload_values = { 'source': json.dumps(source).replace('___timestamp', '@timestamp') if source is not None else '[ ]', 'size': size, 'aggs': json.dumps(aggs), 'query': json.dumps(query.to_dict()), 'timeout': '"' + (str(query_params['timeout']) if 'timeout' in query_params else '{}s'.format(SEARCH_DEFAULT_TIMEOUT_SECONDS)) + '"' } payload_json = """ { "version": true, "sort": [ ], "stored_fields": [ ], "script_fields": { }, "docvalue_fields": [ ], "highlight": { }, "_source": __SOURCE__, "size": __SIZE__, "aggs": __AGGS__, "query": __QUERY__, "timeout": __TIMEOUT__ } """ for payload_k, payload_v in payload_values.items(): replace_token = '__{}__'.format(payload_k.upper()) payload_json = payload_json.replace(replace_token, str(payload_v)) return json.dumps(json.loads(payload_json))
def get_records_list(): """ Obtains list of all records for given type given time range. :return: JSON with status "ok" or "error" and requested data. """ # Check login if not session.logged: json_response = '{"status": "Error", "data": "You must be logged!"}' return json_response # Check mandatory inputs if not (request.get_vars.beginning and request.get_vars.end and request.get_vars.type): json_response = '{"status": "Error", "data": "Some mandatory argument is missing!"}' return json_response # Parse inputs and set correct format beginning = escape(request.get_vars.beginning) end = escape(request.get_vars.end) type = escape(request.get_vars.type) try: # Elastic query client = elasticsearch.Elasticsearch([{ 'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port') }]) elastic_bool = [] elastic_bool.append( {'range': { '@timestamp': { 'gte': beginning, 'lte': end } }}) elastic_bool.append({'term': {'@stat_type': type}}) # Prepare query qx = Q({'bool': {'must': elastic_bool}}) # Set query according to the statistic type search_ip = Search(using=client, index='_all').query(qx) search_ip.aggs.bucket('all_nested', 'nested', path='data_array')\ .bucket('by_key', 'terms', field='data_array.key.raw', size=2147483647)\ .bucket('stats_sum', 'sum', field='data_array.value') results = search_ip.execute() data = "" for all_buckets in results.aggregations.all_nested.by_key: data += all_buckets.key + "," + str( int(all_buckets.stats_sum.value)) + "," # Remove trailing comma data = data[:-1] json_response = '{"status": "Ok", "data": "' + data + '"}' return json_response except Exception as e: json_response = '{"status": "Error", "data": "Exception: ' + escape( str(e)) + '"}' return json_response
def filter_query(type): return Q('term', **{'project.value.projectType._exact': type})
def search(offset=0, limit=100, query_string='', limit_fields=True, *args): query_dict = json.loads(urllib.parse.unquote(query_string)) type_filters = query_dict['typeFilters'] has_type_filters = True in list(map(bool, type_filters.values())) def filter_query(type): return Q('term', **{'project.value.projectType._exact': type}) selected_filters = list( filter(lambda key: bool(type_filters[key]), type_filters.keys())) type_query = Q('bool', should=list(map(filter_query, selected_filters))) client = new_es_client() search = IndexedPublication.search(using=client) if has_type_filters: search = search.filter(type_query) query_filters = [] # Query string fields author = query_dict['queries']['author'] title = query_dict['queries']['title'] keywords = query_dict['queries']['keyword'] description = query_dict['queries']['description'] if author: query_filters.append(search_utils.author_query(author)) if title: query_filters.append(search_utils.title_query(title)) if keywords: query_filters.append(search_utils.keyword_query(keywords)) if description: query_filters.append(search_utils.description_query(description)) # Experimental advanced filters facility = query_dict['advancedFilters']['experimental'][ 'experimentalFacility'] experiment_type = query_dict['advancedFilters']['experimental'][ 'experimentType'] if facility['name']: query_filters.append( search_utils.experimental_facility_query(facility)) if experiment_type: query_filters.append( search_utils.experiment_type_query(experiment_type)) # Simulation advanced filters simulation_type = query_dict['advancedFilters']['simulation'][ 'simulationType'] if simulation_type: query_filters.append( search_utils.simulation_type_query(simulation_type)) # Field recon advanced filters nh_type = query_dict['advancedFilters']['field_recon']['naturalHazardType'] nh_event = query_dict['advancedFilters']['field_recon'][ 'naturalHazardEvent'] if nh_type: query_filters.append(search_utils.nh_type_query(nh_type)) if nh_event: query_filters.append(search_utils.nh_event_query(nh_event)) # Other advanced filters data_type = query_dict['advancedFilters']['other']['dataType'] if data_type: query_filters.append(search_utils.other_type_query(data_type)) # Hybrid sim advanced filters sim_type = data_type = query_dict['advancedFilters']['hybrid_simulation'][ 'hybridSimulationType'] if sim_type: query_filters.append(search_utils.hybrid_sim_type_query(sim_type)) search = search.filter('bool', must=query_filters) search = search.filter(Q('term', status='published')) search = search.extra(from_=offset, size=limit) if limit_fields: search = search.source(includes=[ 'project.value.title', 'project.value.pi', 'project.value.keywords', 'project.value.projectType', 'project.value.dataType', 'created', 'projectId', 'users', 'system', 'revision' ]) search = search.sort({'created': {'order': 'desc'}}) res = search.execute() hits = list( map( lambda h: { **h.to_dict(), 'pi': _get_user_by_username( h, h.project.value.pi) }, res.hits)) return {'listing': hits}
def neesdescription(project_id, *args): pub_query = IndexedPublicationLegacy.search()\ .filter(Q({'term': {'project._exact': project_id}}))\ .source(includes=['description']) desc = next(hit.description for hit in pub_query.execute().hits) return {'description': desc}
def last_run(self): runs = CeleryTaskRunLog.search().filter( Q('term', celery_task_id=self.celery_task_id)).sort('-start').execute() return runs[0] if runs else None
def _make_query_string_query(self, query, fields, default_operator=AND): return Q(QUERY_STRING, query=query, fields=fields, default_operator=default_operator)
def serialize(self, pid, record, links_factory=None): """Return a list of publications for a given author recid. :param pid: Persistent identifier instance. :param record: Record instance. :param links_factory: Factory function for the link generation, which are added to the response. """ author_pid = pid.pid_value publications = [] query = Q('match', authors__recid=author_pid) search = LiteratureSearch().query('nested', path='authors', query=query)\ .params(_source=[ 'accelerator_experiments', 'citation_count', 'control_number', 'earliest_date', 'facet_inspire_doc_type', 'keywords', 'publication_info', 'self', 'titles', ]) for result in search.scan(): result_source = result.to_dict() publication = {} publication['id'] = int(result_source['control_number']) publication['record'] = result_source['self'] publication['title'] = get_title(result_source) # Get the earliest date. try: publication['date'] = result_source['earliest_date'] except KeyError: pass # Get publication type. try: publication['type'] = result_source.get( 'facet_inspire_doc_type', [])[0] except IndexError: pass # Get citation count. try: publication['citations'] = result_source['citation_count'] except KeyError: pass # Get journal. try: publication['journal'] = {} publication['journal']['title'] = result_source.get( 'publication_info', [])[0]['journal_title'] # Get journal id and $self. try: publication['journal']['id'] = result_source.get( 'publication_info', [])[0]['journal_recid'] publication['journal']['record'] = result_source.get( 'publication_info', [])[0]['journal_record'] except KeyError: pass except (IndexError, KeyError): del publication['journal'] # Get collaborations. collaborations = set() for experiment in result_source.get('accelerator_experiments', []): collaborations.add(experiment.get('experiment')) if collaborations: publication['collaborations'] = list(collaborations) publications.append(publication) return json.dumps(publications)
def _make_bool_query(self, **kwargs): return Q(BOOL, **kwargs)
def searchnonexistent(): return Q('match', _id=0)
def _make_must_equal_terms_query(self, field, terms, **kwargs): return Q(TERMS, **{field: terms})
def topvalues(self, field, flt=None, topnbr=10, sort=None, least=False): """ This method uses an aggregation to produce top values for a given field or pseudo-field. Pseudo-fields are: - category / asnum / country / net[:mask] - port - port:open / :closed / :filtered / :<servicename> - portlist:open / :closed / :filtered - countports:open / :closed / :filtered - service / service:<portnbr> - product / product:<portnbr> - cpe / cpe.<part> / cpe:<cpe_spec> / cpe.<part>:<cpe_spec> - devicetype / devicetype:<portnbr> - script:<scriptid> / script:<port>:<scriptid> / script:host:<scriptid> - cert.* / smb.* / sshkey.* / ike.* - httphdr / httphdr.{name,value} / httphdr:<name> - httpapp / httpapp:<name> - modbus.* / s7.* / enip.* - mongo.dbs.* - vulns.* - screenwords - file.* / file.*:scriptid - hop """ baseterms = {"size": topnbr} if least: baseterms["order"] = {"_count": "asc"} outputproc = None nested = None if flt is None: flt = self.flt_empty if field == "category": field = {"field": "categories"} elif field == "asnum": flt = self.flt_and(flt, Q("exists", field="infos.as_num")) field = {"field": "infos.as_num"} elif field == "as": def outputproc(value): return tuple(val if i else int(val) for i, val in enumerate(value.split(',', 1))) flt = self.flt_and(flt, Q("exists", field="infos.as_num")) field = {"script": { "lang": "painless", "source": "doc['infos.as_num'].value + ',' + " "doc['infos.as_name'].value", }} elif field == "port" or field.startswith("port:"): def outputproc(value): return tuple(int(val) if i else val for i, val in enumerate(value.rsplit('/', 1))) if field == "port": flt = self.flt_and(flt, Q('nested', path='ports', query=Q('exists', field="ports.port"))) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "filter": {'bool': {'must_not': [ {'match': {'ports.port': -1}}, ]}}, "aggs": {"patterns": { "terms": dict( baseterms, script={ "lang": "painless", "source": 'doc["ports.protocol"].value + "/" + ' 'doc["ports.port"].value', }, ), }}, }}, } else: info = field[5:] if info in ['open', 'filtered', 'closed']: flt = self.flt_and(flt, Q('nested', path='ports', query=Q('match', ports__state_state=info))) matchfield = "state_state" else: flt = self.flt_and(flt, Q('nested', path='ports', query=Q('match', ports__service_name=info))) matchfield = "service_name" nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "filter": {'bool': { 'must': [{'match': {'ports.%s' % matchfield: info}}], 'must_not': [{'match': {'ports.port': -1}}], }}, "aggs": {"patterns": { "terms": dict( baseterms, script={ "lang": "painless", "source": 'doc["ports.protocol"].value + "/" + ' 'doc["ports.port"].value', }, ), }}, }}, } elif field == 'service': def outputproc(value): return value or None flt = self.flt_and(flt, self.searchopenport()) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "filter": {"match": {"ports.state_state": "open"}}, "aggs": {"patterns": { "terms": dict( baseterms, field="ports.service_name", missing="", ), }}, }}, } elif field.startswith("service:"): port = int(field[8:]) flt = self.flt_and(flt, self.searchport(port)) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "filter": {"bool": {"must": [ {"match": {"ports.state_state": "open"}}, {"match": {"ports.port": port}}, ]}}, "aggs": {"patterns": { "terms": dict( baseterms, field="ports.service_name", missing="", ), }}, }}, } elif field == 'product': def outputproc(value): return tuple(v or None for v in value.split('###', 1)) flt = self.flt_and(flt, self.searchopenport()) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "filter": {"match": {"ports.state_state": "open"}}, "aggs": {"patterns": { "terms": dict( baseterms, script=""" String result = ""; if(doc['ports.service_name'].size() > 0) { result += doc['ports.service_name'].value; } result += "###"; if(doc['ports.service_product'].size() > 0) { result += doc['ports.service_product'].value; } return result; """, missing="", ), }}, }}, } elif field.startswith("product:"): def outputproc(value): return tuple(v or None for v in value.split('###', 1)) info = field[8:] if info.isdigit(): info = int(info) flt = self.flt_and(flt, self.searchport(info)) matchfield = "port" else: flt = self.flt_and(flt, self.searchservice(info)) matchfield = "service_name" nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "filter": {"bool": {"must": [ {"match": {"ports.state_state": "open"}}, {"match": {"ports.%s" % matchfield: info}}, ]}}, "aggs": {"patterns": { "terms": dict( baseterms, script=""" String result = ""; if(doc['ports.service_name'].size() > 0) { result += doc['ports.service_name'].value; } result += "###"; if(doc['ports.service_product'].size() > 0) { result += doc['ports.service_product'].value; } return result; """, ), }}, }}, } elif field == 'version': def outputproc(value): return tuple(v or None for v in value.split('###', 2)) flt = self.flt_and(flt, self.searchopenport()) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "filter": {"match": {"ports.state_state": "open"}}, "aggs": {"patterns": { "terms": dict( baseterms, script=""" String result = ""; if(doc['ports.service_name'].size() > 0) { result += doc['ports.service_name'].value; } result += "###"; if(doc['ports.service_product'].size() > 0) { result += doc['ports.service_product'].value; } result += "###"; if(doc['ports.service_version'].size() > 0) { result += doc['ports.service_version'].value; } return result; """, missing="", ), }}, }}, } elif field.startswith('version:'): def outputproc(value): return tuple(v or None for v in value.split('###', 2)) info = field[8:] if info.isdigit(): port = int(info) flt = self.flt_and(flt, self.searchport(port)) matchflt = Q("match", ports__port=port) elif ":" in info: service, product = info.split(':', 1) flt = self.flt_and(flt, self.searchproduct( product=product, service=service, )) matchflt = ( Q("match", ports__service_name=service) & Q("match", ports__service_product=product) ) else: flt = self.flt_and(flt, self.searchservice(info)) matchflt = Q("match", ports__service_name=info) matchflt &= Q("match", ports__state_state="open") nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "filter": matchflt.to_dict(), "aggs": {"patterns": { "terms": dict( baseterms, script=""" String result = ""; if(doc['ports.service_name'].size() > 0) { result += doc['ports.service_name'].value; } result += "###"; if(doc['ports.service_product'].size() > 0) { result += doc['ports.service_product'].value; } result += "###"; if(doc['ports.service_version'].size() > 0) { result += doc['ports.service_version'].value; } return result; """, ), }}, }}, } elif field == 'httphdr': def outputproc(value): return tuple(value.split(':', 1)) flt = self.flt_and(flt, self.searchhttphdr()) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts.http-headers"}, "aggs": {"patterns": { "terms": dict( baseterms, script={ "lang": "painless", "source": "doc['ports.scripts.http-headers.name']." "value + ':' + doc['ports.scripts.http-" "headers.value'].value" }, ) }}, }}, }}, } elif field.startswith('httphdr.'): flt = self.flt_and(flt, self.searchhttphdr()) field = "ports.scripts.http-headers.%s" % field[8:] nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts.http-headers"}, "aggs": {"patterns": { "terms": dict( baseterms, field=field ), }}, }}, }}, } elif field.startswith('httphdr:'): subfield = field[8:].lower() flt = self.flt_and(flt, self.searchhttphdr(name=subfield)) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts.http-headers"}, "aggs": {"patterns": { "filter": {"match": { "ports.scripts.http-headers.name": subfield, }}, "aggs": {"patterns": { "terms": dict( baseterms, field='ports.scripts.http-headers.value', ), }}, }}, }}, }}, } elif field == 'httpapp': def outputproc(value): return tuple(value.split(':', 1)) flt = self.flt_and(flt, self.searchhttpapp()) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts.http-app"}, "aggs": {"patterns": { "terms": dict( baseterms, script={ "lang": "painless", "source": "doc['ports.scripts.http-app.application']" ".value + ':' + doc['ports.scripts.http-" "app.version'].value" }, ) }}, }}, }}, } elif field.startswith('httpapp:'): subfield = field[8:].lower() flt = self.flt_and(flt, self.searchhttpapp(name=subfield)) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts.http-app"}, "aggs": {"patterns": { "filter": {"match": { "ports.scripts.http-app.application": subfield, }}, "aggs": {"patterns": { "terms": dict( baseterms, field='ports.scripts.http-app.version', ), }}, }}, }}, }}, } elif field == 'useragent' or field.startswith('useragent:'): if field == 'useragent': flt = self.flt_and(flt, self.searchuseragent()) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts"}, "aggs": {"patterns": { "terms": dict( baseterms, field="ports.scripts.http-user-agent", ), }}, }}, } else: subfield = utils.str2regexp(field[10:]) flt = self.flt_and(flt, self.searchuseragent(useragent=subfield)) if isinstance(subfield, utils.REGEXP_T): subfield = self._get_pattern(subfield) else: subfield = re.escape(subfield) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts"}, "aggs": {"patterns": { "terms": dict( baseterms, field="ports.scripts.http-user-agent", include=subfield, ), }}, }}, } elif field == 'ja3-client' or ( field.startswith('ja3-client') and field[10] in ':.' ): if ':' in field: field, value = field.split(':', 1) subkey, value = self._ja3keyvalue(utils.str2regexp(value)) if isinstance(value, utils.REGEXP_T): include_value = self._get_pattern(value) filter_value = {'regexp': { "ports.scripts.ssl-ja3-client.%s" % subkey: include_value, }} else: include_value = re.escape(value) filter_value = {'match': { "ports.scripts.ssl-ja3-client.%s" % subkey: value, }} else: value = None subkey = None if '.' in field: field, subfield = field.split('.', 1) else: subfield = 'md5' base = { "terms": dict( baseterms, field="ports.scripts.ssl-ja3-client.%s" % subfield, ), } if subkey is not None: if subkey != subfield: base = { "filter": filter_value, "aggs": {"patterns": base}, } else: base["terms"]["include"] = include_value flt = self.flt_and(flt, self.searchja3client(value_or_hash=value)) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts.ssl-ja3-client"}, "aggs": {"patterns": base}, }}, }}, } elif field == 'ja3-server' or ( field.startswith('ja3-server') and field[10] in ':.' ): def outputproc(value): return tuple(value.split('/')) if ':' in field: field, values = field.split(':', 1) if ':' in values: value1, value2 = values.split(':', 1) if value1: subkey1, value1 = self._ja3keyvalue( utils.str2regexp(value1) ) if isinstance(value1, utils.REGEXP_T): filter_value1 = {'regexp': { "ports.scripts.ssl-ja3-server.%s" % subkey1: self._get_pattern(value1), }} else: filter_value1 = {'match': { "ports.scripts.ssl-ja3-server.%s" % subkey1: value1, }} else: subkey1, value1 = None, None if value2: subkey2, value2 = self._ja3keyvalue( utils.str2regexp(value2) ) if isinstance(value2, utils.REGEXP_T): filter_value2 = {'regexp': { "ports.scripts.ssl-ja3-server.client.%s" % subkey2: self._get_pattern(value2), }} else: filter_value2 = {'match': { "ports.scripts.ssl-ja3-server.client.%s" % subkey2: value2, }} else: subkey2, value2 = None, None else: subkey1, value1 = self._ja3keyvalue( utils.str2regexp(values) ) if isinstance(value1, utils.REGEXP_T): filter_value1 = {'regexp': { "ports.scripts.ssl-ja3-server.%s" % subkey1: self._get_pattern(value1), }} else: filter_value1 = {'match': { "ports.scripts.ssl-ja3-server.%s" % subkey1: value1, }} subkey2, value2 = None, None else: subkey1, value1 = None, None subkey2, value2 = None, None if '.' in field: field, subfield = field.split('.', 1) else: subfield = 'md5' flt = self.flt_and(flt, self.searchja3server( value_or_hash=value1, client_value_or_hash=value2, )) base = { "terms": dict( baseterms, script={ "lang": "painless", "source": "doc['ports.scripts.ssl-ja3-server.%s'].value + '/' + " "doc['ports.scripts.ssl-ja3-server.client.%s'].value" % (subfield, subfield), }, ), } if value1 is not None: base = { "filter": filter_value1, "aggs": {"patterns": base}, } if value2 is not None: base = { "filter": filter_value2, "aggs": {"patterns": base}, } flt = self.flt_and(flt, self.searchja3server( value_or_hash=value1, client_value_or_hash=value2, )) nested = { "nested": {"path": "ports"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts"}, "aggs": {"patterns": { "nested": {"path": "ports.scripts.ssl-ja3-server"}, "aggs": {"patterns": base}, }}, }}, } elif field.startswith('s7.'): flt = self.flt_and(flt, self.searchscript(name="s7-info")) subfield = field[3:] field = {'field': 'ports.scripts.s7-info.' + subfield} else: field = {"field": field} body = {"query": flt.to_dict()} if nested is None: body["aggs"] = {"patterns": {"terms": dict(baseterms, **field)}} else: body["aggs"] = {"patterns": nested} utils.LOGGER.debug("DB: Elasticsearch aggregation: %r", body) result = self.db_client.search( body=body, index=self.indexes[0], ignore_unavailable=True, size=0 ) result = result["aggregations"] while 'patterns' in result: result = result['patterns'] result = result['buckets'] if outputproc is None: for res in result: yield {'_id': res['key'], 'count': res['doc_count']} else: for res in result: yield {'_id': outputproc(res['key']), 'count': res['doc_count']}
def _make_field_must_exist_query(self, field, **kwargs): return Q(EXISTS, field=field)
def searchhaslocation(neg=False): res = Q('exists', field='infos.coordinates') if neg: return ~res return res
def get_es_query(self): # Just using 'terms' would not work, as it would return any tag match # in the list, but we want to exactly match all of them. return [Q('term', tags=tag) for tag in self.get_value()]
def search(self): logger.debug("SearchManager search invoked") q = Q("multi_match", query=self.query, fields=INDEXED_FIELDS) return self.__get_queryset(q)
def get_es_query(self): return [Q(self.operator, **{self.es_field: self.get_value()})]
class Meta: """Configuration for OAI server search.""" default_filter = Q('exists', field='_oai.id')
from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, connections, query, Q import logging import pandas as pd from IPython import display import numpy as np from matplotlib_venn import venn3 from matplotlib import pyplot as plt # Define a default Elasticsearch client elasticServer = 'http://172.20.30.70:9200/' #prod #elasticServer = 'http://172.20.31.19:9200/' #dev client = Elasticsearch(hosts=[elasticServer]) q = Q('match', id='_search') s = Search(using=client, index="propertypriceregister", doc_type="propertypriceregister").query() print("processing") inPerfectMatch = [] for hit in s.scan(): if hit["hasPerfectMatch"] == True: for field in hit["perfectMatches"]: if field not in inPerfectMatch: inPerfectMatch.append(field) print("processing finished") print()
from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q import csv source = ['lid', 'title', 'time_limited', 'effective_date'] # 征求意见稿)or 征求意见稿 or 草案)or 草案 q_draft = Q('bool', must=[Q("regexp", title_term=".*(征求意见稿|草案))?")], must_not=[Q("match", time_limited="征求意见稿或草案")]) q_draft_2 = Q('bool', must=[Q("regexp", title_term=".*(征求意见稿|草案))?")]) es = Elasticsearch(hosts="nes1:9206") s = Search(using=es, index="law_regu_dev", doc_type="law_regu").source(source).query(q_draft) with open('q_draft.csv', 'a') as f: w = csv.DictWriter(f, source) w.writeheader() for hit in s.scan(): w.writerow(hit.to_dict())
def aggregate_by_event_data(self, event_id=None, event_data_name="Image", sub_event_data_name=None, bucket_size=1000, sub_bucket_size=100, threshold=None, filter_event_data_name='', filter_event_data_value='', aggregate_by_hostname=False): es_query = self.get_default_query() if event_id != None: es_query.append({'match': {'winlog.event_id': event_id}}) if filter_event_data_name: filter_field_name = 'winlog.event_data.' + filter_event_data_name es_query.append( {'match': { filter_field_name: filter_event_data_value }}) query = Q({'bool': {'must': es_query}}) s = Search(using=self.Client, index="winlogbeat-*").query(query) if self.DTRange != None: s = s.filter('range', **self.DTRange) s.source(includes=['winlog.*']) if aggregate_by_hostname: b = s.aggs.bucket(event_data_name, 'terms', field='agent.hostname', size=bucket_size) else: b = s.aggs b = b.bucket(event_data_name, 'terms', field='winlog.event_data.' + event_data_name, size=bucket_size) if threshold: # https://github.com/ongr-io/ElasticsearchDSL/blob/master/docs/Aggregation/Pipeline/BucketSelector.md # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html threshold_bucket_name = event_data_name + "_counts" b.bucket(threshold_bucket_name, 'cardinality', field='@timestamp') b.pipeline('threshold_bucket_selector', 'bucket_selector', buckets_path={"counts": threshold_bucket_name}, script='params.counts > %d' % threshold) if sub_event_data_name: b.bucket(sub_event_data_name, 'terms', field='winlog.event_data.' + sub_event_data_name, size=sub_bucket_size) if self.DebugQuery: pprint.pprint(s.to_dict()) response = s.execute() if self.Scan: s.scan() else: response = s.execute() return response.aggregations[event_data_name]