Exemplo n.º 1
0
def funding_by_state(**kwargs):

    # topic_query = kwargs.get("topic")
    topic = kwargs.get("topic")
    element = kwargs.get("element")
    filters = dict(element=element, topic=topic)

    # run query
    s = query.run_query(Q({"match_all": {}}), index=index, filters=filters)

    # aggregations
    a1 = A("nested", path="funding_agencies")
    a2 = A(
        "terms",
        field="funding_agencies.state.keyword",
        size=50,
        order={"_count": "desc"},
    )
    a3 = A("reverse_nested")
    a4 = A("range",
           field="funding",
           ranges=[{
               "from": 0,
               "to": 100000
           }, {
               "from": 100000,
               "to": 250000
           }, {
               "from": 250000,
               "to": 500000
           }, {
               "from": 500000,
               "to": 750000
           }, {
               "from": 750000,
               "to": 1000000
           }, {
               "from": 1000000
           }],
           keyed=True)

    # chain aggregations and execute
    s.aggs\
     .bucket('agencies', a1)\
     .bucket('states',a2)\
     .bucket('reverse',a3)\
     .bucket('fund_amt',a4)
    response = s.execute()

    # filter response
    res = {}
    for b in response.aggregations.agencies.states.buckets:
        state = b.key
        if len(state) > 2:
            continue
        if state in res:
            continue
        buckets = b.reverse.fund_amt.buckets.to_dict()
        res[state] = buckets

    return res
Exemplo n.º 2
0
    def get_users_in_course(
        cls,
        course_id,
        segments=None,
        ignore_segments=None,
        cohort=None,
        enrollment_mode=None,
        text_search=None,
        sort_policies=None,
    ):
        """
        Construct a search query for all users in `course_id` and return
        the Search object.

        sort_policies is an array, where the first element is the primary sort.
        Elements in the array are dicts with fields: order_by (field to sort by)
        and sort_order (either 'asc' or 'desc').  Default to 'username' and 'asc'.

        Raises `ValueError` if both `segments` and `ignore_segments` are provided.
        """

        if not sort_policies:
            sort_policies = [{'order_by': None, 'sort_order': None}]
        # set default sort policy to 'username' and 'asc'
        for field, default in [('order_by', 'username'),
                               ('sort_order', 'asc')]:
            if sort_policies[0][field] is None:
                sort_policies[0][field] = default

        # Error handling
        if segments and ignore_segments:
            raise ValueError(
                'Cannot combine `segments` and `ignore_segments` parameters.')
        for segment in (segments or list()) + (ignore_segments or list()):
            if segment not in learner.SEGMENTS:
                raise ValueError(
                    "segments/ignore_segments value '{segment}' must be one of: ({segments})"
                    .format(segment=segment,
                            segments=', '.join(learner.SEGMENTS)))

        order_by_options = ('username', 'email', 'discussion_contributions',
                            'problems_attempted', 'problems_completed',
                            'problem_attempts_per_completed',
                            'attempt_ratio_order', 'videos_viewed')
        sort_order_options = ('asc', 'desc')
        for sort_policy in sort_policies:
            if sort_policy['order_by'] not in order_by_options:
                raise ValueError(
                    "order_by value '{order_by}' must be one of: ({order_by_options})"
                    .format(order_by=sort_policy['order_by'],
                            order_by_options=', '.join(order_by_options)))
            if sort_policy['sort_order'] not in sort_order_options:
                raise ValueError(
                    "sort_order value '{sort_order}' must be one of: ({sort_order_options})"
                    .format(sort_order=sort_policy['sort_order'],
                            sort_order_options=', '.join(sort_order_options)))

        search = cls.search()
        search.query = Q('bool', must=[Q('term', course_id=course_id)])

        # Filtering/Search
        if segments:
            search.query.must.append(
                Q('bool',
                  should=[Q('term', segments=segment)
                          for segment in segments]))
        elif ignore_segments:
            for segment in ignore_segments:
                search = search.query(~Q('term', segments=segment))  # pylint: disable=invalid-unary-operand-type
        if cohort:
            search = search.query('term', cohort=cohort)
        if enrollment_mode:
            search = search.query('term', enrollment_mode=enrollment_mode)
        if text_search:
            search.query.must.append(
                Q('multi_match',
                  query=text_search,
                  fields=['name', 'username', 'email']))

        # construct the sort hierarchy
        search_request = search.sort(*[
            {
                sort_policy['order_by']: {
                    'order':
                    sort_policy['sort_order'],
                    # ordering of missing fields
                    'missing':
                    '_last' if sort_policy['sort_order'] == 'asc' else '_first'
                }
            } for sort_policy in sort_policies
        ])

        return search_request
Exemplo n.º 3
0
 def _Q(self, name_or_query='match', **params):
     """
 It is a wrapper to ElasticDSL Query module used to create a query object.
 :param str name_or_query is the type of the query
 """
     return Q(name_or_query, **params)
Exemplo n.º 4
0
class ElasticDB(DB):

    nested_fields = []

    # filters
    flt_empty = Q()

    def __init__(self, url):
        super().__init__()
        self.username = ''
        self.password = ''
        self.hosts = None
        if '@' in url.netloc:
            username, hostname = url.netloc.split('@', 1)
            if ':' in username:
                self.username, self.password = (unquote(val) for val in
                                                username.split(':', 1))
            else:
                self.username = unquote(username)
            if hostname:
                self.hosts = [hostname]
        elif url.netloc:
            self.hosts = [url.netloc]
        index_prefix = url.path.lstrip('/')
        if index_prefix:
            self.index_prefix = index_prefix + '-'
        else:
            self.index_prefix = 'ivre-'
        self.params = dict(x.split('=', 1) if '=' in x else (x, None)
                           for x in url.query.split('&') if x)

    def init(self):
        """Initializes the mappings."""
        for idxnum, mapping in enumerate(self.mappings):
            idxname = self.indexes[idxnum]
            self.db_client.indices.delete(
                index=idxname,
                ignore=[400, 404],
            )
            self.db_client.indices.create(
                index=idxname,
                body={
                    "mappings": {
                        "properties": mapping,
                        # Since we do not need full text searches, use
                        # type "keyword" for strings (unless otherwise
                        # specified in mapping) instead of default
                        # (text + keyword)
                        "dynamic_templates": [
                            {"strings": {
                                "match_mapping_type": "string",
                                # prevent RequestError exceptions when
                                # one term's UTF-8 encoding is bigger
                                # than the max length 32766
                                "mapping": {"type": "keyword",
                                            "ignore_above": 32000},
                            }},
                        ],
                    }
                },
            )

    @property
    def db_client(self):
        """The DB connection."""
        try:
            return self._db_client
        except AttributeError:
            self._db_client = Elasticsearch(
                hosts=self.hosts,
                http_auth=(self.username, self.password)
            )
            return self._db_client

    @property
    def server_info(self):
        """Server information."""
        try:
            return self._server_info
        except AttributeError:
            self._server_info = self.db_client.info()
            return self._server_info

    @staticmethod
    def to_binary(data):
        return utils.encode_b64(data).decode()

    @staticmethod
    def from_binary(data):
        return utils.decode_b64(data.encode())

    @staticmethod
    def ip2internal(addr):
        return addr

    @staticmethod
    def internal2ip(addr):
        return addr

    @staticmethod
    def searchnonexistent():
        return Q('match', _id=0)

    @classmethod
    def searchhost(cls, addr, neg=False):
        """Filters (if `neg` == True, filters out) one particular host
        (IP address).
        """
        return Q('match', addr=addr)

    @classmethod
    def searchhosts(cls, hosts, neg=False):
        pass

    @staticmethod
    def _get_pattern(regexp):
        # The equivalent to a MongoDB or PostgreSQL search for regexp
        # /Test/ would be /.*Test.*/ in Elasticsearch, while /Test/ in
        # Elasticsearch is equivalent to /^Test$/ in MongoDB or
        # PostgreSQL.
        pattern, flags = utils.regexp2pattern(regexp)
        if flags & ~re.UNICODE:
            # is a flag, other than re.UNICODE, is set, issue a
            # warning as it will not be used
            utils.LOGGER.warning(
                'Elasticsearch does not support flags in regular '
                'expressions [%r with flags=%r]',
                pattern, flags
            )
        return pattern

    @staticmethod
    def _flt_and(cond1, cond2):
        return cond1 & cond2

    @staticmethod
    def _flt_or(cond1, cond2):
        return cond1 | cond2

    @staticmethod
    def flt2str(flt):
        return json.dumps(flt.to_dict())
Exemplo n.º 5
0
 def get_max(self):
     logger.debug("SearchManager get_max invoked")
     q = Q("match_all")
     return self.__get_queryset(q)
Exemplo n.º 6
0
    def searchscript(cls, name=None, output=None, values=None, neg=False):
        """Search a particular content in the scripts results.

        """
        req = []
        if name is not None:
            if isinstance(name, utils.REGEXP_T):
                req.append(Q("regexp",
                             **{"ports.scripts.id": cls._get_pattern(name)}))
            else:
                req.append(Q("match", **{"ports.scripts.id": name}))
        if output is not None:
            if isinstance(output, utils.REGEXP_T):
                req.append(Q("regexp",
                             **{"ports.scripts.output":
                                cls._get_pattern(output)}))
            else:
                req.append(Q("match", **{"ports.scripts.output": output}))
        if values:
            if name is None:
                raise TypeError(".searchscript() needs a `name` arg "
                                "when using a `values` arg")
            subfield = ALIASES_TABLE_ELEMS.get(name, name)
            if isinstance(values, Query):
                req.append(values)
            elif isinstance(values, str):
                req.append(Q("match",
                             **{"ports.scripts.%s" % subfield: values}))
            elif isinstance(values, utils.REGEXP_T):
                req.append(Q("regexp",
                             **{"ports.scripts.%s" % subfield:
                                cls._get_pattern(values)}))
            else:
                for field, value in values.items():
                    if isinstance(value, utils.REGEXP_T):
                        req.append(Q("regexp",
                                     **{"ports.scripts.%s.%s" % (subfield,
                                                                 field):
                                        cls._get_pattern(value)}))
                    else:
                        req.append(Q("match",
                                     **{"ports.scripts.%s.%s" % (subfield,
                                                                 field):
                                        value}))
        if not req:
            res = Q('nested', path='ports',
                    query=Q('nested', path='ports.scripts',
                            query=Q("exists", field="ports.scripts")))
        else:
            query = cls.flt_and(*req)
            res = Q("nested", path="ports",
                    query=Q("nested", path="ports.scripts", query=query))
        if neg:
            return ~res
        return res
Exemplo n.º 7
0
 def searchhost(cls, addr, neg=False):
     """Filters (if `neg` == True, filters out) one particular host
     (IP address).
     """
     return Q('match', addr=addr)
Exemplo n.º 8
0
def get_top_n_statistics():
    """
    Obtains TOP N DNS statistics.

    :return: JSON with status "ok" or "error" and requested data.
    """

    # Check login
    if not session.logged:
        json_response = '{"status": "Error", "data": "You must be logged!"}'
        return json_response

    # Check mandatory inputs
    if not (request.get_vars.beginning and request.get_vars.end
            and request.get_vars.type and request.get_vars.number):
        json_response = '{"status": "Error", "data": "Some mandatory argument is missing!"}'
        return json_response

    # Parse inputs and set correct format
    beginning = escape(request.get_vars.beginning)
    end = escape(request.get_vars.end)
    type = escape(request.get_vars.type)
    number = int(escape(request.get_vars.number))

    try:
        # Elastic query
        client = elasticsearch.Elasticsearch([{
            'host':
            myconf.get('consumer.hostname'),
            'port':
            myconf.get('consumer.port')
        }])
        elastic_bool = []
        elastic_bool.append(
            {'range': {
                '@timestamp': {
                    'gte': beginning,
                    'lte': end
                }
            }})
        elastic_bool.append({'term': {'@stat_type': type}})

        # Prepare query
        qx = Q({'bool': {'must': elastic_bool}})

        # Set query according to the statistic type
        if type == "queried_by_ip":
            search_ip = Search(using=client, index='_all').query(qx)
            search_ip.aggs.bucket('all_nested', 'nested', path='data_array') \
                .bucket('by_key', 'terms', field='data_array.key.raw', size=2147483647)\
                .bucket('by_ip', 'terms', field='data_array.ip', size=1, order={'sum_by_ip': 'desc'}) \
                .bucket('sum_by_ip', 'sum', field='data_array.value')
            search_ip.aggs['all_nested']['by_key'].bucket(
                'sum_total', 'sum', field='data_array.value')
            results = search_ip.execute()
        else:
            search_ip = Search(using=client, index='_all').query(qx)
            search_ip.aggs.bucket('all_nested', 'nested', path='data_array') \
                .bucket('by_key', 'terms', field='data_array.key.raw', size=2147483647) \
                .bucket('stats_sum', 'sum', field='data_array.value')
            results = search_ip.execute()

        # Prepare data variable
        data = ""
        # Prepare ordered collection
        counter = collections.Counter()

        if type == "queried_by_ip":
            for record in results.aggregations.all_nested.by_key.buckets:
                top_ip = record.by_ip.buckets[0]
                counter[(record.key, top_ip.key, int(
                    top_ip.sum_by_ip.value))] = int(record.sum_total.value)

            # Select top N (number) values
            for value, count in counter.most_common(number):
                data += value[0] + "," + value[1] + "," + str(
                    value[2]) + "," + str(count) + ","
        else:
            for all_buckets in results.aggregations.all_nested.by_key:
                counter[all_buckets.key] += int(all_buckets.stats_sum.value)

            # Select top N (number) values
            for value, count in counter.most_common(number):
                data += value + "," + str(count) + ","

        # Remove trailing comma
        data = data[:-1]

        if data == "":
            json_response = '{"status": "Empty", "data": "No data found"}'
        else:
            json_response = '{"status": "Ok", "data": "' + data + '"}'
        return json_response

    except Exception as e:
        json_response = '{"status": "Error", "data": "Elasticsearch query exception: ' + escape(
            str(e)) + '"}'
        return json_response
Exemplo n.º 9
0
def search(request, spec, operator="and"):
    if not isinstance(spec, collections.abc.Mapping):
        raise XMLRPCWrappedError(
            TypeError("Invalid spec, must be a mapping/dictionary."))

    if operator not in {"and", "or"}:
        raise XMLRPCWrappedError(
            ValueError("Invalid operator, must be one of 'and' or 'or'."))

    # Remove any invalid spec fields
    spec = {
        k: [v] if isinstance(v, str) else v
        for k, v in spec.items() if v and k in {
            "name",
            "version",
            "author",
            "author_email",
            "maintainer",
            "maintainer_email",
            "home_page",
            "license",
            "summary",
            "description",
            "keywords",
            "platform",
            "download_url",
        }
    }

    queries = []
    for field, value in sorted(spec.items()):
        q = None
        for item in value:
            kw = {"query": item}
            if field in SEARCH_BOOSTS:
                kw["boost"] = SEARCH_BOOSTS[field]
            if q is None:
                q = Q("match", **{field: kw})
            else:
                q |= Q("match", **{field: kw})
        queries.append(q)

    if operator == "and":
        query = request.es.query("bool", must=queries)
    else:
        query = request.es.query("bool", should=queries)

    results = query[:100].execute()

    request.registry.datadog.histogram("warehouse.xmlrpc.search.results",
                                       len(results))

    if "version" in spec.keys():
        return [{
            "name": r.name,
            "summary": getattr(r, "summary", None),
            "version": v,
            "_pypi_ordering": False,
        } for r in results for v in r.version if v in spec.get("version", [v])]
    return [{
        "name": r.name,
        "summary": getattr(r, "summary", None),
        "version": r.latest_version,
        "_pypi_ordering": False,
    } for r in results]
Exemplo n.º 10
0
    def __payload_body(self,
                       query_params,
                       aggs_params,
                       size=SEARCH_SPLIT_LIMIT,
                       source=None):

        if size > SEARCH_SPLIT_LIMIT or size < 1:
            raise ElasticsearchKibanaCLIException(
                'Payload size is out-of-bounds in __payload_body()', size)

        for param_name in [
                'must', 'must_not', 'should', 'should_not', 'filter'
        ]:
            if param_name in query_params:
                query_params[param_name] = self.__parse_query_param(
                    query_params[param_name])
            else:
                query_params[param_name] = []

        aggs = {}  # for another day

        query = Q('bool',
                  must=query_params['must'],
                  must_not=query_params['must_not'],
                  should=query_params['should'],
                  should_not=query_params['should_not'],
                  minimum_should_match=query_params['minimum_should_match']
                  if 'minimum_should_match' in query_params else
                  (1 if len(query_params['should']) > 0 else None),
                  filter=query_params['filter'])

        payload_values = {
            'source':
            json.dumps(source).replace('___timestamp', '@timestamp')
            if source is not None else '[ ]',
            'size':
            size,
            'aggs':
            json.dumps(aggs),
            'query':
            json.dumps(query.to_dict()),
            'timeout':
            '"' + (str(query_params['timeout']) if 'timeout' in query_params
                   else '{}s'.format(SEARCH_DEFAULT_TIMEOUT_SECONDS)) + '"'
        }

        payload_json = """
            {
              "version": true,
              "sort": [ ],
              "stored_fields": [ ],
              "script_fields": { },
              "docvalue_fields": [ ],
              "highlight": { },
              "_source": __SOURCE__,
              "size": __SIZE__,
              "aggs": __AGGS__,
              "query": __QUERY__,
              "timeout": __TIMEOUT__
            }
        """

        for payload_k, payload_v in payload_values.items():
            replace_token = '__{}__'.format(payload_k.upper())
            payload_json = payload_json.replace(replace_token, str(payload_v))

        return json.dumps(json.loads(payload_json))
Exemplo n.º 11
0
def get_records_list():
    """
    Obtains list of all records for given type given time range.

    :return: JSON with status "ok" or "error" and requested data.
    """

    # Check login
    if not session.logged:
        json_response = '{"status": "Error", "data": "You must be logged!"}'
        return json_response

    # Check mandatory inputs
    if not (request.get_vars.beginning and request.get_vars.end
            and request.get_vars.type):
        json_response = '{"status": "Error", "data": "Some mandatory argument is missing!"}'
        return json_response

    # Parse inputs and set correct format
    beginning = escape(request.get_vars.beginning)
    end = escape(request.get_vars.end)
    type = escape(request.get_vars.type)

    try:
        # Elastic query
        client = elasticsearch.Elasticsearch([{
            'host':
            myconf.get('consumer.hostname'),
            'port':
            myconf.get('consumer.port')
        }])
        elastic_bool = []
        elastic_bool.append(
            {'range': {
                '@timestamp': {
                    'gte': beginning,
                    'lte': end
                }
            }})
        elastic_bool.append({'term': {'@stat_type': type}})

        # Prepare query
        qx = Q({'bool': {'must': elastic_bool}})

        # Set query according to the statistic type
        search_ip = Search(using=client, index='_all').query(qx)
        search_ip.aggs.bucket('all_nested', 'nested', path='data_array')\
            .bucket('by_key', 'terms', field='data_array.key.raw', size=2147483647)\
            .bucket('stats_sum', 'sum', field='data_array.value')
        results = search_ip.execute()

        data = ""
        for all_buckets in results.aggregations.all_nested.by_key:
            data += all_buckets.key + "," + str(
                int(all_buckets.stats_sum.value)) + ","

        # Remove trailing comma
        data = data[:-1]

        json_response = '{"status": "Ok", "data": "' + data + '"}'
        return json_response

    except Exception as e:
        json_response = '{"status": "Error", "data": "Exception: ' + escape(
            str(e)) + '"}'
        return json_response
Exemplo n.º 12
0
 def filter_query(type):
     return Q('term', **{'project.value.projectType._exact': type})
Exemplo n.º 13
0
def search(offset=0, limit=100, query_string='', limit_fields=True, *args):
    query_dict = json.loads(urllib.parse.unquote(query_string))

    type_filters = query_dict['typeFilters']
    has_type_filters = True in list(map(bool, type_filters.values()))

    def filter_query(type):
        return Q('term', **{'project.value.projectType._exact': type})

    selected_filters = list(
        filter(lambda key: bool(type_filters[key]), type_filters.keys()))

    type_query = Q('bool', should=list(map(filter_query, selected_filters)))
    client = new_es_client()
    search = IndexedPublication.search(using=client)
    if has_type_filters:
        search = search.filter(type_query)

    query_filters = []

    # Query string fields
    author = query_dict['queries']['author']
    title = query_dict['queries']['title']
    keywords = query_dict['queries']['keyword']
    description = query_dict['queries']['description']
    if author:
        query_filters.append(search_utils.author_query(author))
    if title:
        query_filters.append(search_utils.title_query(title))
    if keywords:
        query_filters.append(search_utils.keyword_query(keywords))
    if description:
        query_filters.append(search_utils.description_query(description))

    # Experimental advanced filters
    facility = query_dict['advancedFilters']['experimental'][
        'experimentalFacility']
    experiment_type = query_dict['advancedFilters']['experimental'][
        'experimentType']
    if facility['name']:
        query_filters.append(
            search_utils.experimental_facility_query(facility))
    if experiment_type:
        query_filters.append(
            search_utils.experiment_type_query(experiment_type))

    # Simulation advanced filters
    simulation_type = query_dict['advancedFilters']['simulation'][
        'simulationType']
    if simulation_type:
        query_filters.append(
            search_utils.simulation_type_query(simulation_type))

    # Field recon advanced filters
    nh_type = query_dict['advancedFilters']['field_recon']['naturalHazardType']
    nh_event = query_dict['advancedFilters']['field_recon'][
        'naturalHazardEvent']
    if nh_type:
        query_filters.append(search_utils.nh_type_query(nh_type))
    if nh_event:
        query_filters.append(search_utils.nh_event_query(nh_event))

    # Other advanced filters
    data_type = query_dict['advancedFilters']['other']['dataType']
    if data_type:
        query_filters.append(search_utils.other_type_query(data_type))

    # Hybrid sim advanced filters
    sim_type = data_type = query_dict['advancedFilters']['hybrid_simulation'][
        'hybridSimulationType']
    if sim_type:
        query_filters.append(search_utils.hybrid_sim_type_query(sim_type))

    search = search.filter('bool', must=query_filters)
    search = search.filter(Q('term', status='published'))
    search = search.extra(from_=offset, size=limit)
    if limit_fields:
        search = search.source(includes=[
            'project.value.title', 'project.value.pi',
            'project.value.keywords', 'project.value.projectType',
            'project.value.dataType', 'created', 'projectId', 'users',
            'system', 'revision'
        ])

    search = search.sort({'created': {'order': 'desc'}})
    res = search.execute()
    hits = list(
        map(
            lambda h: {
                **h.to_dict(), 'pi': _get_user_by_username(
                    h, h.project.value.pi)
            }, res.hits))

    return {'listing': hits}
Exemplo n.º 14
0
def neesdescription(project_id, *args):
    pub_query = IndexedPublicationLegacy.search()\
        .filter(Q({'term': {'project._exact': project_id}}))\
        .source(includes=['description'])
    desc = next(hit.description for hit in pub_query.execute().hits)
    return {'description': desc}
Exemplo n.º 15
0
 def last_run(self):
     runs = CeleryTaskRunLog.search().filter(
         Q('term',
           celery_task_id=self.celery_task_id)).sort('-start').execute()
     return runs[0] if runs else None
Exemplo n.º 16
0
 def _make_query_string_query(self, query, fields, default_operator=AND):
     return Q(QUERY_STRING,
              query=query,
              fields=fields,
              default_operator=default_operator)
Exemplo n.º 17
0
    def serialize(self, pid, record, links_factory=None):
        """Return a list of publications for a given author recid.

        :param pid:
            Persistent identifier instance.

        :param record:
            Record instance.

        :param links_factory:
            Factory function for the link generation, which are added to
            the response.
        """
        author_pid = pid.pid_value
        publications = []

        query = Q('match', authors__recid=author_pid)
        search = LiteratureSearch().query('nested', path='authors', query=query)\
                                   .params(_source=[
                                       'accelerator_experiments',
                                       'citation_count',
                                       'control_number',
                                       'earliest_date',
                                       'facet_inspire_doc_type',
                                       'keywords',
                                       'publication_info',
                                       'self',
                                       'titles',
                                   ])

        for result in search.scan():
            result_source = result.to_dict()

            publication = {}
            publication['id'] = int(result_source['control_number'])
            publication['record'] = result_source['self']
            publication['title'] = get_title(result_source)

            # Get the earliest date.
            try:
                publication['date'] = result_source['earliest_date']
            except KeyError:
                pass

            # Get publication type.
            try:
                publication['type'] = result_source.get(
                    'facet_inspire_doc_type', [])[0]
            except IndexError:
                pass

            # Get citation count.
            try:
                publication['citations'] = result_source['citation_count']
            except KeyError:
                pass

            # Get journal.
            try:
                publication['journal'] = {}
                publication['journal']['title'] = result_source.get(
                    'publication_info', [])[0]['journal_title']

                # Get journal id and $self.
                try:
                    publication['journal']['id'] = result_source.get(
                        'publication_info', [])[0]['journal_recid']
                    publication['journal']['record'] = result_source.get(
                        'publication_info', [])[0]['journal_record']
                except KeyError:
                    pass
            except (IndexError, KeyError):
                del publication['journal']

            # Get collaborations.
            collaborations = set()

            for experiment in result_source.get('accelerator_experiments', []):
                collaborations.add(experiment.get('experiment'))

            if collaborations:
                publication['collaborations'] = list(collaborations)

            publications.append(publication)

        return json.dumps(publications)
Exemplo n.º 18
0
 def _make_bool_query(self, **kwargs):
     return Q(BOOL, **kwargs)
Exemplo n.º 19
0
 def searchnonexistent():
     return Q('match', _id=0)
Exemplo n.º 20
0
 def _make_must_equal_terms_query(self, field, terms, **kwargs):
     return Q(TERMS, **{field: terms})
Exemplo n.º 21
0
    def topvalues(self, field, flt=None, topnbr=10, sort=None, least=False):
        """
        This method uses an aggregation to produce top values for a given
        field or pseudo-field. Pseudo-fields are:
          - category / asnum / country / net[:mask]
          - port
          - port:open / :closed / :filtered / :<servicename>
          - portlist:open / :closed / :filtered
          - countports:open / :closed / :filtered
          - service / service:<portnbr>
          - product / product:<portnbr>
          - cpe / cpe.<part> / cpe:<cpe_spec> / cpe.<part>:<cpe_spec>
          - devicetype / devicetype:<portnbr>
          - script:<scriptid> / script:<port>:<scriptid>
            / script:host:<scriptid>
          - cert.* / smb.* / sshkey.* / ike.*
          - httphdr / httphdr.{name,value} / httphdr:<name>
          - httpapp / httpapp:<name>
          - modbus.* / s7.* / enip.*
          - mongo.dbs.*
          - vulns.*
          - screenwords
          - file.* / file.*:scriptid
          - hop

        """
        baseterms = {"size": topnbr}
        if least:
            baseterms["order"] = {"_count": "asc"}
        outputproc = None
        nested = None
        if flt is None:
            flt = self.flt_empty
        if field == "category":
            field = {"field": "categories"}
        elif field == "asnum":
            flt = self.flt_and(flt, Q("exists", field="infos.as_num"))
            field = {"field": "infos.as_num"}
        elif field == "as":
            def outputproc(value):
                return tuple(val if i else int(val)
                             for i, val in enumerate(value.split(',', 1)))
            flt = self.flt_and(flt, Q("exists", field="infos.as_num"))
            field = {"script": {
                "lang": "painless",
                "source":
                "doc['infos.as_num'].value + ',' + "
                "doc['infos.as_name'].value",
            }}
        elif field == "port" or field.startswith("port:"):
            def outputproc(value):
                return tuple(int(val) if i else val
                             for i, val in enumerate(value.rsplit('/', 1)))
            if field == "port":
                flt = self.flt_and(flt,
                                   Q('nested', path='ports',
                                     query=Q('exists', field="ports.port")))
                nested = {
                    "nested": {"path": "ports"},
                    "aggs": {"patterns": {
                        "filter": {'bool': {'must_not': [
                            {'match': {'ports.port': -1}},
                        ]}},
                        "aggs": {"patterns": {
                            "terms": dict(
                                baseterms,
                                script={
                                    "lang": "painless",
                                    "source":
                                    'doc["ports.protocol"].value + "/" + '
                                    'doc["ports.port"].value',
                                },
                            ),
                        }},
                    }},
                }
            else:
                info = field[5:]
                if info in ['open', 'filtered', 'closed']:
                    flt = self.flt_and(flt,
                                       Q('nested', path='ports',
                                         query=Q('match',
                                                 ports__state_state=info)))
                    matchfield = "state_state"
                else:
                    flt = self.flt_and(flt,
                                       Q('nested', path='ports',
                                         query=Q('match',
                                                 ports__service_name=info)))
                    matchfield = "service_name"
                nested = {
                    "nested": {"path": "ports"},
                    "aggs": {"patterns": {
                        "filter": {'bool': {
                            'must': [{'match': {'ports.%s' % matchfield:
                                                info}}],
                            'must_not': [{'match': {'ports.port': -1}}],
                        }},
                        "aggs": {"patterns": {
                            "terms": dict(
                                baseterms,
                                script={
                                    "lang": "painless",
                                    "source":
                                    'doc["ports.protocol"].value + "/" + '
                                    'doc["ports.port"].value',
                                },
                            ),
                        }},
                    }},
                }
        elif field == 'service':
            def outputproc(value):
                return value or None
            flt = self.flt_and(flt, self.searchopenport())
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "filter": {"match": {"ports.state_state": "open"}},
                    "aggs": {"patterns": {
                        "terms": dict(
                            baseterms,
                            field="ports.service_name",
                            missing="",
                        ),
                    }},
                }},
            }
        elif field.startswith("service:"):
            port = int(field[8:])
            flt = self.flt_and(flt, self.searchport(port))
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "filter": {"bool": {"must": [
                        {"match": {"ports.state_state": "open"}},
                        {"match": {"ports.port": port}},
                    ]}},
                    "aggs": {"patterns": {
                        "terms": dict(
                            baseterms,
                            field="ports.service_name",
                            missing="",
                        ),
                    }},
                }},
            }
        elif field == 'product':
            def outputproc(value):
                return tuple(v or None for v in value.split('###', 1))
            flt = self.flt_and(flt, self.searchopenport())
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "filter": {"match": {"ports.state_state": "open"}},
                    "aggs": {"patterns": {
                        "terms": dict(
                            baseterms,
                            script="""
String result = "";
if(doc['ports.service_name'].size() > 0) {
    result += doc['ports.service_name'].value;
}
result += "###";
if(doc['ports.service_product'].size() > 0) {
    result += doc['ports.service_product'].value;
}
return result;
""",
                            missing="",
                        ),
                    }},
                }},
            }
        elif field.startswith("product:"):
            def outputproc(value):
                return tuple(v or None for v in value.split('###', 1))
            info = field[8:]
            if info.isdigit():
                info = int(info)
                flt = self.flt_and(flt, self.searchport(info))
                matchfield = "port"
            else:
                flt = self.flt_and(flt, self.searchservice(info))
                matchfield = "service_name"
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "filter": {"bool": {"must": [
                        {"match": {"ports.state_state": "open"}},
                        {"match": {"ports.%s" % matchfield: info}},
                    ]}},
                    "aggs": {"patterns": {
                        "terms": dict(
                            baseterms,
                            script="""
String result = "";
if(doc['ports.service_name'].size() > 0) {
    result += doc['ports.service_name'].value;
}
result += "###";
if(doc['ports.service_product'].size() > 0) {
    result += doc['ports.service_product'].value;
}
return result;
""",
                        ),
                    }},
                }},
            }
        elif field == 'version':
            def outputproc(value):
                return tuple(v or None for v in value.split('###', 2))
            flt = self.flt_and(flt, self.searchopenport())
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "filter": {"match": {"ports.state_state": "open"}},
                    "aggs": {"patterns": {
                        "terms": dict(
                            baseterms,
                            script="""
String result = "";
if(doc['ports.service_name'].size() > 0) {
    result += doc['ports.service_name'].value;
}
result += "###";
if(doc['ports.service_product'].size() > 0) {
    result += doc['ports.service_product'].value;
}
result += "###";
if(doc['ports.service_version'].size() > 0) {
    result += doc['ports.service_version'].value;
}
return result;
""",
                            missing="",
                        ),
                    }},
                }},
            }
        elif field.startswith('version:'):
            def outputproc(value):
                return tuple(v or None for v in value.split('###', 2))
            info = field[8:]
            if info.isdigit():
                port = int(info)
                flt = self.flt_and(flt, self.searchport(port))
                matchflt = Q("match", ports__port=port)
            elif ":" in info:
                service, product = info.split(':', 1)
                flt = self.flt_and(flt, self.searchproduct(
                    product=product,
                    service=service,
                ))
                matchflt = (
                    Q("match", ports__service_name=service) &
                    Q("match", ports__service_product=product)
                )
            else:
                flt = self.flt_and(flt, self.searchservice(info))
                matchflt = Q("match", ports__service_name=info)
            matchflt &= Q("match", ports__state_state="open")
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "filter": matchflt.to_dict(),
                    "aggs": {"patterns": {
                        "terms": dict(
                            baseterms,
                            script="""
String result = "";
if(doc['ports.service_name'].size() > 0) {
    result += doc['ports.service_name'].value;
}
result += "###";
if(doc['ports.service_product'].size() > 0) {
    result += doc['ports.service_product'].value;
}
result += "###";
if(doc['ports.service_version'].size() > 0) {
    result += doc['ports.service_version'].value;
}
return result;
""",
                        ),
                    }},
                }},
            }
        elif field == 'httphdr':
            def outputproc(value):
                return tuple(value.split(':', 1))
            flt = self.flt_and(flt, self.searchhttphdr())
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "nested": {"path": "ports.scripts"},
                    "aggs": {"patterns": {
                        "nested": {"path": "ports.scripts.http-headers"},
                        "aggs": {"patterns": {
                            "terms": dict(
                                baseterms,
                                script={
                                    "lang": "painless",
                                    "source":
                                    "doc['ports.scripts.http-headers.name']."
                                    "value + ':' + doc['ports.scripts.http-"
                                    "headers.value'].value"
                                },
                            )
                        }},
                    }},
                }},
            }
        elif field.startswith('httphdr.'):
            flt = self.flt_and(flt, self.searchhttphdr())
            field = "ports.scripts.http-headers.%s" % field[8:]
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "nested": {"path": "ports.scripts"},
                    "aggs": {"patterns": {
                        "nested": {"path": "ports.scripts.http-headers"},
                        "aggs": {"patterns": {
                            "terms": dict(
                                baseterms,
                                field=field
                            ),
                        }},
                    }},
                }},
            }
        elif field.startswith('httphdr:'):
            subfield = field[8:].lower()
            flt = self.flt_and(flt, self.searchhttphdr(name=subfield))
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "nested": {"path": "ports.scripts"},
                    "aggs": {"patterns": {
                        "nested": {"path": "ports.scripts.http-headers"},
                        "aggs": {"patterns": {
                            "filter": {"match": {
                                "ports.scripts.http-headers.name": subfield,
                            }},
                            "aggs": {"patterns": {
                                "terms": dict(
                                    baseterms,
                                    field='ports.scripts.http-headers.value',
                                ),
                            }},
                        }},
                    }},
                }},
            }
        elif field == 'httpapp':
            def outputproc(value):
                return tuple(value.split(':', 1))
            flt = self.flt_and(flt, self.searchhttpapp())
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "nested": {"path": "ports.scripts"},
                    "aggs": {"patterns": {
                        "nested": {"path": "ports.scripts.http-app"},
                        "aggs": {"patterns": {
                            "terms": dict(
                                baseterms,
                                script={
                                    "lang": "painless",
                                    "source":
                                    "doc['ports.scripts.http-app.application']"
                                    ".value + ':' + doc['ports.scripts.http-"
                                    "app.version'].value"
                                },
                            )
                        }},
                    }},
                }},
            }
        elif field.startswith('httpapp:'):
            subfield = field[8:].lower()
            flt = self.flt_and(flt, self.searchhttpapp(name=subfield))
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "nested": {"path": "ports.scripts"},
                    "aggs": {"patterns": {
                        "nested": {"path": "ports.scripts.http-app"},
                        "aggs": {"patterns": {
                            "filter": {"match": {
                                "ports.scripts.http-app.application": subfield,
                            }},
                            "aggs": {"patterns": {
                                "terms": dict(
                                    baseterms,
                                    field='ports.scripts.http-app.version',
                                ),
                            }},
                        }},
                    }},
                }},
            }
        elif field == 'useragent' or field.startswith('useragent:'):
            if field == 'useragent':
                flt = self.flt_and(flt, self.searchuseragent())
                nested = {
                    "nested": {"path": "ports"},
                    "aggs": {"patterns": {
                        "nested": {"path": "ports.scripts"},
                        "aggs": {"patterns": {
                            "terms": dict(
                                baseterms,
                                field="ports.scripts.http-user-agent",
                            ),
                        }},
                    }},
                }
            else:
                subfield = utils.str2regexp(field[10:])
                flt = self.flt_and(flt,
                                   self.searchuseragent(useragent=subfield))
                if isinstance(subfield, utils.REGEXP_T):
                    subfield = self._get_pattern(subfield)
                else:
                    subfield = re.escape(subfield)
                nested = {
                    "nested": {"path": "ports"},
                    "aggs": {"patterns": {
                        "nested": {"path": "ports.scripts"},
                        "aggs": {"patterns": {
                            "terms": dict(
                                baseterms,
                                field="ports.scripts.http-user-agent",
                                include=subfield,
                            ),
                        }},
                    }},
                }
        elif field == 'ja3-client' or (
                field.startswith('ja3-client') and field[10] in ':.'
        ):
            if ':' in field:
                field, value = field.split(':', 1)
                subkey, value = self._ja3keyvalue(utils.str2regexp(value))
                if isinstance(value, utils.REGEXP_T):
                    include_value = self._get_pattern(value)
                    filter_value = {'regexp': {
                        "ports.scripts.ssl-ja3-client.%s" % subkey:
                        include_value,
                    }}
                else:
                    include_value = re.escape(value)
                    filter_value = {'match': {
                        "ports.scripts.ssl-ja3-client.%s" % subkey: value,
                    }}
            else:
                value = None
                subkey = None
            if '.' in field:
                field, subfield = field.split('.', 1)
            else:
                subfield = 'md5'
            base = {
                "terms": dict(
                    baseterms,
                    field="ports.scripts.ssl-ja3-client.%s" % subfield,
                ),
            }
            if subkey is not None:
                if subkey != subfield:
                    base = {
                        "filter": filter_value,
                        "aggs": {"patterns": base},
                    }
                else:
                    base["terms"]["include"] = include_value
            flt = self.flt_and(flt, self.searchja3client(value_or_hash=value))
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "nested": {"path": "ports.scripts"},
                    "aggs": {"patterns": {
                        "nested": {"path": "ports.scripts.ssl-ja3-client"},
                        "aggs": {"patterns": base},
                    }},
                }},
            }
        elif field == 'ja3-server' or (
                field.startswith('ja3-server') and field[10] in ':.'
        ):
            def outputproc(value):
                return tuple(value.split('/'))
            if ':' in field:
                field, values = field.split(':', 1)
                if ':' in values:
                    value1, value2 = values.split(':', 1)
                    if value1:
                        subkey1, value1 = self._ja3keyvalue(
                            utils.str2regexp(value1)
                        )
                        if isinstance(value1, utils.REGEXP_T):
                            filter_value1 = {'regexp': {
                                "ports.scripts.ssl-ja3-server.%s" % subkey1:
                                self._get_pattern(value1),
                            }}
                        else:
                            filter_value1 = {'match': {
                                "ports.scripts.ssl-ja3-server.%s" % subkey1:
                                value1,
                            }}
                    else:
                        subkey1, value1 = None, None
                    if value2:
                        subkey2, value2 = self._ja3keyvalue(
                            utils.str2regexp(value2)
                        )
                        if isinstance(value2, utils.REGEXP_T):
                            filter_value2 = {'regexp': {
                                "ports.scripts.ssl-ja3-server.client.%s" %
                                subkey2:
                                self._get_pattern(value2),
                            }}
                        else:
                            filter_value2 = {'match': {
                                "ports.scripts.ssl-ja3-server.client.%s" %
                                subkey2:
                                value2,
                            }}
                    else:
                        subkey2, value2 = None, None
                else:
                    subkey1, value1 = self._ja3keyvalue(
                        utils.str2regexp(values)
                    )
                    if isinstance(value1, utils.REGEXP_T):
                        filter_value1 = {'regexp': {
                            "ports.scripts.ssl-ja3-server.%s" % subkey1:
                            self._get_pattern(value1),
                        }}
                    else:
                        filter_value1 = {'match': {
                            "ports.scripts.ssl-ja3-server.%s" % subkey1:
                            value1,
                        }}
                    subkey2, value2 = None, None
            else:
                subkey1, value1 = None, None
                subkey2, value2 = None, None
            if '.' in field:
                field, subfield = field.split('.', 1)
            else:
                subfield = 'md5'
            flt = self.flt_and(flt, self.searchja3server(
                value_or_hash=value1,
                client_value_or_hash=value2,
            ))
            base = {
                "terms": dict(
                    baseterms,
                    script={
                        "lang": "painless",
                        "source":
                        "doc['ports.scripts.ssl-ja3-server.%s'].value + '/' + "
                        "doc['ports.scripts.ssl-ja3-server.client.%s'].value" %
                        (subfield, subfield),
                    },
                ),
            }
            if value1 is not None:
                base = {
                    "filter": filter_value1,
                    "aggs": {"patterns": base},
                }
            if value2 is not None:
                base = {
                    "filter": filter_value2,
                    "aggs": {"patterns": base},
                }
            flt = self.flt_and(flt, self.searchja3server(
                value_or_hash=value1,
                client_value_or_hash=value2,
            ))
            nested = {
                "nested": {"path": "ports"},
                "aggs": {"patterns": {
                    "nested": {"path": "ports.scripts"},
                    "aggs": {"patterns": {
                        "nested": {"path": "ports.scripts.ssl-ja3-server"},
                        "aggs": {"patterns": base},
                    }},
                }},
            }
        elif field.startswith('s7.'):
            flt = self.flt_and(flt, self.searchscript(name="s7-info"))
            subfield = field[3:]
            field = {'field': 'ports.scripts.s7-info.' + subfield}
        else:
            field = {"field": field}
        body = {"query": flt.to_dict()}
        if nested is None:
            body["aggs"] = {"patterns": {"terms": dict(baseterms, **field)}}
        else:
            body["aggs"] = {"patterns": nested}
        utils.LOGGER.debug("DB: Elasticsearch aggregation: %r", body)
        result = self.db_client.search(
            body=body,
            index=self.indexes[0],
            ignore_unavailable=True,
            size=0
        )
        result = result["aggregations"]
        while 'patterns' in result:
            result = result['patterns']
        result = result['buckets']
        if outputproc is None:
            for res in result:
                yield {'_id': res['key'], 'count': res['doc_count']}
        else:
            for res in result:
                yield {'_id': outputproc(res['key']),
                       'count': res['doc_count']}
Exemplo n.º 22
0
 def _make_field_must_exist_query(self, field, **kwargs):
     return Q(EXISTS, field=field)
Exemplo n.º 23
0
 def searchhaslocation(neg=False):
     res = Q('exists', field='infos.coordinates')
     if neg:
         return ~res
     return res
Exemplo n.º 24
0
 def get_es_query(self):
     # Just using 'terms' would not work, as it would return any tag match
     # in the list, but we want to exactly match all of them.
     return [Q('term', tags=tag) for tag in self.get_value()]
Exemplo n.º 25
0
 def search(self):
     logger.debug("SearchManager search invoked")
     q = Q("multi_match", query=self.query, fields=INDEXED_FIELDS)
     return self.__get_queryset(q)
Exemplo n.º 26
0
 def get_es_query(self):
     return [Q(self.operator, **{self.es_field: self.get_value()})]
Exemplo n.º 27
0
    class Meta:
        """Configuration for OAI server search."""

        default_filter = Q('exists', field='_oai.id')
Exemplo n.º 28
0
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, connections, query, Q
import logging
import pandas as pd
from IPython import display
import numpy as np
from matplotlib_venn import venn3
from matplotlib import pyplot as plt

# Define a default Elasticsearch client
elasticServer = 'http://172.20.30.70:9200/'  #prod
#elasticServer = 'http://172.20.31.19:9200/'	#dev

client = Elasticsearch(hosts=[elasticServer])

q = Q('match', id='_search')
s = Search(using=client,
           index="propertypriceregister",
           doc_type="propertypriceregister").query()

print("processing")

inPerfectMatch = []
for hit in s.scan():
    if hit["hasPerfectMatch"] == True:

        for field in hit["perfectMatches"]:
            if field not in inPerfectMatch:
                inPerfectMatch.append(field)
print("processing finished")
print()
Exemplo n.º 29
0
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
import csv

source = ['lid', 'title', 'time_limited', 'effective_date']

# 征求意见稿)or 征求意见稿 or 草案)or 草案

q_draft = Q('bool',
            must=[Q("regexp", title_term=".*(征求意见稿|草案))?")],
            must_not=[Q("match", time_limited="征求意见稿或草案")])

q_draft_2 = Q('bool', must=[Q("regexp", title_term=".*(征求意见稿|草案))?")])

es = Elasticsearch(hosts="nes1:9206")
s = Search(using=es, index="law_regu_dev",
           doc_type="law_regu").source(source).query(q_draft)
with open('q_draft.csv', 'a') as f:
    w = csv.DictWriter(f, source)
    w.writeheader()
    for hit in s.scan():
        w.writerow(hit.to_dict())
Exemplo n.º 30
0
    def aggregate_by_event_data(self,
                                event_id=None,
                                event_data_name="Image",
                                sub_event_data_name=None,
                                bucket_size=1000,
                                sub_bucket_size=100,
                                threshold=None,
                                filter_event_data_name='',
                                filter_event_data_value='',
                                aggregate_by_hostname=False):
        es_query = self.get_default_query()

        if event_id != None:
            es_query.append({'match': {'winlog.event_id': event_id}})

        if filter_event_data_name:
            filter_field_name = 'winlog.event_data.' + filter_event_data_name
            es_query.append(
                {'match': {
                    filter_field_name: filter_event_data_value
                }})

        query = Q({'bool': {'must': es_query}})

        s = Search(using=self.Client, index="winlogbeat-*").query(query)
        if self.DTRange != None:
            s = s.filter('range', **self.DTRange)

        s.source(includes=['winlog.*'])

        if aggregate_by_hostname:
            b = s.aggs.bucket(event_data_name,
                              'terms',
                              field='agent.hostname',
                              size=bucket_size)
        else:
            b = s.aggs

        b = b.bucket(event_data_name,
                     'terms',
                     field='winlog.event_data.' + event_data_name,
                     size=bucket_size)
        if threshold:
            # https://github.com/ongr-io/ElasticsearchDSL/blob/master/docs/Aggregation/Pipeline/BucketSelector.md
            # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html
            threshold_bucket_name = event_data_name + "_counts"
            b.bucket(threshold_bucket_name, 'cardinality', field='@timestamp')
            b.pipeline('threshold_bucket_selector',
                       'bucket_selector',
                       buckets_path={"counts": threshold_bucket_name},
                       script='params.counts > %d' % threshold)

        if sub_event_data_name:
            b.bucket(sub_event_data_name,
                     'terms',
                     field='winlog.event_data.' + sub_event_data_name,
                     size=sub_bucket_size)

        if self.DebugQuery:
            pprint.pprint(s.to_dict())

        response = s.execute()

        if self.Scan:
            s.scan()
        else:
            response = s.execute()

        return response.aggregations[event_data_name]