Exemplo n.º 1
0
    def test_user_org_filter_custom_user(self):
        """
        Test that user_organization_filtering returns a filtered search
        when given a user
        """
        org0 = Organization(name='testName', filter='testFilter')
        org0.save()
        org1 = Organization(name='otherTestName', filter='otherTestFilter')
        org1.save()
        user = XDSUser.objects.create_user('*****@*****.**',
                                           'test1234',
                                           first_name='Jane',
                                           last_name='doe')
        user.organizations.add(org0)
        user.organizations.add(org1)

        query = XSEQueries('test', 'test', user=user)

        expected_search = Search(using='default',
                                 index='test').\
            query(Q("match", filter=org0.filter) |
                  Q("match", filter=org1.filter))
        query.user_organization_filtering()
        result = query.search

        self.assertIn(expected_search.to_dict()['query']['bool']['should'][0],
                      result.to_dict()['query']['bool']['should'])
        self.assertIn(expected_search.to_dict()['query']['bool']['should'][1],
                      result.to_dict()['query']['bool']['should'])
Exemplo n.º 2
0
    def query(self):
        """
        Method to query Elasticsearch cluster for EfficiencyReport information

        :return elasticsearch_dsl.Search: Search object containing ES query
        """
        wildcardProbeNameq = 'condor:fifebatch?.fnal.gov'

        starttimeq = self.start_time.isoformat()
        endtimeq = self.end_time.isoformat()

        s = Search(using=self.client, index=self.indexpattern) \
            .filter("wildcard", ProbeName=wildcardProbeNameq) \
            .filter("range", EndTime={"gte": starttimeq, "lt": endtimeq})[0:0]

        # Aggregations

        Buckets = s.aggs.bucket('group_status', 'filters', filters={
            'Success': {'bool': {'must': {'term': {'Resource_ExitCode': 0}}}},
            'Failure': {
                'bool': {'must_not': {'term': {'Resource_ExitCode': 0}}}}}) \
            .bucket('group_VO', 'terms', field='VOName', size=2**31-1) \
            .bucket('group_CommonName','terms', field='CommonName',
                    size=2**31-1)

        # Metrics
        Buckets.metric('numJobs', 'sum', field='Count')\
            .metric('WallHours', 'sum', field='CoreHours')

        if self.verbose:
            print s.to_dict()

        return s
Exemplo n.º 3
0
    def query(self, client):
        """Query method to grab wasted hours, return query object"""

        wildcardProbeNameq = 'condor:fifebatch?.fnal.gov'

        starttimeq = self.dateparse(self.start_time)
        endtimeq = self.dateparse(self.end_time)

        s = Search(using = client, index = indexpattern_generate(self.start_time, self.end_time))\
                   .query("wildcard", ProbeName=wildcardProbeNameq)\
               .filter("range", EndTime={"gte" : starttimeq, "lt" : endtimeq})

        # Aggregations
        a1 = A('filters', filters = {'Success' : {'bool' : {'must' : {'term' : {'Resource_ExitCode' : 0}}}},
            'Failure': {'bool' : {'must_not' : {'term' : {'Resource_ExitCode' : 0}}}}})
        a2 = A('terms', field = 'VOName')
        a3 = A('terms', field = 'CommonName')

        Buckets = s.aggs.bucket('group_status', a1)\
                .bucket('group_VO', a2)\
                .bucket('group_CommonName', a3)

        # Metrics
        # FIGURE OUT HOW TO TOTAL JOBS
        Metric = Buckets.metric('numJobs', 'value_count', field = 'GlobalJobId')\
            .metric('WallHours', 'sum', script="(doc['WallDuration'].value*doc['Processors'].value/3600)")

        if self.verbose:
            print s.to_dict()

        return s
    def query(self, client):
        """Method that actually queries elasticsearch"""
        # Set up our search parameters
        voq = self.config.get("query", "{}_voname".format(self.vo.lower()))
        productioncheck = '*Role=Production*'

        start_date = self.datesplit_pattern.split(self.start_time)
        starttimeq = datetime(*[int(elt) for elt in start_date]).isoformat()

        end_date = self.datesplit_pattern.split(self.end_time)
        endtimeq = datetime(*[int(elt) for elt in end_date]).isoformat()

        # Generate the index pattern based on the start and end dates
        indexpattern = indexpattern_generate(start_date, end_date)

        if self.verbose:
            print >> sys.stdout, indexpattern
            sleep(3)

        # Elasticsearch query
        resultset = Search(using=client, index=indexpattern) \
            .query("wildcard", VOName=productioncheck) \
            .filter(Q({"term": {"VOName": voq}})) \
            .filter("range", EndTime={"gte": starttimeq, "lt": endtimeq}) \
            .filter(Q({"term": {"ResourceType": "Payload"}}))

        if self.verbose:
            print resultset.to_dict()

        return resultset
    def query(self):
        """
        Method to query Elasticsearch cluster for EfficiencyReport information

        :return elasticsearch_dsl.Search: Search object containing ES query
        """
        # Set up our search parameters
        voq = self.config.get(self.vo.lower(),
                              "voname".format(self.vo.lower()))
        productioncheck = '*Role=Production*'

        starttimeq = self.start_time.isoformat()
        endtimeq = self.end_time.isoformat()

        self.logger.info(self.indexpattern)
        if self.verbose:
            sleep(3)

        # Elasticsearch query
        s = Search(using=self.client, index=self.indexpattern) \
            .filter("range", EndTime={"gte": starttimeq, "lt": endtimeq}) \
            .filter("term", ResourceType="Payload")

        if self.vo.lower() in re.split(',',
                                       self.config.get('noproduction',
                                                       'list')):
            s = s.filter("wildcard", VOName=voq)
        else:
            s = s.filter("wildcard", VOName=productioncheck)\
                .filter("term", VOName=voq)

        if self.verbose:
            print s.to_dict()

        return s
Exemplo n.º 6
0
async def get_lists(database: plugins.configuration.DBConfig) -> dict:
    """

    :param database: a Pony Mail database configuration
    :return: A dictionary of all mailing lists found, and whether they are considered
             public or private
    """
    lists = {}
    db = plugins.database.Database(database)
    limit = database.max_lists

    # Fetch aggregations of all private emails
    # Do this first, so mixed lists are not marked private
    s = Search(using=db.client, index=db.dbs.db_mbox).filter("term",
                                                             private=True)
    s.aggs.bucket("per_list", "terms", field="list_raw", size=limit)

    res = await db.search(index=db.dbs.db_mbox, body=s.to_dict(), size=0)

    for ml in res["aggregations"]["per_list"]["buckets"]:
        list_name = ml["key"].strip("<>").replace(".", "@", 1)
        lists[list_name] = {
            "count": 0,  # Sorting later
            "private": True,
        }

    # Fetch aggregations of all public emails
    s = Search(using=db.client, index=db.dbs.db_mbox).filter("term",
                                                             private=False)
    s.aggs.bucket("per_list", "terms", field="list_raw", size=limit)

    res = await db.search(index=db.dbs.db_mbox, body=s.to_dict(), size=0)

    for ml in res["aggregations"]["per_list"]["buckets"]:
        list_name = ml["key"].strip("<>").replace(".", "@", 1)
        lists[list_name] = {
            "count": 0,  # We'll sort this later
            "private": False,
        }

    # Get 90 day activity, if any
    s = Search(using=db.client, index=db.dbs.db_mbox)
    s = s.filter('range', date={'gte': ACTIVITY_TIMESPAN})
    s.aggs.bucket("per_list", "terms", field="list_raw", size=limit)

    res = await db.search(index=db.dbs.db_mbox, body=s.to_dict(), size=0)

    for ml in res["aggregations"]["per_list"]["buckets"]:
        list_name = ml["key"].strip("<>").replace(".", "@", 1)
        if list_name in lists:
            lists[list_name]["count"] = ml["doc_count"]

    await db.client.close()

    return lists
Exemplo n.º 7
0
def test_query_combination():
    q = Q("match", title='python') | Q("match", title='django')
    s = Search().query(q)
    print(s.to_dict())

    q = Q("match", title='python') & Q("match", title='django')
    s = Search().query(q)
    print(s.to_dict())

    q = ~Q("match", title="python")
    s = Search().query(q)
    print(s.to_dict())
Exemplo n.º 8
0
async def get_lists(database: plugins.configuration.DBConfig) -> dict:
    """

    :param database: a Pony Mail database configuration
    :return: A dictionary of all mailing lists found, and whether they are considered
             public or private
    """
    lists = {}
    client = AsyncElasticsearch([
        {
            "host": database.hostname,
            "port": database.port,
            "url_prefix": database.url_prefix or "",
            "use_ssl": database.secure,
        },
    ])

    # Fetch aggregations of all public emails
    s = Search(using=client,
               index=database.db_prefix + "-mbox").query("match",
                                                         private=False)
    s.aggs.bucket("per_list", "terms", field="list_raw")

    res = await client.search(index=database.db_prefix + "-mbox",
                              body=s.to_dict(),
                              size=0)

    for ml in res["aggregations"]["per_list"]["buckets"]:
        list_name = ml["key"].strip("<>").replace(".", "@", 1)
        lists[list_name] = {
            "count": ml["doc_count"],
            "private": False,
        }

    # Ditto, for private emails
    s = Search(using=client,
               index=database.db_prefix + "-mbox").query("match", private=True)
    s.aggs.bucket("per_list", "terms", field="list_raw")

    res = await client.search(index=database.db_prefix + "-mbox",
                              body=s.to_dict(),
                              size=0)

    for ml in res["aggregations"]["per_list"]["buckets"]:
        list_name = ml["key"].strip("<>").replace(".", "@", 1)
        lists[list_name] = {
            "count": ml["doc_count"],
            "private": True,
        }
    await client.close()

    return lists
Exemplo n.º 9
0
def test_filters():
    s = Search()
    s = s.filter('terms', tags=['search', 'python'])
    print(s.to_dict())
    # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}}

    s = s.query('bool', filter=[Q('terms', tags=['search', 'python'])])
    print(s.to_dict())
    # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}}

    s = s.exclude('terms', tags=['search', 'python'])
    # 或者
    # s = s.query('bool', filter=[~Q('terms', tags=['search', 'python'])])
    print(s.to_dict())
Exemplo n.º 10
0
def test_sorting():
    s = Search().sort(
        'category',
        '-title',
        {"lines": {"order": "asc", "mode": "avg"}}
    )
    print(s.to_dict())
Exemplo n.º 11
0
def _execute(search: Search) -> dict:
    if log.isEnabledFor(logging.DEBUG):
        log.debug(json.dumps(search.to_dict(), indent=4))
    resp = search.execute()
    if log.isEnabledFor(logging.DEBUG):
        log.debug(json.dumps(resp.to_dict(), indent=4))
    return resp.to_dict()
Exemplo n.º 12
0
    def do_search(self, search_params):
        """ Do the actual search, using the search params we've been passed

        Params: 
            search_params(dict):
            'ingredients': str of space delimited keywords
        """

        # Prepare the required queries to be joined together with boolean operators in search ( &, | )
        q_ingredients = Q("match", ingredients=search_params['ingredients'])
        # Leave out name for now to keep this simple
        # q_name = Q("match", name=search_params['ingredients'])  # 'name' will add to score but is not essential

        # Prepare the search, using the prepared queries
        es_search = Search(index=settings.SEARCH_SERVICE['ES_INDEX']).using(self.client).query(q_ingredients)
        # Max number of results, from settings
        es_search = es_search[:settings.SEARCH_SERVICE['ES_MAX_RESULTS']]

        # Log the query_params and JSON query used
        logger.debug(json.dumps(search_params))
        logger.debug(json.dumps(es_search.to_dict()))

        es_search.execute()
        results = get_recipes_from_search(es_search)
        return (results)
Exemplo n.º 13
0
    def query_datasets(self, index, offset, page_size):
        """Return list of datasets:
        {
          "query": {
            "match_all": {}
          },
          "aggs": {
            "datasets": {
              "terms": {
                "field": "dataset",
                "size": 0
              }
            }
          },
          "size": 0
        }
        """

        s = Search(using=self.client, index=index).extra(size=0)
        a = A('terms', field='dataset.keyword', size=MAX_SIZE)
        s.aggs.bucket('datasets', a)

        if self.logger:
            self.logger.debug(s.to_dict())

        datasets = [
            i['key']
            for i in s.execute().aggregations.to_dict()['datasets']['buckets']
        ]
        return len(datasets), datasets[offset:offset + page_size]
Exemplo n.º 14
0
    def query_types_by_dataset(self, index, dataset, offset, page_size):
        """Return list of types by dataset:
        {
          "query": {
            "term": {
              "dataset.keyword": "area_of_interest"
            }
          },
          "aggs": {
            "types": {
              "terms": {
                "field": "dataset_type.keyword",
                "size": 0
              }
            }
          },
          "size": 0
        }
        """

        s = Search(using=self.client, index=index).extra(size=0)
        q = Q('term', dataset__keyword=dataset)
        a = A('terms', field='dataset_type.keyword', size=MAX_SIZE)
        s = s.query(q)
        s.aggs.bucket('types', a)

        if self.logger:
            self.logger.debug(s.to_dict())

        types = [
            i['key']
            for i in s.execute().aggregations.to_dict()['types']['buckets']
        ]
        return len(types), types[offset:offset + page_size]
Exemplo n.º 15
0
 def get_permission(self, user_id, file_ids):
     query_conditions = query.Bool(must=[
         query.Terms(file_id=file_ids),
         query.Bool(should=[
             query.Term(owner={
                 'value': user_id,
                 'boost': 100
             }),
             query.Bool(must=[
                 query.Term(share_mode={
                     'value': 1,
                     'boost': 5
                 }),
                 query.Term(users_shared={
                     'value': user_id,
                     'boost': 5
                 })
             ]),
             query.Term(share_mode=2)
         ])
     ])
     file_es = Search() \
         .query(query_conditions) \
         .source(['owner', 'share_mode', 'editable'])
     file_es = file_es[0:1]
     print(json.dumps(file_es.to_dict()))
     responses = file_es.using(self.es).index(self._index).execute()
     return responses
Exemplo n.º 16
0
    def filter(cls, user_id, params, order=None, limit=None, offset=0):
        """Filter indexed objects using a query string.

        :param user_id: user identifier
        :type user_id: str
        :param params: parameters to add in query string, will be
                       form of name:value
        :type params: dict
        :param limit: restrict result to this limit
        :type limit: int
        :param offset: start result list from this offset
        :type offset: int

        :return list
        """
        # XXX well I know this it bad, security must be considered strongly
        values = []
        for k, v in params.iteritems():
            values.append('%s:%s' % (k, v))
        q_str = ' AND '.join(values)

        client = cls.client()
        s = Search(using=client, index=user_id, doc_type=cls.doc_type). \
            query("query_string", query=q_str)
        if limit or offset:
            s = s[offset:(offset + limit)]
        log.debug("Filter index %s %s with : %s" %
                  (user_id, cls.doc_type, s.to_dict()))
        res = s.execute()
        return cls._format_list_result(res)
Exemplo n.º 17
0
    def search(self, criteria, key_list=None):
        """
            Builds ElasticSearch query.

            Args:
                criteria(schemas/search-layer-criteria.json): Criteria to use to initiate search.
                key_list(list): List of keys to receive back from a search.

            Returns:
                dict: each element in the outer dict represents a search "hit"
                      with the returned keys specified in key_list.
        """
        query = self._build_query(criteria.get("search"))
        query = Search(using=self.connection).index(
            self.index).sort("_uid").query(query)
        # Using python splicing on a query is the same as using {from: 0, size: 50} in an elasticsearch query
        # the upper_limit is gathered from the elasticsearch config
        query = query[0:self.upper_limit]
        self.search_container.logger.debug(
            "Executing the following search query: {0}".format(
                query.to_dict()))
        search_results = query.execute()
        search_formatter = SearchFormatter(criteria, search_results, key_list)
        formatted_results = search_formatter.get_formatted_results()

        return formatted_results
Exemplo n.º 18
0
    async def elastic_filter(
        cls, *, query: str, offset: int, limit: int
    ) -> List[Union[SearchModel, Dict[str, Any]]]:
        """
        Filter existing models in ElasticSearch by string query.
        This function uses __es_search_fields as fields for phrase_prefix query.

        >>> await Model.elastic_filter(query="La")
        """
        elastic_query = Search()
        if query:
            elastic_query = elastic_query.query(
                MultiMatch(
                    type="phrase_prefix", query=query, fields=cls.__es_search_fields
                )
            )
        elastic_query = elastic_query[offset : offset + limit]
        search_res = await elastic_client.search(elastic_query.to_dict())
        hits = search_res.get("hits", {}).get("hits", [])
        results = []
        constructor = cls.__es_search_type or dict
        for hit in hits:
            logger.debug(hit)
            results.append(constructor(id=hit.get("_id"), **hit.get("_source", {})))
        return results
Exemplo n.º 19
0
def getLastReported(client, endtime=datetime.datetime.now()):
    s = Search(using=client, index="htcondor-xfer-stats2-*")
    starttime = datetime.datetime.now() - datetime.timedelta(days=365)
    s = s.filter('range', **{'@timestamp': {'gte': starttime, 'lt': endtime}})
    bkt = s.aggs
    bkt = bkt.bucket('hosts', 'terms', size=MAXSZ, field='host.name.keyword')
    bkt = bkt.bucket('max_time', 'max', field='CreateDate')

    print(s.to_dict())

    response = s.execute()
    hosts = {}
    for tag in response.aggregations.hosts:
        if tag.max_time.value is None:
            continue
        last_seen = datetime.datetime.fromtimestamp(tag.max_time.value / 1000)
        # Discount hosts seen in the last week
        if last_seen > datetime.datetime.now() - datetime.timedelta(days=7):
            continue
        hosts[tag.key] = {
            'max_time': tag.max_time.value,
            'max_time_str': last_seen.strftime('%Y-%m-%d %H:%M:%S')
        }

    return hosts
Exemplo n.º 20
0
 def _filter(self, req=None, data=None):
     req = req or RequestFactory().get('/', data=data or {})
     queryset = Search()
     for filter_class in self.filter_classes:
         queryset = filter_class().filter_queryset(req, queryset,
                                                   self.view_class)
     return queryset.to_dict()
    def query(self):
        """
        Method to query Elasticsearch cluster for EfficiencyReport information

        :return elasticsearch_dsl.Search: Search object containing ES query
        """
        # Gather parameters, format them for the query
        starttimeq = self.start_time.isoformat()
        endtimeq = self.end_time.isoformat()

        probelist = self.config[
            self.report_type.lower()]['OSG_flocking_probe_list']

        if self.verbose:
            self.logger.info(self.indexpattern)
            self.logger.info(probelist)

        # Elasticsearch query and aggregations
        s = Search(using=self.client, index=self.indexpattern) \
                .filter("range", EndTime={"gte": starttimeq, "lt": endtimeq}) \
                .filter("terms", ProbeName=probelist) \
                .filter("term", ResourceType="Payload")[0:0]

        # Size 0 to return only aggregations

        Bucket = s.aggs.bucket('OIM_Facility',
                               'terms',
                               field='OIM_Facility',
                               size=MAXINT,
                               order={'CoreHours': 'desc'})

        Bucket.metric('CoreHours', 'sum', field='CoreHours')

        print(s.to_dict())
        return s
Exemplo n.º 22
0
    def filter(cls, user_id, params, order=None, limit=None, offset=0):
        """Filter indexed objects using a query string.

        :param user_id: user identifier
        :type user_id: str
        :param params: parameters to add in query string, will be
                       form of name:value
        :type params: dict
        :param limit: restrict result to this limit
        :type limit: int
        :param offset: start result list from this offset
        :type offset: int

        :return list
        """
        # XXX well I know this it bad, security must be considered strongly
        values = []
        for k, v in params.iteritems():
            values.append('%s:%s' % (k, v))
        q_str = ' AND '.join(values)

        client = cls.client()
        s = Search(using=client, index=user_id, doc_type=cls.doc_type). \
            query("query_string", query=q_str)
        if limit or offset:
            s = s[offset:(offset + limit)]
        log.debug("Filter index %s %s with : %s" %
                  (user_id, cls.doc_type, s.to_dict()))
        res = s.execute()
        return cls._format_list_result(res)
Exemplo n.º 23
0
def search_query(queries, index):
    # TODO: create a base DocType class with method, get_index_by_name('dummy_movies')
    # TODO: create ability to search across indexes, using Search()

    s = Search(index=index)

    query_obj = {
        'must': [],
        'must_not': [],
        'filter': []
    }

    for query in queries:
        if query != None:
            q = Q(query['query'])
            query_obj[query['query_type']].append(q)

    total_queries = Q('bool', **query_obj)
    s = s.query(total_queries)
    print('query --> ', s.to_dict())

    response = s

    print('count --> ', response.count())

    response_obj = {
        'hits': response.count(),
        'data': [h.to_dict() for h in response]
    }

    return response_obj
Exemplo n.º 24
0
def recommends(a):
    s = Search().query("more_like_this",
                       stop_words=MINUS_WORDS,
                       like={
                           "_id": a.pk,
                           "_index": "article-index",
                           "_type": "article_index"
                       },
                       fields=["authors^2", "cats", "title^2", "content"])

    search_body = {
        "query": {
            "function_score": {
                "query": s.to_dict()["query"],
                "functions": [AGEISM, RANDOMISE],
                "score_mode": "sum"
            }
        }
    }
    r = connections.get_connection().search(index="article-index",
                                            body=search_body)

    hits = r["hits"]["hits"][:settings.SUGGESTION_COUNT]

    return list(
        filter(lambda a: a is not None, [
            models.Article.nondraft.filter(pk=hit["_id"]).first()
            for hit in hits
        ]))
Exemplo n.º 25
0
    def search_buy(self, query, ptype, cond):
        s = Search(index='buy')

        if ptype != '':
            s = s.filter('match', ptype=self.pt_dict[ptype])
        if cond != '':
            s = s.filter('match', cond=self.c_dict[cond])

        # s = s.source(['hand_kw', 'jieba_kw', 'synonym', 'ptype', 'cond'])

        q = Q("bool",
              should=[
                  Q("terms", hand_kw=query),
                  Q("terms", jieba_kw=query),
                  Q("terms", synonym=query),
                  Q("match", raw=' '.join(query))
              ])

        # q = Q('multi_match', query=' '.join(query), fields=['raw']) | \
        #     Q("terms", hand_kw=query) | \
        #     Q("terms", jieba_kw=query) | \
        #     Q("terms", synonym=query)

        s = s.query(q)
        pprint(s.to_dict())
        r = s.execute()
        return r
Exemplo n.º 26
0
def index_single(es, network, channel, date, lines):
    # Delete existing
    delete_existing = Search(
        using=es,
        index='moffle',
    ).query(
        "term", network=network,
    ).query(
        "term", channel=channel,
    ).query(
        "term", date=date,
    )

    es.delete_by_query(
        index='moffle',
        body=delete_existing.to_dict(),
    )

    actions = [x for x in (line_to_index_action(network, channel, date, i, line) for i, line in lines) if x]
    while actions:
        retries = 0
        try:
            success_count, _ = bulk(es, actions)
            log("{}/{}/{}: indexed {} lines".format(network, channel, date, success_count))
            return success_count
        except Exception as e:
            retries += 1
            log("{}/{}/{}: Attempt {}/3: {}".format(network, channel, date, retries, e))
            if retries > 3:
                raise
    def search(self, doc_type, query=""):
        """
        Execute search query and retrive results

        :param doc_type: Type in ElasticSearch
        :param query: search query
        :return: list with results
        """
        results = []
        if type(query) in [str, unicode] and type(doc_type) == DocTypeMeta:
            q = Q("multi_match",
                  query=query.lower(),
                  fields=["title"])

            s = Search()
            s = s.using(self.client)
            s = s.index(self.index_name)
            s = s.doc_type(doc_type)
            s = s.query(q)
            print "search query: " + str(s.to_dict())

            response = s.execute()

            for resp in response:
                results.append(resp)
        return results
Exemplo n.º 28
0
    def process(self, start_time:datetime, end_time:datetime, input:DataFrame):
        logger.debug('Start: %s  End: %s  Log: index=%s fields=%s' % (start_time.isoformat(), end_time.isoformat(), str(self.indices), str(self.fields)))

        search = Search(using=self.client, index=self.indices[0])
        search = search.filter(Range(** {'@timestamp': {'gte': start_time.isoformat(), 'lte': end_time.isoformat()}}))

        for k,v in self.fields.items():
            if isinstance(v, list):
                for sv in v:
                    search = search.query("match", **{k:sv})

            else:
                search = search.query("match", **{k:v})

        logger.debug('ES Query: %s' % str(search.to_dict()))
        response = search.execute()

        logger.debug('Results: success:%d failed:%d hits:%d' % (response._shards.successful, response._shards.failed, len(response.hits)))

        for hit in response:
            # filter out the meta key and flatten the values
            row = {k: str(hit[k]) for k in hit if k != 'meta'}

            logger.debug(row)
            input = input.append(row, ignore_index=True)

        return input
Exemplo n.º 29
0
def catalog_search():

    params = app.current_request.query_params

    s = Search(using=client, index='imagery', doc_type="metadata")

    filter_count = 0

    max_results = 1000

    if ("st" in params):
        s = s.filter('range', date={'gte': params["st"]})
        filter_count += 1
    if ("et" in params):
        s = s.filter('range', date={'lte': params["et"]})
        filter_count += 1
    if ("wkt" in params):
        shape_filter = {"shape": wkt.loads(params["wkt"])}
        s = s.filter('geo_shape', bounds=shape_filter)
        filter_count += 1
    if ("debug" in params):
        return (s.to_dict())

    s = s[0:max_results]

    if (filter_count > 0):
        result = s.execute().to_dict()
        return ([hit["_source"] for hit in result["hits"]["hits"]])
    else:
        return {'Search Failed': 'No search parameters were recognized'}
Exemplo n.º 30
0
 def test_add_filter_no_param(self):
     q = FieldSearchQuery(args={})
     search = Search()
     prev_dict = search.to_dict()
     search = q.add_filters(search, 'units', 'units').to_dict()
     # Esperado: no se modifica la query si no hay parámetros
     self.assertEqual(prev_dict, search)
Exemplo n.º 31
0
 def _filter(self, req=None, data=None):
     req = req or RequestFactory().get('/', data=data or {})
     queryset = Search()
     for filter_class in self.filter_classes:
         queryset = filter_class().filter_queryset(req, queryset,
                                                   self.view_class)
     return queryset.to_dict()
Exemplo n.º 32
0
        def _data(self, request, cleaned, *args, explain=None, **kwargs):
            search = Search(using=connection,
                            index=indicies,
                            extra={'size': 0})
            search.aggs.bucket('documents_by_type',
                               TermsFacet(field='_type').get_aggregation()) \
                .bucket('by_month',
                        DateHistogramFacet(field='created', interval='month', min_doc_count=0).get_aggregation())
            search.aggs.bucket(
                'datasets_by_institution',
                NestedFacet(
                    'institution',
                    TermsFacet(field='institution.id')).get_aggregation())

            search.aggs.bucket(
                'datasets_by_category',
                NestedFacet(
                    'category',
                    TermsFacet(field='category.id', min_doc_count=1,
                               size=50)).get_aggregation())
            search.aggs.bucket('datasets_by_tags',
                               TermsFacet(field='tags').get_aggregation())
            search.aggs.bucket('datasets_by_formats',
                               TermsFacet(field='formats').get_aggregation())
            search.aggs.bucket(
                'datasets_by_openness_scores',
                TermsFacet(field='openness_scores').get_aggregation())
            if explain == '1':
                return search.to_dict()
            try:
                return search.execute()
            except TransportError as err:
                raise falcon.HTTPBadRequest(
                    description=err.info['error']['reason'])
Exemplo n.º 33
0
  def searchTweets(keyword, latlondist):
    #Variables that contains the user credentials to access Twitter API 
    if TwitterHelper.AWS_ACCESS_KEY == None:
      raise KeyError("Please set the AWS_ACCESS_KEY env. variable")
    
    if TwitterHelper.AWS_SECRET_KEY == None:
      raise KeyError("Please set the AWS_SECRET_KEY env. variable")

    s = Search()
    if latlondist != None:
      locJson = json.loads(latlondist)
      s = s.query({"filtered" : {"query" : {"match_all" : {}}, "filter" : {"geo_distance" : {"distance" : locJson['dist'], "location" : {"lat" : locJson['lat'], "lon" : locJson['lon']}}}}})

    if keyword != None:
      q = Q("match_phrase", text = keyword)
      s = s.query(q)
    
    scanResp = None
    scanResp = helpers.scan(client = TwitterHelper.ES, query = s.to_dict(), scroll = "1m", index = "tweets", timeout = "1m")

    arr = []
    for resp in scanResp:
      hit = resp['_source']
      d = {}
      d['name'] = hit['name']
      d['text'] = hit['text']
      d['sentiment'] = hit['sentiment']
      d['lat'] = hit['location']['lat']
      d['lon'] = hit['location']['lon']
      arr.append(d)
    allD = {}
    allD['tweets'] = arr
    mapInput = json.dumps(allD)
    return mapInput
Exemplo n.º 34
0
def get_registered_datasender_count(dbm, questionnaire_name):
    es = Elasticsearch(hosts=[{"host": ELASTIC_SEARCH_HOST, "port": ELASTIC_SEARCH_PORT}])
    search = Search(using=es, index=dbm.database_name, doc_type='reporter')
    search = search.query("term", projects_value=lowercase_and_strip_accents(questionnaire_name))
    search = search.query("term", void=False)
    body = search.to_dict()
    return es.search(index=dbm.database_name, doc_type='reporter', body=body, search_type='count')['hits']['total']
Exemplo n.º 35
0
    def search_my_data(self, username, q, offset, limit):

        split_query = q.split(" ")
        for i, c in enumerate(split_query):
            if c.upper() not in ["AND", "OR", "NOT"]:
                split_query[i] = "*" + c + "*"

        q = " ".join(split_query)

        search = Search(index='des-files')
        search = search.filter("nested",
                               path="permissions",
                               query=Q("term", permissions__username=username))
        search = search.query("query_string",
                              query=q,
                              fields=["name", "name._exact", "keywords"])
        search = search.query(
            Q('bool', must=[Q({'prefix': {
                'path._exact': username
            }})]))
        search = search.filter("term", system='designsafe.storage.default')
        search = search.query(
            Q('bool',
              must_not=[
                  Q({'prefix': {
                      'path._exact': '{}/.Trash'.format(username)
                  }})
              ]))
        search = search.extra(from_=offset, size=limit)
        logger.info(search.to_dict())
        return search
Exemplo n.º 36
0
    def get_update_list_single_process(self):
        """ Find units that needs updating and their sidstopdateret (last updated)
        the sidstopdateret may be inaccurate and thus way to far back in time therefore we cannot use take the largest
        of sidstopdateret from the database. Seems we download like 600 dicts a second with match_all.
        Should take around 2 hours and 30 minuttes then. This takes 30 so i need to save half an hour on downloads.

        :return datetime (min sidstopdateret), list (enhedsnumer, sidstopdateret)
        """
        enh_samtid_map = self.make_samtid_dict()
        oldest_sidstopdateret = datetime.datetime.utcnow().replace(
            tzinfo=pytz.utc) + datetime.timedelta(days=1)
        update_dicts = {
            x: {
                'units': [],
                'sidstopdateret': oldest_sidstopdateret
            }
            for x in self.source_keymap.values()
        }
        if len(enh_samtid_map) == 0:
            return update_dicts
        dummy = CvrConnection.update_info(samtid=-1,
                                          sidstopdateret=self.dummy_date)
        print('Get update time for all data')

        for _type in self.source_keymap.values():
            search = Search(using=self.elastic_client, index=self.index)
            search = search.query('match_all')
            sidst_key = '{0}.sidstOpdateret'.format(_type)
            samt_key = '{0}.samtId'.format(_type)
            field_list = ['_id', sidst_key, samt_key]
            # field_list = ['_id'] + ['{0}.sidstOpdateret'.format(key) for key in self.source_keymap.values()] + \
            #          ['{0}.samtId'.format(key) for key in self.source_keymap.values()]
            search = search.fields(fields=field_list)
            params = {'scroll': self.elastic_search_scroll_time, 'size': 2**12}
            search = search.params(**params)
            print('ElasticSearch Query: ', search.to_dict())
            generator = search.scan()
            for cvr_update in tqdm.tqdm(generator):
                enhedsnummer = int(cvr_update.meta.id)
                raw_dat = cvr_update.to_dict()
                samtid = raw_dat[samt_key][0] if samt_key in raw_dat else None
                sidstopdateret = raw_dat[sidst_key][
                    0] if sidst_key in raw_dat else None
                if sidstopdateret is None or samtid is None:
                    continue
                current_update = enh_samtid_map[
                    enhedsnummer] if enhedsnummer in enh_samtid_map else dummy
                if samtid > current_update.samtid:
                    utc_sidstopdateret = utc_transform(sidstopdateret)
                    update_dicts[_type]['sidstopdateret'] = min(
                        utc_sidstopdateret,
                        update_dicts[_type]['sidstopdateret'])
                    update_dicts[_type]['units'].append(
                        (enhedsnummer, utc_sidstopdateret))
                    # break
        print('Update Info: ')
        print([(k, v['sidstopdateret'], len(v['units']))
               for k, v in update_dicts.items()])
        return update_dicts
Exemplo n.º 37
0
def test_simple_search():
    s = Search().query("match", title="python")
    # {'query': {'match': {'title': 'python'}}}
    print(s.to_dict())
    response = s.execute()
    print response
    for hit in s:
        print(hit.title)
Exemplo n.º 38
0
async def get_lists(database: plugins.configuration.DBConfig) -> dict:
    """

    :param database: a Pony Mail database configuration
    :return: A dictionary of all mailing lists found, and whether they are considered
             public or private
    """
    lists = {}
    db = plugins.database.Database(database)
    limit = 8192

    # Fetch aggregations of all public emails
    s = Search(using=db.client,
               index=database.db_prefix + "-mbox").filter("term",
                                                          private=False)
    s.aggs.bucket("per_list", "terms", field="list_raw", size=limit)

    res = await db.search(index=database.db_prefix + "-mbox",
                          body=s.to_dict(),
                          size=0)

    for ml in res["aggregations"]["per_list"]["buckets"]:
        list_name = ml["key"].strip("<>").replace(".", "@", 1)
        lists[list_name] = {
            "count": ml["doc_count"],
            "private": False,
        }

    # Ditto, for private emails
    s = Search(using=db.client,
               index=database.db_prefix + "-mbox").filter("term", private=True)
    s.aggs.bucket("per_list", "terms", field="list_raw", size=limit)

    res = await db.search(index=database.db_prefix + "-mbox",
                          body=s.to_dict(),
                          size=0)

    for ml in res["aggregations"]["per_list"]["buckets"]:
        list_name = ml["key"].strip("<>").replace(".", "@", 1)
        lists[list_name] = {
            "count": ml["doc_count"],
            "private": True,
        }
    await db.client.close()

    return lists
Exemplo n.º 39
0
    def delete(self, region, date):
        index = app.config['ELASTICSEARCH_INDEX']
        doc_type = app.config['ELASTICSEARCH_TYPE']

        s = Search(using=self.es, index=app.config['ELASTICSEARCH_INDEX'], doc_type=doc_type) \
            .filter('term', region=region) \
            .filter('term', date=date)
        self.es.delete_by_query(index=index, doc_type=doc_type, body=s.to_dict())
Exemplo n.º 40
0
    def query(self):
        search_obj = Search()
        for f in self.filters:
            search_obj = search_obj.filter(f)

        for q in self.queries:
            search_obj = search_obj.query(q)

        return search_obj.to_dict()
Exemplo n.º 41
0
    def _from_uuid(cls, uuid):
        result = Search(using=es, index=CLUSTER_NAME).query('match', uuid=uuid).execute()[0]
        result = result.to_dict()

        r = cls()
        r.uuid = result.pop('uuid', None)
        r.epoch = result.pop('epoch', None)
        r.data = result

        return r
Exemplo n.º 42
0
def test_post_filter(app):
    """Test post filter."""
    urlargs = MultiDict()
    defs = dict(
        type=terms_filter('type'),
        subtype=terms_filter('subtype'),
    )

    with app.test_request_context('?type=test'):
        search = Search().query(Q(query='value'))
        search, args = _post_filter(search, urlargs, defs)
        assert 'post_filter' in search.to_dict()
        assert search.to_dict()['post_filter'] == dict(
            terms=dict(type=['test'])
        )
        assert args['type'] == 'test'

    with app.test_request_context('?anotertype=test'):
        search = Search().query(Q(query='value'))
        search, args = _post_filter(search, urlargs, defs)
        assert 'post_filter' not in search.to_dict()
Exemplo n.º 43
0
 def _queryElasticsearch(self, from_date, to_date, query):
     logging.debug("Connecting to ES")
     client = Elasticsearch()
     
     logging.debug("Beginning search")
     s = Search(using=client, index=self._config['ElasticSearch']['raw_index'])
     s = s.filter('range', **{'EndTime': {'from': from_date, 'to': to_date }})
     
     logging.debug("About to execute query:\n%s" % str(s.to_dict()))
     
     for hit in s.scan():
         yield hit
Exemplo n.º 44
0
def test_default_facets_factory(app):
    """Test aggregations."""
    defs = dict(
        aggs=dict(
            type=dict(
                terms=dict(field='upload_type'),
            ),
            subtype=dict(
                terms=dict(field='subtype'),
            )
        ),
        filters=dict(
            subtype=terms_filter('subtype'),
        ),
        post_filters=dict(
            type=terms_filter('type'),
        ),
    )
    app.config['RECORDS_REST_FACETS']['testidx'] = defs

    with app.test_request_context('?type=a&subtype=b'):
        search = Search().query(Q(query='value'))
        search, urlkwargs = default_facets_factory(search, 'testidx')
        assert search.to_dict()['aggs'] == defs['aggs']
        assert 'post_filter' in search.to_dict()
        assert search.to_dict(
            )['query']['bool']['filter'][0]['terms']['subtype']

        search = Search().query(Q(query='value'))
        search, urlkwargs = default_facets_factory(search, 'anotheridx')
        assert 'aggs' not in search.to_dict()
        assert 'post_filter' not in search.to_dict()
        assert 'bool' not in search.to_dict()['query']
Exemplo n.º 45
0
def test_query_filter(app):
    """Test post filter."""
    urlargs = MultiDict()
    defs = dict(
        type=terms_filter('type'),
        subtype=terms_filter('subtype'),
    )

    with app.test_request_context('?type=test'):
        search = Search().query(Q('multi_match', query='value'))
        body = search.to_dict()
        search, args = _query_filter(search, urlargs, defs)
        assert 'post_filter' not in search.to_dict()
        assert search.to_dict()['query']['bool']['must'][0] == body['query']
        assert search.to_dict()['query']['bool']['filter'] == [
            dict(terms=dict(type=['test']))
        ]
        assert args['type'] == 'test'

    with app.test_request_context('?anotertype=test'):
        search = Search().query(Q(query='value'))
        body = search.to_dict()
        query, args = _query_filter(search, urlargs, defs)
        assert query.to_dict() == body
Exemplo n.º 46
0
def index_single(es, network, channel, date, lines):
    log("Processing {}/{}/{}".format(network, channel, date))

    # Delete existing
    delete_existing = Search(
        using=es,
        index='moffle',
    ).query(
        "term", network=network,
    ).query(
        "term", channel=channel,
    ).query(
        "term", date=date,
    )

    es.delete_by_query(
        index='moffle',
        body=delete_existing.to_dict(),
    )

    actions = []
    for i, line in lines:
        m = LINE.match(line)
        if not m:
            # What happened here?
            continue

        fields = m.groupdict()
        fields['text'] = fields['text'].strip()
        fields['line_type'] = TYPE_MAP[fields['line_type']]

        fields.update({
            '_index': 'moffle',
            '_type': 'logline',
            'network': network,
            'channel': channel,
            'date': date,
            'line_no': i,
        })
        actions.append(fields)

    if actions:
        log(bulk(es, actions))
Exemplo n.º 47
0
	def build_search_query(params: dict):
		s = Search(using=client, index="logger")
		query_string = ""
		for key, value in params.items():
			if key == "remote_host" and value !="":
				query_string += "\'remote_host\': \'" + value + "\' AND "
			elif key == "application_name" and value !="":
				query_string += "\'application_name\': \'" + value + "\' AND "

		query_string += params['body']	
		print(query_string)
		

		s.query(Q("query_string", query=query_string))
		print(s.to_dict())
		result_list = list()
		try:
			response = s.execute()
			for h in response.to_dict()['hits']['hits']:
				result_list.append(h)
			return result_list
		except Exception as e:
			print(e)
Exemplo n.º 48
0
    def search(self, criteria, key_list=None):
        """
            Builds ElasticSearch query.

            Args:
                criteria(schemas/search-layer-criteria.json): Criteria to use to initiate search.
                key_list(list): List of keys to receive back from a search.

            Returns:
                dict: each element in the outer dict represents a search "hit"
                      with the returned keys specified in key_list.
        """
        query = self._build_query(criteria.get("search"))
        query = Search(using=self.connection).index(self.index).sort("_uid").query(query)
        # Using python splicing on a query is the same as using {from: 0, size: 50} in an elasticsearch query
        # the upper_limit is gathered from the elasticsearch config
        query = query[0:self.upper_limit]
        self.search_container.logger.debug("Executing the following search query: {0}".format(query.to_dict()))
        search_results = query.execute()
        search_formatter = SearchFormatter(criteria, search_results, key_list)
        formatted_results = search_formatter.get_formatted_results()

        return formatted_results
Exemplo n.º 49
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']
        self._build_fields()

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError('_results_number too large')
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value, full=False)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value)
                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=facets_size,
                )

        # Create signature aggregations.
        if params.get('_aggs.signature'):
            sig_bucket = A(
                'terms',
                field=self.get_field_name('signature'),
                size=facets_size,
            )
            for param in params['_aggs.signature']:
                for value in param.value:
                    if not value:
                        continue

                    if value.startswith('_histogram.'):
                        # This is a histogram aggregation we want to run,
                        # not a terms aggregation.
                        field_name = value[len('_histogram.'):]
                        if field_name not in self.histogram_fields:
                            continue

                        histogram_type = (
                            self.all_fields[field_name]['query_type'] == 'date'
                            and 'date_histogram' or 'histogram'
                        )
                        sig_bucket.bucket(
                            'histogram_%s' % field_name,
                            histogram_type,
                            field=self.get_field_name(field_name),
                            interval=histogram_intervals[field_name],
                        )
                    else:
                        sig_bucket.bucket(
                            value,
                            'terms',
                            field=self.get_field_name(value),
                            size=facets_size,
                        )

            search.aggs.bucket('signature', sig_bucket)

        # Create histograms.
        for f in self.histogram_fields:
            if params.get('_histogram.%s' % f):
                histogram_type = (
                    self.all_fields[f]['query_type'] == 'date'
                    and 'date_histogram' or 'histogram'
                )
                date_bucket = A(
                    histogram_type,
                    field=self.get_field_name(f),
                    interval=histogram_intervals[f],
                )
                for param in params['_histogram.%s' % f]:
                    for value in param.value:
                        if not value:
                            continue

                        field_name = self.get_field_name(value)
                        val_bucket = A(
                            'terms',
                            field=field_name,
                            size=facets_size,
                        )
                        date_bucket.bucket(value, val_bucket)

                search.aggs.bucket('histogram_%s' % f, date_bucket)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
    def generate_where(self, query, where, is_root=False):

        where_clauses = where["clauses"]
        source_fields = set()

        musts = []
        shoulds = []
        filters = []
        must_nots = []

        sub_queries = []
        
        shoulds_by_predicate = {}
        unbound_subquery_variables = set()

        for clause in where_clauses:
            if "operator" in clause:
                continue
            if "fields" in clause:
                fields = clause["fields"]
                source_fields |= set([field["name"] for field in fields])
            if("constraint" in clause):
                es_clause = self.translate_clause_helper(clause, fields, True)
            elif "clauses" in clause:
                sub_query = self.generate_where(query, clause, False)
                sub_query["clause_fields"] = []
                # if sub_query contains variable of parent query
                #  create clause that filters on variable of parent query
                contains_parent_variable = False
                for c in clause["clauses"]:
                    if "variable" in c:
                        if c["variable"] == where["variable"]:
                            contains_parent_variable = True
                        else:
                            unbound_subquery_variables.add(c["variable"])
                            for f in c["fields"]:
                                if not f["name"].startswith("content") and not f["name"] == "raw_content":
                                    sub_query["clause_fields"].append({"name": f["name"], 
                                                                   "variable": c["variable"]})

                if contains_parent_variable:
                    sub_query_clause = {}
                    sub_query_clause["constraint"] = "__placeholder__"
                    sub_query_clause["isOptional"] = False
                    sub_query_clause["fields"] = where["fields"]
                    source_fields |= set([field["name"] for field in where["fields"]])
                    sub_query_clause["_id"] = clause["_id"]
                    es_clause = self.translate_clause_helper(sub_query_clause,
                                                             where["fields"],
                                                             True)
                    sub_query["clause_fields"] = where["fields"]
                    sub_query["clause_id"] = clause["_id"]
                else:
                    es_clause = None

                #sub_query["clause_fields"] = where["fields"]
                
                sub_queries.append(sub_query)
                # else 
                #   create clause that's constrained on variable of clause
                #clause["constraint"] = "__placeholder__"
                #es_clause = self.translate_clause_helper(clause, fields, True)
                #sub_query["clause_name"] = es_clause["_name"]
            else:
                # this is a we need an answer for this clause
                if not is_root or "filter_for_fields_of_unbound_variables" \
                   not in self.elasticsearch_compiler_options or \
                    self.elasticsearch_compiler_options["filter_for_fields_of_unbound_variables"]:
                    es_clause = self.translate_clause_helper(clause, fields, False)
                else:
                    es_clause = None
            if es_clause:
                if clause.get("isOptional", False):
                    predicate = clause.get("predicate")
                    if predicate not in shoulds_by_predicate:
                        shoulds_by_predicate[predicate] = list()
                    shoulds_by_predicate.get(predicate).append(es_clause)
                else:
                    musts.append(es_clause)

        if unbound_subquery_variables:
            sub_query = sub_queries[-1]
            sub_query["variable_to_clause_id"] = {}
            for clause in where_clauses:
                if "operator" in clause:
                    if "union" == clause["operator"].lower():
                        union_shoulds = []
                        for uc in clause["clauses"]:
                            if "variable" in uc and uc["variable"] in unbound_subquery_variables:
                                uc["constraint"] = "__placeholder__"
                            uc_es_clause = self.translate_clause_helper(uc,
                                                                        uc["fields"],
                                                                        True)
                            if uc["variable"] not in sub_queries[-1]["variable_to_clause_id"]:
                                sub_query["variable_to_clause_id"][uc["variable"]] = []
                            variable_to_clause_id = sub_query["variable_to_clause_id"][uc["variable"]]
                            variable_to_clause_id.append(uc["_id"])


                            union_shoulds.append(uc_es_clause)
                        union_q = Bool(should=union_shoulds)
                        # must or filter?
                        filters.append(union_q)


                elif "constraint" not in clause and "clauses" not in clause:
                    if "variable" in clause and\
                        clause["variable"] in unbound_subquery_variables:
                        clause["constraint"] = "__placeholder__"
                        es_clause = self.translate_clause_helper(clause,
                                                                clause["fields"],
                                                                True)
                        if clause["variable"] not in sub_queries[-1]["variable_to_clause_id"]:
                            sub_query["variable_to_clause_id"][clause["variable"]] = []
                        variable_to_clause_id = sub_query["variable_to_clause_id"][clause["variable"]]
                        variable_to_clause_id.append(clause["_id"])
                        # must or filter?
                        filters.append(es_clause)




        for key, value in shoulds_by_predicate.iteritems():
            if len(value) > 1:
                shoulds.append(DisMax(queries=value))
            else:
                shoulds.append(value[0])


        if "filters" in where:
            filter_clauses = where["filters"]
            for f in filter_clauses:
                source_fields = self.generate_filter(f, filters, source_fields)

        if self.elasticsearch_compiler_options.get("convert_text_filters_to_shoulds", False):
            valid_filters = list()
            converted_filters = list()
            for f in filters:
                is_matches = False
                if isinstance(f, DisMax):
                    is_matches = True
                    for q in f.queries:
                        if isinstance(q, Range):
                            is_matches = False
                        break
                if is_matches:
                    converted_filters.append(f)
                else:
                    valid_filters.append(f)

            shoulds.extend(converted_filters)
            filters= valid_filters
        q = Bool(must=musts,
                 should=shoulds,
                 filter=filters,
                 must_not=must_nots)
        if ("boost_musts" in self.elasticsearch_compiler_options and
            len(musts) > 0) or\
                "boost_shoulds" in self.elasticsearch_compiler_options:
            if "boost_musts" in self.elasticsearch_compiler_options\
                    and len(musts) == 1:
                shoulds.extend(musts)
                q = Bool(should=shoulds,
                         filter=filters,
                         must_not=must_nots)
            else:
                boost = 10.0
                weighted_by_musts = []
                musts_temp = musts
                if "boost_musts" in self.elasticsearch_compiler_options:
                    shoulds.extend(musts)
                    musts_temp = []

                if len(shoulds) > 0:
                    extra_minimum_should_match = 0
                    if len(shoulds) >= 2 and "boost_shoulds"\
                            in self.elasticsearch_compiler_options:
                        extra_minimum_should_match = 1
                    for x in range(0, len(shoulds) - extra_minimum_should_match):
                        weighted_q = Bool(
                            must=musts_temp,
                            should=shoulds,
                            filter=filters,
                            must_not=must_nots,
                            boost=boost,
                            minimum_should_match=len(shoulds) - x)
                        weighted_by_musts.append(weighted_q)
                        boost = boost / 2
                    weighted_must = Bool(should=weighted_by_musts,
                                         disable_coord=True)
                    q = weighted_must

        s = Search()
        s.query = q
        if is_root:
            s = self.generate_query_boilerplate(query, s, source_fields)
        else:
            s = self.generate_source_fields(s, source_fields)
        es_result = {}
        es_result["search"] = self.clean_dismax(s.to_dict())
        es_result["type"] = where["type"]
        if len(sub_queries) > 0:
            sub_queries.append(es_result)
            return sub_queries
        return es_result
Exemplo n.º 51
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError(
                                '_results_number',
                                msg=(
                                    '_results_number cannot be greater '
                                    'than 1,000'
                                )
                            )
                        if results_number < 0:
                            raise BadArgumentError(
                                '_results_number',
                                msg='_results_number cannot be negative'
                            )
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]
                        # Why cap it?
                        # Because if the query is covering a lot of different
                        # things you can get a really really large query
                        # which can hog resources excessively.
                        # Downloading, as an example, 100k facets (and 0 hits)
                        # when there is plenty of data yields a 11MB JSON
                        # file.
                        if facets_size > 10000:
                            raise BadArgumentError(
                                '_facets_size greater than 10,000'
                            )

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]
                name = self.get_full_field_name(field_data)

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '^': '%s*',  # starts with
                    '$': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator == '__true__':
                    filter_type = 'term'
                    filter_value = True
                elif param.operator == '@':
                    filter_type = 'regexp'
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []

        # We keep track of the requested columns in order to make sure we
        # return those column names and not aliases for example.
        self.request_columns = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                self.request_columns.append(value)
                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product then descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        if facets_size:
            self._create_aggregations(
                params,
                search,
                facets_size,
                histogram_intervals
            )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        errors = []

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()

                aggregations = getattr(results, 'aggregations', {})
                if aggregations:
                    aggregations = self.format_aggregations(aggregations)

                shards = getattr(results, '_shards', {})

                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                errors.append({
                    'type': 'missing_index',
                    'index': missing_index,
                })

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    shards = None
                    break
            except RequestError as exception:
                # Try to handle it gracefully if we can find out what
                # input was bad and caused the exception.
                try:
                    bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall(
                        exception.error
                    )[-1]
                    # Loop over the original parameters to try to figure
                    # out which *key* had the bad input.
                    for key, value in kwargs.items():
                        if value == bad_input:
                            raise BadArgumentError(key)
                except IndexError:
                    # Not an ElasticsearchParseException exception
                    pass
                raise
Exemplo n.º 52
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = None

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:

                if param.name.startswith('_'):
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                args = {}
                filter_type = 'term'
                filter_value = None
                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]
                        if not isinstance(val, basestring) or (
                            isinstance(val, basestring) and ' ' not in val
                        ):
                            filter_value = val

                        # If the term contains white spaces, we want to perform
                        # a phrase query. Thus we do nothing here and let this
                        # value be handled later.
                    else:
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator == '>':
                    # greater than
                    filter_type = 'range'
                    filter_value = {
                        'gt': param.value
                    }
                elif param.operator == '<':
                    # lower than
                    filter_type = 'range'
                    filter_value = {
                        'lt': param.value
                    }
                elif param.operator == '>=':
                    # greater than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'gte': param.value
                    }
                elif param.operator == '<=':
                    # lower than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'lte': param.value
                    }
                elif param.operator == '__null__':
                    # is null
                    filter_type = 'missing'
                    args['field'] = name

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    if param.operator_not:
                        new_filter = ~F(filter_type, **args)
                    else:
                        new_filter = F(filter_type, **args)

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif param.data_type == 'enum':
                        sub_filters |= new_filter
                    else:
                        sub_filters &= new_filter

                    continue

                # These use a wildcard and thus need to be in a query
                # instead of a filter.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                if param.operator in operator_wildcards:
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    query_type = 'wildcard'
                    args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                elif not param.operator:
                    # This is a phrase that was passed down.
                    query_type = 'simple_query_string'
                    args['query'] = param.value[0]
                    args['fields'] = [name]
                    args['default_operator'] = 'and'

                if args:
                    query = Q(query_type, **args)
                    if param.operator_not:
                        query = ~query
                    search = search.query(query)
                else:
                    # If we reach this point, that means the operator is
                    # not supported, and we should raise an error about that.
                    raise NotImplementedError(
                        'Operator %s is not supported' % param.operator
                    )

            if filters is None:
                filters = sub_filters
            elif sub_filters is not None:
                filters &= sub_filters

        search = search.filter(filters)

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't restrict on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot return it' % value
                    )

                if not field_['is_returned']:
                    # Returning this field is not allowed.
                    raise BadArgumentError(
                        value,
                        msg='Field "%s" is not allowed to be returned' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't sort on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot sort on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't facet on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot facet on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if field_['has_full_version']:
                    # If the param has a full version, that means what matters
                    # is the full string, and not its individual terms.
                    field_name += '.full'

                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=self.config.facets_max_number
                )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Exemplo n.º 53
0
class TopologyData(object):
    """A base class used by models that are really Elasticsearch entries, and
    not db tables."""

    _DOC_TYPE = ""
    _INDEX_PREFIX = ""

    def __init__(self):
        self.conn = es_conn()
        self.search = Search(self.conn)

        # Using the private setters over methods simplifies mocking for
        # unit tests.
        # pylint: disable=W0212
        self.search._doc_type = self._DOC_TYPE
        self.search._index = es_indices(self._INDEX_PREFIX, self.conn)

    @classmethod
    def _sort_arg(cls, key, order):
        """Return key as, key or -key, depending on the sort order."""

        if order in ["+", "asc"]:
            return key              # translates to [{key: {'order': 'asc'}}]
        elif order in ["-", "desc"]:
            return "-" + key        # translates to [{key: {'order': 'desc'}}]
        else:
            raise ValueError("Valid order values are in [+, -, asc, desc]")

    def get(self, count=1, sort_key="@timestamp", sort_order="desc"):
        """Return the latest n instances from ES or None if not found."""
        from elasticsearch import ElasticsearchException

        try:
            self.search.sort(self._sort_arg(sort_key, sort_order))
            self.search = self.search[0:count]

            logger.debug("[get] search = %s", self.search.to_dict())
            # pylint: disable=W0212
            logger.debug("[get] index = %s", self.search._index)
            logger.debug("[get] doc_type = %s", self._DOC_TYPE)

            return self.search.execute()

        except ElasticsearchException as exc:
            logger.debug("get from ES failed, exception was %s", exc.message)
            raise

        except ValueError as exc:
            logger.exception(exc)
            raise

    def post(self, body, **_):
        """Post a record to the database.

        :arg body: record body as JSON object
        :arg _: Unused.
        :return: id of the inserted record

        """

        logger.debug("post called with body = %s", json.dumps(body))

        response = self.conn.create(
            daily_index(self._INDEX_PREFIX),
            self._DOC_TYPE,
            body,
            refresh=True)

        logger.debug('[post] response = %s', json.dumps(response))
        return response['_id']
Exemplo n.º 54
0
elif args.cmd == "missingparameter":
    s = query_missingparam(s, args.parameter, args.method, args.responsecode, args.invert)
    querytype = QUERY_SEARCH
elif args.cmd == "headervalues":
    s = query_headervals(s, args.header)
    querytype = QUERY_VALUES
elif args.cmd == "search":
    s = query(s, " ".join(args.query))
    querytype = QUERY_SEARCH
else:
    argparser.print_help()
    sys.exit(1)

if querytype == QUERY_SEARCH:
    if args.fields:
        print_debug(s.to_dict())
        r = s.scan()
    else:
        add_default_aggregation(s)
        print_debug(s.to_dict())
        r = s.execute()
elif querytype == QUERY_VALUES:
    print_debug(s.to_dict())
    r = s.execute()

if querytype == QUERY_SEARCH:
    if not r:
        print("No matches!")
        sys.exit(0)
    if args.fields:
        for d in r:
Exemplo n.º 55
0
class Elastic(LogProvider):
    def __init__(self, config_file='config.cfg'):
        super(Elastic, self).__init__()

        self.percentage=10.0
        self.minimum_occurrences=250

# The ConfigParser documentation points out that there's no way to force defaults config option
# outside the "DEFAULT" section.
        config = ConfigParser()
        config.read(config_file)
        if not config.has_section('elastic'):
            config.add_section('elastic')
        
        for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items():
            if not config.has_option('elastic', option):
                config.set('elastic', option, value)

        self.version = config.getint('elastic', 'version')
        self.index = config.get('elastic', 'index')
        use_ssl = config.getboolean('elastic', 'use_ssl')
        host = config.get('elastic', 'host')
        self.doc_type = config.get('elastic', 'doc_type')
        self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True )

        Event.init(index=self.index)
        index = Index(self.index, using=self.client)
        index.doc_type(Event)
        self.initialize_search()

    def initialize_search(self):
        self.search = Search(using=self.client, index=self.index).extra(size=10000)
        
    def export_search(self):
        return self.search

    def import_search(self, search):
        self.search = search

    def get_filters(self):
        return self.search.to_dict()

    def add_filters(self, filters, regexp=False, negative=False):
        """
        Add `filters` to the query.
         `filters is a dict of the form {'field': value, field2: value2}, but you can also use a list of values
         instead of a `str`. They'll be added as a _or_ (and not a _and_).
        :param dict filters:
        :param bool regexp:
        :param bool negative:
        :return:
        """
        # We need to use multi_match, since we get the fields names dynamically.
        for key, value in filters.items():
            if isinstance(value, set):
                value = list(value)

            # There is no need to process empty values.
            if not value:
                continue

            if isinstance(value, list):
                if negative:
                    self.search = self.search.query(Q('bool', must_not=[
                        reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])])
                    )
                else:
                    self.search = self.search.query(Q('bool', must=[
                        reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])])
                    )
            else:
                if negative:
                    self.search = self.search.query(~Q("multi_match", query=value, fields=[key]))
                else:
                    self.search = self.search.query(Q("multi_match", query=value, fields=[key]))

    def get_top(self, field, size=250):
        """
        Get the top values for the given `field`
        :param str field: the field to filter on
        :param int size: how many top values to return, top
        :return dict of int: A structure of the form {value: number_of_hits, value2: numer_of_hits2}
        """
        search = self.search
        ret = dict()

        if field in ['uri', 'vers', 'comments', 'server']:
            field = ''.join((field, '.raw'))

        if VERSION < (5, 0, 0):
            self.search = self.search.params(search_type='count', default_operator='AND')
        else:
            self.search = self.search.params(search_type='query_then_fetch')
        # This documented at https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search
        # search_type='count' has been deprecated in ES 2.0
        self.search.aggs.bucket('TEST', 'terms', field=field)
        for hit in self.search.execute(ignore_cache=True).aggregations['TEST']['buckets']:
            ret[hit['key']] = hit['doc_count']
        self.search = search
        return ret

    def get_relevant_ids(self, fields, percentage=0, minimum_occurrences=0):
        """ This function is supposed to return the id that are reparteed/present on the `fields`.

         :param list of str fields:
         :param float percentage:
         :param float minimum_occurrences:
         :return set of int:
         """
        minimum_occurences = minimum_occurrences or self.minimum_occurrences
        percentage = percentage or self.percentage

        ret = set()
        search = self.search
        ids = set(i['id'] for i in self.search.execute())  # get all possible ID
        self.search = search

        for _id in ids:
            search = self.search

            self.add_filters({'id': _id})

            # Get how many different fields there are for a given `id`
            data = collections.defaultdict(set)
            fields_counter = collections.defaultdict(int)
            for res in self.search.execute():
                for field in fields:
                    if res[field] not in data[field]:
                        fields_counter[field] += 1.0
                    data[field].add(res[field])

            # Ignore id that are present on less than 10% of different values of each fields
            for field, content in data.items():
                if len(content) < minimum_occurrences:
                    logging.debug('Discarding id \033[32m%s\033[0m only present %d times.', _id, len(content))
                    continue
                _percentage = len(content) / fields_counter[field] * 100.0
                if _percentage > percentage:
                    continue
                logging.debug('Discarding id \033[32m%s\033[0m present in %d%% of different values of the \033[32m%s\033[0m field', _id, _percentage, field)
                break
            else:
                ret.add(_id)
            self.search = search

        return ret

    def reset_filters(self):
        self.search = Search(using=self.client, index=self.index).extra(size=10000)

    def get_results(self):
        """
        Return a `Result` object obtained from the execution of the search `self.search`.
        :return Result: The `Result` object obtained from the execution of the search `self.search`.
        """
        search = self.search
        result = self.search.scan()
        self.search = search
        return result

    def commit(self):
        """Process list of dict (yes) and push them to DB """
        self.total_objs += len(self.nlist)
        count = 0

        def gen_events(events):
            dicts = list()
            for d in events:
                dicts.extend([{'index': {'_index': 'nxapi', '_type': 'events'}}, d.to_dict()])
                yield dicts.pop(-2)
                yield dicts.pop(-1)


        events = list()
        for entry in self.nlist:
            event = Event(_index=self.index)
            for key, value in entry.items():
                setattr(event, key, value)

            event.whitelisted = False
            event.comments = "import on"+str(datetime.datetime.now())
            events.append(event)
            count += 1

        try:
            ret = self.client.bulk(gen_events(events))
            ## ToDo parse ret to selectively loop over events to events.save() whatever happens
        except TransportError as e:
            logging.warning("We encountered an error trying to continue.")
            for event in events:
                event.save(using=self.client)
                ## ToDo find a way to change the hardcoded 'events' for ES doctype
                ## elasticsearch_dsl Issue 689
               
        self.total_commits += count
        logging.debug("Written "+str(self.total_commits)+" events")
        del self.nlist[0:len(self.nlist)]
Exemplo n.º 56
0
    def _build_query(self):
        query = Q()

        source = ['id']
        sort = []

        aggregations = {}
        query_string = None
        as_list = as_dict = False

        for action, value in self.steps:
            if action == 'order_by':
                for key in value:
                    if key.startswith('-'):
                        sort.append({key[1:]: 'desc'})
                    else:
                        sort.append(key)
            elif action == 'values':
                source.extend(value)
                as_list, as_dict = True, False
            elif action == 'values_dict':
                if value:
                    source.extend(value)
                as_list, as_dict = False, True
            elif action == 'query':
                query &= self._process_queries(value)
            elif action == 'filter':
                query &= self._process_filters(value)
            elif action == 'source':
                source.extend(value)
            elif action == 'aggregate':
                aggregations.update(value)
            elif action == 'filter_query_string':
                query_string = value
            else:
                raise NotImplementedError(action)

        # If we have a raw query string we are going to apply all sorts
        # of boosts and filters to improve relevance scoring.
        #
        # We are using the same rules that `search.filters:SearchQueryFilter`
        # implements to have a single-source of truth for how our
        # scoring works.
        from olympia.search.filters import SearchQueryFilter

        search = Search().query(query)

        if query_string:
            search = SearchQueryFilter().apply_search_query(
                query_string, search)

        if sort:
            search = search.sort(*sort)

        if source:
            search = search.source(source)

        body = search.to_dict()

        # These are manually added for now to simplify a partial port to
        # elasticsearch-dsl
        if self.start:
            body['from'] = self.start
        if self.stop is not None:
            body['size'] = self.stop - self.start
        if aggregations:
            body['aggs'] = aggregations

        self.source, self.as_list, self.as_dict = source, as_list, as_dict
        return body
Exemplo n.º 57
0
# -*- encoding: utf-8 -*-

import json
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q


client = Elasticsearch()
        
s = Search(using=client, index="test-index").query("match", nick=u"压力")

testresult = client.search(index='test-index', body=s.to_dict(),size=3, from_=3)
print '=============',testresult

response = s.execute()
print s.to_dict()
Exemplo n.º 58
0
__author__ = "Monica Fernandez"


from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q

client = Elasticsearch()

s = Search().query("term", title="Example").query("term", author="fracma")
print(s.to_dict())