Пример #1
0
def simple_search_public_data(query_text):
    result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]}
    index_list = ['experiments', 'dataset', 'datafile']
    ms = MultiSearch(index=index_list)
    query_exp = Q("match", title=query_text)
    query_exp_oacl = Q("term", public_access=100)
    query_exp = query_exp & query_exp_oacl
    ms = ms.add(Search(index='experiments')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE)
                .query(query_exp))
    query_dataset = Q("match", description=query_text)
    query_dataset_oacl = Q("term", **{'experiments.public_access': 100})
    ms = ms.add(Search(index='dataset')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset)
                .query('nested', path='experiments', query=query_dataset_oacl))
    query_datafile = Q("match", filename=query_text)
    query_datafile_oacl = Q("term", experiments__public_access=100)
    query_datafile = query_datafile & query_datafile_oacl
    ms = ms.add(Search(index='datafile')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE)
                .query(query_datafile))
    results = ms.execute()
    for item in results:
        for hit in item.hits.hits:
            if hit["_index"] == "dataset":
                result_dict["datasets"].append(hit.to_dict())

            elif hit["_index"] == "experiments":
                result_dict["experiments"].append(hit.to_dict())

            elif hit["_index"] == "datafile":
                result_dict["datafiles"].append(hit.to_dict())
    return result_dict
Пример #2
0
    def get_object_list(self, request):
        user = request.user
        query_text = request.GET.get('query', None)
        if not user.is_authenticated:
            result_dict = simple_search_public_data(query_text)
            return [SearchObject(id=1, hits=result_dict)]
        groups = user.groups.all()
        index_list = ['experiments', 'dataset', 'datafile']
        ms = MultiSearch(index=index_list)

        query_exp = Q("match", title=query_text)
        query_exp_oacl = Q("term", objectacls__entityId=user.id) | \
            Q("term", public_access=100)
        for group in groups:
            query_exp_oacl = query_exp_oacl | \
                                 Q("term", objectacls__entityId=group.id)
        query_exp = query_exp & query_exp_oacl
        ms = ms.add(
            Search(index='experiments').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_exp))

        query_dataset = Q("match", description=query_text)
        query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \
            Q("term", **{'experiments.public_access': 100})
        for group in groups:
            query_dataset_oacl = query_dataset_oacl | \
                                 Q("term", **{'experiments.objectacls.entityId': group.id})
        ms = ms.add(
            Search(index='dataset').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_dataset).query(
                    'nested', path='experiments', query=query_dataset_oacl))

        query_datafile = Q("match", filename=query_text)
        query_datafile_oacl = Q("term", experiments__objectacls__entityId=user.id) | \
            Q("term", experiments__public_access=100)
        for group in groups:
            query_datafile_oacl = query_datafile_oacl | \
                                 Q("term", experiments__objectacls__entityId=group.id)
        query_datafile = query_datafile & query_datafile_oacl
        ms = ms.add(
            Search(index='datafile').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_datafile))
        results = ms.execute()
        result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]}
        for item in results:
            for hit in item.hits.hits:
                if hit["_index"] == "dataset":
                    result_dict["datasets"].append(hit.to_dict())

                elif hit["_index"] == "experiments":
                    result_dict["experiments"].append(hit.to_dict())

                elif hit["_index"] == "datafile":
                    result_dict["datafiles"].append(hit.to_dict())

        return [SearchObject(id=1, hits=result_dict)]
Пример #3
0
def get_usernames_for_crawl():
    ms = MultiSearch(index='populars')
    q = Q({"bool": {"must_not": {"exists": {"field": "last_update"}}}})
    never_updated = Search().query(q)
    total = never_updated.count()
    never_updated = never_updated[0:total]
    old_updated = Search().query('range', last_update={"lte": "now-2d"})
    total = old_updated.count()
    old_updated = old_updated[0:total]
    ms = ms.add(never_updated)
    ms = ms.add(old_updated)
    responses = ms.execute()
    for res in responses:
        for hit in res:
            yield (hit.username)
Пример #4
0
    def _fetch_word_freqs_per_day(
        self,
        dataset_widget: DatasetWidget,
    ) -> Tuple[Mapping[str, Sequence[int]], Sequence[int], int]:
        _LOGGER.debug("Fetching word frequencies per day.")

        search_helper = SearchHelper(dataset_widget.dataset.type)
        search_template = Search().extra(size=0, track_total_hits=True)
        search_template = dataset_widget.set_search(search_template)
        search_template = search_helper.add_agg_text_tokens_terms(
            search_template, size=self._top_n_words)

        search = MultiSearch()
        for cur_date in date_range(self._min_date, self._max_date):
            search = search.add(
                search_template.filter(
                    search_helper.query_date_range(gte=cur_date,
                                                   lt=cur_date +
                                                   timedelta(days=1))))

        time_before = time()
        responses = search.execute()
        time_after = time()
        took_msecs = int((time_after - time_before) * 1000)

        word_freqs = defaultdict(lambda: [0] * len(responses))
        num_docs = []
        for i, response in enumerate(responses):
            num_docs.append(response.hits.total.value)
            for bucket in search_helper.read_agg_text_tokens_terms(response):
                word_freqs[bucket.key][i] = bucket.doc_count

        return word_freqs, num_docs, took_msecs
Пример #5
0
    def mcount_buckets(self, buckets):
        ms = MultiSearch(using=self.es)
        for bucket_name in buckets:
            search = Search(using=self.es,
                            index="{}*".format(TMUtils.MAP_PREFIX))
            search.aggs.bucket('indexes', 'terms', field='_index',
                               size=999999).bucket('values',
                                                   'terms',
                                                   field=bucket_name,
                                                   size=999999)
            ms = ms.add(search)

        mres = ms.execute()

        lang2buckets = dict()
        for bucket_name, res in zip(buckets, mres):
            if hasattr(res, "aggregations") and 'indexes' in res.aggregations:
                triple_list = [(re.sub("^{}".format(TMUtils.MAP_PREFIX), "",
                                       x.key), y.key, y.doc_count)
                               for x in res.aggregations['indexes'].buckets
                               for y in x['values'].buckets]
                for lang_pair, bucket_value, count in triple_list:
                    lang2buckets.setdefault(lang_pair, dict()).setdefault(
                        bucket_name, dict())[bucket_value] = count

        return lang2buckets
Пример #6
0
    def mexist(self, src_lang, src_ids):
        if not src_ids: return []
        tgt_langs = [
            target_lang for target_lang in self.lang_graph.neighbors(src_lang)
        ]

        MEXIST_BATCH_SIZE = 10
        results = []
        for i in range(0, len(src_ids), MEXIST_BATCH_SIZE):
            msearch = MultiSearch(using=self.es)
            for source_id in src_ids[i:i + MEXIST_BATCH_SIZE]:
                search = self._create_search_mindexes(source_id, src_lang,
                                                      tgt_langs)
                if search:
                    msearch = msearch.add(search)
            responses = msearch.execute()
            for res in responses:
                try:
                    results.append(bool('hits' in res and res.hits.total))
                except:
                    # Exception is thrown if Response is in some invalid state (no hits, hits are empty)
                    logging.warning("Invalid Response object: {}".format(
                        res.to_dict()))
                    results.append(None)
        return results
Пример #7
0
    def mget(self, id_langs, return_multiple=False):
        if not id_langs: return []
        msearch = MultiSearch(using=self.es)
        search_swap = []
        for source_id, source_lang, target_lang in id_langs:
            search, swap = self._create_search(source_id, source_lang,
                                               target_lang)
            if search:
                # Sort by update date so in case of multiple segments having the same source, the latest one will be returned
                search = search.sort('-update_date')
                msearch = msearch.add(search)
                search_swap.append(swap)

        responses = msearch.execute()
        results = []
        for res, swap in zip(responses, search_swap):
            try:
                if not 'hits' in res or not res.hits.total:
                    results.append(None)
                    continue
                for ret_doc in res.hits:
                    # Exchange source and target (if needed)
                    if swap: ret_doc = self._swap(ret_doc)
                    results.append(ret_doc)
                    if not return_multiple: break
            except:
                # Exception is thrown if Response is in some invalid state (no hits, hits are empty)
                logging.warning("Invalid Response object: {}".format(
                    res.to_dict()))
                results.append(None)
                continue
        return results
Пример #8
0
def _run_multisearch(es, searches):
    """Ejecuta una lista de búsquedas Elasticsearch utilizando la función
    MultiSearch. La cantidad de búsquedas que se envían a la vez es
    configurable vía la variable ES_MULTISEARCH_MAX_LEN.

    Args:
        es (Elasticsearch): Conexión a Elasticsearch.
        searches (list): Lista de elasticsearch_dsl.Search.

    Raises:
        DataConnectionException: Si ocurrió un error al ejecutar las búsquedas.

    Returns:
        list: Lista de respuestas a cada búsqueda.

    """
    step_size = constants.ES_MULTISEARCH_MAX_LEN
    responses = []

    # Partir las búsquedas en varios baches si es necesario.
    for i in range(0, len(searches), step_size):
        end = min(i + step_size, len(searches))
        ms = MultiSearch(using=es)

        for j in range(i, end):
            ms = ms.add(searches[j])

        try:
            responses.extend(ms.execute(raise_on_error=True))
        except elasticsearch.ElasticsearchException as e:
            raise DataConnectionException() from e

    return responses
Пример #9
0
class MultiSearch(object):
    def __init__(self, index=None, queries=None):
        self.index = index
        self._queries = BaseMultiSearch(
            index=self.index._meta.index if index else None)

        for query in queries or []:
            self.add(query)

    def raw(self, raw_dict):
        return Search().raw(raw_dict)

    def filter(self, *args, **kw):
        return Search().filter(*args, **kw)

    def query(self, *args, **kw):
        return Search().query(*args, **kw)

    def add(self, *queries):
        for query in queries:
            self._queries = self._queries.add(query)

    def execute(self):
        return self._queries.execute()

    def __iter__(self):
        return iter(self.execute())

    def __len__(self):
        return len(self._queries)
Пример #10
0
def run_searches(es, index, searches):
    """Ejecuta una lista de búsquedas Elasticsearch. Internamente, se utiliza
    la función MultiSearch.

    Args:
        es (Elasticsearch): Conexión a Elasticsearch.
        index (str): Nombre del índice sobre el cual se deberían ejecutar las
            queries.
        searches (list): Lista de búsquedas, de tipo Search.

    Raises:
        DataConnectionException: si ocurrió un error al ejecutar las búsquedas.

    Returns:
        list: Lista de resultados, cada resultado contiene una lista de 'hits'
            (documentos encontrados).

    """
    ms = MultiSearch(index=index, using=es)

    for search in searches:
        ms = ms.add(search)

    try:
        responses = ms.execute(raise_on_error=True)

        return [[hit.to_dict() for hit in response.hits]
                for response in responses]
    except elasticsearch.ElasticsearchException:
        raise DataConnectionException()
def select_fields(all_fields, search, number_of_groups):
    '''
    Selects the fields from the given Fields object which are most common across the given
    resource ids. The search parameter is used to limit the records that contribute fields to the
    returned selection. The fields returned must appear in the search in at least one resource with
    at least one value present.

    :param all_fields: a Fields object
    :param search: an elasticsearch-dsl search object
    :param number_of_groups: the number of groups to select from the Fields object and return
    :return: a list of groups, each group is a dict containing:
                - "group" - the group name
                - "count" - the number of resources its fields appear in
                - "records" - the number of records the group's fields appear in
                - "fields" - the fields that make up the group along with the resource ids they come
                             from
                - "forced" - whether the field was forced into being included, or whether it was
                             included organically
    '''
    selected_fields = []
    # make sure we don't get any hits back, we're only interested in the counts
    search = search.extra(size=0)

    # iterate over the groups and searches in chunks
    for chunk in chunk_iterator(all_fields.get_searches(search),
                                chunk_size=number_of_groups):
        groups, searches = zip(*chunk)
        # create a multisearch for all the searches in the group
        multisearch = MultiSearch(using=common.ES_CLIENT)
        for search in searches:
            multisearch = multisearch.add(search)

        for (group, count, fields), response in zip(groups,
                                                    multisearch.execute()):
            if all_fields.is_forced(group) or response.hits.total > 0:
                # a field from this group has values in the search result, add it to the selection
                selected_fields.append(
                    dict(group=group,
                         count=count,
                         records=response.hits.total,
                         fields=fields,
                         forced=all_fields.is_forced(group)))

        if len(selected_fields) >= number_of_groups:
            break

    def group_sorter(the_group):
        # this sorts the groups ensuring forced groups are first, in the order they were forced,
        # then the groups with highest count and then the ones with the highest number of records
        if the_group[u'forced']:
            # use 0 0 to ensure that the base order of the groups is maintained for forced groups
            return True, 0, 0
        else:
            return False, the_group[u'count'], the_group[u'records']

    # sort the returned selected list by count and secondly records
    return sorted(selected_fields, key=group_sorter, reverse=True)
Пример #12
0
def multi_search(searchs):
    ms = MultiSearch(using=conn, index="log-index")

    for search in searchs:
        ms = ms.add(search)

    response = ms.execute()

    return response
Пример #13
0
    def get_multi_search(self):
        multi_search = MultiSearch()
        search = self.get_search()
        multi_search = multi_search.add(search)

        if self.args.get(constants.PARAM_AGGREGATIONS) is not None:
            multi_search = self.add_terms_aggregations(multi_search)

        return multi_search
Пример #14
0
def multi_search(request):
	client = Elasticsearch()
	q = request.GET.get('q')
	if q:
		ms = MultiSearch(using=client, index="esdocument-index")
		ms = ms.add(Search().query("match", author=q))
		ms = ms.add(Search().query("match", title=q))
		ms = ms.add(Search().query("match", json_object=q))
		responses = ms.execute()
		hits = []
		for response in responses:
			for hit in response:
				hit = hit.title
				hits.append(hit)
	else:
		responses = 'empty'

	return render(request, 'elasticsearchapp/search.html',
		{'responses': responses, 'hits': hits})
Пример #15
0
def es_create_result_csv_bulk(name, index, result_size=200, batch_size=1000):
    start_time = time.time()
    index_size = Search(index=index).count()
    rest = index_size % batch_size
    results = []
    for i in range(0, index_size - rest, batch_size):
        multisearch = MultiSearch(index=index)
        print(f'generating results number {i} to {i + batch_size}')
        for item in range(i, i + batch_size):
            multisearch = multisearch.add(
                create_mlt_with_id(item, index, result_size))
        responses = multisearch.execute()
        for index_id, response in enumerate(responses, start=i):
            results.append(
                [str(index_id)] +
                [f'{hit.meta.id} ({hit.meta.score})' for hit in response])
    if rest:
        multisearch = MultiSearch(index=index)
        for i in range(index_size - rest, index_size):
            multisearch = multisearch.add(
                create_mlt_with_id(item, index, result_size))
        responses = multisearch.execute()
        for index_id, response in enumerate(responses, start=i):
            results.append(
                [str(index_id)] +
                [f'{hit.meta.id} ({hit.meta.score})' for hit in response])
    try:
        os.mkdir(f'{faiss_path}/{name}/')
    except FileExistsError:
        print(f'directory already exists and I am just deleting it.')
        shutil.rmtree(f'{faiss_path}/{name}/')
        os.mkdir(f'{faiss_path}/{name}/')
    with open(f'{faiss_path}/{name}/search_rankings.csv', 'w',
              newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        for line in results:
            wr.writerow(line)
    stop_time = time.time() - start_time
    with open(f'./datasets/elasticsearch_{name}_timing', 'a') as f:
        f.write(f'time for generating es results for {name}: {stop_time}\n')
    return stop_time
Пример #16
0
def run_multiple_filters():
    while True:
        index = "book"
        ms = MultiSearch(index=index)
        ask_price_filter = AskPriceFilter(index)
        search_ask = ask_price_filter.main_query(
            gt_price=50, lt_price=52, from_range=10, to_range=20
        )
        ms = ms.add(search_ask)
        bid_price_filter = BidPriceFilter(index)
        search_bid = bid_price_filter.main_query(gt_price=50, lt_price=51, to_range=15)
        ms = ms.add(search_bid)

        responses = ms.execute()  # returns a list of Response objects
        for resp in responses:
            print(len(resp))
            print(resp.hits.total.value)

        ask_price_filter.show_result(from_range=0, to_range=5)
        bid_price_filter.show_result(from_range=0, to_range=5)

        time.sleep(5)
Пример #17
0
    def build(self, q=None, **options):
        """
        Build a query according to q and options.
        This is the public method called by API handlers.

        Regarding scopes:
            scopes: [str] nonempty, match query.
            scopes: NoneType, or [], no scope, so query string query.

        Additionally support these options:
            explain: include es scoring information
            userquery: customized function to interpret q

        * additional keywords are passed through as es keywords
            for example: 'explain', 'version' ...

        * multi-search is supported when q is a list. all queries
            are built individually and then sent in one request.

        """
        options = dotdict(options)

        if options.scroll_id:
            # bypass all query building stages
            return ESScrollID(options.scroll_id)

        if options.fetch_all:
            # clean up conflicting parameters
            options.pop('sort', None)
            options.pop('size', None)

        try:
            # process single q vs list of q(s).
            # dispatch 'val' vs 'key:val' to corresponding functions.

            if isinstance(q, list):
                search = MultiSearch()
                for _q in q:
                    _search = self._build_one(_q, options)
                    search = search.add(_search)
            else:  # str, int ...
                search = self._build_one(q, options)

        except IllegalOperation as exc:
            raise ValueError(str(exc))  # ex. sorting by -_score

        if options.get('rawquery'):
            raise RawQueryInterrupt(search.to_dict())

        return search
Пример #18
0
def multisearch(*models, **params):
    ms = MultiSearch(using=es.client, index=es.index_name)
    queries = []
    for model in models:
        s = search_for(model, **params)
        ms = ms.add(s._s)
        queries.append(s)
    responses = ms.execute()
    return [
        # _d_ is the only way to access the raw data
        # allowing to rewrap response in a FacetedSearch
        # because default multisearch loose facets
        SearchResult(query, response._d_)
        for query, response in zip(queries, responses)
    ]
def test_multi_search(data_client):
    s1 = Repository.search()
    s2 = Search(index='flat-git')

    ms = MultiSearch()
    ms = ms.add(s1).add(s2)

    r1, r2 = ms.execute()

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1._search is s1

    assert 52 == r2.hits.total
    assert r2._search is s2
Пример #20
0
def test_multi_search(data_client):
    s1 = Repository.search()
    s2 = Search(index="flat-git")

    ms = MultiSearch()
    ms = ms.add(s1).add(s2)

    r1, r2 = ms.execute()

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1._search is s1

    assert 52 == r2.hits.total.value
    assert r2._search is s2
Пример #21
0
def test_multi_search(data_client):
    s1 = Repository.search()
    s2 = Search(doc_type='commits')

    ms = MultiSearch(index='git')
    ms = ms.add(s1).add(s2)

    r1, r2 = ms.execute()

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1.search is s1

    assert 52 == r2.hits.total
    assert r2.search is s2
Пример #22
0
    def get(self, request, *args, **kwargs):

        query = request.GET.get('q')
        coords = [
            request.GET.get('latitude'),
            request.GET.get('longitude'),
            request.GET.get('radius')
        ]
        latitude, longitude, radius = get_user_coordinates(coords, request)
        ms = MultiSearch(index=['restaurants', 'categories'])

        if query:
            cs = CategoryDocument.search().query("query_string",
                                                 query=query,
                                                 default_field="label")
            q = Q('query_string', query=query, default_field='name')
            q |= Q('nested',
                   path='categories',
                   query=Q('query_string',
                           query=query,
                           default_field='categories.label'))
            rs = RestaurantDocument.search().filter('geo_distance',
                                                    distance='%smi' % radius,
                                                    location={
                                                        "lat": latitude,
                                                        "lon": longitude
                                                    }).query(q)

            ms = ms.add(cs)
            ms = ms.add(rs)
            responses = ms.execute()

            aggregate = []

            for response in responses:
                hits = response['hits']['hits']
                aggregate += [hit.to_dict() for hit in hits]

        else:
            cs = CategoryDocument.search().source([])
            cs = cs[0:10]
            response = cs.execute()
            hits = response['hits']['hits']
            aggregate = [hit.to_dict() for hit in hits]

        return Response(aggregate)
Пример #23
0
    def query(self, queries, size, record_fnum):
        ms = MultiSearch(using=self.es, index=self.index_name)
        for q in queries:
            s = Search().query("match", userid=q[0]).query("match", record=q[1])[:size]
            ms = ms.add(s)
        responses = ms.execute()

        res_batch = []
        for response in responses:
            res = []
            for hit in response:
                res.append([int(hit.userid)] + list(map(int, hit.record.split(','))))
            if len(res) < size:
                res += [np.zeros([record_fnum,]).astype(np.int32).tolist()] * (size - len(res))
            res_batch.append(res)
        return res_batch
        
Пример #24
0
    def get_queryset(self, queryset, data):
        phrase = data.get('q')

        if 'models' not in data:
            models = self._supported_models
        else:
            models = data['models'].split(',')

        advanced = data.get('advanced')
        op, suffix = get_advanced_options(advanced)
        lang = get_language()

        per_model = data.get('per_model', 1)
        ms = MultiSearch(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME)

        for model in models:
            if is_enabled('S39_filter_by_geodata.be'
                          ) and model in self._completion_models:
                sug_query = Search(index=f'{model}s')
                sug_query = sug_query.suggest('title',
                                              phrase,
                                              completion={
                                                  'field':
                                                  f'title.{lang}.suggest',
                                                  'size': per_model
                                              })
                res = sug_query.execute()
                suggestions = res.suggest['title'][0]
                ids = [sug['_id'] for sug in suggestions['options']]
                query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME)
                query = query.filter('term', model=model).query('ids',
                                                                values=ids)
            else:
                query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME)
                query = query.filter('term', model=model)
                query = query.query('bool',
                                    should=[
                                        nested_query_with_advanced_opts(
                                            phrase, field, lang, op, suffix)
                                        for field in ('title', 'notes')
                                    ])
                query = query.extra(size=per_model)
            ms = ms.add(query)

        return ms
Пример #25
0
    def build(self, q, **options):

        # NOTE
        # GRAPH QUERY CUSTOMIZATION

        # ONE
        if isinstance(q, GraphQuery):
            return self.build_graph_query(q, **options)

        # MULTI
        elif isinstance(q, GraphQueries):
            search = MultiSearch()
            for _q in q:
                search = search.add(self.build_graph_query(_q, **options))
            return search

        else:  # NOT GRAPH
            return super().build(q, **options)
Пример #26
0
    def _get_parsed_data(self):
        # Error will be set to true if we encounter an error
        parsed_data = dict(raw=[], error=False, data=[])
        source = ElasticsearchSource.objects.get(name=self.source.name)
        multisearch = MultiSearch()

        if source.max_concurrent_searches is not None:
            multisearch.params(
                max_concurrent_searches=source.max_concurrent_searches)

        for query in json.loads(self.queries):
            multisearch = multisearch.add(
                Search.from_dict(query).params(ignore_unavailable=True,
                                               allow_no_indices=True))

        try:
            responses = multisearch.using(source.client).index(
                source.index).execute()

            for response in responses:
                raw_data = response.to_dict()
                parsed_data['raw'].append(raw_data)

                if raw_data['hits']['hits'] == []:
                    continue

                self._check_response_size(raw_data)

                data = self._parse_es_response([raw_data['aggregations']])
                if data == []:
                    continue

                parsed_data['data'].extend(data)

        except Exception as e:
            logger.exception(
                'Error executing Elasticsearch queries: {}'.format(
                    self.queries))
            parsed_data['error_code'] = type(e).__name__
            parsed_data['error_message'] = six.text_type(e)
            parsed_data['error'] = True

        return parsed_data
Пример #27
0
    def execute_searches(self):
        """Ejecuta la query de todas las series agregadas, e inicializa
        los atributos data y count a partir de las respuestas.
        """
        if not self.series:
            raise QueryError(strings.EMPTY_QUERY_ERROR)

        multi_search = MultiSearch(index=self.index,
                                   doc_type=settings.TS_DOC_TYPE,
                                   using=self.elastic)

        for serie in self.series:
            serie.add_collapse(self.args[constants.PARAM_PERIODICITY])
            multi_search = multi_search.add(serie.search)

        responses = multi_search.execute()
        formatter = ResponseFormatter(self.series, responses, self.args)
        self.data = formatter.format_response()

        self.count = max([response.hits.total for response in responses])
Пример #28
0
    def execute_searches(self):
        """Ejecuta la query de todas las series agregadas, e inicializa
        los atributos data y count a partir de las respuestas.
        """

        multi_search = MultiSearch(index=self.index,
                                   doc_type=settings.TS_DOC_TYPE)

        for serie in self.series:
            multi_search = multi_search.add(serie.search)

        responses = multi_search.execute()
        formatter = ResponseFormatter(self.series, responses,
                                      self.args[constants.PARAM_SORT],
                                      self.args[constants.PARAM_PERIODICITY])

        return {
            'data': (formatter.format_response()),
            'count': max([response.hits.total for response in responses])
        }
def calculate_field_counts(request, es_client):
    '''
    Given a download request and an elasticsearch client to work with, work out the number of values
    available per field, per resource for the search.

    :param request: the DownloadRequest object
    :param es_client: the elasticsearch client to use
    :return: a dict of resource ids -> fields -> counts
    '''
    field_counts = defaultdict(dict)
    for resource_id, version in request.resource_ids_and_versions.items():
        index_name = prefix_resource(resource_id)
        # get the base field mapping for the index so that we know which fields to look up, this
        # will get all fields from all versions and therefore isn't usable straight off the bat, we
        # have to then go and see which fields are present in the search at this version
        mapping = es_client.indices.get_mapping(index_name)[index_name]

        # we're going to do a multisearch to find out the number of records a value for each field
        # from the mapping
        search = MultiSearch(using=es_client, index=index_name)
        base_search = Search.from_dict(request.search) \
            .index(index_name) \
            .using(es_client) \
            .extra(size=0) \
            .filter(create_version_query(version))

        # get all the fields names and use dot notation for nested fields
        fields = [
            u'.'.join(parts) for parts, _config in iter_data_fields(mapping)
        ]
        for field in fields:
            # add a search which finds the documents that have a value for the given field at the
            # right version
            search = search.add(
                base_search.filter(u'exists', field=prefix_field(field)))

        responses = search.execute()
        for field, response in zip(fields, responses):
            field_counts[resource_id][field] = response.hits.total

    return field_counts
Пример #30
0
def test_multi_missing(data_client):
    s1 = Repository.search()
    s2 = Search(doc_type='commits')
    s3 = Search(index='does_not_exist')

    ms = MultiSearch()
    ms = ms.add(s1).add(s2).add(s3)

    with raises(TransportError):
        ms.execute()

    r1, r2, r3 = ms.execute(raise_on_error=False)

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1.search is s1

    assert 52 == r2.hits.total
    assert r2.search is s2

    assert r3 is None
def test_multi_missing(data_client):
    s1 = Repository.search()
    s2 = Search(index='flat-git')
    s3 = Search(index='does_not_exist')

    ms = MultiSearch()
    ms = ms.add(s1).add(s2).add(s3)

    with raises(TransportError):
        ms.execute()

    r1, r2, r3 = ms.execute(raise_on_error=False)

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1._search is s1

    assert 52 == r2.hits.total
    assert r2._search is s2

    assert r3 is None
Пример #32
0
    def execute_queries(self, queries: Dict[Resource, Q],
                        page_index: int,
                        results_per_page: int) -> List[Response]:
        multisearch = MultiSearch(using=self.elasticsearch)

        for resource in queries.keys():
            query_for_resource = queries.get(resource)
            search = Search(index=self.get_index_for_resource(resource_type=resource)).query(query_for_resource)
            LOGGER.info(search.to_dict())
            # pagination
            start_from = page_index * results_per_page
            end = results_per_page * (page_index + 1)

            search = search[start_from:end]

            multisearch = multisearch.add(search)
        try:
            response = multisearch.execute()
            return response
        except Exception as e:
            LOGGER.error(f'Failed to execute ES search queries. {e}')
            return []
Пример #33
0
    def run(self, network, channels, query, author=None, date_range=None):
        # We don't support non-ajax, so will always have date range
        assert date_range
        date_begin, date_end = date_range

        result = Search(
            using=self.es, index='moffle',
        ).query(
            "match", text=query,
        ).query(
            "range", date={
                'gt': date_begin.strftime('%Y%m%d'),
                'lte': date_end.strftime('%Y%m%d'),
            },
        ).filter(
            "terms", line_type=['normal', 'action'],
        ).filter(
            "term", network=network,
        ).filter(
            "terms", channel=channels,
        ).sort(
            "-date",
        )[:10000].execute()

        hits = []
        # TODO: interval merging
        ctx_search = MultiSearch(using=self.es, index='moffle')
        for hit in result:
            # Fetch context
            ctx_search = ctx_search.add(Search(
                using=self.es,
                index='moffle',
            ).query(
                "range", line_no={
                    "gte": hit.line_no - config.SEARCH_CONTEXT,
                    "lte": hit.line_no + config.SEARCH_CONTEXT,
                },
            ).filter(
                "term", network=hit.network,
            ).filter(
                "term", channel=hit.channel,
            ).filter(
                "term", date=hit.date,
            ).sort(
                "line_no",
            ))

        ctx_results = ctx_search.execute()
        for hit, ctx_result in zip(result, ctx_results):
            lines = []
            for ctx_hit in ctx_result:
                lines.append(self._format_line(
                    ctx_hit,
                    is_hit=(hit.line_no == ctx_hit.line_no),
                ))
            hit = Hit(
                channel=hit.channel,
                date=hit.date,
                begin=lines[0].line_no,
                lines=lines,
            )
            hits.append(hit)

        hits = [list(group) for _, group in groupby(hits, key=lambda hit: hit.date)]
        return hits
Пример #34
0
    def batch_request(cls, names):
        """
        Map all name fragments in the array to name hashes.

        Takes an array of arrays (names are tokenized) and returns
        hashes and labels from ES.
        """
        # TODO: THROW IT AWAY AND REPLACE WITH DAWG
        def search_clause(term):
            # TODO: case for initials
            return cls.search().filter("term", term=term)

        def transform_resp(resp):
            labels = list(set(resp.lemma_labels) - {"lemma"})
            assert len(labels) == 1

            label = {
                "lemma-firstname": "firstname",
                "lemma-patronymic": "patronymic",
                "lemma-lastname": "lastname",
                "lemma-firstname-typo": "firstname",
                "lemma-patronymic-typo": "patronymic",
                "lemma-lastname-typo": "lastname"
            }[labels[0]]

            return {
                "term": resp.term,
                "lemma": resp.lemma,
                "label": label
            }

        def match_req_resp(name, hashes):
            res = []

            for chunk, resp in zip(name, hashes):
                if resp:
                    res.append(list(map(transform_resp, resp)))
                else:
                    res.append([{
                        "lemma": sha1((chunk + "thisissalt").encode('utf-8')).hexdigest(),
                        "label": "no-match",
                        "term": chunk
                    }])
            return res

        qs = MultiSearch(index=cls._doc_type.index)
        for name in names:
            for chunk in name:
                qs = qs.add(search_clause(chunk))

        response = qs.execute()
        results = []

        pos = 0
        for name in names:
            l = len(name)

            res_chunk = match_req_resp(name, response[pos:pos + l])

            results.append(res_chunk)
            pos += l

        return results
Пример #35
0
    def _execute_multi_search(self, page, num_results):
        indices = self.samples_by_family_index.keys()

        if not self.previous_search_results.get('loaded_variant_counts'):
            self.previous_search_results['loaded_variant_counts'] = {}

        ms = MultiSearch()
        for index_name in indices:
            start_index = 0
            if self.previous_search_results['loaded_variant_counts'].get(index_name):
                index_total = self.previous_search_results['loaded_variant_counts'][index_name]['total']
                start_index = self.previous_search_results['loaded_variant_counts'][index_name]['loaded']
                if start_index >= index_total:
                    continue
            else:
                self.previous_search_results['loaded_variant_counts'][index_name] = {'loaded': 0, 'total': 0}

            searches = self._get_paginated_searches(index_name, page, num_results, start_index=start_index)
            ms = ms.index(index_name)
            for search in searches:
                ms = ms.add(search)

        responses = self._execute_search(ms)

        new_results = []
        compound_het_results = self.previous_search_results.get('compound_het_results', [])
        for response in responses:
            response_hits, response_total, is_compound_het = self._parse_response(response)
            if not response_total:
                continue

            index_name = response.hits[0].meta.index
            if is_compound_het:
                compound_het_results += response_hits
                self.previous_search_results['loaded_variant_counts']['{}_compound_het'.format(index_name)] = {'total': response_total}
            else:
                new_results += response_hits
                self.previous_search_results['loaded_variant_counts'][index_name]['total'] = response_total
                self.previous_search_results['loaded_variant_counts'][index_name]['loaded'] += len(response_hits)

        self.previous_search_results['total_results'] = sum(counts['total'] for counts in self.previous_search_results['loaded_variant_counts'].values())

        # combine new results with unsorted previously loaded results to correctly sort/paginate
        all_loaded_results = self.previous_search_results.get('all_results', [])
        previous_page_record_count = (page - 1) * num_results
        if len(all_loaded_results) >= previous_page_record_count:
            loaded_results = all_loaded_results[:previous_page_record_count]
            new_results += all_loaded_results[previous_page_record_count:]
        else:
            loaded_results = []
            new_results += self.previous_search_results.get('variant_results', [])

        new_results = sorted(new_results, key=lambda variant: variant['_sort'])
        variant_results = self._deduplicate_results(new_results)

        if compound_het_results or self.previous_search_results.get('grouped_results'):
            if compound_het_results:
                compound_het_results = self._deduplicate_compound_het_results(compound_het_results)
            return self._process_compound_hets(compound_het_results, variant_results, num_results)
        else:
            self.previous_search_results['all_results'] = loaded_results + variant_results
            return variant_results[:num_results]
Пример #36
0
#!/usr/bin/env python

from elasticsearch import Elasticsearch
from elasticsearch_dsl import MultiSearch, Search

client = Elasticsearch(['192.168.33.108:9200','192.168.33.109:9200'])

# multi search "hello" on message field.
ms = MultiSearch(using=client,index='logstash-*')
ms = ms.add(Search().query("match", message="hello"))
ms = ms.add(Search().query("match", message="hello7"))
responses = ms.execute()

for response in responses:
    for r in response:
        print(r['host'], r['message'])
Пример #37
0
class MultiSearchConductor:
    def __init__(self):
        self.field_counts = {}
        self.multi_search = MultiSearch()

    def query_conductor(self, indices, query_body, elasticsearch, es_url, excluded_fields):
        result = {}

        list_of_indices = indices.split(',')

        for index in list_of_indices:
            # Fetch all the fields and their types, then filter the ones we don't want like _texta_id.
            normal_fields, nested_fields = DashboardEsHelper(es_url=es_url, indices=index).get_aggregation_field_data()
            normal_fields, nested_fields = self._filter_excluded_fields(excluded_fields, normal_fields, nested_fields, )

            # Attach all the aggregations to Elasticsearch, depending on the fields.
            # Text, keywords get term aggs etc.
            self._normal_fields_handler(normal_fields, index=index, query_body=query_body, elasticsearch=elasticsearch)
            self._texta_facts_agg_handler(index=index, query_body=query_body, elasticsearch=elasticsearch)

            # Send the query towards Elasticsearch and then save it into the result
            # dict under its index's name.
            responses = self.multi_search.using(elasticsearch).execute()
            result[index] = [response.to_dict() for response in responses]

        return result

    def _normal_fields_handler(self, list_of_normal_fields, query_body, index, elasticsearch):
        for field_dict in list_of_normal_fields:
            field_type = field_dict['type']
            field_name = field_dict['full_path']
            clean_field_name = self._remove_dot_notation(field_name)

            search_gateway = elasticsearch_dsl.Search(index=index).using(elasticsearch)
            self.field_counts[field_name] = search_gateway.query("exists", field=clean_field_name).count()

            # Do not play around with the #, they exist to avoid naming conflicts as awkward as they may be.
            # TODO Find a better solution for this.
            if field_type == "text":
                if query_body is not None:
                    search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch)
                    search_dsl.aggs.bucket("sigsterms#{0}#text_sigterms".format(field_name), 'significant_text', field=field_name, filter_duplicate_text=True)
                    self.multi_search = self.multi_search.add(search_dsl)

            elif field_type == "keyword":
                search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch)
                search_dsl.aggs.bucket("sterms#{0}#keyword_terms".format(field_name), 'terms', field=field_name)
                self.multi_search = self.multi_search.add(search_dsl)

            elif field_type == "date":
                search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch)
                search_dsl.aggs.bucket("date_histogram#{0}_month#date_month".format(field_name), 'date_histogram', field=field_name, interval='month')
                search_dsl.aggs.bucket("date_histogram#{0}_year#date_year".format(field_name), 'date_histogram', field=field_name, interval='year')
                self.multi_search = self.multi_search.add(search_dsl)

            elif field_type == "integer":
                search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch)
                search_dsl.aggs.bucket("extended_stats#{0}#int_stats".format(field_name), 'extended_stats', field=field_name)
                self.multi_search = self.multi_search.add(search_dsl)

            elif field_type == "long":
                search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch)
                search_dsl.aggs.bucket('extended_stats#{0}#long_stats'.format(field_name), 'extended_stats', field=field_name)
                self.multi_search = self.multi_search.add(search_dsl)

            elif field_type == "float":
                search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch)
                search_dsl.aggs.bucket("extended_stats#{0}#float_stats".format(field_name), 'extended_stats', field=field_name)
                self.multi_search = self.multi_search.add(search_dsl)

    def _texta_facts_agg_handler(self, query_body, index, elasticsearch):
        search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch)

        search_dsl.aggs.bucket("nested#texta_facts", 'nested', path='texta_facts') \
            .bucket('sterms#fact_category', 'terms', field='texta_facts.fact', collect_mode="breadth_first") \
            .bucket("sigsterms#significant_facts", 'significant_terms', field='texta_facts.str_val')

        self.multi_search = self.multi_search.add(search_dsl)

    def _filter_excluded_fields(self, excluded_fields, normal_fields, nested_fields):
        normal_fields = list(filter(lambda x: x['full_path'] not in excluded_fields, normal_fields))
        nested_fields = list(filter(lambda x: x['full_path'] not in excluded_fields, nested_fields))
        return normal_fields, nested_fields

    def _remove_dot_notation(self, field_name):
        """
        Removes all the .'s in the field names to avoid
        potential conflicts in the front end.

        :param field_name: Name of a field inside Elasticsearch, ex article_lead.keyword
        :return: Name of the field but the comma removed. ex article_lead
        """
        if '.' in field_name:
            field_name = field_name.split('.')[0]
            return field_name
        else:
            return field_name

    def _create_search_object(self, query_body, index, elasticsearch):
        if query_body:
            search = elasticsearch_dsl.Search.from_dict(query_body).index(index).using(elasticsearch).extra(size=0).source(False)
            return search
        else:
            search = elasticsearch_dsl.Search().index(index).extra(size=0).source(False)
            return search