def mcount_buckets(self, buckets): ms = MultiSearch(using=self.es) for bucket_name in buckets: search = Search(using=self.es, index="{}*".format(TMUtils.MAP_PREFIX)) search.aggs.bucket('indexes', 'terms', field='_index', size=999999).bucket('values', 'terms', field=bucket_name, size=999999) ms = ms.add(search) mres = ms.execute() lang2buckets = dict() for bucket_name, res in zip(buckets, mres): if hasattr(res, "aggregations") and 'indexes' in res.aggregations: triple_list = [(re.sub("^{}".format(TMUtils.MAP_PREFIX), "", x.key), y.key, y.doc_count) for x in res.aggregations['indexes'].buckets for y in x['values'].buckets] for lang_pair, bucket_value, count in triple_list: lang2buckets.setdefault(lang_pair, dict()).setdefault( bucket_name, dict())[bucket_value] = count return lang2buckets
def _run_multisearch(es, searches): """Ejecuta una lista de búsquedas Elasticsearch utilizando la función MultiSearch. La cantidad de búsquedas que se envían a la vez es configurable vía la variable ES_MULTISEARCH_MAX_LEN. Args: es (Elasticsearch): Conexión a Elasticsearch. searches (list): Lista de elasticsearch_dsl.Search. Raises: DataConnectionException: Si ocurrió un error al ejecutar las búsquedas. Returns: list: Lista de respuestas a cada búsqueda. """ step_size = constants.ES_MULTISEARCH_MAX_LEN responses = [] # Partir las búsquedas en varios baches si es necesario. for i in range(0, len(searches), step_size): end = min(i + step_size, len(searches)) ms = MultiSearch(using=es) for j in range(i, end): ms = ms.add(searches[j]) try: responses.extend(ms.execute(raise_on_error=True)) except elasticsearch.ElasticsearchException as e: raise DataConnectionException() from e return responses
class MultiSearch(object): def __init__(self, index=None, queries=None): self.index = index self._queries = BaseMultiSearch( index=self.index._meta.index if index else None) for query in queries or []: self.add(query) def raw(self, raw_dict): return Search().raw(raw_dict) def filter(self, *args, **kw): return Search().filter(*args, **kw) def query(self, *args, **kw): return Search().query(*args, **kw) def add(self, *queries): for query in queries: self._queries = self._queries.add(query) def execute(self): return self._queries.execute() def __iter__(self): return iter(self.execute()) def __len__(self): return len(self._queries)
def mget(self, id_langs, return_multiple=False): if not id_langs: return [] msearch = MultiSearch(using=self.es) search_swap = [] for source_id, source_lang, target_lang in id_langs: search, swap = self._create_search(source_id, source_lang, target_lang) if search: # Sort by update date so in case of multiple segments having the same source, the latest one will be returned search = search.sort('-update_date') msearch = msearch.add(search) search_swap.append(swap) responses = msearch.execute() results = [] for res, swap in zip(responses, search_swap): try: if not 'hits' in res or not res.hits.total: results.append(None) continue for ret_doc in res.hits: # Exchange source and target (if needed) if swap: ret_doc = self._swap(ret_doc) results.append(ret_doc) if not return_multiple: break except: # Exception is thrown if Response is in some invalid state (no hits, hits are empty) logging.warning("Invalid Response object: {}".format( res.to_dict())) results.append(None) continue return results
def mexist(self, src_lang, src_ids): if not src_ids: return [] tgt_langs = [ target_lang for target_lang in self.lang_graph.neighbors(src_lang) ] MEXIST_BATCH_SIZE = 10 results = [] for i in range(0, len(src_ids), MEXIST_BATCH_SIZE): msearch = MultiSearch(using=self.es) for source_id in src_ids[i:i + MEXIST_BATCH_SIZE]: search = self._create_search_mindexes(source_id, src_lang, tgt_langs) if search: msearch = msearch.add(search) responses = msearch.execute() for res in responses: try: results.append(bool('hits' in res and res.hits.total)) except: # Exception is thrown if Response is in some invalid state (no hits, hits are empty) logging.warning("Invalid Response object: {}".format( res.to_dict())) results.append(None) return results
def simple_search_public_data(query_text): result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]} index_list = ['experiments', 'dataset', 'datafile'] ms = MultiSearch(index=index_list) query_exp = Q("match", title=query_text) query_exp_oacl = Q("term", public_access=100) query_exp = query_exp & query_exp_oacl ms = ms.add(Search(index='experiments') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE) .query(query_exp)) query_dataset = Q("match", description=query_text) query_dataset_oacl = Q("term", **{'experiments.public_access': 100}) ms = ms.add(Search(index='dataset') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset) .query('nested', path='experiments', query=query_dataset_oacl)) query_datafile = Q("match", filename=query_text) query_datafile_oacl = Q("term", experiments__public_access=100) query_datafile = query_datafile & query_datafile_oacl ms = ms.add(Search(index='datafile') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE) .query(query_datafile)) results = ms.execute() for item in results: for hit in item.hits.hits: if hit["_index"] == "dataset": result_dict["datasets"].append(hit.to_dict()) elif hit["_index"] == "experiments": result_dict["experiments"].append(hit.to_dict()) elif hit["_index"] == "datafile": result_dict["datafiles"].append(hit.to_dict()) return result_dict
def _fetch_word_freqs_per_day( self, dataset_widget: DatasetWidget, ) -> Tuple[Mapping[str, Sequence[int]], Sequence[int], int]: _LOGGER.debug("Fetching word frequencies per day.") search_helper = SearchHelper(dataset_widget.dataset.type) search_template = Search().extra(size=0, track_total_hits=True) search_template = dataset_widget.set_search(search_template) search_template = search_helper.add_agg_text_tokens_terms( search_template, size=self._top_n_words) search = MultiSearch() for cur_date in date_range(self._min_date, self._max_date): search = search.add( search_template.filter( search_helper.query_date_range(gte=cur_date, lt=cur_date + timedelta(days=1)))) time_before = time() responses = search.execute() time_after = time() took_msecs = int((time_after - time_before) * 1000) word_freqs = defaultdict(lambda: [0] * len(responses)) num_docs = [] for i, response in enumerate(responses): num_docs.append(response.hits.total.value) for bucket in search_helper.read_agg_text_tokens_terms(response): word_freqs[bucket.key][i] = bucket.doc_count return word_freqs, num_docs, took_msecs
def run_searches(es, index, searches): """Ejecuta una lista de búsquedas Elasticsearch. Internamente, se utiliza la función MultiSearch. Args: es (Elasticsearch): Conexión a Elasticsearch. index (str): Nombre del índice sobre el cual se deberían ejecutar las queries. searches (list): Lista de búsquedas, de tipo Search. Raises: DataConnectionException: si ocurrió un error al ejecutar las búsquedas. Returns: list: Lista de resultados, cada resultado contiene una lista de 'hits' (documentos encontrados). """ ms = MultiSearch(index=index, using=es) for search in searches: ms = ms.add(search) try: responses = ms.execute(raise_on_error=True) return [[hit.to_dict() for hit in response.hits] for response in responses] except elasticsearch.ElasticsearchException: raise DataConnectionException()
def get_object_list(self, request): user = request.user query_text = request.GET.get('query', None) if not user.is_authenticated: result_dict = simple_search_public_data(query_text) return [SearchObject(id=1, hits=result_dict)] groups = user.groups.all() index_list = ['experiments', 'dataset', 'datafile'] ms = MultiSearch(index=index_list) query_exp = Q("match", title=query_text) query_exp_oacl = Q("term", objectacls__entityId=user.id) | \ Q("term", public_access=100) for group in groups: query_exp_oacl = query_exp_oacl | \ Q("term", objectacls__entityId=group.id) query_exp = query_exp & query_exp_oacl ms = ms.add( Search(index='experiments').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_exp)) query_dataset = Q("match", description=query_text) query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \ Q("term", **{'experiments.public_access': 100}) for group in groups: query_dataset_oacl = query_dataset_oacl | \ Q("term", **{'experiments.objectacls.entityId': group.id}) ms = ms.add( Search(index='dataset').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset).query( 'nested', path='experiments', query=query_dataset_oacl)) query_datafile = Q("match", filename=query_text) query_datafile_oacl = Q("term", experiments__objectacls__entityId=user.id) | \ Q("term", experiments__public_access=100) for group in groups: query_datafile_oacl = query_datafile_oacl | \ Q("term", experiments__objectacls__entityId=group.id) query_datafile = query_datafile & query_datafile_oacl ms = ms.add( Search(index='datafile').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_datafile)) results = ms.execute() result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]} for item in results: for hit in item.hits.hits: if hit["_index"] == "dataset": result_dict["datasets"].append(hit.to_dict()) elif hit["_index"] == "experiments": result_dict["experiments"].append(hit.to_dict()) elif hit["_index"] == "datafile": result_dict["datafiles"].append(hit.to_dict()) return [SearchObject(id=1, hits=result_dict)]
def multi_search(searchs): ms = MultiSearch(using=conn, index="log-index") for search in searchs: ms = ms.add(search) response = ms.execute() return response
def select_fields(all_fields, search, number_of_groups): ''' Selects the fields from the given Fields object which are most common across the given resource ids. The search parameter is used to limit the records that contribute fields to the returned selection. The fields returned must appear in the search in at least one resource with at least one value present. :param all_fields: a Fields object :param search: an elasticsearch-dsl search object :param number_of_groups: the number of groups to select from the Fields object and return :return: a list of groups, each group is a dict containing: - "group" - the group name - "count" - the number of resources its fields appear in - "records" - the number of records the group's fields appear in - "fields" - the fields that make up the group along with the resource ids they come from - "forced" - whether the field was forced into being included, or whether it was included organically ''' selected_fields = [] # make sure we don't get any hits back, we're only interested in the counts search = search.extra(size=0) # iterate over the groups and searches in chunks for chunk in chunk_iterator(all_fields.get_searches(search), chunk_size=number_of_groups): groups, searches = zip(*chunk) # create a multisearch for all the searches in the group multisearch = MultiSearch(using=common.ES_CLIENT) for search in searches: multisearch = multisearch.add(search) for (group, count, fields), response in zip(groups, multisearch.execute()): if all_fields.is_forced(group) or response.hits.total > 0: # a field from this group has values in the search result, add it to the selection selected_fields.append( dict(group=group, count=count, records=response.hits.total, fields=fields, forced=all_fields.is_forced(group))) if len(selected_fields) >= number_of_groups: break def group_sorter(the_group): # this sorts the groups ensuring forced groups are first, in the order they were forced, # then the groups with highest count and then the ones with the highest number of records if the_group[u'forced']: # use 0 0 to ensure that the base order of the groups is maintained for forced groups return True, 0, 0 else: return False, the_group[u'count'], the_group[u'records'] # sort the returned selected list by count and secondly records return sorted(selected_fields, key=group_sorter, reverse=True)
def es_create_result_csv_bulk(name, index, result_size=200, batch_size=1000): start_time = time.time() index_size = Search(index=index).count() rest = index_size % batch_size results = [] for i in range(0, index_size - rest, batch_size): multisearch = MultiSearch(index=index) print(f'generating results number {i} to {i + batch_size}') for item in range(i, i + batch_size): multisearch = multisearch.add( create_mlt_with_id(item, index, result_size)) responses = multisearch.execute() for index_id, response in enumerate(responses, start=i): results.append( [str(index_id)] + [f'{hit.meta.id} ({hit.meta.score})' for hit in response]) if rest: multisearch = MultiSearch(index=index) for i in range(index_size - rest, index_size): multisearch = multisearch.add( create_mlt_with_id(item, index, result_size)) responses = multisearch.execute() for index_id, response in enumerate(responses, start=i): results.append( [str(index_id)] + [f'{hit.meta.id} ({hit.meta.score})' for hit in response]) try: os.mkdir(f'{faiss_path}/{name}/') except FileExistsError: print(f'directory already exists and I am just deleting it.') shutil.rmtree(f'{faiss_path}/{name}/') os.mkdir(f'{faiss_path}/{name}/') with open(f'{faiss_path}/{name}/search_rankings.csv', 'w', newline='') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for line in results: wr.writerow(line) stop_time = time.time() - start_time with open(f'./datasets/elasticsearch_{name}_timing', 'a') as f: f.write(f'time for generating es results for {name}: {stop_time}\n') return stop_time
def test_multi_missing(data_client): s1 = Repository.search() s2 = Search(doc_type='commits') s3 = Search(index='does_not_exist') ms = MultiSearch() ms = ms.add(s1).add(s2).add(s3) with raises(TransportError): ms.execute() r1, r2, r3 = ms.execute(raise_on_error=False) assert 1 == len(r1) assert isinstance(r1[0], Repository) assert r1.search is s1 assert 52 == r2.hits.total assert r2.search is s2 assert r3 is None
def test_multi_missing(data_client): s1 = Repository.search() s2 = Search(index='flat-git') s3 = Search(index='does_not_exist') ms = MultiSearch() ms = ms.add(s1).add(s2).add(s3) with raises(TransportError): ms.execute() r1, r2, r3 = ms.execute(raise_on_error=False) assert 1 == len(r1) assert isinstance(r1[0], Repository) assert r1._search is s1 assert 52 == r2.hits.total assert r2._search is s2 assert r3 is None
def test_multi_search(data_client): s1 = Repository.search() s2 = Search(doc_type='commits') ms = MultiSearch(index='git') ms = ms.add(s1).add(s2) r1, r2 = ms.execute() assert 1 == len(r1) assert isinstance(r1[0], Repository) assert r1.search is s1 assert 52 == r2.hits.total assert r2.search is s2
def get_usernames_for_crawl(): ms = MultiSearch(index='populars') q = Q({"bool": {"must_not": {"exists": {"field": "last_update"}}}}) never_updated = Search().query(q) total = never_updated.count() never_updated = never_updated[0:total] old_updated = Search().query('range', last_update={"lte": "now-2d"}) total = old_updated.count() old_updated = old_updated[0:total] ms = ms.add(never_updated) ms = ms.add(old_updated) responses = ms.execute() for res in responses: for hit in res: yield (hit.username)
def multisearch(*models, **params): ms = MultiSearch(using=es.client, index=es.index_name) queries = [] for model in models: s = search_for(model, **params) ms = ms.add(s._s) queries.append(s) responses = ms.execute() return [ # _d_ is the only way to access the raw data # allowing to rewrap response in a FacetedSearch # because default multisearch loose facets SearchResult(query, response._d_) for query, response in zip(queries, responses) ]
def test_multi_search(data_client): s1 = Repository.search() s2 = Search(index="flat-git") ms = MultiSearch() ms = ms.add(s1).add(s2) r1, r2 = ms.execute() assert 1 == len(r1) assert isinstance(r1[0], Repository) assert r1._search is s1 assert 52 == r2.hits.total.value assert r2._search is s2
def test_multi_search(data_client): s1 = Repository.search() s2 = Search(index='flat-git') ms = MultiSearch() ms = ms.add(s1).add(s2) r1, r2 = ms.execute() assert 1 == len(r1) assert isinstance(r1[0], Repository) assert r1._search is s1 assert 52 == r2.hits.total assert r2._search is s2
def query(self, queries, size, record_fnum): ms = MultiSearch(using=self.es, index=self.index_name) for q in queries: s = Search().query("match", userid=q[0]).query("match", record=q[1])[:size] ms = ms.add(s) responses = ms.execute() res_batch = [] for response in responses: res = [] for hit in response: res.append([int(hit.userid)] + list(map(int, hit.record.split(',')))) if len(res) < size: res += [np.zeros([record_fnum,]).astype(np.int32).tolist()] * (size - len(res)) res_batch.append(res) return res_batch
def get(self, request, *args, **kwargs): query = request.GET.get('q') coords = [ request.GET.get('latitude'), request.GET.get('longitude'), request.GET.get('radius') ] latitude, longitude, radius = get_user_coordinates(coords, request) ms = MultiSearch(index=['restaurants', 'categories']) if query: cs = CategoryDocument.search().query("query_string", query=query, default_field="label") q = Q('query_string', query=query, default_field='name') q |= Q('nested', path='categories', query=Q('query_string', query=query, default_field='categories.label')) rs = RestaurantDocument.search().filter('geo_distance', distance='%smi' % radius, location={ "lat": latitude, "lon": longitude }).query(q) ms = ms.add(cs) ms = ms.add(rs) responses = ms.execute() aggregate = [] for response in responses: hits = response['hits']['hits'] aggregate += [hit.to_dict() for hit in hits] else: cs = CategoryDocument.search().source([]) cs = cs[0:10] response = cs.execute() hits = response['hits']['hits'] aggregate = [hit.to_dict() for hit in hits] return Response(aggregate)
def multi_search(request): client = Elasticsearch() q = request.GET.get('q') if q: ms = MultiSearch(using=client, index="esdocument-index") ms = ms.add(Search().query("match", author=q)) ms = ms.add(Search().query("match", title=q)) ms = ms.add(Search().query("match", json_object=q)) responses = ms.execute() hits = [] for response in responses: for hit in response: hit = hit.title hits.append(hit) else: responses = 'empty' return render(request, 'elasticsearchapp/search.html', {'responses': responses, 'hits': hits})
def execute_searches(self): """Ejecuta la query de todas las series agregadas, e inicializa los atributos data y count a partir de las respuestas. """ if not self.series: raise QueryError(strings.EMPTY_QUERY_ERROR) multi_search = MultiSearch(index=self.index, doc_type=settings.TS_DOC_TYPE, using=self.elastic) for serie in self.series: serie.add_collapse(self.args[constants.PARAM_PERIODICITY]) multi_search = multi_search.add(serie.search) responses = multi_search.execute() formatter = ResponseFormatter(self.series, responses, self.args) self.data = formatter.format_response() self.count = max([response.hits.total for response in responses])
def execute_searches(self): """Ejecuta la query de todas las series agregadas, e inicializa los atributos data y count a partir de las respuestas. """ multi_search = MultiSearch(index=self.index, doc_type=settings.TS_DOC_TYPE) for serie in self.series: multi_search = multi_search.add(serie.search) responses = multi_search.execute() formatter = ResponseFormatter(self.series, responses, self.args[constants.PARAM_SORT], self.args[constants.PARAM_PERIODICITY]) return { 'data': (formatter.format_response()), 'count': max([response.hits.total for response in responses]) }
def calculate_field_counts(request, es_client): ''' Given a download request and an elasticsearch client to work with, work out the number of values available per field, per resource for the search. :param request: the DownloadRequest object :param es_client: the elasticsearch client to use :return: a dict of resource ids -> fields -> counts ''' field_counts = defaultdict(dict) for resource_id, version in request.resource_ids_and_versions.items(): index_name = prefix_resource(resource_id) # get the base field mapping for the index so that we know which fields to look up, this # will get all fields from all versions and therefore isn't usable straight off the bat, we # have to then go and see which fields are present in the search at this version mapping = es_client.indices.get_mapping(index_name)[index_name] # we're going to do a multisearch to find out the number of records a value for each field # from the mapping search = MultiSearch(using=es_client, index=index_name) base_search = Search.from_dict(request.search) \ .index(index_name) \ .using(es_client) \ .extra(size=0) \ .filter(create_version_query(version)) # get all the fields names and use dot notation for nested fields fields = [ u'.'.join(parts) for parts, _config in iter_data_fields(mapping) ] for field in fields: # add a search which finds the documents that have a value for the given field at the # right version search = search.add( base_search.filter(u'exists', field=prefix_field(field))) responses = search.execute() for field, response in zip(fields, responses): field_counts[resource_id][field] = response.hits.total return field_counts
def find_searched_resources(search, resource_ids): ''' Given a search and a list of resource ids to search in, returns a list of the resources that are actually included in the search results. :param search: an elasticsearch-dsl object :param resource_ids: a list of resource ids :return: a list of resource ids ''' # we have to make a copy as aggs don't return a clone :( search_copy = copy(search) search_copy = search_copy.index( [prefix_resource(resource_id) for resource_id in resource_ids]) search_copy.aggs.bucket(u'indexes', u'terms', field=u'_index') multisearch = MultiSearch(using=common.ES_CLIENT).add(search_copy) result = next(iter(multisearch.execute())) return [ trim_index_name(bucket[u'key']) for bucket in result.aggs.to_dict()[u'indexes'][u'buckets'] if bucket[u'doc_count'] > 0 ]
def execute_queries(self, queries: Dict[Resource, Q], page_index: int, results_per_page: int) -> List[Response]: multisearch = MultiSearch(using=self.elasticsearch) for resource in queries.keys(): query_for_resource = queries.get(resource) search = Search(index=self.get_index_for_resource(resource_type=resource)).query(query_for_resource) LOGGER.info(search.to_dict()) # pagination start_from = page_index * results_per_page end = results_per_page * (page_index + 1) search = search[start_from:end] multisearch = multisearch.add(search) try: response = multisearch.execute() return response except Exception as e: LOGGER.error(f'Failed to execute ES search queries. {e}') return []
def run_multiple_filters(): while True: index = "book" ms = MultiSearch(index=index) ask_price_filter = AskPriceFilter(index) search_ask = ask_price_filter.main_query( gt_price=50, lt_price=52, from_range=10, to_range=20 ) ms = ms.add(search_ask) bid_price_filter = BidPriceFilter(index) search_bid = bid_price_filter.main_query(gt_price=50, lt_price=51, to_range=15) ms = ms.add(search_bid) responses = ms.execute() # returns a list of Response objects for resp in responses: print(len(resp)) print(resp.hits.total.value) ask_price_filter.show_result(from_range=0, to_range=5) bid_price_filter.show_result(from_range=0, to_range=5) time.sleep(5)
def update_jobs(entries, history=False): """ Generate updates to claims.* from job classad dictionaries """ def parent_slot_name(dynamic_slot_name): parts = dynamic_slot_name.split("@") match = re.match(r"(slot\d+)_\d+", parts[0]) if match: parts[0] = match.group(1) return "@".join(parts) # MultiSearch will fail if there are no queries to run jobs = list(entries) if not jobs: return # glidein names are not necessarily unique on long time scales. look up the # last glidein that started with the advertised name _before_ the evicted # job was started ms = MultiSearch(using=es, index=options.indexname) for hit in jobs: if history: t0 = hit["JobCurrentStartDate"] else: if hit["JobStatus"] == 5: t0 = hit["JobCurrentStartDate"] else: t0 = hit["JobLastStartDate"] ms = ms.add( Search() .filter("term", Name__keyword=parent_slot_name(hit["LastRemoteHost"])) .filter("range", DaemonStartTime={"lte": datetime.utcfromtimestamp(t0)},) .sort({"DaemonStartTime": {"order": "desc"}}) .source(["nuthin"])[:1] ) for hit, match in zip(jobs, ms.execute()): if not match.hits: continue if history: if hit["JobStatus"] == 3: category = "removed" elif hit["ExitCode"] == 0: category = "finished" else: category = "failed" walltime = float(hit["EnteredCurrentStatus"] - hit["JobCurrentStartDate"]) else: # NB: if a job is evicted from one slot, held on another, and then # removed from the queue, there's no way to recover the time that # may have elapsed between hold and removal. To handle this case, # we treat held jobs as a subcategory of removed jobs, so that they # will not be counted again when encountered in the history. if hit["JobStatus"] == 5: walltime = float(hit["EnteredCurrentStatus"] - hit["JobCurrentStartDate"]) category = "removed" else: walltime = float(hit["LastVacateTime"] - hit["JobLastStartDate"]) category = "evicted" # normalize capitalization of requests requests = {resource: 0 for resource in RESOURCES} for k in hit: if k.startswith("Request"): requests[k[7:]] = walltime * hit[k] doc = { "_op_type": "update", "_index": match.hits[0].meta.index, "_type": match.hits[0].meta.doc_type, "_id": match.hits[0].meta.id, "script": { "id": options.indexname + "-update-jobs", "params": { "job": hit["GlobalJobId"].replace("#", "-").replace(".", "-"), "category": category, "requests": requests, }, }, } yield doc
def batch_request(cls, names): """ Map all name fragments in the array to name hashes. Takes an array of arrays (names are tokenized) and returns hashes and labels from ES. """ # TODO: THROW IT AWAY AND REPLACE WITH DAWG def search_clause(term): # TODO: case for initials return cls.search().filter("term", term=term) def transform_resp(resp): labels = list(set(resp.lemma_labels) - {"lemma"}) assert len(labels) == 1 label = { "lemma-firstname": "firstname", "lemma-patronymic": "patronymic", "lemma-lastname": "lastname", "lemma-firstname-typo": "firstname", "lemma-patronymic-typo": "patronymic", "lemma-lastname-typo": "lastname" }[labels[0]] return { "term": resp.term, "lemma": resp.lemma, "label": label } def match_req_resp(name, hashes): res = [] for chunk, resp in zip(name, hashes): if resp: res.append(list(map(transform_resp, resp))) else: res.append([{ "lemma": sha1((chunk + "thisissalt").encode('utf-8')).hexdigest(), "label": "no-match", "term": chunk }]) return res qs = MultiSearch(index=cls._doc_type.index) for name in names: for chunk in name: qs = qs.add(search_clause(chunk)) response = qs.execute() results = [] pos = 0 for name in names: l = len(name) res_chunk = match_req_resp(name, response[pos:pos + l]) results.append(res_chunk) pos += l return results
def extract_keyterms_and_contexts(self): """ Execute keyterms and highlight/offset queries Task 2: Extract keyterms and offsets (and optionally contexts). Add the corresponding ES/mongo updates to an update queue for indexing. Execute task over multiple threads to speed things up. """ while self.task_manager.task_running('keyterm_query_fetching') or len( self.keyterms_query_batches) > 0: try: # Get the next batch batch = self.keyterms_query_batches.popleft() except IndexError: # Queue is empty; wait time.sleep(1) continue # Execute keyterms queries, getting the keyterms, offsets and contexts # and adding the payloads to the updates queue if self.use_termvectors: # When we use termvectors, we can't extract highlight/offset information # from the anlayzed/tokenized field, and so we have to extract from the termvectors. termvecs = self.es_utility.es.mtermvectors( ids=batch['_ids'], index=self.es_index_name, doc_type='_doc') # Since we don't know the order of the returned termvectors, # we create a lookup from keyterms to their locations doc_termvector_lookup = { x['_id']: x['term_vectors'][self.es_highlight_field]['terms'] for x in termvecs['docs'] } batch_toks_to_locs = [] batch_contexts = [] for i, (_id, tv_attrs) in enumerate( zip(batch['_ids'], termvecs['docs'])): keyterms_set = set(batch['keyterms'][i]) doc_tok_offsets, context = extract_termvectors_contexts_and_keyterm_offsets( doc_termvector_lookup[_id], keyterms_set, self.termvectors_window_size, self.extract_contexts) batch_toks_to_locs.append(doc_tok_offsets) batch_contexts.append(context) else: ms = MultiSearch(index=self.es_index_name).using( self.es_utility.es) for _id, _keyterms in zip(batch['_ids'], batch['keyterms']): offsets_search, highlight_search = get_context_and_offset_query( _id, _keyterms, self.es_utility.es, self.es_highlight_field) ms = ms.add(offsets_search).add(highlight_search) resp = ms.execute(raise_on_error=True) batch_toks_to_locs, batch_contexts = extract_contexts_and_keyterm_offsets( resp, self.es_highlight_field, extract_context=self.extract_contexts) if self.extract_contexts: for _id, keyterm_locs, contexts in zip(batch['_ids'], batch_toks_to_locs, batch_contexts): if keyterm_locs is None: keyterms = offsets = None else: keyterms, offsets = list(keyterm_locs.keys()), list( keyterm_locs.values()) self.updates.append({ '_id': _id, 'body': { self.keywords_field_name: keyterms, self.offsets_field_name: offsets, self.contexts_field_name: contexts } }) else: for _id, keyterm_locs in zip(batch['_ids'], batch_toks_to_locs): if keyterm_locs is None: keyterms = offsets = None else: keyterms, offsets = list(keyterm_locs.keys()), list( keyterm_locs.values()) self.updates.append({ '_id': _id, 'body': { self.keywords_field_name: keyterms, self.offsets_field_name: offsets } }) # Notify that the thread has finished self.task_manager.add_completed('updates_extraction')
h = response.hits[0] print('/%s/%s/%s returned with score %f' % ( h.meta.index, h.meta.doc_type, h.meta.id, h.meta.score)) # Aggregations for tag in response.aggregations.per_tag.buckets: print(tag.key, tag.mex_lines.value) # Multisearch from elasticsearch_dsl import MultiSearch, Search ms = MultiSearch(index='blogs') ms = ms.add(Search().filter('term', tags='python')) ms = ms.add(Search().filter('term', tags='elasticsearch')) responses = ms.execute() for response in responses: print("result for query %r." % response.search.query) for hit in response: print(hit.title) ################################## # PERSISTENCE # Mappings from elasticsearch_dsl import Keyword, Mapping, Nested, Text
def validate(cls, address): try: if "fullAddress" not in address: address["fullAddress"] = "" if "source" not in address: address["source"] = "" ms = MultiSearch(index=ADDRESSES_INDEX) should = [] if "postalCode" in address: should.append(Q("match", postalCode=address["postalCode"])) if "region" in address: should.append(Q("match", region=address["region"])) if "fullAddress" in address: ms = ms.add( cls.search().query( "bool", must=Q( "simple_query_string", fields=["all.shingle"], query=address["fullAddress"], default_operator="or", ), should=should, ) ).add( cls.search().query( "bool", must=Q( "simple_query_string", fields=["all"], query=address["fullAddress"], default_operator="or", ), should=should, ) ) if address["source"]: ms = ms.add( cls.search().query( "bool", must=Q( "simple_query_string", fields=["all"], query=address["source"], default_operator="or", ), should=should, ) ) responses = ms.execute() new_address = {} max_score = 0 for resp in responses: if resp.hits.max_score is not None and resp.hits.max_score >= max_score: new_address = resp[0].to_dict() max_score = resp.hits.max_score if new_address: address["fullAddress"] = "" if new_address.get("postalCode"): address["postalCode"] = new_address["postalCode"].rjust(5, "0") address["fullAddress"] += address["postalCode"] if new_address.get("region"): address["region"] = new_address["region"] if new_address.get("region") not in ["місто Київ", "місто Севастополь"]: address["fullAddress"] += ", " + address["region"] if new_address.get("district"): address["district"] = new_address["district"] address["fullAddress"] += ", " + address["district"] if address.get("locality") and new_address.get("locality"): if ( distance( new_address["locality"].lower(), address["locality"].lower() ) < 6 or distance( new_address["oldLocality"].lower(), address["locality"].lower(), ) < 6 ): address["locality"] = new_address["locality"] address["fullAddress"] += ", " + address["locality"] elif new_address.get("locality"): address["locality"] = new_address["locality"] address["fullAddress"] += ", " + address["locality"] if address.get("streetAddress") and new_address.get("street"): if ( distance( new_address["street"].lower(), address["streetAddress"].lower(), ) < 6 or distance( new_address["oldStreet"].lower(), address["streetAddress"].lower(), ) < 6 ): address["streetAddress"] = new_address["street"] address["fullAddress"] += ", " + address["streetAddress"] if new_address.get("oldStreet"): address["oldStreet"] = new_address["oldStreet"] if new_address.get("oldDistrict"): address["oldDistrict"] = new_address["oldDistrict"] if new_address.get("oldLocality"): address["oldLocality"] = new_address["oldLocality"] if address.get("streetNumber"): address["streetAddress"] += ", " + address["streetNumber"] address["fullAddress"] += ", " + address["streetNumber"] del address["streetNumber"] except (ValueError, KeyError, IndexError) as e: print(e) return address return address
#!/usr/bin/env python from elasticsearch import Elasticsearch from elasticsearch_dsl import MultiSearch, Search client = Elasticsearch(['192.168.33.108:9200','192.168.33.109:9200']) # multi search "hello" on message field. ms = MultiSearch(using=client,index='logstash-*') ms = ms.add(Search().query("match", message="hello")) ms = ms.add(Search().query("match", message="hello7")) responses = ms.execute() for response in responses: for r in response: print(r['host'], r['message'])
def run(self, network, channels, query, author=None, date_range=None): # We don't support non-ajax, so will always have date range assert date_range date_begin, date_end = date_range result = Search( using=self.es, index='moffle', ).query( "match", text=query, ).query( "range", date={ 'gt': date_begin.strftime('%Y%m%d'), 'lte': date_end.strftime('%Y%m%d'), }, ).filter( "terms", line_type=['normal', 'action'], ).filter( "term", network=network, ).filter( "terms", channel=channels, ).sort( "-date", )[:10000].execute() hits = [] # TODO: interval merging ctx_search = MultiSearch(using=self.es, index='moffle') for hit in result: # Fetch context ctx_search = ctx_search.add(Search( using=self.es, index='moffle', ).query( "range", line_no={ "gte": hit.line_no - config.SEARCH_CONTEXT, "lte": hit.line_no + config.SEARCH_CONTEXT, }, ).filter( "term", network=hit.network, ).filter( "term", channel=hit.channel, ).filter( "term", date=hit.date, ).sort( "line_no", )) ctx_results = ctx_search.execute() for hit, ctx_result in zip(result, ctx_results): lines = [] for ctx_hit in ctx_result: lines.append(self._format_line( ctx_hit, is_hit=(hit.line_no == ctx_hit.line_no), )) hit = Hit( channel=hit.channel, date=hit.date, begin=lines[0].line_no, lines=lines, ) hits.append(hit) hits = [list(group) for _, group in groupby(hits, key=lambda hit: hit.date)] return hits