def _search_iterator(self, q, start, max, count, sort, level='Package', include=False): if max and start + max > 10_000: raise Exception( 'Pagination beyond 10000 hits not allowed, use empty max parameter to retrieve full set' ) index = self.index_map.get(level, self.index_name) #print(index, self.index_map, flush=True) s = Search(using=self.elastic, index=index) s.extra(track_total_hits=True) s.update_from_dict(q) s.source(include) m = max or count for hit in s[start:start + m] if start + m <= 10_000 else s.scan(): yield hit.meta.id if not include else (hit.meta.id, self._hit_to_desc(hit))
def query_event_ids(self): es_query = [] es_query.append({'match': {'winlog.provider_name': MICROSOFT_WINDOWS_DNSCLIENT_PROVIDER_NAME}}) query = Q({'bool': {'must': es_query}}) s = Search(using = self.Client, index = "winlogbeat-*").query(query) s.source(includes = ['winlog.provider_name', 'winlog.event_id']) count = s.count() print("Count: %d" % (count)) event_ids = {} i = 0 try: for hit in s.scan(): print('%d. %d' % (i, hit.winlog.event_id)) if not hit.winlog.event_id in event_ids: event_ids[hit.winlog.event_id] = 1 print("%s: %d" % (hit.winlog.provider_name, hit.winlog.event_id)) else: event_ids[hit.winlog.event_id] += 1 i += 1 except: traceback.print_exc()
def query_query_names(self, size = 6000, descending = True): winlog_event_data_name = "winlog.event_data.QueryName" es_query = [] es_query.append({'match': {'winlog.provider_name': MICROSOFT_WINDOWS_DNSCLIENT_PROVIDER_NAME}}) query = Q({'bool': {'must': es_query}}) s = Search(using = self.Client, index = "winlogbeat-*").query(query) s.source(includes = [winlog_event_data_name])\ if descending: order = 'desc' else: order = 'asc' s.aggs.bucket('distinct_query_name', 'terms', field = winlog_event_data_name, size = size, order = {'_count': order}) response = s.execute() sorted_distinct_query_name = response.aggregations.distinct_query_name max_len = 0 for e in sorted_distinct_query_name: if len(e.key) > max_len: max_len = len(e.key) fmt_str = "{0:%d} Count: {1}" % (max_len) for e in sorted_distinct_query_name: print(fmt_str.format(e.key, e.doc_count))
def _create_request(self, catalog: CatalogName, filters: FiltersJSON, post_filter: bool = False, source_filter: SourceFilters = None, enable_aggregation: bool = True, entity_type='files') -> Search: """ This function will create an ElasticSearch request based on the filters and facet_config passed into the function :param filters: The 'filters' parameter. Assumes to be translated into es_key terms :param post_filter: Flag for doing either post_filter or regular querying (i.e. faceting or not) :param List source_filter: A list of "foo.bar" field paths (see https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-request-source-filtering.html) :param enable_aggregation: Flag for enabling query aggregation (and effectively ignoring facet configuration) :param entity_type: the string referring to the entity type used to get the ElasticSearch index to search :return: Returns the Search object that can be used for executing the request """ service_config = self.service_config(catalog) field_mapping = service_config.translation facet_config = { key: field_mapping[key] for key in service_config.facets } es_search = Search(using=self.es_client, index=config.es_index_name(catalog=catalog, entity_type=entity_type, aggregate=True)) filters = self._translate_filters(catalog, filters, field_mapping) es_query = self._create_query(catalog, filters) if post_filter: es_search = es_search.post_filter(es_query) else: es_search = es_search.query(es_query) if source_filter: es_search = es_search.source(includes=source_filter) elif entity_type not in ("files", "bundles"): es_search = es_search.source(excludes="bundles") if enable_aggregation: for agg, translation in facet_config.items(): # FIXME: Aggregation filters may be redundant when post_filter is false # https://github.com/DataBiosphere/azul/issues/3435 es_search.aggs.bucket( agg, self._create_aggregate(catalog, filters, facet_config, agg)) return es_search
def load_genes_by_region(chrom, start, end, features): """Retrieve genes by region""" index = _get_index_from_chr(chrom) search_genes = Search().using(es).doc_type('genes').index(index).filter("range", positions={"lte": end, "gte":start}) if not features: search_genes.source(exclude=['isoforms']) genes = [gene.to_dict() for gene in search_genes.scan() ] for gene in genes: gene['ko_associations'] = load_gene_ko_associations(gene['name'], return_only_significant=True) return genes
def create_mlt_with_id(document_id, position, index): s = Search(using=client, index=index) s.source(includes=['*'], excludes=["body"]) mlt_match = MoreLikeThis(fields=["body.content"], like=[id], min_term_freq=1, min_doc_freq=1) nested_query = Nested(path='body', inner_hits={}, query=mlt_match) s = s.query(nested_query) return s
def search_graphs1(request, owner_email=None, names=None, nodes=None, edges=None, tags=None, member_email=None, is_public=None, query=None, limit=20, offset=0, order='desc', sort='name'): sort_attr = getattr(db.Graph, sort if sort is not None else 'name') orber_by = getattr(db, order if order is not None else 'desc')(sort_attr) is_public = int(is_public) if is_public is not None else None if member_email is not None: member_user = users.controllers.get_user(request, member_email) if member_user is not None: group_ids = [ group.id for group in users.controllers.get_groups_by_member_id( request, member_user.id) ] else: raise Exception("User with given member_email doesnt exist.") else: group_ids = None if edges is not None: edges = [tuple(edge.split(':')) for edge in edges] if 'query' in query: s = Search(using=settings.ELASTIC_CLIENT, index='graphs') s.update_from_dict(query) s.source(False) graph_ids = [int(hit.meta.id) for hit in s.scan()] else: graph_ids = None total, graphs_list = db.find_graphs(request.db_session, owner_email=owner_email, graph_ids=graph_ids, is_public=is_public, group_ids=group_ids, names=names, nodes=nodes, edges=edges, tags=tags, limit=limit, offset=offset, order_by=orber_by) return total, graphs_list
def load_genes_by_region(chrom, start, end, features): """Retrieve genes by region""" index = _get_index_from_chr(chrom) search_genes = Search().using(es).doc_type('genes').index(index).filter( "range", positions={ "lte": end, "gte": start }) if not features: search_genes.source(exclude=['isoforms']) return [gene.to_dict() for gene in search_genes.scan()]
def query_distinct_event_ids(self): es_query = [] es_query.append({'match': {'winlog.provider_name': MICROSOFT_WINDOWS_DNSCLIENT_PROVIDER_NAME}}) query = Q({'bool': {'must': es_query}}) s = Search(using = self.Client, index = "winlogbeat-*").query(query) s.source(includes = ['winlog.event_id', 'winlog.event_data.LogString']) s.aggs.bucket('distinct_event_ids', 'terms', field = 'winlog.event_id', size = 1000) response = s.execute() sorted_distinct_distinct_event_ids = sorted(response.aggregations.distinct_event_ids, key = lambda kv:(kv.doc_count, kv.key), reverse = True) for e in sorted_distinct_distinct_event_ids: print("{0:50} {1}".format(e.key, e.doc_count))
def _search(self, query): s = Search(using=self.Client, index="winlogbeat-*").query(query) if self.DTRange != None: s = s.filter('range', **self.DTRange) s.source(includes=['winlog.*']) s.sort('-winlog.event_data.UtcTime') if self.Scan: return s.scan() else: return s.execute().hits
def construct_multi_field_search( # pylint: disable=too-many-arguments search: Search, text: str, operator: str, fields: List[str], size: int = 5, includes: Optional[List[str]] = None, excludes: Optional[List[str]] = None, ) -> Search: """Construye búsqueda por texto en multiples campos. Args: search: Búsqueda inicial. text: Texto de busquéda. operator: Condicional sobre los tokens del texto. Si el operator es `and`, entonces para que haya match, todos los tokens deben ser encontrados en el índice inverso. Si el operator es `or`, entonces para que haya match, al menos un token debe ser encontrado en el índice inverso. Ver: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html size: Tamaño de la busqueda. Valor por defecto 5. includes: Control selectivo del campo _source. Retorna solo los campos que se especifican. excludes: Control selectivo del campo _source. Excluye los campos que se especifican. fields: Nombre de los fields de :class:~`elastinga.schemas.TwitterPosts` en donde se quiere buscar Returns: search: Search """ if includes: search = search.source(includes=includes) if excludes: search = search.source(excludes=excludes) search = search.query( Q("multi_match", query=text, operator=operator, fields=fields)) search = search.params(size=size) logger.info("Query", query=search.to_dict()) return search
def searchAnotiranaRec(term): s = Search(index=index) s = s.source(includes=['pk', 'osnovniOblik']) s.query = Bool(must=[Match(oblici=term)]) response = s.execute() return response.hits
def _search_odrednica(request): if not request.GET.get('q'): return bad_request('no search term') term = request.GET.get('q') hits = [] s = Search(index=ODREDNICA_INDEX) s = s.source(includes=['pk', 'rec', 'vrsta', 'rbr_homo']) s.query = MultiMatch( type='bool_prefix', query=remove_punctuation(term), fields=['varijante'], # analyzer=SERBIAN_ANALYZER ) try: response = s.execute() for hit in response.hits.hits: hits.append(hit['_source']) serializer = OdrednicaResponseSerializer(hits, many=True) data = serializer.data return Response( data, status=HTTP_200_OK, content_type=JSON ) except ElasticsearchException as error: return server_error(error.args)
def _search_korpus(request): if not request.data or request.data['term'] is None: return bad_request('no search term') term = request.data['term'] hits = [] s = Search(index=KORPUS_INDEX) s = s.source(includes=['pk', 'osnovniOblik']) s.query = Bool( must=[Match(oblici=term)] ) try: response = s.execute() for hit in response.hits.hits: hits.append(hit['_source']) serializer = KorpusResponseSerializer(hits, many=True) data = serializer.data return Response( data, status=HTTP_200_OK, content_type=JSON ) except ElasticsearchException as error: return server_error(error.args)
def get_indices(self, docTypes: List = ["default"]) -> str: """ Returns a list of all indexes for the given doc types. :param docTypes: List of Doctypes to search, if empty will search all docTypes :return: A string representing indexes to search. (will use * to regroup multiple indices) """ es = get_es_conn() indexNamesStr = "" if docTypes: s = Search(using=es, index=self.typeIndex, doc_type="directory_type").query("ids", values=docTypes) s = s.params(scroll=get_scan_scroll_duration(), size=get_nb_documents_per_scan_scroll()) indexNamesQuery = s.source(["indexName"]) indexNamesArr = [] for indexNamePart in indexNamesQuery.scan(): indexNamesArr.append(indexNamePart["indexName"]) indexNamesStr = ','.join(indexNamesArr) else: indexNamesStr = self.dataIndexPrefix + "*" return indexNamesStr
def test_connections_to_bert_service(created): print(f'starting task at {created}') from bert_serving.client import BertClient from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT from elasticsearch_dsl import Search _TEMP_INDEX = "temp_rubert_index" bc = BertClient(ip="bert_as_service", check_length=False) ind_doc_search = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT) ind_doc_search = ind_doc_search.source(['id', 'text']) ind_doc_scan = ind_doc_search.scan() if ES_CLIENT.indices.exists(_TEMP_INDEX): ES_CLIENT.indices.delete(index=_TEMP_INDEX, ignore=[400, 404]) ES_CLIENT.indices.create(index=_TEMP_INDEX) elastic_results = [] for ind, res in enumerate(ind_doc_scan): if ind % 1000 == 0: print(f"Current index is {ind}") if ind % 25 == 0 and not ind == 0: vecs = bc.encode( [i['text'] for i in elastic_results] ).tolist() for ind, vector in enumerate(vecs): elastic_results[ind].update({'rubert_embedding': vector}) persist_in_elastic(ES_CLIENT, elastic_results, _TEMP_INDEX) elastic_results = [] cleaned_text = clean_text(res.text) if len(cleaned_text) > 20: elastic_results.append( {'id': res.id, 'text': cleaned_text})
def get_documents_with_q(self, index, query=Q(), source=None, add_index_name=False): """ Get documents from elasticsearch index :param index: elasticsearch index :param query: es query :param source: extra properties for search :return: dataframe with es data """ s = Search(using=self.es, index=index) if source: s = s.source(source) # Dotted fields, replace . by __ q = s.query(query) #print(str(q.to_dict()).replace("'",'"')) results = s.query(query).scan() if add_index_name: all_dicts = [] for hit in results: result_dict = hit.to_dict() result_dict['_index'] = hit.meta.index all_dicts.append(result_dict) fa = pd.DataFrame.from_dict(all_dicts) else: fa = pd.DataFrame([hit.to_dict() for hit in results]) return fa
def _get_search_client(self, size=35): client = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts']) s = Search(using=client) s = s.extra(size=size) s = s.filter('term', published=True) s = s.source(excludes=["html_text"]) return s
def report_all_customers(customer_file, my_index='epl', my_database='duplicate_user'): """Outputs all the user keys to file.""" ## Code based on solution found at: ## https://stackoverflow.com/questions/17497075/efficient-way-to-retrieve-all-ids-in-elasticsearch es = Elasticsearch() # epl/duplicate_user s = Search(using=es, index=my_index, doc_type=my_database) s = s.source( []) # only get ids, otherwise `fields` takes a list of field names ids = [h.meta.id for h in s.scan()] try: file = open(customer_file, 'w') except: sys.stderr.write('** error, while attempting to open "{0}"!\n'.format( customer_file)) sys.exit(1) count = 0 for key in ids: count = count + 1 # UKEY file.write(key + '\n') sys.stderr.write( "total user keys in index {0}, doc_type {1}: {2}\n".format( my_index, my_database, count))
def get_warnings_by_package(package_name, package_warnings): ''' Returns all the warnings for a specific package Arguments: package_name: the package in the database package_warnings: a dict keyed by warning_type we will populate in this function Returns: None, but populates the package_warnings dict ''' client = Elasticsearch(host=HOST) s = Search(using=client) s = s.source(['package', 'type', 'severity', 'score']) #q = Q("match", type=warning) & Q("match", severity=severity) s = s.query("match", package__keyword=package_name) s = s.exclude("match", tag="test_code") #print(s.to_dict()) # process the query for hit in s.scan(): #print(hit.type) #print(hit.severity) #print(hit.package) if hit.type not in package_warnings.keys(): package_warnings[hit.type] = {} if hit.severity in package_warnings[hit.type]: package_warnings[hit.type][hit.severity] += 1 else: package_warnings[hit.type][hit.severity] = 0
def search_more_like_this(talk): """ Get more like this documents """ client = Elasticsearch([{ 'host': settings.ELASTICSEARCH['default']['HOSTNAME'], 'port': settings.ELASTICSEARCH['default']['PORT'], }]) s = Search(using=client, index="vtalks") s = s.query( MoreLikeThis(like={ "_index": "vtalks", "_type": "talk", "_id": talk.id }, fields=['title', 'description', 'tags'])) # Sorting s = s.sort({"_score": {"order": "desc"}}) # Fields selection s = s.source(['id']) response = s.execute() results_total = response.hits.total results_ids = [hit.id for hit in response.hits] return results_total, results_ids
def get_article_by_title(title, access_groups): s = Search(using=client, index=wiki_index_name) s = s.filter(access_filter(access_groups)) s = s.query("term", title__raw=title) s = s.source(excludes=["access"]) res = s.execute() return format_article(res)
def _search(self, index, table, fields=None): """ Search private area for matching docs in Elasticsearch. only returns the _id of the matching document. fields = { 'id': [1, 2], 'uid': ['a002', 'a009'] } """ fields = fields or {} search = Search(using=self.__es, index=index) # explicitly exclude all fields since we only need the doc _id search = search.source(excludes=['*']) for key, values in fields.items(): search = search.query( Bool( filter=[ Q('terms', **{f'{META}.{table}.{key}': values}) | Q('terms', **{f'{META}.{table}.{key}.keyword': values}) ] ) ) for hit in search.scan(): yield hit.meta.id
def search(client, index, query, start=None, end=None, source=None, sort=None, get_scan_obj=False, get_search_obj=False): from elasticsearch_dsl import Search s = Search(using=client, index=index) for key, value in query.items(): if any( key.endswith(range_selector) for range_selector in ['__gte', '__lte', '__gt', '__lt']): range_selector = key.split("__")[-1] s = s.filter( 'range', **{ key.replace(f"__{range_selector}", ""): { range_selector: value } }) else: s = es_filter_term(s, key, value) if source: s = s.source(include=source) if sort: s = s.sort(*sort) s = s[start:end] if get_scan_obj: return s.scan() elif get_search_obj: return s else: return s.execute()
def search_talks(page=None, sort=None): """ Get Talks from by Topic from ElasticSearch """ client = Elasticsearch([{ 'host': settings.ELASTICSEARCH['default']['HOSTNAME'], 'port': settings.ELASTICSEARCH['default']['PORT'], }]) s = Search(using=client, index="vtalks") # Pagination if page: start = 0 end = 10 if page > 1: start = settings.PAGE_SIZE * (page - 1) end = settings.PAGE_SIZE * page s = s[start:end] # Sorting s = s.sort({sort: {"order": "desc"}}) # Fields selection s = s.source(['id']) response = s.execute() results_total = response.hits.total results_ids = [hit.id for hit in response.hits] return results_total, results_ids
def documents_by_text(self, grouped_targets: dict, queries: list, from_index: int, size: int) -> tuple: """ Paginated documents found by text. """ # For pagination/score sorting to work, we need to query all the different corpus indices in the same # Elasticsearch query. # We are using the grouped target approach like search documents by annotations, event though buckets # are inconsequential for text search. indices = self.target_text_document_indices(grouped_targets) indices_argument = ','.join(indices) language_manager = get_language_manager() match_queries = [ to_match_query(language_manager, query) for query in queries ] grouped_queries = self.group_queries_by_operator(match_queries) # A query language restriction, if present, will work automatically via the query text.<language> mapping. es = get_es_conn() search = Search(using=es, index=indices_argument) search = search.source(["title", "language", "source"]) search.query = Q('bool', must=grouped_queries["must"], must_not=grouped_queries["must_not"], should=grouped_queries["should"]) search = search[from_index:from_index + size] count = search.count() documents = [self.map_hit_with_score(hit) for hit in search] return count, documents
def elasticsearch_pages(context, sort, page): result_limit = int(os.environ['RESULT_LIMIT']) max_result_limit = int(os.environ['MAX_RESULT_LIMIT']) start = (page - 1) * result_limit end = start + result_limit domain_query = Q("term", is_banned=False) if context["is_up"]: domain_query = domain_query & Q("term", is_up=True) if not context["show_fh_default"]: domain_query = domain_query & Q("term", is_crap=False) if not context["show_subdomains"]: domain_query = domain_query & Q("term", is_subdomain=False) if context["rep"] == "genuine": domain_query = domain_query & Q("term", is_genuine=True) if context["rep"] == "fake": domain_query = domain_query & Q("term", is_fake=True) limit = max_result_limit if context["more"] else result_limit has_parent_query = Q("has_parent", type="domain", query=domain_query) query = Search().filter(has_parent_query).query( Q("match", body_stripped=context['search'])) query = query.highlight_options( order='score', encoder='html').highlight('body_stripped')[start:end] query = query.source(['title', 'domain_id', 'created_at', 'visited_at']).params(request_cache=True) return query.execute()
class ElasticSearchConnector: def __init__(self, host: str, index_name: str, *args, **kwargs): self.search = Search(using=Elasticsearch(hosts=host), index=index_name) def get_window(self, turbines: tuple, start_date: datetime.datetime, end_date: datetime.datetime, fields=('*', )): self.search = self.search.filter("range", timestamp={ "gte": start_date, "lt": end_date }) self.search = self.search.filter( "terms", wind_turbine=turbines, ) fields = ('timestamp', ) + fields self.search = self.search.source(include=fields) self.search = self.search.sort('timestamp') hits = list(self.search.scan()) return hits_to_dataframe(hits) def get_plain_data(self, turbines: tuple, start_date: datetime.datetime, end_date: datetime.datetime, fields=('*', )): self.search = self.search.filter("range", timestamp={ "gte": start_date, "lt": end_date }) self.search = self.search.filter( "terms", wind_turbine=turbines, ) fields = ('timestamp', ) + fields self.search = self.search.source(include=fields) self.search = self.search.sort('timestamp') hits = list(self.search.scan()) return hits_to_dataframe(hits)
def queryES(index, host): host_addr = 'http://' + host + ':9200/' client = Elasticsearch([host_addr]) s = Search(using=client, index=index) request = s.source(['hash', 'author_date', 'author']) response = s.scan() return response
def get_accounts(self, account_ids, size=1000): s = Search(using='objects', index="objects-account", extra={'size': size }) s = s.filter('terms', id=account_ids) s = s.source([ 'id', 'name', 'options.voting_account']) s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis. accounts = [hit.to_dict() for hit in s.scan()] return accounts
def _search(self, index, mapping, query, real_fields): query_dict = json.loads(query) e = Elasticsearch(es_url) search = Search(index=index, doc_type=mapping).update_from_dict(query_dict).using(e) search = search.source(real_fields) # Select fields to return. search = search[0:query_dict.get("size", 10)] # Select how many documents to return. response = search.execute() for hit in response: yield hit.to_dict()
def search_graphs1(request, owner_email=None, names=None, nodes=None, edges=None, tags=None, member_email=None, is_public=None, query=None, limit=20, offset=0, order='desc', sort='name'): sort_attr = getattr(db.Graph, sort if sort is not None else 'name') orber_by = getattr(db, order if order is not None else 'desc')(sort_attr) is_public = int(is_public) if is_public is not None else None if member_email is not None: member_user = users.controllers.get_user(request, member_email) if member_user is not None: group_ids = [group.id for group in users.controllers.get_groups_by_member_id(request, member_user.id)] else: raise Exception("User with given member_email doesnt exist.") else: group_ids = None if edges is not None: edges = [tuple(edge.split(':')) for edge in edges] if 'query' in query: s = Search(using=settings.ELASTIC_CLIENT, index='graphs') s.update_from_dict(query) s.source(False) graph_ids = [int(hit.meta.id) for hit in s.scan()] else: graph_ids = None total, graphs_list = db.find_graphs(request.db_session, owner_email=owner_email, graph_ids=graph_ids, is_public=is_public, group_ids=group_ids, names=names, nodes=nodes, edges=edges, tags=tags, limit=limit, offset=offset, order_by=orber_by) return total, graphs_list
def get_all_cans(index, estype=Types.candidate, fields=['id'], status=1, at_most=10000): s = Search(using=client) s = s.filter('term', _index=index) s = s.filter('term', _type=estype) s = s.filter('term', status=status) s = s.source(include=fields) s = s[:at_most] resp = s.execute() # print(resp.took) # print(resp.hits.total) return [hit['id'] for hit in resp]
def get_all_job_cans(index, estype=Types.job_candidate, fields=['id', 'job', 'candidate'], status=None, at_most=10000): s = Search(using=client) s = s.filter('term', _index=index) s = s.filter('term', _type=estype) if status: s = s.filter('term', status=status) s = s.source(include=fields) s = s[:at_most] resp = s.execute() # print(resp.took) # print(resp.hits.total) return [{'id': hit['id'], 'job_id': hit['job'], 'can_id': hit['candidate']} for hit in resp]
def search_cans(): work_years = 0 salary_low = 1000 salary_high = 50000 status = 0 job_3l_nums = [ "13040342", "02250254", "02250166", "04550449", "00510085" ] s = Search(using=client, index='can_tenant_chouun') # s = s.filter('term', _index='can_tenant_chouun') s = s.filter('term', _type='candidate') q = Q('nested', path='analysis', query=Q('term', **{'analysis.job_3l_num': job_3l_nums[0]})) for job_3l_num in job_3l_nums[1:]: q |= Q('nested', path='analysis', query=Q('term', **{'analysis.job_3l_num': job_3l_num})) # s = s.filter('term', status=status) s = s.query(q) s = s.query(Q('nested', path='analysis', query=Q('range', **{'analysis.salary': {'gte': int(salary_low) - 3000}}))) s = s.filter('range', years={ 'lte': datetime.date.today().year - int(work_years) }) s = s.source(include=['id', 'analysis']) s = s[0:200] resp = s.execute() print(resp['took']) print(resp['hits']['total'])
def _build_query(self): query = Q() source = ['id'] sort = [] aggregations = {} query_string = None as_list = as_dict = False for action, value in self.steps: if action == 'order_by': for key in value: if key.startswith('-'): sort.append({key[1:]: 'desc'}) else: sort.append(key) elif action == 'values': source.extend(value) as_list, as_dict = True, False elif action == 'values_dict': if value: source.extend(value) as_list, as_dict = False, True elif action == 'query': query &= self._process_queries(value) elif action == 'filter': query &= self._process_filters(value) elif action == 'source': source.extend(value) elif action == 'aggregate': aggregations.update(value) elif action == 'filter_query_string': query_string = value else: raise NotImplementedError(action) # If we have a raw query string we are going to apply all sorts # of boosts and filters to improve relevance scoring. # # We are using the same rules that `search.filters:SearchQueryFilter` # implements to have a single-source of truth for how our # scoring works. from olympia.search.filters import SearchQueryFilter search = Search().query(query) if query_string: search = SearchQueryFilter().apply_search_query( query_string, search) if sort: search = search.sort(*sort) if source: search = search.source(source) body = search.to_dict() # These are manually added for now to simplify a partial port to # elasticsearch-dsl if self.start: body['from'] = self.start if self.stop is not None: body['size'] = self.stop - self.start if aggregations: body['aggs'] = aggregations self.source, self.as_list, self.as_dict = source, as_list, as_dict return body
class EsRdfBulkLoader(object): """ Bulk loads data from the triplestore to elasticsearch """ log_level = logging.DEBUG def __init__(self, rdf_class, tstore_conn, search_conn, **kwargs): log.setLevel(self.log_level) self.tstore_conn = tstore_conn self.search_conn = search_conn try: self.es_index = rdf_class.es_defs.get('kds_esIndex')[0] self.es_doc_type = rdf_class.es_defs.get('kds_esDocType')[0] except TypeError: log.warn("'%s' is NOT cofigured for indexing to elasticsearch", rdf_class) return self.search = Search(using=search_conn.es).index(self.es_index) self.rdf_class = rdf_class self._set_es_workers(**kwargs) self.idx_start_time = XsdDatetime(datetime.datetime.utcnow()) # add all of the sublcasses for a rdf_class self.rdf_types = [rdf_class.uri] + [item.uri for item in rdf_class.subclasses] # self.query = self.items_query_template.format( # rdf_types="\n\t\t".join(rdf_types), # idx_start_time=XsdDatetime(datetime.datetime.utcnow()).sparql) EsMappings().initialize_indices() if kwargs.get("reset_idx"): self.delete_idx_status(self.rdf_class) self.count = 0 kwargs['uri_list'] = self.get_uri_list() # self._index_group_with_subgroup(**kwargs) while len(kwargs['uri_list']) > 0: self._index_group_with_subgroup(**kwargs) kwargs['uri_list'] = self.get_uri_list() def _set_es_workers(self, **kwargs): """ Creates index worker instances for each class to index kwargs: ------- idx_only_base[bool]: True will only index the base class """ def make_es_worker(search_conn, es_index, es_doc_type, class_name): """ Returns a new es_worker instance args: ----- search_conn: the connection to elasticsearch es_index: the name of the elasticsearch index es_doc_type: the name of the elasticsearch doctype class_name: name of the rdf class that is being indexed """ new_esbase = copy.copy(search_conn) new_esbase.es_index = es_index new_esbase.doc_type = es_doc_type log.info("Indexing '%s' into ES index '%s' doctype '%s'", class_name.pyuri, es_index, es_doc_type) return new_esbase def additional_indexers(rdf_class): """ returns additional classes to index based off of the es definitions """ rtn_list = rdf_class.es_indexers() rtn_list.remove(rdf_class) return rtn_list self.es_worker = make_es_worker(self.search_conn, self.es_index, self.es_doc_type, self.rdf_class.__name__) if not kwargs.get("idx_only_base"): self.other_indexers = {item.__name__: make_es_worker( self.search_conn, item.es_defs.get('kds_esIndex')[0], item.es_defs.get('kds_esDocType')[0], item.__name__) for item in additional_indexers(self.rdf_class)} else: self.other_indexers = {} def _index_sub(self, uri_list, num, batch_num): """ Converts a list of uris to elasticsearch json objects args: uri_list: list of uris to convert num: the ending count within the batch batch_num: the batch number """ bname = '%s-%s' % (batch_num, num) log.debug("batch_num '%s' starting es_json conversion", bname) qry_data = get_all_item_data([item[0] for item in uri_list], self.tstore_conn, rdfclass=self.rdf_class) log.debug("batch_num '%s-%s' query_complete | count: %s", batch_num, num, len(qry_data)) # path = os.path.join(CFG.dirs.cache, "index_pre") # if not os.path.exists(path): # os.makedirs(path) # with open(os.path.join(path, bname + ".json"), "w") as fo: # fo.write(json.dumps(qry_data)) data = RdfDataset(qry_data) del qry_data log.debug("batch_num '%s-%s' RdfDataset Loaded", batch_num, num) for value in uri_list: try: self.batch_data[batch_num]['main'].append(\ data[value[0]].es_json()) self.count += 1 except KeyError: pass for name, indexer in self.other_indexers.items(): for item in data.json_qry("$.:%s" % name.pyuri): val = item.es_json() if val: self.batch_data[batch_num][name].append(val) self.batch_uris[batch_num].append(item.subject) del data del uri_list log.debug("batch_num '%s-%s' converted to es_json", batch_num, num) def get_uri_list(self, **kwargs): """ Returns a list of Uris to index """ index_status_filter = """ optional {{ ?s dcterm:modified ?modTime }} . optional {{ ?s kds:esIndexTime ?time }} . optional {{ ?s kds:esIndexError ?error }} filter ( !(bound(?time)) || ?time<?modTime || (bound(?error) && ?time < {idx_start_time})) """.format(idx_start_time=self.idx_start_time.sparql) items_query_template = """ SELECT DISTINCT ?s ?es_id {{ VALUES ?rdftypes {{\n\t\t{rdf_types} }} . ?s a ?rdftypes . BIND(SHA1(STR(?s)) as ?es_id) . {status_filter} }} {order_by} """ status_filter = index_status_filter \ if not kwargs.get("no_status") else "" order_by = kwargs.get("order_by", "") sparql = items_query_template.format( rdf_types="\n\t\t".join(self.rdf_types), status_filter=status_filter, order_by=order_by) results = [(Uri(item['s']['value']), item['es_id']['value'],) for item in self.tstore_conn.query(sparql=sparql)] return results #[:100] def _index_group_with_subgroup(self, **kwargs): """ indexes all the URIs defined by the query into Elasticsearch """ log.setLevel(self.log_level) # get a list of all the uri to index uri_list = kwargs.get('uri_list', self.get_uri_list()) if not uri_list: log.info("0 items to index") return # results = results[:100] # Start processing through uri batch_file = os.path.join(CFG.dirs.logs, "batch_list.txt") # with open(batch_file, "w") as fo: # fo.write("{") log.info("'%s' items to index", len(uri_list)) self.time_start = datetime.datetime.now() batch_size = kwargs.get("batch_size", 12000) if len(uri_list) > batch_size: batch_end = batch_size else: batch_end = len(uri_list) batch_start = 0 batch_num = 1 self.batch_data = {} self.batch_data[batch_num] = {} self.batch_data[batch_num]['main'] = [] self.batch_uris = {} self.batch_uris[batch_num] = [] for name, indexer in self.other_indexers.items(): self.batch_data[batch_num][name] = [] end = False last = False final_list = [] expand_index = kwargs.get("expand_index", True) while not end: log.debug("batch %s: %s-%s", batch_num, batch_start, batch_end) sub_batch = [] j = 0 for i in range(batch_start, batch_end): # for i, subj in enumerate(uri_list[batch_start:batch_end]): qry_size = kwargs.get("qry_size", 1000) if j < qry_size: try: sub_batch.append(uri_list.pop()) #subj) except IndexError: pass if j == qry_size -1 or i == batch_end - 1: try: sub_batch.append(uri_list.pop()) #subj) except IndexError: pass # with open(batch_file, "a") as fo: # fo.write(json.dumps({str('%s-%s' % (batch_num, i+1)): # [item[0].sparql # for item in sub_batch]})[1:-1]+",\n") if not kwargs.get("no_threading", False): th = threading.Thread(name=batch_start + i + 1, target=self._index_sub, args=(sub_batch, i+1, batch_num,)) th.start() else: self._index_sub(sub_batch, i+1, batch_num) j = 0 final_list += sub_batch sub_batch = [] else: j += 1 log.debug(datetime.datetime.now() - self.time_start) if not kwargs.get("no_threading", False): main_thread = threading.main_thread() for t in threading.enumerate(): if t is main_thread: continue t.join() action_list = [] for key, items in self.batch_data[batch_num].items(): if key == 'main': es_worker = self.es_worker else: es_worker = self.other_indexers[key] action_list += es_worker.make_action_list(items) result = self.es_worker.bulk_save(action_list) final_list += self.batch_uris[batch_num] self._update_triplestore(result, action_list) del action_list del self.batch_uris[batch_num] del self.batch_data[batch_num] try: del pyrdf.memorized pyrdf.memorized = {} except AttributeError: pass while gc.collect() > 0: pass # pdb.set_trace() batch_end += batch_size batch_start += batch_size if last: end = True if len(uri_list) <= batch_size: batch_end = len(uri_list) last = True batch_num += 1 self.batch_uris[batch_num] = [] self.batch_data[batch_num] = {} self.batch_data[batch_num]['main'] = [] for name, indexer in self.other_indexers.items(): self.batch_data[batch_num][name] = [] log.debug(datetime.datetime.now() - self.time_start) # with open(batch_file, 'rb+') as fo: # fo.seek(-2, os.SEEK_END) # fo.truncate() # # fo.close() # fo.write("}".encode()) def _update_triplestore(self, es_result, action_list, **kwargs): """ updates the triplestore with success of saves and failues of indexing Args: ----- es_result: the elasticsearch result list action_list: list of elasticsearch action items that were indexed """ idx_time = XsdDatetime(datetime.datetime.utcnow()) uri_keys = {} bnode_keys = {} for item in action_list: try: uri_keys[item['_id']] = item['_source']["uri"] except KeyError: bnode_keys[item['_id']] = item['_id'] error_dict = {} error_bnodes = {} if es_result[1]: for result in es_result[1]: err_item = list(result.values())[0] try: error_dict[uri_keys.pop(err_item['_id'])] = \ XsdString(err_item['error']['reason']) except KeyError: error_bnodes[bnode_keys.pop(err_item['_id'])] = \ XsdString(err_item['error']['reason']) if uri_keys: sparql_good = """ DELETE {{ ?s kds:esIndexTime ?esTime . ?s kds:esIndexError ?esError . }} INSERT {{ GRAPH ?g {{ ?s kds:esIndexTime {idx_time} }}. }} WHERE {{ VALUES ?s {{ {subj_list} }} . {{ SELECT DISTINCT ?g ?s ?esTime ?esError {{ GRAPH ?g {{ ?s ?p ?o }} . OPTIONAL {{ ?s kds:esIndexTime ?esTime }} OPTIONAL {{ ?s kds:esIndexError ?esError }} }} }} }} """.format(idx_time=idx_time.sparql, subj_list="<%s>" % ">\n<".join(uri_keys.values())) self.tstore_conn.update_query(sparql_good) # Process any errors that were found. if not error_dict: return # Delete all indexing triples related to the error subjects sparql_error = """ DELETE {{ ?s kds:esIndexTime ?esTime . ?s kds:esIndexError ?esError . }} WHERE {{ VALUES ?s {{ {subj_list} }} . OPTIONAL {{ ?s kds:esIndexTime ?esTime }} OPTIONAL {{ ?s kds:esIndexError ?esError }} }} """.format(subj_list="<%s>" % ">\n<".join(error_dict.keys())) self.tstore_conn.update_query(sparql_error) del sparql_error sparql_update = """ INSERT {{ GRAPH ?g {{ ?s kds:esIndexTime {idx_time} . ?s kds:esIndexError ?esError . }} }} WHERE {{ VALUES (?s ?esError) {{ {error_list} }} . {{ SELECT DISTINCT ?g ?s {{ graph ?g {{?s ?p ?o}} }} }} }}""".format( idx_time=idx_time.sparql, error_list="\n".join(["(<%s> %s)" % (key, val.sparql) for key, val in error_dict.items()])) # Create a turtle data stream of the new errors to upload into the # triplestore self.tstore_conn.update_query(sparql_update) del sparql_update def delete_idx_status(self, rdf_class): """ Removes all of the index status triples from the datastore Args: ----- rdf_class: The class of items to remove the status from """ sparql_template = """ DELETE {{ ?s kds:esIndexTime ?esTime . ?s kds:esIndexError ?esError . }} WHERE {{ VALUES ?rdftypes {{\n\t\t{} }} . ?s a ?rdftypes . OPTIONAL {{ ?s kds:esIndexTime ?esTime }} OPTIONAL {{ ?s kds:esIndexError ?esError }} FILTER(bound(?esTime)||bound(?esError)) }} """ rdf_types = [rdf_class.uri] + [item.uri for item in rdf_class.subclasses] sparql = sparql_template.format("\n\t\t".join(rdf_types)) log.warn("Deleting index status for %s", rdf_class.uri) return self.tstore_conn.update_query(sparql) def get_es_ids(self): """ reads all the elasticssearch ids for an index """ search = self.search.source(['uri']).sort(['uri']) es_ids = [item.meta.id for item in search.scan()] return es_ids def validate_index(self, rdf_class): """ Will compare the triplestore and elasticsearch index to ensure that that elasticsearch and triplestore items match. elasticsearch records that are not in the triplestore will be deleteed """ es_ids = set(self.get_es_ids()) tstore_ids = set([item[1] for item in self.get_uri_list(no_status=True)]) diff = es_ids - tstore_ids if diff: pdb.set_trace() action_list = self.es_worker.make_action_list(diff, action_type="delete") results = self.es_worker.bulk_save(action_list)