def find_tweetable(self, limit=10, timeout=30): s = Search(using=self.es_client, index=self.resource_index) s = s.filter('term', **{'resource.keyword': 'gpc'}) # Only tweet about sites where the last scan succeded, a gpc.json was # found, and it indicates support for GPC. s = s.filter('term', **{'status.keyword': 'ok'}) s = s.filter('term', **{'scan_data.found': True}) s = s.filter('term', **{'scan_data.gpc.parsed.gpc': True}) # Only tweet about base domains, not subdomains. s = s.filter('term', **{'is_base_domain': True}) # Don't tweet about sites we're previously tweeted about (or may have). # We may have set `tweeting` and failed before we could set `tweeted`. In this case, it's # unclear if the tweet went out or not - needs to be checked manually. s = s.exclude('term', **{'gpcsup.tweeting': True}) s = s.exclude('term', **{'gpcsup.tweeted': True}) s = s.sort('update_dt') s = s[:limit] s = s.params(request_timeout=timeout) response = s.execute() return [r.domain for r in response]
def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None: """ Queries fatcat search index (the full regular fatcat.wiki release index) for search string passed (and some filters), iterates over the result set (using scroll), and fetches full release entity (via api.fatcat.wik) for each. TODO: group by work_id """ api_session = requests_retry_session() es_backend = os.environ.get( "ELASTICSEARCH_FATCAT_BASE", "https://search.fatcat.wiki" ) es_index = os.environ.get("ELASTICSEARCH_FATCAT_RELEASE_INDEX", "fatcat_release") es_client = elasticsearch.Elasticsearch(es_backend) search = Search(using=es_client, index=es_index) search = search.exclude("terms", release_type=["stub", "component", "abstract"]) # "Emerald Expert Briefings" search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"]) # ResearchGate search = search.exclude("terms", doi_prefix=["10.13140"]) if fulltext_only: search = search.filter("terms", in_ia=True) search = search.query( Q("query_string", query=query, default_operator="AND", fields=["biblio"]) ) print(f"Expecting {search.count()} search hits", file=sys.stderr) search = search.params(clear_scroll=False) search = search.params(_source=False) results = search.scan() for hit in results: release_id = hit.meta.id resp = api_session.get( f"https://api.fatcat.wiki/v0/release/{release_id}", params={ "expand": "container,files,filesets,webcaptures", "hide": "references", }, ) resp.raise_for_status() row = dict( fatcat_hit=hit.meta._d_, release_id=release_id, fatcat_release=resp.json(), ) print(json.dumps(row, sort_keys=True), file=json_output)
def remove_cropped_if_asset_exists(asset): try: search = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset_meta)) search.query = Q('match', asset_id=asset.asset_id) search.exclude() for hit in search: idx = '{}-{}'.format(asset.asset_id, hit.cropped_id) s = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_cropped)) s.query = Q('match', id=idx) s.delete() search.delete() except: print(sys.exc_info()[0])
def related_images(uuid, index, request, filter_dead): """ Given a UUID, find related search results. """ # Convert UUID to sequential ID. item = Search(index=index) item = item.query('match', identifier=uuid) _id = item.execute().hits[0].id s = Search(index=index) s = s.query('more_like_this', fields=['tags.name', 'title', 'creator'], like={ '_index': index, '_id': _id }, min_term_freq=1, max_query_terms=50) # Never show mature content in recommendations. s = s.exclude('term', mature=True) page_size = 10 page = 1 start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] response = s.execute() results = _post_process_results(s, start, end, page_size, response, request, filter_dead) result_count, _ = _get_result_and_page_count(response, results, page_size) return results, result_count
def exclude_filtered_providers(s: Search) -> Search: """ Hide data sources from the catalog dynamically. This excludes providers with ``filter_content`` enabled from the search results. :param s: the search query to issue to Elasticsearch :return: the modified search query """ logger = parent_logger.getChild("exclude_filtered_providers") filter_cache_key = "filtered_providers" filtered_providers = cache.get(key=filter_cache_key) if filtered_providers is None: filtered_providers = ContentProvider.objects.filter( filter_content=True).values("provider_identifier") logger.debug("adding filtered providers to cache") cache.set( key=filter_cache_key, timeout=FILTER_CACHE_TIMEOUT, value=filtered_providers, ) logger.info(f'filtered_providers={",".join(filtered_providers)}') if len(filtered_providers) != 0: to_exclude = [f["provider_identifier"] for f in filtered_providers] logger.info("auto-excluding filtered providers") s = s.exclude("terms", provider=to_exclude) return s
def get_warnings_by_package(package_name, package_warnings): ''' Returns all the warnings for a specific package Arguments: package_name: the package in the database package_warnings: a dict keyed by warning_type we will populate in this function Returns: None, but populates the package_warnings dict ''' client = Elasticsearch(host=HOST) s = Search(using=client) s = s.source(['package', 'type', 'severity', 'score']) #q = Q("match", type=warning) & Q("match", severity=severity) s = s.query("match", package__keyword=package_name) s = s.exclude("match", tag="test_code") #print(s.to_dict()) # process the query for hit in s.scan(): #print(hit.type) #print(hit.severity) #print(hit.package) if hit.type not in package_warnings.keys(): package_warnings[hit.type] = {} if hit.severity in package_warnings[hit.type]: package_warnings[hit.type][hit.severity] += 1 else: package_warnings[hit.type][hit.severity] = 0
def init_last_datetime(**kwargs): from airflow.models import Variable from elasticsearch_dsl import Search from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT) s = s.exclude('exists', field="is_english") Variable.set("lemmatize_number_of_documents_eng", s.count())
def get(self, request, *args, **kwargs): if "q" in request.GET: self.search_query = "".join(request.GET["q"]) excluded_content_ids = request.GET.get("excluded", "").split(",") results = [] if self.index_manager.connected_to_es and self.search_query: self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() if len(excluded_content_ids) > 0 and excluded_content_ids != [""]: search_queryset = search_queryset.exclude( "terms", content_pk=excluded_content_ids) query = Match(_type="publishedcontent") & MultiMatch( query=self.search_query, fields=["title", "description"]) functions_score = [ { "filter": Match(content_type="TUTORIAL"), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"] ["if_tutorial"], }, { "filter": Match(content_type="ARTICLE"), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"] ["if_article"], }, { "filter": Match(content_type="OPINION"), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"] ["if_opinion"], }, ] scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score) search_queryset = search_queryset.query(scored_query)[:10] # Build the result for hit in search_queryset.execute(): result = { "id": hit.content_pk, "pubdate": hit.publication_date, "title": str(hit.title), "description": str(hit.description), } results.append(result) data = {"results": results} return HttpResponse(json_handler.dumps(data), content_type="application/json")
def _build_search(self, index, **kwargs): """ Internal method building the quering with respect to elasticsearch-dsl package. :param index: index for search :param kwargs: see getDocumentsCount and getDocuments :return: """ startdate = kwargs.get('startdate', None) if startdate: timefield = kwargs.get('timefield') enddate = kwargs.get('enddate', 'now') filters = kwargs.get('filters', None) exclude = kwargs.get('exclude', None) ranges = kwargs.get('ranges', None) fields_to_include = kwargs.get('field_to_include', None) wildcards = kwargs.get('wildcard', None) start_from = kwargs.get('from_', None) size = kwargs.get('size', None) sort_ = kwargs.get('sort', None) search = Search(using=self.es, index=index, doc_type=self.doc_type)\ .params(request_timeout=2000) if startdate: if startdate != enddate: timeRange = {timefield: {'gte': startdate, 'lt': enddate}} else: timeRange = {timefield: {'gte': startdate, 'lte': enddate}} search = search.filter('range', **timeRange) if filters: for key, val in filters.items(): search = search.filter( 'terms' if isinstance(val, list) else 'term', **{key: val}) if exclude: for ex in exclude.keys(): search = search.exclude('terms', **{ex: exclude[ex]}) if ranges: # ranges are expected in format: # [{field:{'gte':value, 'lte':value}}, {field: {'gte': value}}, {field: {'lte': value}}] for range_filter in ranges: search = search.filter('range', **range_filter) if fields_to_include: for field in fields_to_include.keys(): search = search.source(**{field: fields_to_include[field]}) if wildcards: for wild in wildcards: search = search.filter('wildcard', **{wild: wildcards[wild]}) if start_from: search = search.extra(**{"from_": start_from}) if size: search = search.extra(**{"size": size}) if sort_: search = search.sort(*sort_) self._logger.info(json.dumps(search.to_dict())) return search
def make_elasticsearch(index, filters, queries=None, exclusion_filters=None, range_filters=None, prefix_filters=None, terms_filters=None, es_url='http://elasticsearch.lco.gtn:9200'): """ Make an ElasticSearch query Parameters ---------- index : str Name of index to search filters : list of dicts Each dict has a criterion for an ElasticSearch "filter" queries : list of dicts Each dict has a "type" and "query" entry. The 'query' entry is a dict that has a criterion for an ElasticSearch "query" exclusion_filters : list of dicts Each dict has a criterion for an ElasticSearch "exclude" range_filters: list of dicts Each dict has a criterion an ElasticSearch "range filter" es_url : str URL of the ElasticSearch host Returns ------- search : elasticsearch_dsl.Search The ElasticSearch object """ if queries is None: queries = [] if exclusion_filters is None: exclusion_filters = [] if range_filters is None: range_filters = [] if terms_filters is None: terms_filters = [] if prefix_filters is None: prefix_filters = [] es = Elasticsearch(es_url) s = Search(using=es, index=index) for f in filters: s = s.filter('term', **f) for f in terms_filters: s = s.filter('terms', **f) for f in range_filters: s = s.filter('range', **f) for f in prefix_filters: s = s.filter('prefix', **f) for f in exclusion_filters: s = s.exclude('term', **f) for q in queries: s = s.query(q['type'], **q['query']) return s
def get(self, request, *args, **kwargs): if 'q' in request.GET: self.search_query = ''.join(request.GET['q']) excluded_content_ids = request.GET.get('excluded', '').split(',') results = [] if self.index_manager.connected_to_es and self.search_query: self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() if len(excluded_content_ids) > 0 and excluded_content_ids != ['']: search_queryset = search_queryset.exclude( 'terms', content_pk=excluded_content_ids) query = Match(_type='publishedcontent') & MultiMatch( query=self.search_query, fields=['title', 'description']) functions_score = [{ 'filter': Match(content_type='TUTORIAL'), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent'] ['if_tutorial'] }, { 'filter': Match(content_type='ARTICLE'), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent'] ['if_article'] }, { 'filter': Match(content_type='OPINION'), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent'] ['if_opinion'] }] scored_query = FunctionScore(query=query, boost_mode='multiply', functions=functions_score) search_queryset = search_queryset.query(scored_query)[:10] # Build the result for hit in search_queryset.execute(): result = { 'id': hit.content_pk, 'pubdate': hit.publication_date, 'title': str(hit.title), 'description': str(hit.description) } results.append(result) data = {'results': results} return HttpResponse(json_handler.dumps(data), content_type='application/json')
def test_exclude_org(self): """Test add organization name exclusion filter. """ s = Search() s.exclude = MagicMock(return_value='test') result = esc.exclude_org(s, esc.UNKNOWN_ORG_NAME) s.exclude.assert_called_with('term', author_org_name=esc.UNKNOWN_ORG_NAME) self.assertEqual(result, 'test')
def test_filters(): s = Search() s = s.filter('terms', tags=['search', 'python']) print(s.to_dict()) # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}} s = s.query('bool', filter=[Q('terms', tags=['search', 'python'])]) print(s.to_dict()) # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}} s = s.exclude('terms', tags=['search', 'python']) # 或者 # s = s.query('bool', filter=[~Q('terms', tags=['search', 'python'])]) print(s.to_dict())
def _exclude_filtered(s: Search): """ Hide data sources from the catalog dynamically. """ filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = models.ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set(key=filter_cache_key, timeout=FILTER_CACHE_TIMEOUT, value=filtered_providers) to_exclude = [f['provider_identifier'] for f in filtered_providers] s = s.exclude('terms', provider=to_exclude) return s
def get_all_warnings_x(warning_type, all_warnings): """ get all warnings for all packages, specified by a specific warning type Arguments --------- warning_type : the name of the warning type all_warnings : a dict of all the warnings of that type and their metadata, keyed by package name, which is empty to start Returns: --------- populates all_warnings """ client = Elasticsearch(host=HOST) #s = Search(using=client, index='production-logs-2021.04.14').params(request_timeout=60) s = Search(using=client) s = s.source(['package', 'type', 'severity', 'score', 'line', 'line_no']) s = s.query("match", type=warning_type) s = s.exclude("match", tag="test_code") #s = s.query("multi_match", type=warning_type, fields=['package', 'type', 'severity', 'score', 'line','line_no']) #print(s.to_dict()) #response = s.execute() #print(response) #for i in response: #print(i) # process the query for hit in s.scan(): if hit.package not in all_warnings.keys(): all_warnings[hit.package] = [] if not hasattr(hit, "severity"): hit.severity = None if not hasattr(hit, "score"): hit.score = None if not hasattr(hit, "line"): hit.line = None if not hasattr(hit, "line_no"): hit.line_no = None all_warnings[hit.package].append({ 'warning_type': warning_type, 'severity': hit.severity, 'score': hit.score, 'line': hit.line, 'line_no': hit.line_no })
def searchAllByGeonameid(geonameid): es = ELASTICSEARCHAUTHENTIFICATION geonameSearch = Search(using=es, index=GEONAMESINDEX) geonameSearch = geonameSearch[0:5] geonameSearch = geonameSearch.query("match", geonameid=geonameid) geonameSearch = geonameSearch.query("exists", field="admin2Code") geonameSearch = geonameSearch.exclude("match", featureClass="A") geonameSearch = geonameSearch.exclude("match", featureClass="P") response = geonameSearch.execute() responseDict = response.to_dict() found = None nbRes = responseDict['hits']['total']['value'] if nbRes > 0: found = responseDict['hits']['hits'][0]['_source'] return found
def get_all_warnings_counts_x(warning_type, all_warnings, all_unique_warnings, all_severities, all_raw_scores): """ populates the incoming dictionaries by the warning type specified Arguments --------- warning_type : the AuraScan warning type to search the database for all_warnings : a dict that stores the total number of warnings, keyed by package all_unique_warnings : a dict that stores the total number of unique warnings, keyed by package all_severities : a dict that stores all the severities for each package all_raw_scores : a list of all the scores of all packages, used to calculate percentile scores Returns --------- None, but populates all_warnings, all_unique_warnings, and all_severities """ client = Elasticsearch(host=HOST) s = Search(using=client).params(request_timeout=30) s = s.source(['package', 'type', 'severity', 'score']) s = s.query("match", type=warning_type) s = s.exclude("match", tag="test_code") #print(s.to_dict()) # process the query for hit in s.scan(): #print(hit) #print(hit.score) if not hasattr(hit, "severity"): hit.severity = None if not hasattr(hit, "score"): hit.score = None if hit.package not in all_warnings.keys(): all_warnings[hit.package] = 0 all_warnings[hit.package] += 1 if hit.package not in all_unique_warnings.keys(): all_unique_warnings[hit.package] = {} if warning_type not in all_unique_warnings[hit.package].keys(): all_unique_warnings[hit.package][warning_type] = 1 if hit.package not in all_severities.keys(): all_severities[hit.package] = 0 all_severities[hit.package] += get_score_percentiles( all_raw_scores, int(hit.score))
def get_LOC_by_warning(package_name): """ get all the LOC for a specific package Arguments --------- package_name : the package you want warnings for (do not specify a version) Returns --------- warnings : a list of lines of code (and their metadata) """ client = Elasticsearch(host=HOST) s = Search(using=client) s = s.source([ 'package', 'type', 'severity', 'score', 'line', 'line_no', 'location' ]) #q = Q("match", type=warning) & Q("match", severity=severity) s = s.query("match", package__keyword=package_name) s = s.exclude("match", tag="test_code") #print(s.to_dict()) # process the query results = [] for hit in s.scan(): if not hasattr(hit, "line"): hit.line = None if not hasattr(hit, "line_no"): hit.line_no = None if not hasattr(hit, "location"): hit.location = None #print(hit.to_dict()) results.append( [hit.line, hit.line_no, hit.location, hit.type, hit.severity]) #print(results) return results
def example10(): """ DSL objects for common entities instead of dict/json. All importable from elasticsearch_dsl """ from elasticsearch_dsl import Q, Search """ Straightforward mapping to json - kwargs are translated into keys into json. You can use the to_dict() method to see the result json. """ q = Q("terms", tags=["python", "search"]) q.to_dict() """ All objects can also be constructed using the raw dict. """ q = Q({"terms": {"tags": ["python", "search"]}}) q.to_dict() """ Query objects support logical operators which result in bool queries """ q = q | Q("match", title="python") q.to_dict() """ DSL objects also allow for attribute access instead of ['key'] """ q.minimum_should_match = 2 q.minimum_should_match q.to_dict() from datetime import date q = q & Q("range", **{"@timestamp": {"lt": date(2019, 1, 1)}}) q.to_dict() """ Configuration is global so no client needs to be passed around. """ from elasticsearch_dsl import connections """ Default connection used where no other connection specified. Any configuration methods just pass all parameters to the underlying elasticsearch-py client. """ connections.create_connection(hosts=["localhost"]) """ Optionally specify an alias for the connection in case of multiple connections. """ connections.create_connection("prod", hosts=["localhost"]) s = Search(using="prod") s.count() """ You can always just pass in your own client instance """ s = Search(using=Elasticsearch()) s.count() """ Any method on Search returns a clone so you need to always assign it back to the same variable. """ s = Search() s = s.params(q="fix") """ Multiple queries are combined together using the AND operator """ s = Search() s = s.query("match", description="fix") s = s.query("match", author="Honza") """ Filter shortcut to use {bool: {filter: []}} """ s = s.filter("range", committed_date={"lt": date(2016, 1, 1)}) s.to_dict() """ Exclude as a wrapper around must_not, use __ instead of dots for convenience. """ s = s.exclude("term", committer__name__keyword="Honza Král") """ Search is executed when iterated on or when .execute() is called. """ for hit in s: """ Hit class offers direct access to fields and via .meta any other properties on the returned hit (_id, _seq_no, ...) """ print(f"{hit.meta.id[:6]} ({hit.author.name}): {hit.description[:50]}") """ Aggregations are implemented in place to allow for chaining """ s = Search(index="git") s.aggs.bucket("tags", "terms", field="terms").metric( "lines", "sum", field="stats.lines").metric("authors", "cardinality", field="author.name.keyword") r = s.execute() """ Or modify aggregation in place """ s.aggs["tags"].bucket("months", "date_histogram", field="committed_date", interval="month") """ Analysis """ from elasticsearch_dsl import analyzer, token_filter a = analyzer( "file_analyzer", tokenizer="path_hierarchy", filter=[ "lowercase", token_filter( "split_ext", "pattern_capture", preserve_original=True, patterns=[r"^([^\.]+)"], ), ], ) a.simulate("test/integration/search.py") """ """ from elasticsearch_dsl import Document, Text, Keyword, InnerDoc, Date, Nested class FileDiff(InnerDoc): filename = Text(analyzer=a) patch = Text() class Commit(Document): description = Text() committed_date = Date() author = Text(fields={"keyword": Keyword()}) files = Nested(FileDiff) def subject(self): return self.description.split("\n", 1)[0][:80] class Index: name = "git*" settings = {"number_of_replicas": 0} """ Create the index """ Commit.init(index="git-v2") """ Search now returns Commit objects """ for c in Commit.search(): print(f"{c.meta.id}: {c.subject()}")
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) # Apply term filters. Each tuple pairs a filter's parameter name in the API # with its corresponding field in Elasticsearch. "None" means that the # names are identical. filters = [('extension', None), ('categories', None), ('aspect_ratio', None), ('size', None), ('source', 'provider'), ('license', 'license__keyword'), ('license_type', 'license__keyword')] for tup in filters: api_field, elasticsearch_field = tup s = _apply_filter(s, search_params, api_field, elasticsearch_field) # Get suggestions for any route s = s.suggest('get_suggestion', '', term={'field': 'creator'}) # Exclude mature content unless explicitly enabled by the requester if not search_params.data['mature']: s = s.exclude('term', mature=True) # Hide data sources from the catalog dynamically. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = models.ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set(key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers) to_exclude = [f['provider_identifier'] for f in filtered_providers] s = s.exclude('terms', provider=to_exclude) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query('simple_query_string', query=query, fields=search_fields) # Get suggestions for term query s = s.suggest('get_suggestion', query, term={'field': 'creator'}) else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query('simple_query_string', query=creator, fields=['creator']) # Get suggestions for creator s = s.suggest('get_suggestion', creator, term={'field': 'creator'}) if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query('simple_query_string', query=title, fields=['title']) # Get suggestions for title s = s.suggest('get_suggestion', title, term={'field': 'title'}) if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query('simple_query_string', fields=['tags.name'], query=tags) # Get suggestions for tags s = s.suggest('get_suggestion', tags, term={'field': 'tags.name'}) # Boost by popularity metrics if POPULARITY_BOOST: queries = [] factors = ['comments', 'views', 'likes'] boost_factor = 100 / len(factors) for factor in factors: rank_feature_query = Q('rank_feature', field=factor, boost=boost_factor) queries.append(rank_feature_query) s = Search().query( Q('bool', must=s.query, should=queries, minimum_should_match=1)) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip), request_timeout=7) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] try: search_response = s.execute() log.info(f'query={s.to_dict()}, es_took_ms={search_response.took}') except RequestError as e: raise ValueError(e) results = _post_process_results(s, start, end, page_size, search_response, request, filter_dead) suggestion = _query_suggestions(search_response) result_count, page_count = _get_result_and_page_count( search_response, results, page_size) return results, page_count, result_count, suggestion
def query_fatcat(json_output): """ Queries fatcat search index (the full regular fatcat.wiki release index) for COVID-19 keywords and phrases, iterates over the result set (using scroll), and fetches full release entity (via api.fatcat.wik) for each. """ api_session = requests_retry_session() es_backend = os.environ.get( "ELASTICSEARCH_BACKEND", default="https://search.fatcat.wiki", ) es_index = "fatcat_release" es_client = elasticsearch.Elasticsearch(es_backend) search = Search(using=es_client, index=es_index) search = search.exclude("terms", release_type=["stub", "component", "abstract"]) # "Emerald Expert Briefings" search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"]) # ResearchGate search = search.exclude("terms", doi_prefix=["10.13140"]) # some industrial thing search = search.exclude("query_string", query='"Report on SARS backfit evaluation"', fields=["title"]) # physic experiment search = search.exclude("query_string", query='"TOF-SARS"', fields=["title"]) # species not related to SARS # something based on excluding "lake" in title might be easier? search = search.exclude("query_string", query='"G.O. Sars"', fields=["title"]) search = search.exclude("query_string", query='"Gomphocythere Sars"', fields=["title"]) search = search.exclude("query_string", query='"Australis Sars"', fields=["title"]) search = search.exclude("query_string", query='"scutifer Sars"', fields=["title"]) search = search.exclude("query_string", query='"lumholtzi Sars"', fields=["title"]) search = search.query( Q("query_string", query= '"COVID-19" coronavirus coronaviruses "sars-cov-2" "2019-nCoV" "SARS-CoV" "MERS-CoV" SARS', default_operator="OR", fields=["title", "original_title"]) | Q("query_string", query='pandemic influenza', default_operator="AND", fields=["biblio"]) | Q("query_string", query='epidemic influenza', default_operator="AND", fields=["biblio"]) | Q("query_string", query='pandemic ventilator', default_operator="AND", fields=["biblio"])) print("Expecting {} search hits".format(search.count()), file=sys.stderr) search = search.params(clear_scroll=False) search = search.params(_source=False) results = search.scan() for hit in results: release_id = hit.meta.id resp = api_session.get( 'https://api.fatcat.wiki/v0/release/{}'.format(release_id), params={ 'expand': 'container,files,filesets,webcaptures', 'hide': 'references', }) resp.raise_for_status() row = dict( fatcat_hit=hit.meta._d_, release_id=release_id, fatcat_release=resp.json(), ) print(json.dumps(row, sort_keys=True), file=json_output)
def emit_compute_dict(self, uuid, compute_map, index, identifier, alias): """ Returns the normalized data from the ES query """ output_dict = {} if "aggregations" not in compute_map: logger.critical( f"Incorrect JSON data: nested dictionaries aggregations \ fields are required in {compute_map}") return output_dict buckets = compute_map.get("buckets", []) aggregations = compute_map["aggregations"] filters = compute_map.get("filter", {}) logger.debug("Initializing search object") kw_identifier = identifier + ".keyword" # append .keyword s = Search(using=self._conn_object, index=str(index)).query("match", **{kw_identifier: uuid}) # Apply filters for key, value in filters.items(): s = s.filter("wildcard", **{key: value}) # Apply excludes for key, value in compute_map.get("exclude", {}).items(): s = s.exclude("match", **{key: value}) if buckets: logger.debug("Building buckets") a = A("terms", field=buckets[0], size=10000) x = s.aggs.bucket(buckets[0].split(".keyword")[0], a) for bucket in buckets[1:]: a = A("terms", field=bucket, size=10000) # Create bucket with and trimming characters after . x = x.bucket(bucket.split(".keyword")[0], a) logger.debug("Finished adding buckets to query") else: a = a = A("terms") logger.debug("Adding aggregations to query") for key, agg_list in aggregations.items(): for aggs in agg_list: if isinstance(aggs, str): _temp_agg_str = "{}({})".format(aggs, key) # Create aggregation based on the key a.metric(_temp_agg_str, aggs, field=key) self._aggs_list.append(_temp_agg_str) # If there's a dictionary of aggregations. i.e different percentiles # we have to iterate through keys and values elif isinstance(aggs, dict): for dict_key, dict_value in aggs.items(): _temp_agg_str = "{}({})".format(dict_key, key) # Add nested dict as aggregation a.metric(_temp_agg_str, dict_key, field=key, **dict_value) self._aggs_list.append(_temp_agg_str) else: logger.warn("Ignoring aggregation {}".format(aggs)) logger.debug("Finished adding aggregations to query") logger.debug("Built the following query: {}".format( json.dumps(s.to_dict(), indent=4))) response = s.execute() logger.debug("Succesfully executed the search query") if len(response.hits.hits) == 0: return {} _output_dict = self.gen_result_dict(response, buckets, self._aggs_list, uuid, alias) if filters: output_dict = _output_dict filter_list = [] for key, value in filters.items(): filter_list.append(key) filter_list.append(value) # Include all k,v from filters as keys in the output dictionary for key in reversed(filter_list): output_dict = {key.split(".keyword")[0]: output_dict} else: output_dict = _output_dict logger.debug("output compute dictionary with summaries is: {}".format( json.dumps(output_dict, indent=4))) return output_dict
def browse(request): s = Search(using=es) description = None s.query = FunctionScore( query=s.query, functions=[SF('random_score', seed=int(time.time()))]) if 'source' in request.GET: source = request.GET['source'] s = s.filter('terms', **{'analysis.source': [source]}) description = SOURCE_MAP.get(source, {}).get('name') or source elif 'titleii' in request.GET: title_ii = request.GET['titleii'] if title_ii == 'pro': s = s.filter('terms', **{'analysis.titleii': [True]}) description = "Pro Title II" elif title_ii == 'anti': description = 'Anti Title II' s = s.filter('terms', **{'analysis.titleii': [False]}) elif title_ii == 'unknown': description = 'Uncategorized' s = s.exclude('exists', field='analysis.titleii') s.aggs.bucket('address', A('terms', field='analysis.fulladdress')) s.aggs.bucket('site', A('terms', field='analysis.onsite')) s.aggs.bucket( 'email_confirmation', A('filters', filters={ 'true': { 'term': { 'emailConfirmation': 'true' } }, 'false': { 'term': { 'emailConfirmation': 'false' } } })) s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw')) # s.aggs.bucket('email_confirmation', A('filters', field='analysis.fulladdress')) stats = OrderedDict({ 'Comment Form': { 'On-site': 0, 'Off-site': 0 }, 'Emails': { 'Unique': 0, }, 'Address': { 'Full Address': 0, 'Partial Address': 0, }, 'Email Confirmation': { 'True': 0, 'False': 0, 'Missing': 0 } }) response = s[:50].execute() total = s.count() for bucket in response.aggregations.address.buckets: if bucket.key == 1: stats['Address']['Full Address'] = bucket.doc_count elif bucket.key == 0: stats['Address']['Partial Address'] = bucket.doc_count for bucket in response.aggregations.site.buckets: if bucket.key == 1: stats['Comment Form']['On-site'] = bucket.doc_count elif bucket.key == 0: stats['Comment Form']['Off-site'] = bucket.doc_count stats['Emails']['Unique'] = response.aggregations.unique_emails.value for bucket, value in response.aggs.email_confirmation.to_dict( )['buckets'].items(): if bucket == 'true': stats['Email Confirmation']['True'] = value['doc_count'] elif bucket == 'false': stats['Email Confirmation']['False'] = value['doc_count'] stats['Email Confirmation']['Missing'] = ( total - stats['Email Confirmation']['True'] - stats['Email Confirmation']['False']) context = { 'description': description, 'stats': stats, 'results': response, 'comment_count': total } return render(request, 'listing.html', context)
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) if 'li' in search_params.data: s = _filter_licenses(s, search_params.data['li']) elif 'lt' in search_params.data: s = _filter_licenses(s, search_params.data['lt']) # Apply term filters. filters = ['provider', 'extension', 'categories', 'aspect_ratio', 'size'] for _filter in filters: s = _apply_filter(_filter, search_params, s) # Hide data sources from the catalog dynamically. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set(key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers) for filtered in filtered_providers: s = s.exclude('match', provider=filtered['provider_identifier']) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query('query_string', query=query, fields=search_fields, type='most_fields') else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query('query_string', query=creator, default_field='creator') if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query('query_string', query=title, default_field='title') if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query('query_string', default_field='tags.name', query=tags) # Boost by popularity metrics if POPULARITY_BOOST: queries = [] factors = ['comments', 'views', 'likes'] boost_factor = 100 / len(factors) for factor in factors: rank_feature_query = Q('rank_feature', field=factor, boost=boost_factor) queries.append(rank_feature_query) s = Search().query( Q('bool', must=s.query, should=queries, minimum_should_match=1)) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip)) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] try: search_response = s.execute() except RequestError as e: raise ValueError(e) results = _post_process_results(s, start, end, page_size, search_response, request, filter_dead) result_count, page_count = _get_result_and_page_count( search_response, results, page_size) return results, page_count, result_count
def search(search_params, index, page_size, ip, page=1) -> Response: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `~cccatalog.api.search_serializers.SearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param page: The results page number. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :return: An Elasticsearch Response object. """ s = Search(index=index) # Paginate search query. start_slice = page_size * (page - 1) end_slice = page_size * page if start_slice + end_slice > ELASTICSEARCH_MAX_RESULT_WINDOW: raise ValueError("Deep pagination is not allowed.") s = s[start_slice:end_slice] # If any filters are specified, add them to the query. if 'li' in search_params.data or 'lt' in search_params.data: license_field = 'li' if 'li' in search_params.data else 'lt' license_filters = [] for _license in search_params.data[license_field].split(','): license_filters.append(Q('term', license__keyword=_license)) s = s.filter('bool', should=license_filters, minimum_should_match=1) if 'provider' in search_params.data: provider_filters = [] for provider in search_params.data['provider'].split(','): provider_filters.append(Q('term', provider=provider)) s = s.filter('bool', should=provider_filters, minimum_should_match=1) # It is sometimes desirable to hide content providers from the catalog # without scrubbing them from the database or reindexing. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set(key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers) for filtered in filtered_providers: s = s.exclude('match', provider=filtered['provider_identifier']) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. if 'q' in search_params.data: s = s.query('constant_score', filter=Q( 'query_string', query=search_params.data['q'], fields=['tags.name', 'title'], )) else: if 'creator' in search_params.data: creator = search_params.data['creator'] s = s.query('constant_score', filter=Q('query_string', query=creator, default_field='creator')) if 'title' in search_params.data: title = search_params.data['title'] s = s.query('constant_score', filter=Q('query_string', query=title, default_field='title')) if 'tags' in search_params.data: tags = search_params.data['tags'] s = s.query('constant_score', filter=Q('query_string', default_field='tags.name', query=tags)) s.extra(track_scores=True) s = s.params(preference=str(ip)) search_response = s.execute() return search_response
should=[Q(...), Q(...)], minimum_should_match=1 ) s = Search().query(q) ################################################## # FILTERS s = Search() s = s.filter('terms', tags=['search', 'python']) # Same as s = s.query('bool', filter=[Q('terms', tags=['search', 'python'])]) # We can use exclude too s = s.exclude('terms', tags=['search', 'python']) ##################################################### # AGGREGATIONS from elasticsearch_dsl.aggs import A a = A('terms', field='category') a.metric('clicks_per_category', 'sum', field='clicks')\ .bucket('tags_per_category', 'terms', field='tags') # This is how you add aggregations to the search object
class AbstractSearch: def __init__(self, es_client=None, index=None): self.es_client = es_client self.index = index self.search = None self.multi_search = None @staticmethod def es2api(response, start, limit): return { 'total': response['hits']['total']['value'], 'results': response['hits']['hits'], 'aggregations': {key: value['buckets'] for key, value \ in response.get('aggregations', {}).items()}, 'start': start, 'limit': limit } async def get(self, query='*', filters=None, aggregations=None, exclude=None, to_date=None, from_date=None, start=0, limit=30): self.search = Search(using=self.es_client, index=self.index) self.__filters(filters, to_date, from_date) # print(self.search.to_dict()) self.__exclude(exclude) self.__query(query) self.__aggregations(aggregations) try: return AbstractSearch.es2api( self.search.sort('-publish_date')[start:start+limit].execute().to_dict(), start, limit ) except Exception as err: print(err) async def get_hist(self, query='*', filters=None, field=None, interval=None, exclude=None, to_date=None, from_date=None, start=0, limit=30): self.__filters(filters, to_date, from_date) self.__exclude(exclude) self.__query(query) self.__histogram(field, interval) return AbstractSearch.es2api( self.search[start:limit].execute().to_dict(), start, limit ) def __histogram(self, field, interval): try: ah = A('terms', field='tags.keyword', size=5) aggregation = A('date_histogram', field='publish_date', interval=interval) self.search.aggs.bucket('histogram_data', aggregation).bucket('tags', ah) except Exception as err: print(err) def __aggregations(self, aggregations): for key, value in aggregations.items(): try: aggregation = A(value['type'], field=f'{key}.keyword') self.search.aggs.bucket(key, aggregation) except Exception as err: print(err) pass def __filters(self, filters, to_date, from_date): self.search = self.search.filter('range', **{f'publish_date': {'gte':from_date ,'lte':to_date}}) for key, value in filters.items(): self.search = self.search.filter(value['type'], **{f'{key}.keyword': value[value['type']]}) def __exclude(self, exclude): for value in exclude: self.search = self.search.exclude(value['type'], **{f'{value["field"]}.keyword': value[value['type']]}) def __query(self, query): self.search = self.search.query('simple_query_string', query=query) async def __aenter__(self): self.search = Search(using=self.es_client, index=self.index) return self async def __aexit__(self, *args, **kwargs): return self
def federal_fec_compute_load_graph_candidates(message, context): # configure ElasticSearch search s = Search(using=es, index="federal_fec_candidates") q = s.exclude("exists", field="context.last_graphed") # get start time start = time.time() # loop for 520s while time.time() - start < 520: docs = q[0:1000].execute() if len(docs) == 0: logger.info(' - '.join(['NO CANDIDATES FOUND FOR LOADING'])) break # batches for neo4j and elasticsearch candidates = [] parties = [] races = [] linkages = [] actions = [] for doc in docs: # prepare docs for loading if "row" in doc: candidates.append({ "cand_id": doc.row["cand_id"], "cand_name": doc.processed["cand_name"].strip() if doc.processed["cand_name"] is not None else "", "cand_pty_affiliation": doc.row["cand_pty_affiliation"], "cand_election_yr": doc.row["cand_election_yr"], "cand_office_st": doc.row["cand_office_st"], "cand_office": doc.row["cand_office"], "cand_office_district": doc.row["cand_office_district"], "cand_ici": doc.row["cand_ici"] }) if doc.row["cand_pty_affiliation"] is not None: parties.append({ "cand_id": doc.row["cand_id"], "cand_pty_affiliation": doc.row["cand_pty_affiliation"] }) races.append({ "cand_id": doc.row["cand_id"], "cand_election_yr": doc.row["cand_election_yr"] or "", "cand_office_st": doc.row["cand_office_st"] or "", "cand_office": doc.row["cand_office"] or "", "cand_office_district": doc.row["cand_office_district"] or "" }) if "linkages" in doc: if "committees" in doc.linkages: for linkage in doc.linkages.committees: linkages.append({ "cmte_id": linkage["cmte_id"], "cand_id": doc.meta.id, "cand_election_yr": linkage["cand_election_yr"], "linkage_id": linkage["linkage_id"] }) # prepare to mark as in graph in elasticsearch actions.append({ "_op_type": "update", "_index": "federal_fec_candidates", "_id": doc.meta.id, "doc": { "context": { "last_graphed": datetime.datetime.now(datetime.timezone.utc) } } }) # load into neo4j with driver.session() as neo4j: neo4j.write_transaction(cypher.merge_node_candidate, batch=candidates) neo4j.write_transaction(cypher.merge_rel_candidate_party, batch=parties) neo4j.write_transaction(cypher.merge_rel_candidate_race, batch=races) neo4j.write_transaction(cypher.merge_rel_candidate_committee, batch=linkages) # mark as graphed in elasticsearch helpers.bulk(es, actions) logger.info(' - '.join(['CANDIDATES LOADED', str(len(actions))])) return True
Multiple queries are combined together using the AND operator """ s = Search() s = s.query("match", description="fix") s = s.query("match", author="Honza") """ Filter shortcut to use {bool: {filter: []}} """ s = s.filter("range", committed_date={"lt": date(2016, 1, 1)}) s.to_dict() """ Exclude as a wrapper around must_not, use __ instead of dots for convenience. """ s = s.exclude("term", committer__name__keyword="Honza Král") """ Search is executed when iterated on or when .execute() is called. """ for hit in s: """ Hit class offers direct access to fields and via .meta any other properties on the returned hit (_id, _seq_no, ...) """ print(f"{hit.meta.id[:6]} ({hit.author.name}): {hit.description[:50]}") """ Aggregations are implemented in place to allow for chaining """ s = Search(index="git")
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) # Add requested filters. if 'li' in search_params.data: s = _filter_licenses(s, search_params.data['li']) elif 'lt' in search_params.data: s = _filter_licenses(s, search_params.data['lt']) if 'provider' in search_params.data: provider_filters = [] for provider in search_params.data['provider'].split(','): provider_filters.append(Q('term', provider=provider)) s = s.filter('bool', should=provider_filters, minimum_should_match=1) if 'extension' in search_params.data: extension = search_params.data['extension'] extension_filter = Q('term', extension=extension) s = s.filter('bool', should=extension_filter, minimum_should_match=1) # It is sometimes desirable to hide content providers from the catalog # without scrubbing them from the database or reindexing. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set( key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers ) for filtered in filtered_providers: s = s.exclude('match', provider=filtered['provider_identifier']) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query( 'query_string', query=query, fields=search_fields, type='most_fields' ) else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query( 'query_string', query=creator, default_field='creator' ) if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query( 'query_string', query=title, default_field='title' ) if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query( 'query_string', default_field='tags.name', query=tags ) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip)) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] search_response = s.execute() results = _post_process_results( s, start, end, page_size, search_response, request, filter_dead ) result_count, page_count = _get_result_and_page_count( search_response, results, page_size ) return results, page_count, result_count