def post(self, request: Request) -> Response: filter_query = QueryWithFilters.generate_awards_elasticsearch_query( self.filters) # Ensure that only non-zero values are taken into consideration non_zero_columns = list( ElasticsearchLoansPaginationMixin.sum_column_mapping.values()) non_zero_queries = [] for field in non_zero_columns: non_zero_queries.append(ES_Q("range", **{field: {"gt": 0}})) non_zero_queries.append(ES_Q("range", **{field: {"lt": 0}})) filter_query.must.append( ES_Q("bool", should=non_zero_queries, minimum_should_match=1)) bucket_count = get_number_of_unique_terms_for_awards( filter_query, "cfda_agg_key.hash") return Response({"count": bucket_count})
def _generate_elasticsearch_query(cls, filter_values: List[str], query_type: _QueryType) -> ES_Q: keyword_queries = [] fields = [ "recipient_name", "naics_description", "product_or_service_description", "award_description", "piid", "fain", "uri", "recipient_unique_id", "parent_recipient_unique_id", "description", ] for v in filter_values: keyword_queries.append(ES_Q("query_string", query=v + "*", default_operator="AND", fields=fields)) return ES_Q("dis_max", queries=keyword_queries)
def generate_elasticsearch_query(cls, filter_value: str, query_type: _QueryType) -> ES_Q: recipient_scope_query = ES_Q("match", recipient_location_country_code="USA") if filter_value == "domestic": return recipient_scope_query elif filter_value == "foreign": return ~recipient_scope_query
def generate_elasticsearch_query(cls, filter_values: List[str], query_type: _QueryType, nested_path: str = "") -> ES_Q: award_ids_query = [] for v in filter_values: if v and v.startswith('"') and v.endswith('"'): v = v[1:-1] award_ids_query.append( ES_Q("term", display_award_id={"value": es_sanitize(v)})) else: v = es_sanitize(v) v = " +".join(v.split()) award_ids_query.append( ES_Q("regexp", display_award_id={"value": v})) return ES_Q("bool", should=award_ids_query, minimum_should_match=1)
def _generate_elasticsearch_query(cls, filter_values: List[dict], query_type: _QueryType) -> List[ES_Q]: awarding_agency_query = [] funding_agency_query = [] for v in filter_values: agency_name = v["name"] agency_tier = v["tier"] agency_type = v["type"] agency_query = ES_Q("match", **{f"{agency_type}_{agency_tier}_agency_name__keyword": agency_name}) if agency_type == "awarding": awarding_agency_query.append(agency_query) elif agency_type == "funding": funding_agency_query.append(agency_query) return [ ES_Q("bool", should=awarding_agency_query, minimum_should_match=1), ES_Q("bool", should=funding_agency_query, minimum_should_match=1), ]
def generate_elasticsearch_query(cls, filter_values: List[str], query_type: _QueryType) -> ES_Q: def_codes_query = [] for v in filter_values: def_codes_query.append( ES_Q("match", disaster_emergency_fund_codes=v)) if query_type == _QueryType.AWARDS: return ES_Q( "bool", should=def_codes_query, minimum_should_match=1, ) return ES_Q( "bool", should=def_codes_query, minimum_should_match=1, must=ES_Q("range", action_date={"gte": "2020-04-01"}), )
def generate_elasticsearch_query(cls, filter_values: List[str], query_type: _QueryType, nested_path: str = "") -> ES_Q: if nested_path is None: nested_path = "" def_codes_query = [] def_code_field = f"{nested_path}{'.' if nested_path else ''}disaster_emergency_fund_code{'s' if query_type != _QueryType.ACCOUNTS else ''}" for v in filter_values: def_codes_query.append(ES_Q("match", **{def_code_field: v})) if query_type == _QueryType.TRANSACTIONS: return ES_Q( "bool", should=def_codes_query, minimum_should_match=1, must=ES_Q("range", action_date={"gte": "2020-04-01"}), ) return ES_Q("bool", should=def_codes_query, minimum_should_match=1)
def delete_docs_by_unique_key(client: Elasticsearch, key: str, value_list: list, job_id: str, index) -> int: """ Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the ``values_list``. Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster key (str): name of filed in targeted elasticearch index that shoudld have a unique value for every doc in the index. Ideally the field or sub-field provided is of ``keyword`` type. value_list (list): if key field has these values, the document will be deleted job_id (str): name of ES ETL job being run, used in logging index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation. NOTE: This delete routine looks at just the index name given. If there are duplicate records across multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple indices, or this will need to be run once per index. Returns: Number of ES documents deleted """ start = perf_counter() printf({ "msg": f"Deleting up to {len(value_list):,} document(s)", "f": "ES Delete", "job": job_id }) assert index, "index name must be provided" deleted = 0 is_error = False try: # 65,536 is max number of terms that can be added to an ES terms filter query values_generator = chunks(value_list, 50000) for chunk_of_values in values_generator: # Creates an Elasticsearch query criteria for the _delete_by_query call q = ES_Q("terms", **{key: chunk_of_values}) # Invoking _delete_by_query as per the elasticsearch-dsl docs: # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query response = Search(using=client, index=index).filter(q).delete() chunk_deletes = response["deleted"] deleted += chunk_deletes except Exception as e: is_error = True printf({ "msg": f"[ERROR][ERROR][ERROR]\n{str(e)}", "f": "ES Delete", "job": job_id }) raise SystemExit(1) finally: error_text = " before encountering an error" if is_error else "" msg = f"ES Deletes took {perf_counter() - start:.2f}s. Deleted {deleted:,} records{error_text}" printf({"msg": msg, "f": "ES Delete", "job": job_id}) return deleted
def generate_elasticsearch_query(cls, filter_values: List[str], query_type: _QueryType, nested_path: str = "") -> ES_Q: pop_scope_query = ES_Q("match", pop_country_code="USA") if filter_values == "domestic": return pop_scope_query elif filter_values == "foreign": return ~pop_scope_query
def filter(self, *args, **kwargs): for expr in args: left, right = expr.get_children() # see sqlalchemy BinaryExpression operator = expr.operator.__name__ try: colname = left.name # see sqlalchemy Column except AttributeError: colname = left try: value = right.effective_value # see sqlalchemy BindParameter except AttributeError: value = right if operator is 'ilike_op': value = value.replace('%', '').lower() search = self._search.query(ES_Q('match_phrase', **{colname: value})) else: search = self._search.query(ES_Q('match', **{colname: value})) self._filter_params[colname] = value return self.__class__(self._model, search, self._vals, self._highlight)
def generate_elasticsearch_query(cls, filter_values, query_type: _QueryType, nested_path: str = "") -> ES_Q: cls.validate_filter_values(filter_values) require, exclude = cls.split_filter_values(filter_values) require = cls.handle_tier1_names(require) exclude = cls.handle_tier1_names(exclude) return ES_Q("query_string", query=cls._query_string(require, exclude), default_field="product_or_service_code.keyword")
def generate_elasticsearch_query(cls, filter_values: list, query_type: _QueryType) -> ES_Q: tas_codes_query = [] for v in filter_values: code_lookup = { "aid": v.get("aid", ".*"), "main": v.get("main", ".*"), "ata": v.get("ata", ".*"), "sub": v.get("sub", ".*"), "bpoa": v.get("bpoa", ".*"), "epoa": v.get("epoa", ".*"), "a": v.get("a", ".*"), } search_regex = f"aid={code_lookup['aid']}main={code_lookup['main']}ata={code_lookup['ata']}sub={code_lookup['sub']}bpoa={code_lookup['bpoa']}epoa={code_lookup['epoa']}a={code_lookup['a']}" code_query = ES_Q("regexp", tas_components={"value": search_regex}) tas_codes_query.append(ES_Q("bool", must=code_query)) return ES_Q("bool", should=tas_codes_query, minimum_should_match=1)
def post(self, request: Request) -> Response: # Need to update the value of "query" to have the fields to search on query = self.filters.pop("query", None) if query: self.filters["query"] = { "text": query, "fields": self.query_fields } self.filter_query = QueryWithFilters.generate_awards_elasticsearch_query( self.filters) # Ensure that only non-zero values are taken into consideration # TODO: Refactor to use new NonzeroFields filter in QueryWithFilters non_zero_queries = [] for field in self.sum_column_mapping.values(): non_zero_queries.append(ES_Q("range", **{field: {"gt": 0}})) non_zero_queries.append(ES_Q("range", **{field: {"lt": 0}})) self.filter_query.must.append( ES_Q("bool", should=non_zero_queries, minimum_should_match=1)) self.bucket_count = get_number_of_unique_terms_for_awards( self.filter_query, f"{self.agg_key.replace('.keyword', '')}.hash") messages = [] if self.pagination.sort_key in ("id", "code"): messages.append(( f"Notice! API Request to sort on '{self.pagination.sort_key}' field isn't fully implemented." " Results were actually sorted using 'description' field.")) if self.bucket_count > 10000 and self.agg_key == settings.ES_ROUTING_FIELD: self.bucket_count = 10000 messages.append(( "Notice! API Request is capped at 10,000 results. Either download to view all results or" " filter using the 'query' attribute.")) response = self.query_elasticsearch() response["page_metadata"] = get_pagination_metadata( self.bucket_count, self.pagination.limit, self.pagination.page) if messages: response["messages"] = messages return Response(response)
def _expand_query(q, fields): should = [] for field in fields: should.append(ES_Q('term', **{field: {'value': q, 'boost': 10}})) should.append( ES_Q('match', **{field: { 'query': q, 'boost': 4, 'type': 'phrase' }})) should.append(ES_Q('match', **{field: {'query': q, 'boost': 3}})) should.append( ES_Q('fuzzy', **{field: { 'value': q, 'boost': 2, 'prefix_length': 4 }})) should.append(ES_Q('prefix', **{field: {'value': q, 'boost': 1.5}})) return query.Bool(should=should)
def _generate_elasticsearch_query(cls, filter_values: List[dict], query_type: _QueryType) -> ES_Q: pop_locations_query = [] for v in filter_values: location_query = [] location_lookup = { "country_code": v.get("country"), "state_code": v.get("state"), "county_code": v.get("county"), "congressional_code": v.get("district"), "city_name__keyword": v.get("city"), "zip5": v.get("zip"), } for location_key, location_value in location_lookup.items(): if location_value is not None: location_query.append(ES_Q("match", **{f"pop_{location_key}": location_value})) pop_locations_query.append(ES_Q("bool", must=location_query)) return ES_Q("bool", should=pop_locations_query, minimum_should_match=1)
def fuzzy(cls, query, fields=None, highlight=False): s, fieldnames = cls._prepare_search(query, fields=None) highlight_ = False if query: s = s.query(ES_Q('fuzzy_like_this', like_text=query, analyzer='ga_search_analyzer', fields=fieldnames)) if highlight: s = s.highlight(*_remove_boost(fieldnames), pre_tags=['<b>'], post_tags=['</b>'], fragment_size=400, number_of_fragments=3) highlight_ = True return ES_QuerySet(model=cls, search=s, highlight=highlight_)
def _generate_elasticsearch_query(cls, filter_values: List[dict], query_type: _QueryType) -> ES_Q: time_period_query = [] for v in filter_values: start_date = v.get("start_date") or settings.API_SEARCH_MIN_DATE end_date = v.get("end_date") or settings.API_MAX_DATE if query_type == _QueryType.AWARDS: time_period_query.append( ES_Q( "bool", should=[ ES_Q("range", action_date={"gte": start_date}), ES_Q("range", date_signed={"lte": end_date}), ], minimum_should_match=2, ) ) else: time_period_query.append(ES_Q("range", action_date={"gte": start_date, "lte": end_date})) return ES_Q("bool", should=time_period_query, minimum_should_match=1)
def generate_elasticsearch_query(cls, filter_values: dict, query_type: _QueryType, nested_path: str = "") -> ES_Q: query_text = filter_values["text"] query_fields = [ f"{nested_path}{'.' if nested_path else ''}{field}" for field in filter_values["fields"] ] return ES_Q("multi_match", query=query_text, type="phrase_prefix", fields=query_fields)
def _generate_elasticsearch_query(cls, filter_values: List[dict], query_type: _QueryType) -> ES_Q: tas_codes_query = [] for v in filter_values: code_query = [] code_lookup = { "aid": v.get("aid"), "ata": v.get("ata"), "main": v.get("main"), "sub": v.get("sub"), "bpoa": v.get("bpoa"), "epoa": v.get("epoa"), "a": v.get("a"), } for code_key, code_value in code_lookup.items(): if code_value is not None: code_query.append(ES_Q("match", **{f"treasury_accounts__{code_key}": code_value})) tas_codes_query.append(ES_Q("bool", must=code_query)) nested_query = ES_Q("bool", should=tas_codes_query, minimum_should_match=1) return ES_Q("nested", path="treasury_accounts", query=nested_query)
def search(cls, query, fields=None, default_operator='and', fuzzy=False, highlight=False): if query and fuzzy: return cls.fuzzy(query, fields, highlight) s, fieldnames = cls._prepare_search(query, fields=None) highlight_ = False if query: s = s.query(ES_Q('simple_query_string', query=query, analyzer='ga_search_analyzer', default_operator=default_operator, fields=fieldnames)) if highlight: s = s.highlight(*_remove_boost(fieldnames), pre_tags=['<b>'], post_tags=['</b>'], fragment_size=400, number_of_fragments=3) highlight_ = True return ES_QuerySet(model=cls, search=s, highlight=highlight_)
def generate_elasticsearch_query(cls, filter_values, query_type: _QueryType) -> ES_Q: if isinstance(filter_values, list): # This is a legacy usage, and will be dealt with by the other filter return TreasuryAccounts.generate_elasticsearch_query( filter_values, query_type) elif isinstance(filter_values, dict): require = filter_values.get("require") or [] exclude = filter_values.get("exclude") or [] else: raise InvalidParameterException( f"tas_codes must be an array or object") return ES_Q("query_string", query=cls._query_string(require, exclude), default_field="tas_paths")
def _handle_tas_query(cls, must_queries: list, filters: dict, query_type: _QueryType) -> list: if filters.get(TreasuryAccounts.underscore_name) or filters.get( TasCodes.underscore_name): tas_queries = [] if filters.get(TreasuryAccounts.underscore_name): tas_queries.append( TreasuryAccounts.generate_elasticsearch_query( filters[TreasuryAccounts.underscore_name], query_type)) if filters.get(TasCodes.underscore_name): tas_queries.append((TasCodes.generate_elasticsearch_query( filters[TasCodes.underscore_name], query_type))) must_queries.append( ES_Q("bool", should=tas_queries, minimum_should_match=1)) filters.pop(TreasuryAccounts.underscore_name, None) filters.pop(TasCodes.underscore_name, None) return must_queries
def create_es_search(scope, search_text, country=None, state=None): """ Providing the parameters, create a dictionary representing the bool-query conditional clauses for elasticsearch Args: scope: which city field was chosen for searching `pop` (place of performance) or `recipient_location` search_text: the text the user is typing in and sent to the backend country: optional country selected by user state: optional state selected by user """ # The base query that will do a wildcard term-level query query = {"must": [{"wildcard": {"{}_city_name.keyword".format(scope): search_text + "*"}}]} if country != "USA": # A non-USA selected country if country != ALL_FOREIGN_COUNTRIES: query["must"].append({"match": {"{scope}_country_code".format(scope=scope): country}}) # Create a "Should Not" query with a nested bool, to get everything non-USA query["should"] = [ { "bool": { "must": {"exists": {"field": "{}_country_code".format(scope)}}, "must_not": [ {"match": {"{}_country_code".format(scope): "USA"}}, {"match_phrase": {"{}_country_code".format(scope): "UNITED STATES"}}, ], } } ] query["minimum_should_match"] = 1 else: # USA is selected as country query["should"] = [build_country_match(scope, "USA"), build_country_match(scope, "UNITED STATES")] query["should"].append({"bool": {"must_not": {"exists": {"field": "{}_country_code".format(scope)}}}}) query["minimum_should_match"] = 1 # null country codes are being considered as USA country codes if state: # If a state was provided, include it in the filter to limit hits query["must"].append({"match": {"{}_state_code".format(scope): es_sanitize(state).upper()}}) search = TransactionSearch().filter(ES_Q("bool", **query)) return search
def _generate_elasticsearch_query(cls, filters: dict, query_type: _QueryType) -> ES_Q: must_queries = [] for filter_type, filter_values in filters.items(): # Validate the filters if filter_type in cls.unsupported_filters: msg = "API request included '{}' key. No filtering will occur with provided value '{}'" logger.warning(msg.format(filter_type, filter_values)) continue elif filter_type not in cls.filter_lookup.keys(): raise InvalidParameterException(f"Invalid filter: {filter_type} does not exist.") # Generate the query for a filter query = cls.filter_lookup[filter_type].generate_query(filter_values, query_type) # Handle the possibility of multiple queries from one filter if isinstance(query, list): must_queries.extend(query) else: must_queries.append(query) return ES_Q("bool", must=must_queries)
def generate_elasticsearch_query(cls, filter_values: List[dict], query_type: _QueryType, nested_path: str = "") -> List[ES_Q]: awarding_agency_query = [] funding_agency_query = [] for v in filter_values: agency_name = v.get("name") agency_tier = v["tier"] agency_type = v["type"] toptier_id = v.get("toptier_id") toptier_name = v.get("toptier_name") agency_query = ES_Q() if agency_name: agency_query &= ES_Q( "match", **{ f"{agency_type}_{agency_tier}_agency_name__keyword": agency_name }) if toptier_id: if toptier_name and toptier_name != "awarding": raise InvalidParameterException( "Incompatible parameters: `toptier_id` can only be used with `awarding` agency type." ) agency_query &= ES_Q( "match", **{"awarding_toptier_agency_id": toptier_id}) if agency_tier == "subtier" and toptier_name is not None: agency_query &= ES_Q( "match", **{ f"{agency_type}_toptier_agency_name__keyword": toptier_name }) if agency_type == "awarding": awarding_agency_query.append(agency_query) elif agency_type == "funding": funding_agency_query.append(agency_query) return [ ES_Q("bool", should=awarding_agency_query, minimum_should_match=1), ES_Q("bool", should=funding_agency_query, minimum_should_match=1), ]
def generate_elasticsearch_query(cls, filter_values, query_type: _QueryType, nested_path: str = "") -> ES_Q: # legacy functionality permits sending a single list of naics codes, which is treated as the required list if isinstance(filter_values, list): require = [ cls.naics_code_to_naics_code_path(str(code)) for code in filter_values ] exclude = [] elif isinstance(filter_values, dict): require = [ cls.naics_code_to_naics_code_path(str(code)) for code in filter_values.get("require") or [] ] exclude = [ cls.naics_code_to_naics_code_path(str(code)) for code in filter_values.get("exclude") or [] ] else: raise InvalidParameterException( f"naics_codes must be an array or object") if [value for value in require if len(value[-1]) not in [2, 4, 6]] or [ value for value in exclude if len(value[-1]) not in [2, 4, 6] ]: raise InvalidParameterException( "naics code filtering only supported for codes with lengths of 2, 4, and 6" ) require = [code for code in require] exclude = [code for code in exclude] return ES_Q("query_string", query=cls._query_string(require, exclude), default_field="naics_code.keyword")
def build_elasticsearch_search_with_aggregations( self) -> Optional[AccountSearch]: """ Using the provided ES_Q object creates an AccountSearch object with the necessary applied aggregations. """ # No need to continue if there is no result if self.bucket_count == 0: return None # Create the initial search using filters search = AccountSearch().filter(self.filter_query) # Create the aggregations financial_accounts_agg = A("nested", path="financial_accounts_by_award") if "query" in self.filters: terms = ES_Q( "terms", **{ "financial_accounts_by_award.disaster_emergency_fund_code": self.filters.get("def_codes") }) query = ES_Q( "multi_match", query=self.filters["query"], type="phrase_prefix", fields=[ f"financial_accounts_by_award.{query}" for query in self.query_fields ], ) filter_agg_query = ES_Q("bool", should=[terms, query], minimum_should_match=2) else: filter_agg_query = ES_Q( "terms", **{ "financial_accounts_by_award.disaster_emergency_fund_code": self.filters.get("def_codes") }) filtered_aggs = A("filter", filter_agg_query) group_by_dim_agg = A("terms", field=self.agg_key, size=self.bucket_count) dim_metadata = A( "top_hits", size=1, sort=[{ "financial_accounts_by_award.update_date": { "order": "desc" } }], _source={"includes": self.top_hits_fields}, ) sum_covid_outlay = A( "sum", script= """doc['financial_accounts_by_award.is_final_balances_for_fy'].value ? ( ( doc['financial_accounts_by_award.gross_outlay_amount_by_award_cpe'].size() > 0 ? doc['financial_accounts_by_award.gross_outlay_amount_by_award_cpe'].value : 0) + (doc['financial_accounts_by_award.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe'].size() > 0 ? doc['financial_accounts_by_award.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe'].value : 0) + (doc['financial_accounts_by_award.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe'].size() > 0 ? doc['financial_accounts_by_award.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe'].value : 0) ) : 0""", ) sum_covid_obligation = A( "sum", field="financial_accounts_by_award.transaction_obligated_amount") count_awards_by_dim = A("reverse_nested", **{}) award_count = A("value_count", field="financial_account_distinct_award_key") loan_value = A("sum", field="total_loan_value") # Apply the aggregations search.aggs.bucket(self.agg_group_name, financial_accounts_agg).bucket( "filtered_aggs", filtered_aggs).bucket("group_by_dim_agg", group_by_dim_agg).metric( "dim_metadata", dim_metadata).metric( "sum_transaction_obligated_amount", sum_covid_obligation).metric( "sum_gross_outlay_amount_by_award_cpe", sum_covid_outlay).bucket("count_awards_by_dim", count_awards_by_dim).metric( "award_count", award_count).metric( "sum_loan_value", loan_value) # Apply sub-aggregation for children if applicable if self.sub_agg_key: self.extend_elasticsearch_search_with_sub_aggregation(search) search.update_from_dict({"size": 0}) return search