def test_filtering_subtier_with_bogus_toptier_es( client, monkeypatch, elasticsearch_transaction_index, basic_award, subagency_award ): logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) resp = client.post( "/api/v2/search/spending_by_category/awarding_subagency", content_type="application/json", data={ "filters": { "time_period": [{"start_date": "2018-10-01", "end_date": "2020-09-30"}], "agencies": [ { "type": "awarding", "tier": "subtier", "name": "Awarding Subtier Agency 5", "toptier_name": "bogus toptier name", } ], } }, **{EXPERIMENTAL_API_HEADER: ELASTICSEARCH_HEADER_VALUE}, ) assert resp.status_code == status.HTTP_200_OK assert resp.data == { "category": "awarding_subagency", "limit": 10, "page_metadata": {"page": 1, "next": None, "previous": None, "hasNext": False, "hasPrevious": False}, "results": [], "messages": [get_time_period_message()], }
def test_subset_of_fields_returned(client, monkeypatch, transaction_data, elasticsearch_transaction_index): logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) fields = ["Award ID", "Recipient Name", "Mod"] request = { "filters": { "keyword": "test", "award_type_codes": ["A", "B", "C", "D"] }, "fields": fields, "page": 1, "limit": 5, "sort": "Award ID", "order": "desc", } resp = client.post(ENDPOINT, content_type="application/json", data=json.dumps(request)) assert resp.status_code == status.HTTP_200_OK assert len(resp.data["results"]) > 0 for result in resp.data["results"]: for field in fields: assert field in result, f"Response item is missing field {field}" assert "internal_id" in result assert "generated_internal_id" in result assert "Last Date to Order" not in result
def test_columns_can_be_sorted(client, monkeypatch, transaction_data, elasticsearch_transaction_index): logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) fields = [ "Action Date", "Award ID", "Awarding Agency", "Awarding Sub Agency", "Award Type", "Mod", "Recipient Name", "Action Date", ] request = { "filters": { "keyword": "test", "award_type_codes": ["A", "B", "C", "D"] }, "fields": fields, "page": 1, "limit": 5, "order": "desc", } for field in fields: request["sort"] = field resp = client.post(ENDPOINT, content_type="application/json", data=json.dumps(request)) assert resp.status_code == status.HTTP_200_OK, f"Failed to sort column: {field}"
def test_correct_response(client, monkeypatch, elasticsearch_transaction_index, awards_and_transactions): logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) resp = client.post( "/api/v2/search/spending_by_category/district", content_type="application/json", data=json.dumps({"filters": {"time_period": [{"start_date": "2018-10-01", "end_date": "2020-09-30"}]}}), **{EXPERIMENTAL_API_HEADER: ELASTICSEARCH_HEADER_VALUE}, ) expected_response = { "category": "district", "limit": 10, "page_metadata": {"page": 1, "next": None, "previous": None, "hasNext": False, "hasPrevious": False}, "results": [ {"amount": 500000.0, "code": "90", "id": None, "name": "SC-MULTIPLE DISTRICTS"}, {"amount": 50005.0, "code": "10", "id": None, "name": "SC-10"}, {"amount": 5500.0, "code": "50", "id": None, "name": "WA-50"}, {"amount": 50.0, "code": "50", "id": None, "name": "SC-50"}, ], "messages": [get_time_period_message()], } assert resp.status_code == status.HTTP_200_OK, "Failed to return 200 Response" assert len(logging_statements) == 1, "Expected one logging statement" assert resp.json() == expected_response
def test_correct_response_with_more_awards( client, monkeypatch, elasticsearch_transaction_index, basic_award, subagency_award ): logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) resp = client.post( "/api/v2/search/spending_by_category/awarding_agency", content_type="application/json", data=json.dumps({"filters": {"time_period": [{"start_date": "2018-10-01", "end_date": "2020-09-30"}]}}), **{EXPERIMENTAL_API_HEADER: ELASTICSEARCH_HEADER_VALUE}, ) expected_response = { "category": "awarding_agency", "limit": 10, "page_metadata": {"page": 1, "next": None, "previous": None, "hasNext": False, "hasPrevious": False}, "results": [ {"amount": 10.0, "name": "Awarding Toptier Agency 3", "code": "TA3", "id": 1003}, {"amount": 5.0, "name": "Awarding Toptier Agency 1", "code": "TA1", "id": 1001}, ], "messages": [get_time_period_message()], } assert resp.status_code == status.HTTP_200_OK, "Failed to return 200 Response" assert len(logging_statements) == 1, "Expected one logging statement" assert resp.json() == expected_response
def test_spending_by_transaction_count(monkeypatch, transaction_type_data, elasticsearch_transaction_index): logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) request_data = {"filters": {"keywords": ["pop tart"]}} results = spending_by_transaction_count(request_data) expected_results = {"contracts": 1, "grants": 1, "idvs": 1, "loans": 1, "direct_payments": 1, "other": 1} assert results == expected_results
def test_get_download_ids(monkeypatch, transaction_type_data, elasticsearch_transaction_index): logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) results = get_download_ids(["pop tart"], "transaction_id") transaction_ids = list(itertools.chain.from_iterable(results)) expected_results = [1, 2, 3, 4, 5, 6] assert transaction_ids == expected_results
def test_success_with_all_filters(client, monkeypatch, elasticsearch_transaction_index, basic_award): """ General test to make sure that all groups respond with a Status Code of 200 regardless of the filters. """ logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) resp = client.post( "/api/v2/search/spending_by_category/awarding_subagency", content_type="application/json", data=json.dumps({"filters": non_legacy_filters()}), **{EXPERIMENTAL_API_HEADER: ELASTICSEARCH_HEADER_VALUE}, ) assert resp.status_code == status.HTTP_200_OK, "Failed to return 200 Response" assert len(logging_statements) == 1, "Expected one logging statement"
def test_correct_response(client, monkeypatch, elasticsearch_transaction_index, awards_and_transactions): logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) resp = client.post( "/api/v2/search/spending_by_category/recipient_duns", content_type="application/json", data=json.dumps({"filters": {"time_period": [{"start_date": "2007-10-01", "end_date": "2020-09-30"}]}}), **{EXPERIMENTAL_API_HEADER: ELASTICSEARCH_HEADER_VALUE}, ) expected_response = { "category": "recipient_duns", "limit": 10, "page_metadata": {"page": 1, "next": None, "previous": None, "hasNext": False, "hasPrevious": False}, "results": [ { "amount": 5000000.0, "code": "DUNS Number not provided", "name": "MULTIPLE RECIPIENTS", "recipient_id": None, }, {"amount": 550000.0, "code": "123456789", "name": None, "recipient_id": None}, {"amount": 5000.0, "code": "096354360", "name": "MULTIPLE RECIPIENTS", "recipient_id": None}, { "amount": 500.0, "code": "987654321", "name": "RECIPIENT 3", "recipient_id": "d2894d22-67fc-f9cb-4005-33fa6a29ef86-C", }, {"amount": 50.0, "code": "456789123", "name": "RECIPIENT 2", "recipient_id": None}, { "amount": 5.0, "code": "DUNS Number not provided", "name": "RECIPIENT 1", "recipient_id": "5f572ec9-8b49-e5eb-22c7-f6ef316f7689-R", }, ], "messages": [get_time_period_message()], } assert resp.status_code == status.HTTP_200_OK, "Failed to return 200 Response" assert len(logging_statements) == 1, "Expected one logging statement" assert resp.json() == expected_response
def test_a_search_endpoint(client, monkeypatch, award_data_fixture, elasticsearch_transaction_index): """ An example of how one might test a keyword search. """ # This is the important part. This ensures data is loaded into your Elasticsearch. logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) query = { "filters": { "keyword": "IND12PB00323", "award_type_codes": ["A", "B", "C", "D"] }, "fields": [ "Award ID", "Mod", "Recipient Name", "Action Date", "Transaction Amount", "Awarding Agency", "Awarding Sub Agency", "Award Type", ], "page": 1, "limit": 35, "sort": "Transaction Amount", "order": "desc", } response = client.post("/api/v2/search/spending_by_transaction", content_type="application/json", data=json.dumps(query)) assert response.status_code == status.HTTP_200_OK assert len(response.data["results"]) == 1
def test_top_1_fails_with_es_transactions_routed_dangerously(client, monkeypatch, elasticsearch_transaction_index, db): """ This confirms vulnerability of high-cardinality aggregations documented in DEV-4685, that leads to inaccurate summing and ordering of sums when taking less buckets than the term cardinality. This is shown by manually applying a routing key (using a key value stuck in ``awards.piid`` field here as the routing key value) on index do that documents are distributed as below NOTE: This requires an ES cluster with at least 3 shards for the transaction index. Ours should be defaulted to 5. Recipient shard0 shard1 shard2 shard3 shard4 Biz 1 $2.00 Biz 1 $ 7.00 Biz 1 $ 3.00 Biz 1 $ 2.00 Biz 1 $ 3.00 Biz 1 $ 5.00 Biz 2 $ 6.00 Biz 2 $ 3.00 Biz 2 $ 2.00 Biz 2 $ 3.00 Biz 2 $ 4.00 Biz 2 $13.00 **IF THIS TEST FAILS** - Did our cluster structure change to not be 5 shards per the transaction index? - Did the transaction<->award DB linkage change? - Did we change ES version or config? - Investigate if Elasticsearch has changed the way they do routing or hash routing key values """ # Setup data for this test recipient1 = uuid.uuid4() recipient2 = uuid.uuid4() # Recipient Lookup mommy.make("recipient.RecipientLookup", id=1, recipient_hash=recipient1, legal_business_name="Biz 1", duns="111") mommy.make("recipient.RecipientLookup", id=2, recipient_hash=recipient2, legal_business_name="Biz 2", duns="222") # Transaction FPDS _make_fpds_transaction(1, 1, 2.00, "2020-01-01", "111", "Biz 1") _make_fpds_transaction(2, 3, 7.00, "2020-02-02", "111", "Biz 1") _make_fpds_transaction(3, 3, 3.00, "2020-03-03", "111", "Biz 1") _make_fpds_transaction(4, 2, 2.00, "2020-01-02", "111", "Biz 1") _make_fpds_transaction(5, 2, 3.00, "2020-02-03", "111", "Biz 1") _make_fpds_transaction(6, 2, 5.00, "2020-03-04", "111", "Biz 1") _make_fpds_transaction(7, 2, 6.00, "2020-01-03", "222", "Biz 2") _make_fpds_transaction(8, 2, 3.00, "2020-02-04", "222", "Biz 2") _make_fpds_transaction(9, 3, 2.00, "2020-03-05", "222", "Biz 2") _make_fpds_transaction(10, 3, 3.00, "2020-01-04", "222", "Biz 2") _make_fpds_transaction(11, 3, 4.00, "2020-02-05", "222", "Biz 2") _make_fpds_transaction(12, 1, 13.00, "2020-03-06", "222", "Biz 2") # Awards # Jam a routing key value into the piid field, and use the derived piid value for routing documents to shards later mommy.make("awards.Award", id=1, latest_transaction_id=12, piid="shard_zero") mommy.make("awards.Award", id=2, latest_transaction_id=6, piid="shard_one") mommy.make("awards.Award", id=3, latest_transaction_id=9, piid="shard_two") # Push DB data into the test ES cluster # NOTE: Force routing of documents by the piid field, which will separate them int 3 groups, leading to an # inaccurate sum and ordering of sums logging_statements = [] # Using piid (derived from the transaction's award) to route transaction documents to shards setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements, routing="piid") search = TransactionSearch() total = search.handle_count() assert total == 12, "Should have seen 12 documents indexed for this test" group_by_agg = A("terms", field="recipient_hash", size=1, shard_size=1, order={"sum_agg": "desc"}) sum_agg = A("sum", field="generated_pragmatic_obligation") search.aggs.bucket("results", group_by_agg).metric("sum_agg", sum_agg) logging.getLogger("console").debug(f"=>->=>->=>-> WILL RUN THIS ES QUERY: \n {search.extra(size=0).to_dict()}") response = search.extra(size=0).handle_execute().to_dict() results = [] for bucket in response["aggregations"]["results"]["buckets"]: results.append({"key": bucket["key"], "sum": bucket["sum_agg"]["value"]}) print(results) assert len(results) == 1 assert results[0]["key"] == str( recipient1 ), "This botched 'Top 1' sum agg should have incorrectly chosen the lesser recipient" assert results[0]["sum"] == 20.0, "The botched 'Top 1' sum agg should have incorrectly summed up recipient totals"
def test_top_1_with_es_transactions_routed_by_recipient(client, monkeypatch, elasticsearch_transaction_index, db): """ This tests the approach to compensating for high-cardinality aggregations documented in DEV-4685, to ensure accuracy and completeness of aggregations and sorting even when taking less buckets than the term cardinality. Without the code to route indexing of transaction documents in elasticsearch to shards by the `recipient_agg_key`, which was added to :meth:`usaspending_api.etl.es_etl_helpers.csv_chunk_gen`, the below agg queries should lead to inaccurate results, as shown in the DEV-4538. With routing by recipient, documents will be allocated to shards as below Recipient shard0 shard1 shard2 shard3 shard4 Biz 1 $2.00 Biz 1 $ 7.00 Biz 1 $ 3.00 Biz 1 $ 2.00 Biz 1 $ 3.00 Biz 1 $ 5.00 Biz 2 $ 6.00 Biz 2 $ 3.00 Biz 2 $ 2.00 Biz 2 $ 3.00 Biz 2 $ 4.00 Biz 2 $13.00 **IF THIS TEST FAILS** - Are we still using the TestElasticSearchIndex fixture to help with pushing test data to ES? - Did TestElasticSearchIndex indexing / routing behavior change? - Did our cluster structure change to not be 5 shards per the transaction index? - Did the transaction<->recipient DB linkage change? - Did we change ES version or config? - Investigate if Elasticsearch has changed the way they do routing or hash routing key values """ # Setup data for this test recipient1 = uuid.uuid4() recipient2 = uuid.uuid4() # Recipient Lookup mommy.make("recipient.RecipientLookup", id=1, recipient_hash=recipient1, legal_business_name="Biz 1", duns="111") mommy.make("recipient.RecipientLookup", id=2, recipient_hash=recipient2, legal_business_name="Biz 2", duns="222") # Transaction FPDS _make_fpds_transaction(1, 1, 2.00, "2020-01-01", "111", "Biz 1") _make_fpds_transaction(2, 3, 7.00, "2020-02-02", "111", "Biz 1") _make_fpds_transaction(3, 3, 3.00, "2020-03-03", "111", "Biz 1") _make_fpds_transaction(4, 2, 2.00, "2020-01-02", "111", "Biz 1") _make_fpds_transaction(5, 2, 3.00, "2020-02-03", "111", "Biz 1") _make_fpds_transaction(6, 2, 5.00, "2020-03-04", "111", "Biz 1") _make_fpds_transaction(7, 2, 6.00, "2020-01-03", "222", "Biz 2") _make_fpds_transaction(8, 2, 3.00, "2020-02-04", "222", "Biz 2") _make_fpds_transaction(9, 3, 2.00, "2020-03-05", "222", "Biz 2") _make_fpds_transaction(10, 3, 3.00, "2020-01-04", "222", "Biz 2") _make_fpds_transaction(11, 3, 4.00, "2020-02-05", "222", "Biz 2") _make_fpds_transaction(12, 1, 13.00, "2020-03-06", "222", "Biz 2") # Awards mommy.make("awards.Award", id=1, latest_transaction_id=12) mommy.make("awards.Award", id=2, latest_transaction_id=6) mommy.make("awards.Award", id=3, latest_transaction_id=9) # Push DB data into the test ES cluster logging_statements = [] setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, logging_statements) search = TransactionSearch() total = search.handle_count() assert total == 12, "Should have seen 12 documents indexed for this test" group_by_agg = A("terms", field="recipient_hash", size=1, shard_size=1, order={"sum_agg": "desc"}) sum_agg = A("sum", field="generated_pragmatic_obligation") search.aggs.bucket("results", group_by_agg).metric("sum_agg", sum_agg) logging.getLogger("console").debug(f"=>->=>->=>-> WILL RUN THIS ES QUERY: \n {search.extra(size=0).to_dict()}") response = search.extra(size=0).handle_execute().to_dict() results = [] for bucket in response["aggregations"]["results"]["buckets"]: results.append({"key": bucket["key"], "sum": bucket["sum_agg"]["value"]}) assert len(results) == 1 assert results[0]["key"] == str( recipient2 ), "The 'Top 1' sum agg incorrectly chose the recipient with a lesser total sum" assert results[0]["sum"] == 31.0, "The 'Top 1' sum agg incorrectly summed up recipient totals"