def test_fill_fields_and_timeseries_from_column(): existing_df = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,cnt,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,ab\n" "55005,ZZ,county,North County,2,2020-05-02,cd\n" "55005,ZZ,county,North County,,2020-05-03,ef\n" "55006,ZZ,county,South County,4,2020-05-04,gh\n" "55,ZZ,state,Grand State,41,2020-05-01,ij\n" "55,ZZ,state,Grand State,43,2020-05-03,kl\n" ) new_df = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,cnt,date\n" "55006,ZZ,county,South County,44,2020-05-04\n" "55007,ZZ,county,West County,28,2020-05-03\n" "55,ZZ,state,Grand State,42,2020-05-02\n" ) datasets = {"existing": existing_df, "new": new_df} result, _ = _build_data_and_provenance( {"cnt": ["existing", "new"], "foo": ["existing"]}, datasets ) expected = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,cnt,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,ab\n" "55005,ZZ,county,North County,2,2020-05-02,cd\n" "55005,ZZ,county,North County,,2020-05-03,ef\n" "55006,ZZ,county,South County,44,2020-05-04,gh\n" "55007,ZZ,county,West County,28,2020-05-03,\n" "55,ZZ,state,Grand State,,2020-05-01,ij\n" "55,ZZ,state,Grand State,42,2020-05-02,\n" "55,ZZ,state,Grand State,,2020-05-03,kl\n" ) assert to_dict(["fips", "date"], result) == to_dict(["fips", "date"], expected)
def test_fill_fields_with_data_source_add_column(): # existing_df does not have a current_icu column. Check that it doesn't cause a crash. existing_df = read_csv_and_index_fips( "fips,state,aggregate_level,county,preserved\n" "55005,ZZ,county,North County,ab\n" "55,ZZ,state,Grand State,cd\n", ) new_df = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu\n" "55007,ZZ,county,West County,28\n" "55,ZZ,state,Grand State,64\n", ) datasets = {"existing": existing_df, "new": new_df} result, _ = _build_data_and_provenance( {"current_icu": ["new"], "preserved": ["existing"]}, datasets ) expected = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu,preserved\n" "55005,ZZ,county,North County,,ab\n" "55007,ZZ,county,West County,28,\n" "55,ZZ,state,Grand State,64,cd\n", ) assert to_dict(["fips"], result) == to_dict(["fips"], expected)
def test_build_timeseries(): data_a = read_csv_and_index_fips_date( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,1\n") data_b = read_csv_and_index_fips_date( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,2\n") datasets = {"source_a": data_a, "source_b": data_b} combined, provenance = _build_data_and_provenance( {"cases": ["source_a", "source_b"]}, datasets) assert combined.at[("97123", "2020-04-01"), "cases"] == 2 assert provenance.at[("97123", "2020-04-01"), "cases"] == "source_b" combined, provenance = _build_data_and_provenance( {"cases": ["source_b", "source_a"]}, datasets) assert combined.at[("97123", "2020-04-01"), "cases"] == 1 assert provenance.at[("97123", "2020-04-01"), "cases"] == "source_a"
def test_build_timeseries_override(): data_a = read_csv_and_index_fips_date("fips,date,m1,m2\n" "97123,2020-04-01,1,\n" "97123,2020-04-02,,\n" "97123,2020-04-03,3,3") data_b = read_csv_and_index_fips_date("fips,date,m1,m2\n" "97123,2020-04-01,,\n" "97123,2020-04-02,2,\n") datasets = {"source_a": data_a, "source_b": data_b} # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1 combined, provenance = _build_data_and_provenance( {"m1": ["source_a", "source_b"]}, datasets, ) assert combined.loc["97123", "m1"].replace({ np.nan: None }).tolist() == [None, 2, None] assert provenance.loc["97123", "m1"].replace({ np.nan: None }).tolist() == [ None, "source_b", None, ] # The combined m1 timeseries is the highest priority value for each date; source_b is higher priority for # both 2020-04-01 and 2020-04-02. combined, provenance = _build_data_and_provenance( {"m1": ["source_b", "source_a"]}, datasets) assert combined.loc["97123", "m1"].replace({ np.nan: None }).tolist() == [1, None, 3] assert provenance.loc["97123", "m1"].replace({ np.nan: None }).tolist() == [ "source_a", None, "source_a", ]
def test_build_latest(): data_a = read_csv_and_index_fips( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,1\n" "Three County,XY,97333,USA,county,2020-04-01,3\n") data_b = read_csv_and_index_fips( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,2\n") datasets = {"source_a": data_a, "source_b": data_b} combined, provenance = _build_data_and_provenance( {"cases": ["source_a", "source_b"]}, datasets) assert combined.at["97123", "cases"] == 2 assert provenance.at["97123", "cases"] == "source_b" assert combined.at["97333", "cases"] == 3 assert provenance.at["97333", "cases"] == "source_a" combined, provenance = _build_data_and_provenance( {"cases": ["source_b", "source_a"]}, datasets) assert combined.at["97123", "cases"] == 1 assert provenance.at["97123", "cases"] == "source_a" assert combined.at["97333", "cases"] == 3 assert provenance.at["97333", "cases"] == "source_a"
def test_fill_fields_with_data_source_no_rows_input(): existing_df = read_csv_and_index_fips("fips,state,aggregate_level,county,preserved\n") new_df = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu\n" "55007,ZZ,county,West County,28\n" "55,ZZ,state,Grand State,64\n", ) datasets = {"existing": existing_df, "new": new_df} result, _ = _build_data_and_provenance( {"current_icu": ["new"], "preserved": ["existing"]}, datasets ) expected = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu,preserved\n" "55007,ZZ,county,West County,28,\n" "55,ZZ,state,Grand State,64,\n" ) assert to_dict(["fips"], result) == to_dict(["fips"], expected)
def test_build_and_and_provenance_missing_fips(): data_a = read_csv_and_index_fips_date("fips,date,m1,m2\n" "97111,2020-04-01,1,\n" "97111,2020-04-02,,\n" "97111,2020-04-03,3,3\n") data_b = read_csv_and_index_fips_date("fips,date,m1,m2\n" "97111,2020-04-01,,\n" "97111,2020-04-02,2,\n" "97444,2020-04-04,4,\n") datasets = {"source_a": data_a, "source_b": data_b} # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1 combined, provenance = _build_data_and_provenance( { "m1": ["source_a", "source_b"], "m2": ["source_a", "source_b"] }, datasets) assert combined.loc["97444", "m1"].dropna().tolist() == [4] assert provenance.loc["97444", "m1"].dropna().tolist() == ["source_b"] assert combined.loc["97444", "m2"].dropna().tolist() == [] assert provenance.loc["97444", "m2"].dropna().tolist() == []