Python _build_data_and_provenance示例，libs.datasets.combined_datasets._build_data_and_provenance Python示例

示例#1

0

显示文件

文件： dataset_utils_test.py 项目： andresmmujica/covid-data-model

def test_fill_fields_and_timeseries_from_column():
    existing_df = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,cnt,date,foo\n"
        "55005,ZZ,county,North County,1,2020-05-01,ab\n"
        "55005,ZZ,county,North County,2,2020-05-02,cd\n"
        "55005,ZZ,county,North County,,2020-05-03,ef\n"
        "55006,ZZ,county,South County,4,2020-05-04,gh\n"
        "55,ZZ,state,Grand State,41,2020-05-01,ij\n"
        "55,ZZ,state,Grand State,43,2020-05-03,kl\n"
    )
    new_df = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,cnt,date\n"
        "55006,ZZ,county,South County,44,2020-05-04\n"
        "55007,ZZ,county,West County,28,2020-05-03\n"
        "55,ZZ,state,Grand State,42,2020-05-02\n"
    )

    datasets = {"existing": existing_df, "new": new_df}

    result, _ = _build_data_and_provenance(
        {"cnt": ["existing", "new"], "foo": ["existing"]}, datasets
    )

    expected = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,cnt,date,foo\n"
        "55005,ZZ,county,North County,1,2020-05-01,ab\n"
        "55005,ZZ,county,North County,2,2020-05-02,cd\n"
        "55005,ZZ,county,North County,,2020-05-03,ef\n"
        "55006,ZZ,county,South County,44,2020-05-04,gh\n"
        "55007,ZZ,county,West County,28,2020-05-03,\n"
        "55,ZZ,state,Grand State,,2020-05-01,ij\n"
        "55,ZZ,state,Grand State,42,2020-05-02,\n"
        "55,ZZ,state,Grand State,,2020-05-03,kl\n"
    )
    assert to_dict(["fips", "date"], result) == to_dict(["fips", "date"], expected)

示例#2

0

显示文件

文件： dataset_utils_test.py 项目： andresmmujica/covid-data-model

def test_fill_fields_with_data_source_add_column():
    # existing_df does not have a current_icu column. Check that it doesn't cause a crash.
    existing_df = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,preserved\n"
        "55005,ZZ,county,North County,ab\n"
        "55,ZZ,state,Grand State,cd\n",
    )
    new_df = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu\n"
        "55007,ZZ,county,West County,28\n"
        "55,ZZ,state,Grand State,64\n",
    )

    datasets = {"existing": existing_df, "new": new_df}

    result, _ = _build_data_and_provenance(
        {"current_icu": ["new"], "preserved": ["existing"]}, datasets
    )

    expected = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu,preserved\n"
        "55005,ZZ,county,North County,,ab\n"
        "55007,ZZ,county,West County,28,\n"
        "55,ZZ,state,Grand State,64,cd\n",
    )
    assert to_dict(["fips"], result) == to_dict(["fips"], expected)

示例#3

0

显示文件

文件： combined_dataset_test.py 项目： andresmmujica/covid-data-model

def test_build_timeseries():
    data_a = read_csv_and_index_fips_date(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,1\n")
    data_b = read_csv_and_index_fips_date(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,2\n")
    datasets = {"source_a": data_a, "source_b": data_b}

    combined, provenance = _build_data_and_provenance(
        {"cases": ["source_a", "source_b"]}, datasets)
    assert combined.at[("97123", "2020-04-01"), "cases"] == 2
    assert provenance.at[("97123", "2020-04-01"), "cases"] == "source_b"

    combined, provenance = _build_data_and_provenance(
        {"cases": ["source_b", "source_a"]}, datasets)
    assert combined.at[("97123", "2020-04-01"), "cases"] == 1
    assert provenance.at[("97123", "2020-04-01"), "cases"] == "source_a"

示例#4

0

显示文件

文件： combined_dataset_test.py 项目： andresmmujica/covid-data-model

def test_build_timeseries_override():
    data_a = read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                          "97123,2020-04-01,1,\n"
                                          "97123,2020-04-02,,\n"
                                          "97123,2020-04-03,3,3")
    data_b = read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                          "97123,2020-04-01,,\n"
                                          "97123,2020-04-02,2,\n")
    datasets = {"source_a": data_a, "source_b": data_b}

    # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1
    combined, provenance = _build_data_and_provenance(
        {"m1": ["source_a", "source_b"]},
        datasets,
    )
    assert combined.loc["97123", "m1"].replace({
        np.nan: None
    }).tolist() == [None, 2, None]
    assert provenance.loc["97123", "m1"].replace({
        np.nan: None
    }).tolist() == [
        None,
        "source_b",
        None,
    ]

    # The combined m1 timeseries is the highest priority value for each date; source_b is higher priority for
    # both 2020-04-01 and 2020-04-02.
    combined, provenance = _build_data_and_provenance(
        {"m1": ["source_b", "source_a"]}, datasets)
    assert combined.loc["97123", "m1"].replace({
        np.nan: None
    }).tolist() == [1, None, 3]
    assert provenance.loc["97123", "m1"].replace({
        np.nan: None
    }).tolist() == [
        "source_a",
        None,
        "source_a",
    ]

示例#5

0

显示文件

文件： combined_dataset_test.py 项目： andresmmujica/covid-data-model

def test_build_latest():
    data_a = read_csv_and_index_fips(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,1\n"
        "Three County,XY,97333,USA,county,2020-04-01,3\n")
    data_b = read_csv_and_index_fips(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,2\n")
    datasets = {"source_a": data_a, "source_b": data_b}

    combined, provenance = _build_data_and_provenance(
        {"cases": ["source_a", "source_b"]}, datasets)
    assert combined.at["97123", "cases"] == 2
    assert provenance.at["97123", "cases"] == "source_b"
    assert combined.at["97333", "cases"] == 3
    assert provenance.at["97333", "cases"] == "source_a"

    combined, provenance = _build_data_and_provenance(
        {"cases": ["source_b", "source_a"]}, datasets)
    assert combined.at["97123", "cases"] == 1
    assert provenance.at["97123", "cases"] == "source_a"
    assert combined.at["97333", "cases"] == 3
    assert provenance.at["97333", "cases"] == "source_a"

示例#6

0

显示文件

文件： dataset_utils_test.py 项目： andresmmujica/covid-data-model

def test_fill_fields_with_data_source_no_rows_input():
    existing_df = read_csv_and_index_fips("fips,state,aggregate_level,county,preserved\n")
    new_df = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu\n"
        "55007,ZZ,county,West County,28\n"
        "55,ZZ,state,Grand State,64\n",
    )

    datasets = {"existing": existing_df, "new": new_df}

    result, _ = _build_data_and_provenance(
        {"current_icu": ["new"], "preserved": ["existing"]}, datasets
    )

    expected = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu,preserved\n"
        "55007,ZZ,county,West County,28,\n"
        "55,ZZ,state,Grand State,64,\n"
    )
    assert to_dict(["fips"], result) == to_dict(["fips"], expected)

示例#7

0

显示文件

文件： combined_dataset_test.py 项目： andresmmujica/covid-data-model

def test_build_and_and_provenance_missing_fips():
    data_a = read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                          "97111,2020-04-01,1,\n"
                                          "97111,2020-04-02,,\n"
                                          "97111,2020-04-03,3,3\n")
    data_b = read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                          "97111,2020-04-01,,\n"
                                          "97111,2020-04-02,2,\n"
                                          "97444,2020-04-04,4,\n")
    datasets = {"source_a": data_a, "source_b": data_b}

    # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1
    combined, provenance = _build_data_and_provenance(
        {
            "m1": ["source_a", "source_b"],
            "m2": ["source_a", "source_b"]
        }, datasets)
    assert combined.loc["97444", "m1"].dropna().tolist() == [4]
    assert provenance.loc["97444", "m1"].dropna().tolist() == ["source_b"]
    assert combined.loc["97444", "m2"].dropna().tolist() == []
    assert provenance.loc["97444", "m2"].dropna().tolist() == []