Пример #1
0
def test_fill_fields_with_data_source():
    existing_df = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,current_icu,preserved\n"
            "55005,ZZ,county,North County,43,ab\n"
            "55006,ZZ,county,South County,,cd\n"
            "55,ZZ,state,Grand State,46,ef\n"
        )
    )
    new_df = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,current_icu\n"
            "55006,ZZ,county,South County,27\n"
            "55007,ZZ,county,West County,28\n"
            "55,ZZ,state,Grand State,64\n"
        )
    )

    result = fill_fields_with_data_source(
        existing_df, new_df, "fips state aggregate_level county".split(), ["current_icu"],
    )
    expected = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,current_icu,preserved\n"
            "55005,ZZ,county,North County,43,ab\n"
            "55006,ZZ,county,South County,27,cd\n"
            "55007,ZZ,county,West County,28,\n"
            "55,ZZ,state,Grand State,64,ef\n"
        )
    )

    assert to_dict(["fips"], result) == to_dict(["fips"], expected)
Пример #2
0
def test_fill_fields_with_data_source_add_column():
    # existing_df does not have a current_icu column. Check that it doesn't cause a crash.
    existing_df = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,preserved\n"
            "55005,ZZ,county,North County,ab\n"
            "55,ZZ,state,Grand State,cd\n"
        )
    )
    new_df = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,current_icu\n"
            "55007,ZZ,county,West County,28\n"
            "55,ZZ,state,Grand State,64\n"
        )
    )

    result = fill_fields_with_data_source(
        existing_df, new_df, "fips state aggregate_level county".split(), ["current_icu"],
    )

    expected = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,current_icu,preserved\n"
            "55005,ZZ,county,North County,,ab\n"
            "55007,ZZ,county,West County,28,\n"
            "55,ZZ,state,Grand State,64,cd\n"
        )
    )
    assert to_dict(["fips"], result) == to_dict(["fips"], expected)
def test_fill_fields_with_data_source_no_rows_input():
    existing_df = pd.read_csv(StringIO("fips,state,aggregate_level,county,preserved\n"))
    new_df = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,current_icu\n"
            "55007,ZZ,county,West County,28\n"
            "55,ZZ,state,Grand State,64\n"
        )
    )

    with testing.capture_logs() as logs:
        log = get_logger()
        result = fill_fields_with_data_source(
            log, existing_df, new_df, "fips state aggregate_level county".split(), ["current_icu"],
        )

    expected = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,current_icu,preserved\n"
            "55007,ZZ,county,West County,28,\n"
            "55,ZZ,state,Grand State,64,\n"
        )
    )
    assert to_dict(["fips"], result) == to_dict(["fips"], expected)
    assert logs == []
def build_combined_dataset_from_sources(
    target_dataset_cls: Type[dataset_base.DatasetBase],
    feature_definition_config: FeatureDataSourceMap,
    filters: List[dataset_filter.DatasetFilter] = None,
):
    """Builds a combined dataset from a feature definition.

    Args:
        target_dataset_cls: Target dataset class.
        feature_definition_config: Dictionary mapping an output field to the
            data sources that will be used to pull values from.
        filters: A list of dataset filters applied to the datasets before
            assembling features.
    """
    loaded_data_sources = load_data_sources(feature_definition_config)

    # Convert data sources to instances of `target_data_cls`.
    intermediate_datasets = {
        data_source_cls: target_dataset_cls.build_from_data_source(source)
        for data_source_cls, source in loaded_data_sources.items()
    }

    # Apply filters to datasets.
    for key in intermediate_datasets:
        dataset = intermediate_datasets[key]
        for data_filter in filters or []:
            dataset = data_filter.apply(dataset)
        intermediate_datasets[key] = dataset

    # Build feature columns from feature_definition_config.
    data = pd.DataFrame({})
    # structlog makes it very easy to bind extra attributes to `log` as it is passed down the stack.
    log = structlog.get_logger()
    for field, data_source_classes in feature_definition_config.items():
        for data_source_cls in data_source_classes:
            dataset = intermediate_datasets[data_source_cls]
            data = dataset_utils.fill_fields_with_data_source(
                log.bind(dataset_name=data_source_cls.SOURCE_NAME,
                         field=field),
                data,
                dataset.data,
                target_dataset_cls.INDEX_FIELDS,
                [field],
            )

    return target_dataset_cls(data)
def test_fill_fields_with_data_source_timeseries():
    # Timeseries in existing_df and new_df are merged together.
    existing_df = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,cnt,date,foo\n"
            "55005,ZZ,county,North County,1,2020-05-01,ab\n"
            "55005,ZZ,county,North County,2,2020-05-02,cd\n"
            "55005,ZZ,county,North County,,2020-05-03,ef\n"
            "55006,ZZ,county,South County,4,2020-05-04,gh\n"
            "55,ZZ,state,Grand State,41,2020-05-01,ij\n"
            "55,ZZ,state,Grand State,43,2020-05-03,kl\n"
        )
    )
    new_df = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,cnt,date\n"
            "55006,ZZ,county,South County,44,2020-05-04\n"
            "55007,ZZ,county,West County,28,2020-05-03\n"
            "55005,ZZ,county,North County,3,2020-05-03\n"
            "55,ZZ,state,Grand State,42,2020-05-02\n"
        )
    )

    with testing.capture_logs() as logs:
        log = get_logger()
        result = fill_fields_with_data_source(
            log, existing_df, new_df, "fips state aggregate_level county date".split(), ["cnt"],
        )
    expected = pd.read_csv(
        StringIO(
            "fips,state,aggregate_level,county,cnt,date,foo\n"
            "55005,ZZ,county,North County,1,2020-05-01,ab\n"
            "55005,ZZ,county,North County,2,2020-05-02,cd\n"
            "55005,ZZ,county,North County,3,2020-05-03,ef\n"
            "55006,ZZ,county,South County,44,2020-05-04,gh\n"
            "55007,ZZ,county,West County,28,2020-05-03,\n"
            "55,ZZ,state,Grand State,41,2020-05-01,ij\n"
            "55,ZZ,state,Grand State,42,2020-05-02,\n"
            "55,ZZ,state,Grand State,43,2020-05-03,kl\n"
        )
    )

    assert to_dict(["fips", "date"], result) == to_dict(["fips", "date"], expected)
    assert logs == []