def test_fill_fields_with_data_source(): existing_df = pd.read_csv( StringIO( "fips,state,aggregate_level,county,current_icu,preserved\n" "55005,ZZ,county,North County,43,ab\n" "55006,ZZ,county,South County,,cd\n" "55,ZZ,state,Grand State,46,ef\n" ) ) new_df = pd.read_csv( StringIO( "fips,state,aggregate_level,county,current_icu\n" "55006,ZZ,county,South County,27\n" "55007,ZZ,county,West County,28\n" "55,ZZ,state,Grand State,64\n" ) ) result = fill_fields_with_data_source( existing_df, new_df, "fips state aggregate_level county".split(), ["current_icu"], ) expected = pd.read_csv( StringIO( "fips,state,aggregate_level,county,current_icu,preserved\n" "55005,ZZ,county,North County,43,ab\n" "55006,ZZ,county,South County,27,cd\n" "55007,ZZ,county,West County,28,\n" "55,ZZ,state,Grand State,64,ef\n" ) ) assert to_dict(["fips"], result) == to_dict(["fips"], expected)
def test_fill_fields_with_data_source_add_column(): # existing_df does not have a current_icu column. Check that it doesn't cause a crash. existing_df = pd.read_csv( StringIO( "fips,state,aggregate_level,county,preserved\n" "55005,ZZ,county,North County,ab\n" "55,ZZ,state,Grand State,cd\n" ) ) new_df = pd.read_csv( StringIO( "fips,state,aggregate_level,county,current_icu\n" "55007,ZZ,county,West County,28\n" "55,ZZ,state,Grand State,64\n" ) ) result = fill_fields_with_data_source( existing_df, new_df, "fips state aggregate_level county".split(), ["current_icu"], ) expected = pd.read_csv( StringIO( "fips,state,aggregate_level,county,current_icu,preserved\n" "55005,ZZ,county,North County,,ab\n" "55007,ZZ,county,West County,28,\n" "55,ZZ,state,Grand State,64,cd\n" ) ) assert to_dict(["fips"], result) == to_dict(["fips"], expected)
def test_fill_fields_with_data_source_no_rows_input(): existing_df = pd.read_csv(StringIO("fips,state,aggregate_level,county,preserved\n")) new_df = pd.read_csv( StringIO( "fips,state,aggregate_level,county,current_icu\n" "55007,ZZ,county,West County,28\n" "55,ZZ,state,Grand State,64\n" ) ) with testing.capture_logs() as logs: log = get_logger() result = fill_fields_with_data_source( log, existing_df, new_df, "fips state aggregate_level county".split(), ["current_icu"], ) expected = pd.read_csv( StringIO( "fips,state,aggregate_level,county,current_icu,preserved\n" "55007,ZZ,county,West County,28,\n" "55,ZZ,state,Grand State,64,\n" ) ) assert to_dict(["fips"], result) == to_dict(["fips"], expected) assert logs == []
def build_combined_dataset_from_sources( target_dataset_cls: Type[dataset_base.DatasetBase], feature_definition_config: FeatureDataSourceMap, filters: List[dataset_filter.DatasetFilter] = None, ): """Builds a combined dataset from a feature definition. Args: target_dataset_cls: Target dataset class. feature_definition_config: Dictionary mapping an output field to the data sources that will be used to pull values from. filters: A list of dataset filters applied to the datasets before assembling features. """ loaded_data_sources = load_data_sources(feature_definition_config) # Convert data sources to instances of `target_data_cls`. intermediate_datasets = { data_source_cls: target_dataset_cls.build_from_data_source(source) for data_source_cls, source in loaded_data_sources.items() } # Apply filters to datasets. for key in intermediate_datasets: dataset = intermediate_datasets[key] for data_filter in filters or []: dataset = data_filter.apply(dataset) intermediate_datasets[key] = dataset # Build feature columns from feature_definition_config. data = pd.DataFrame({}) # structlog makes it very easy to bind extra attributes to `log` as it is passed down the stack. log = structlog.get_logger() for field, data_source_classes in feature_definition_config.items(): for data_source_cls in data_source_classes: dataset = intermediate_datasets[data_source_cls] data = dataset_utils.fill_fields_with_data_source( log.bind(dataset_name=data_source_cls.SOURCE_NAME, field=field), data, dataset.data, target_dataset_cls.INDEX_FIELDS, [field], ) return target_dataset_cls(data)
def test_fill_fields_with_data_source_timeseries(): # Timeseries in existing_df and new_df are merged together. existing_df = pd.read_csv( StringIO( "fips,state,aggregate_level,county,cnt,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,ab\n" "55005,ZZ,county,North County,2,2020-05-02,cd\n" "55005,ZZ,county,North County,,2020-05-03,ef\n" "55006,ZZ,county,South County,4,2020-05-04,gh\n" "55,ZZ,state,Grand State,41,2020-05-01,ij\n" "55,ZZ,state,Grand State,43,2020-05-03,kl\n" ) ) new_df = pd.read_csv( StringIO( "fips,state,aggregate_level,county,cnt,date\n" "55006,ZZ,county,South County,44,2020-05-04\n" "55007,ZZ,county,West County,28,2020-05-03\n" "55005,ZZ,county,North County,3,2020-05-03\n" "55,ZZ,state,Grand State,42,2020-05-02\n" ) ) with testing.capture_logs() as logs: log = get_logger() result = fill_fields_with_data_source( log, existing_df, new_df, "fips state aggregate_level county date".split(), ["cnt"], ) expected = pd.read_csv( StringIO( "fips,state,aggregate_level,county,cnt,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,ab\n" "55005,ZZ,county,North County,2,2020-05-02,cd\n" "55005,ZZ,county,North County,3,2020-05-03,ef\n" "55006,ZZ,county,South County,44,2020-05-04,gh\n" "55007,ZZ,county,West County,28,2020-05-03,\n" "55,ZZ,state,Grand State,41,2020-05-01,ij\n" "55,ZZ,state,Grand State,42,2020-05-02,\n" "55,ZZ,state,Grand State,43,2020-05-03,kl\n" ) ) assert to_dict(["fips", "date"], result) == to_dict(["fips", "date"], expected) assert logs == []