def test_auto_run_pipeline_by_load_source_with_no_location(self): dgp = self.create_generator_pipeline() ds = DataSource(pipeline=dgp, location=self.csv_path_output) ds.touch() # even with last_modified set, should still load from pipeline df = ds.df assert_frame_equal(df, EXPECT_GENERATED_DF) self.assert_all_pipeline_operations_have_pipeline(dgp)
def create_source(self, **kwargs) -> DataSource: config_dict = dict( df=self.test_df, location=self.csv_path, ) config_dict.update(kwargs) return DataSource(**config_dict)
def test_auto_run_pipeline_by_load_source_with_no_location(self): dtp = self.create_transformation_pipeline() ds = DataSource(pipeline=dtp, location=self.csv_path_output) df = ds.df assert_frame_equal(df, self.expect_func_df) self.assert_all_pipeline_operations_have_pipeline(dtp)
def test_auto_run_pipeline_by_load_source_with_no_location(self): dp = self.create_combine_pipeline() ds = DataSource(pipeline=dp, location=self.csv_path_output) df = ds.df assert_frame_equal(df, self.expect_combined_rows_1_2) self.assert_all_pipeline_operations_have_pipeline(dp)
def test_graph(self): dp = self.create_merge_pipeline() ds = DataSource(pipeline=dp, location=self.csv_path_output) df = ds.df # TODO [#80]: better tests for graph # # Currently just checking to make sure they can be generated with no errors. # Should also check the contents of the graphs. Also see TestCreateSource.test_graph ds.graph dp.graph
def portfolio_data_func(col: dc.Column, variable: dc.Variable, source: dc.DataSource, **kwargs) -> dc.DataSource: if 'portvar' in kwargs: raise ValueError('cannot pass portvar as variable will be transformed into portvar') if 'byvars' not in kwargs and source.index_vars: by_vars: List[dc.Variable] = [] other_indices = [col_idx for col_idx in col.indices if col_idx] if len(other_indices) > 0: # Got other indices for col_idx in other_indices: by_vars.extend(col_idx.variables) by_var_names = [var.name for var in by_vars] if by_var_names: kwargs['byvars'] = by_var_names # TODO [#1]: remove portfolio column reordering once pd_utils.portfolio retains order orig_columns = [col for col in source.df.columns] # TODO [#2]: remore portfolio index handling once pd_utils.portfolio supports using index if source.index_vars: orig_index_names = source.df.index.names source.df.reset_index(inplace=True) orig_columns = [col for col in source.df.columns] source.df = portfolio( source.df, variable.name, **kwargs ) source.df.drop([variable.name], axis=1, inplace=True) source.df.rename(columns={'portfolio': variable.name}, inplace=True) source.df = source.df[orig_columns] if source.index_vars: source.df.set_index(orig_index_names, inplace=True) return source
def test_auto_run_pipeline_by_load_source_with_no_location_and_shared_columns( self): self.create_csv() all_cols = self.create_columns() def transform_func(source: DataSource) -> DataSource: new_ds = DataSource(df=source.df, columns=all_cols) return new_ds dtp = self.create_transformation_pipeline(func=transform_func) ds = DataSource(pipeline=dtp, location=self.csv_path_output, columns=all_cols) df = ds.df assert_frame_equal(df, self.expect_loaded_df_rename_only) self.assert_all_pipeline_operations_have_pipeline(dtp)
def transform_func(source: DataSource) -> DataSource: new_ds = DataSource(df=source.df, columns=all_cols) return new_ds
def winsorize_data_func(col: dc.Column, variable: dc.Variable, source: dc.DataSource, *args, **kwargs) -> dc.DataSource: source.df = winsorize(source.df, *args, subset=variable.name, **kwargs) return source
def ds_generator_func(columns: Sequence[Column]) -> DataSource: ds = DataSource(df=EXPECT_GENERATED_DF, columns=columns) return ds