示例#1
0
    def test_auto_run_pipeline_by_load_source_with_no_location(self):
        dgp = self.create_generator_pipeline()

        ds = DataSource(pipeline=dgp, location=self.csv_path_output)
        ds.touch()  # even with last_modified set, should still load from pipeline
        df = ds.df
        assert_frame_equal(df, EXPECT_GENERATED_DF)
        self.assert_all_pipeline_operations_have_pipeline(dgp)
示例#2
0
 def create_source(self, **kwargs) -> DataSource:
     config_dict = dict(
         df=self.test_df,
         location=self.csv_path,
     )
     config_dict.update(kwargs)
     return DataSource(**config_dict)
    def test_auto_run_pipeline_by_load_source_with_no_location(self):
        dtp = self.create_transformation_pipeline()

        ds = DataSource(pipeline=dtp, location=self.csv_path_output)
        df = ds.df
        assert_frame_equal(df, self.expect_func_df)
        self.assert_all_pipeline_operations_have_pipeline(dtp)
    def test_auto_run_pipeline_by_load_source_with_no_location(self):
        dp = self.create_combine_pipeline()

        ds = DataSource(pipeline=dp, location=self.csv_path_output)
        df = ds.df
        assert_frame_equal(df, self.expect_combined_rows_1_2)
        self.assert_all_pipeline_operations_have_pipeline(dp)
示例#5
0
    def test_graph(self):
        dp = self.create_merge_pipeline()

        ds = DataSource(pipeline=dp, location=self.csv_path_output)
        df = ds.df

        # TODO [#80]: better tests for graph
        #
        # Currently just checking to make sure they can be generated with no errors.
        # Should also check the contents of the graphs. Also see TestCreateSource.test_graph
        ds.graph
        dp.graph
示例#6
0
def portfolio_data_func(col: dc.Column, variable: dc.Variable, source: dc.DataSource, **kwargs) -> dc.DataSource:
    if 'portvar' in kwargs:
        raise ValueError('cannot pass portvar as variable will be transformed into portvar')

    if 'byvars' not in kwargs and source.index_vars:
        by_vars: List[dc.Variable] = []
        other_indices = [col_idx for col_idx in col.indices if col_idx]
        if len(other_indices) > 0:
            # Got other indices
            for col_idx in other_indices:
                by_vars.extend(col_idx.variables)
        by_var_names = [var.name for var in by_vars]
        if by_var_names:
            kwargs['byvars'] = by_var_names

    # TODO [#1]: remove portfolio column reordering once pd_utils.portfolio retains order
    orig_columns = [col for col in source.df.columns]

    # TODO [#2]: remore portfolio index handling once pd_utils.portfolio supports using index
    if source.index_vars:
        orig_index_names = source.df.index.names
        source.df.reset_index(inplace=True)
        orig_columns = [col for col in source.df.columns]

    source.df = portfolio(
        source.df,
        variable.name,
        **kwargs
    )

    source.df.drop([variable.name], axis=1, inplace=True)
    source.df.rename(columns={'portfolio': variable.name}, inplace=True)
    source.df = source.df[orig_columns]

    if source.index_vars:
        source.df.set_index(orig_index_names, inplace=True)
    return source
    def test_auto_run_pipeline_by_load_source_with_no_location_and_shared_columns(
            self):
        self.create_csv()
        all_cols = self.create_columns()

        def transform_func(source: DataSource) -> DataSource:
            new_ds = DataSource(df=source.df, columns=all_cols)
            return new_ds

        dtp = self.create_transformation_pipeline(func=transform_func)

        ds = DataSource(pipeline=dtp,
                        location=self.csv_path_output,
                        columns=all_cols)
        df = ds.df
        assert_frame_equal(df, self.expect_loaded_df_rename_only)
        self.assert_all_pipeline_operations_have_pipeline(dtp)
 def transform_func(source: DataSource) -> DataSource:
     new_ds = DataSource(df=source.df, columns=all_cols)
     return new_ds
示例#9
0
def winsorize_data_func(col: dc.Column, variable: dc.Variable,
                        source: dc.DataSource, *args,
                        **kwargs) -> dc.DataSource:
    source.df = winsorize(source.df, *args, subset=variable.name, **kwargs)

    return source
示例#10
0
def ds_generator_func(columns: Sequence[Column]) -> DataSource:
    ds = DataSource(df=EXPECT_GENERATED_DF, columns=columns)
    return ds