Пример #1
0
    def create_variables_for_2(self,
                               transform_data: str = '',
                               apply_transforms: bool = True
                               ) -> Tuple[Variable, Variable, Variable]:
        transform_dict = self.get_transform_dict(
            transform_data=transform_data, apply_transforms=apply_transforms)

        e = Variable('e', 'E', dtype='int', **transform_dict)
        f = Variable('f', 'F', dtype='int', **transform_dict)
        c = Variable('c', 'C', dtype='str')
        return e, f, c
Пример #2
0
    def create_variables_for_3(self,
                               transform_data: str = '',
                               apply_transforms: bool = True
                               ) -> Tuple[Variable, Variable, Variable]:
        transform_dict = self.get_transform_dict(
            transform_data=transform_data, apply_transforms=apply_transforms)

        g = Variable('g', 'G', dtype='int', **transform_dict)
        h = Variable('h', 'H', dtype='int', **transform_dict)
        c = Variable('c', 'C', dtype='str')
        return g, h, c
Пример #3
0
    def create_variables_for_generated(
            self,
            transform_data: str = '',
            apply_transforms: bool = True
    ) -> Tuple[Variable, Variable, Variable]:
        transform_dict = self.get_transform_dict(
            transform_data=transform_data, apply_transforms=apply_transforms)

        a = Variable('a', 'a', dtype='int', **transform_dict)
        b = Variable('b', 'b', dtype='int', **transform_dict)
        c = Variable('c', 'C', dtype='str')
        return a, b, c
Пример #4
0
 def test_hash_dict_source_with_calculated_variable(self):
     self.create_csv()
     all_cols = self.create_columns()
     a, b, c = self.create_variables()
     d = Variable('d', 'D', calculation=a + b)
     ds = self.create_source(df=None, columns=all_cols, load_variables=[a, b, c, d])
     dtp = self.create_transformation_pipeline(source=ds, func=lambda source: source)
     check_or_store_hash_dict(dtp, 'transform_source_with_calculated')
     dtp.execute()
     check_or_store_hash_dict(dtp, 'transform_source_with_calculated', pre_execute=True)
Пример #5
0
 def test_hash_dict_source_with_calculate_on_transformed_before_and_after_transform(self):
     self.create_csv()
     all_cols = self.create_columns()
     a, b, c = self.create_variables(transform_data='cell', apply_transforms=False)
     d = Variable('d', 'D', calculation=a + b.add_one_cell())
     ds = self.create_source(df=None, columns=all_cols, load_variables=[a.add_one_cell(), b.add_one_cell(), c, d])
     dtp = self.create_transformation_pipeline(source=ds, func=lambda source: source)
     check_or_store_hash_dict(dtp, 'transform_source_with_calculate_on_transformed_before_after')
     dtp.execute()
     check_or_store_hash_dict(dtp, 'transform_source_with_calculate_on_transformed_before_after', pre_execute=True)
Пример #6
0
    def test_hash_dict_source_with_calculated_and_same_calculated_variable_transformed(self):
        self.create_csv()

        # Try with plain calculated variable first
        all_cols = self.create_columns()
        a, b, c = self.create_variables()
        tran = self.get_transform('cell')
        d = Variable('d', 'D', calculation=a + b, available_transforms=[tran])
        load_vars = [
            a,
            b,
            c,
            d,
            d.add_one_cell()
        ]
        ds = self.create_source(df=None, columns=all_cols, load_variables=load_vars)
        dtp = self.create_transformation_pipeline(source=ds, func=lambda source: source)
        check_or_store_hash_dict(dtp, 'transform_source_with_calculated_and_calculated_transformed')
        dtp.execute()
        check_or_store_hash_dict(dtp, 'transform_source_with_calculated_and_calculated_transformed', pre_execute=True)
Пример #7
0
 def test_cache_with_variable_transforms(self):
     dc.options.set_default_cache_location(AUTO_CACHE_PATH)
     self.create_csv()
     all_cols = self.create_columns()
     a, b, c = self.create_variables(transform_data='cell',
                                     apply_transforms=False)
     d = Variable('d', 'D', calculation=a + b.add_one_cell())
     d_col = dc.Column(d, 'd', dtype='int')
     all_cols.append(d_col)
     ds = self.create_source(
         df=None,
         columns=all_cols,
         load_variables=[a.add_one_cell(),
                         b.add_one_cell(), c, d])
     self.assert_cached(
         source=ds,
         func=lambda source: source,
         expect_cached_loaded_df=self.
         expect_loaded_df_with_calculate_on_transformed_before_and_after_transform,
         result_kwargs=dict(columns=all_cols))
Пример #8
0
 def test_change_with_by_index_and_time_index_with_gaps(self):
     vc, a, b, c = self.create_variable_collection()
     d = Variable('d', 'Date', dtype='datetime')
     self.create_csv(df=self.test_df_with_ids_and_dates)
     by_colindex = ColumnIndex(self.by_index, [c])
     time_colindex = ColumnIndex(self.time_index, [d])
     by_time_colindex = [by_colindex, time_colindex]
     ac = Column(a, 'a', by_time_colindex)
     bc = Column(b, 'b', by_time_colindex)
     cc = Column(c, 'c')
     dd = Column(d, 'date')
     all_cols = [ac, bc, cc, dd]
     load_variables = [
         vc.a.change(fill_method=None),
         vc.b.change(fill_method=None), c, d
     ]
     ds = self.create_source(df=None,
                             columns=all_cols,
                             load_variables=load_variables)
     assert_frame_equal(ds.df, self.expect_change_df_with_ids_and_dates)
Пример #9
0
 def create_variables(self) -> Tuple[Variable, Variable, Variable]:
     a = Variable('a', 'A', dtype='int')
     b = Variable('b', 'B', dtype='int')
     c = Variable('c', 'C', dtype='str')
     return a, b, c
Пример #10
0
class PipelineTest(SourceTest):
    merge_var = Variable('c', 'C', dtype='str')
    source_transform = SourceTransform(
        'st',
        name_func=SourceTest.transform_name_func,
        data_func=source_transform_func)
    ds_one_analysis_result = 21
    ds_one_and_two_analysis_result = 191
    ds_one_transformed_analysis_result = 27
    ds_one_transformed_analysis_result_offset_10 = ds_one_transformed_analysis_result + 10
    ds_one_generated_analysis_result = 10
    csv_path2 = os.path.join(GENERATED_PATH, 'data2.csv')
    csv_path3 = os.path.join(GENERATED_PATH, 'data3.csv')
    csv_path_output = os.path.join(GENERATED_PATH, 'output.csv')

    test_df2 = pd.DataFrame([
        (10, 20, 'd'),
        (50, 60, 'e'),
    ],
                            columns=['e', 'f', 'c'])
    test_df3 = pd.DataFrame([
        (100, 200, 'd'),
        (500, 600, 'e'),
    ],
                            columns=['g', 'h', 'c'])
    expect_merged_1_2 = pd.DataFrame([
        (1, 2, 'd', 10, 20),
        (3, 4, 'd', 10, 20),
        (5, 6, 'e', 50, 60),
    ],
                                     columns=['A', 'B', 'C', 'E',
                                              'F']).convert_dtypes()
    expect_merged_1_2_c_index = expect_merged_1_2.set_index('C')
    expect_merged_1_transformed_2 = pd.DataFrame(
        [
            (2, 3, 'd', 10, 20),
            (4, 5, 'd', 10, 20),
            (6, 7, 'e', 50, 60),
        ],
        columns=['A', 'B', 'C', 'E', 'F']).convert_dtypes()
    expect_merged_1_2_both_transformed = pd.DataFrame(
        [
            (2, 3, 'd', 11, 21),
            (4, 5, 'd', 11, 21),
            (6, 7, 'e', 51, 61),
        ],
        columns=['A', 'B', 'C', 'E', 'F']).convert_dtypes()
    expect_merged_1_generated_2 = pd.DataFrame(
        [
            (1, 2, 'd', 10, 20),
            (3, 4, 'e', 50, 60),
        ],
        columns=['a', 'b', 'C', 'E', 'F']).convert_dtypes()
    expect_merged_1_2_3 = pd.DataFrame(
        [
            (1, 2, 'd', 10, 20, 100, 200),
            (3, 4, 'd', 10, 20, 100, 200),
            (5, 6, 'e', 50, 60, 500, 600),
        ],
        columns=['A', 'B', 'C', 'E', 'F', 'G', 'H']).convert_dtypes()
    expect_func_df = df = pd.DataFrame([(2, 3, 'd'), (4, 5, 'd'), (6, 7, 'e')],
                                       columns=['A', 'B',
                                                'C']).convert_dtypes()
    expect_func_df_with_a_and_a_transformed = pd.DataFrame(
        [(2, 3, 3, 'd'), (4, 5, 5, 'd'), (6, 7, 7, 'e')],
        columns=['A', 'A_1', 'B', 'C']).convert_dtypes()
    expect_loaded_df_with_transform = pd.DataFrame(
        [(2, 3, 'd'), (4, 5, 'd'),
         (6, 7, 'e')], columns=['A_1', 'B_1', 'C_1']).convert_dtypes()
    expect_df_double_source_transform = pd.DataFrame(
        [(3, 4, 'd'), (5, 6, 'd'),
         (7, 8, 'e')], columns=['A', 'B', 'C']).convert_dtypes()
    expect_generated_transformed = pd.DataFrame([(2, 3, 'd'), (4, 5, 'e')],
                                                columns=['a', 'b', 'C'
                                                         ]).convert_dtypes()
    expect_combined_rows_1_2 = pd.DataFrame(
        [
            (1, 2, 'd', nan, nan),
            (3, 4, 'd', nan, nan),
            (5, 6, 'e', nan, nan),
            (nan, nan, 'd', 10, 20),
            (nan, nan, 'e', 50, 60),
        ],
        columns=['A', 'B', 'C', 'E', 'F'],
    ).convert_dtypes()
    expect_combined_rows_1_2_c_index = expect_combined_rows_1_2.set_index('C')
    expect_combined_rows_1_2_3 = pd.DataFrame(
        [
            (1, 2, 'd', nan, nan, nan, nan),
            (3, 4, 'd', nan, nan, nan, nan),
            (5, 6, 'e', nan, nan, nan, nan),
            (nan, nan, 'd', 10, 20, nan, nan),
            (nan, nan, 'e', 50, 60, nan, nan),
            (nan, nan, 'd', nan, nan, 100, 200),
            (nan, nan, 'e', nan, nan, 500, 600),
        ],
        columns=['A', 'B', 'C', 'E', 'F', 'G', 'H']).convert_dtypes()
    expect_combined_rows_1_2_entity_drop_c = pd.DataFrame(
        [
            (1, 2, 'd', nan, nan),
            (3, 4, 'd', nan, nan),
            (5, 6, 'e', nan, nan),
        ],
        columns=['A', 'B', 'C', 'E', 'F']).convert_dtypes()
    expect_combined_rows_1_2_row_drop_c = pd.DataFrame(
        [
            (1, 2, 'd', nan, nan),
            (5, 6, 'e', nan, nan),
        ],
        columns=['A', 'B', 'C', 'E', 'F']).convert_dtypes()
    expect_combined_cols_2_3 = pd.DataFrame(
        [
            ('d', 11, 21, 100, 200),
            ('e', 51, 61, 500, 600),
        ],
        columns=['C', 'E', 'F', 'G', 'H']).convert_dtypes().set_index('C')

    def teardown_method(self, *args, **kwargs):
        import tests.test_hooks as th
        super().teardown_method(*args, **kwargs)
        dc_hooks.reset_hooks()
        th.COUNTER = 0
        reset_operation_counter()

    def create_csv_for_2(self,
                         df: Optional[pd.DataFrame] = None,
                         **to_csv_kwargs):
        if df is None:
            df = self.test_df2
        df.to_csv(self.csv_path2, index=False, **to_csv_kwargs)

    def create_csv_for_3(self,
                         df: Optional[pd.DataFrame] = None,
                         **to_csv_kwargs):
        if df is None:
            df = self.test_df3
        df.to_csv(self.csv_path3, index=False, **to_csv_kwargs)

    def create_variables_for_2(self,
                               transform_data: str = '',
                               apply_transforms: bool = True
                               ) -> Tuple[Variable, Variable, Variable]:
        transform_dict = self.get_transform_dict(
            transform_data=transform_data, apply_transforms=apply_transforms)

        e = Variable('e', 'E', dtype='int', **transform_dict)
        f = Variable('f', 'F', dtype='int', **transform_dict)
        c = Variable('c', 'C', dtype='str')
        return e, f, c

    def create_variables_for_3(self,
                               transform_data: str = '',
                               apply_transforms: bool = True
                               ) -> Tuple[Variable, Variable, Variable]:
        transform_dict = self.get_transform_dict(
            transform_data=transform_data, apply_transforms=apply_transforms)

        g = Variable('g', 'G', dtype='int', **transform_dict)
        h = Variable('h', 'H', dtype='int', **transform_dict)
        c = Variable('c', 'C', dtype='str')
        return g, h, c

    def create_variables_for_generated(
            self,
            transform_data: str = '',
            apply_transforms: bool = True
    ) -> Tuple[Variable, Variable, Variable]:
        transform_dict = self.get_transform_dict(
            transform_data=transform_data, apply_transforms=apply_transforms)

        a = Variable('a', 'a', dtype='int', **transform_dict)
        b = Variable('b', 'b', dtype='int', **transform_dict)
        c = Variable('c', 'C', dtype='str')
        return a, b, c

    def create_columns_for_2(self,
                             transform_data: str = '',
                             apply_transforms: bool = True):
        e, f, c = self.create_variables_for_2(
            transform_data=transform_data, apply_transforms=apply_transforms)
        ec = Column(e, 'e')
        fc = Column(f, 'f')
        cc = Column(c, 'c')
        return [ec, fc, cc]

    def create_columns_for_3(self,
                             transform_data: str = '',
                             apply_transforms: bool = True):
        g, h, c = self.create_variables_for_3(
            transform_data=transform_data, apply_transforms=apply_transforms)
        gc = Column(g, 'g')
        hc = Column(h, 'h')
        cc = Column(c, 'c')
        return [gc, hc, cc]

    def create_columns_for_generated(self,
                                     transform_data: str = '',
                                     apply_transforms: bool = True):
        a, b, c = self.create_variables_for_generated(
            transform_data=transform_data, apply_transforms=apply_transforms)
        ac = Column(a, 'a')
        bc = Column(b, 'b')
        cc = Column(c, 'C')
        return [ac, bc, cc]

    def create_variables_and_c_colindex_for_2(
            self,
            transform_data: str = '',
            apply_transforms: bool = True
    ) -> Tuple[List[Variable], ColumnIndex]:
        e, f, c = self.create_variables_for_2(
            transform_data=transform_data, apply_transforms=apply_transforms)
        c_index = self.create_c_index()

        c_col_index = ColumnIndex(c_index, [c])

        return [e, f, c], c_col_index

    def create_indexed_columns_for_2(
            self,
            transform_data: str = '',
            apply_transforms: bool = True) -> List[Column]:
        (e, f, c), c_col_index = self.create_variables_and_c_colindex_for_2(
            transform_data=transform_data, apply_transforms=apply_transforms)
        ec = Column(e, 'e', indices=[c_col_index])
        fc = Column(f, 'f', indices=[c_col_index])
        cc = Column(c, 'c')
        return [ec, fc, cc]

    def create_variables_and_c_colindex_for_3(
            self,
            transform_data: str = '',
            apply_transforms: bool = True
    ) -> Tuple[List[Variable], ColumnIndex]:
        g, h, c = self.create_variables_for_3(
            transform_data=transform_data, apply_transforms=apply_transforms)
        c_index = self.create_c_index()

        c_col_index = ColumnIndex(c_index, [c])

        return [g, h, c], c_col_index

    def create_indexed_columns_for_3(
            self,
            transform_data: str = '',
            apply_transforms: bool = True) -> List[Column]:
        (g, h, c), c_col_index = self.create_variables_and_c_colindex_for_3(
            transform_data=transform_data, apply_transforms=apply_transforms)
        gc = Column(g, 'g', indices=[c_col_index])
        hc = Column(h, 'h', indices=[c_col_index])
        cc = Column(c, 'c')
        return [gc, hc, cc]

    def create_merge_pipeline(
        self,
        include_indices: Sequence[int] = (0, 1),
        data_sources: Optional[Sequence[DataSource]] = None,
        merge_options_list: Optional[Sequence[MergeOptions]] = None,
        indexed: bool = False,
        all_option_config: Optional[Dict[str, Any]] = None,
        last_option_config: Optional[Dict[str, Any]] = None,
        pipeline_kwargs: Optional[Dict[str, Any]] = None,
        create_csv: bool = True,
    ) -> DataMergePipeline:
        if indexed:
            col_func_1 = self.create_indexed_columns
            col_func_2 = self.create_indexed_columns_for_2
            col_func_3 = self.create_indexed_columns_for_3
        else:
            col_func_1 = self.create_columns
            col_func_2 = self.create_columns_for_2
            col_func_3 = self.create_columns_for_3

        if data_sources is None:
            if create_csv:
                self.create_csv()
                self.create_csv_for_2()
                self.create_csv_for_3()
            ds1_cols = col_func_1()
            ds1 = self.create_source(df=None, columns=ds1_cols, name='one')
            ds2_cols = col_func_2()
            ds2 = self.create_source(df=None,
                                     location=self.csv_path2,
                                     columns=ds2_cols,
                                     name='two')
            ds3_cols = col_func_3()
            ds3 = self.create_source(df=None,
                                     location=self.csv_path3,
                                     columns=ds3_cols,
                                     name='three')
            data_sources = [ds1, ds2, ds3]
            selected_data_sources = []
            for i, ds in enumerate(data_sources):
                if i in include_indices:
                    selected_data_sources.append(ds)
        else:
            selected_data_sources = data_sources

        if pipeline_kwargs is None:
            pipeline_kwargs = {}
        if all_option_config is None:
            all_cols = []
            for ds in selected_data_sources:
                if isinstance(ds, DataSource) and ds.columns is not None:
                    for col in ds.columns:
                        if col not in all_cols:
                            all_cols.append(col)
            all_option_config = dict(result_kwargs=dict(columns=all_cols))

        if last_option_config is None:
            last_option_config = dict(out_path=self.csv_path_output, )

        if merge_options_list is None:
            mo = MergeOptions([self.merge_var.name], **all_option_config)
            merge_options_list = [
                mo for _ in range(len(selected_data_sources) - 1)
            ]

        for key, value in last_option_config.items():
            setattr(merge_options_list[-1], key, value)

        dp = DataMergePipeline(selected_data_sources, merge_options_list,
                               **pipeline_kwargs)
        return dp

    def create_analysis_pipeline(self,
                                 source: Optional[DataSourceOrPipeline] = None,
                                 options: Optional[AnalysisOptions] = None):
        if source is None:
            self.create_csv()
            ds1_cols = self.create_columns()
            source = self.create_source(df=None, columns=ds1_cols, name='one')

        if options is None:
            options = AnalysisOptions(analysis_from_source)

        dap = DataAnalysisPipeline(source, options)
        return dap

    def create_generator_pipeline(self,
                                  pipeline_kwargs: Optional[Dict[str,
                                                                 Any]] = None,
                                  create_csv: bool = True,
                                  **kwargs) -> DataGeneratorPipeline:
        if pipeline_kwargs is None:
            pipeline_kwargs = {}
        gen_cols = self.create_columns_for_generated()
        config_dict = dict(out_path=self.csv_path_output,
                           columns=gen_cols,
                           result_kwargs=dict(columns=gen_cols, ))
        config_dict.update(**kwargs)
        go = GenerationOptions(ds_generator_func, **config_dict)
        dgp = DataGeneratorPipeline(go, **pipeline_kwargs)
        return dgp

    def create_transformation_pipeline(
            self,
            source: Optional[DataSourceOrPipeline] = None,
            pipeline_kwargs: Optional[Dict[str, Any]] = None,
            create_csv: bool = True,
            **options) -> DataTransformationPipeline:
        config_dict = dict(func=source_transform_func,
                           out_path=self.csv_path_output)
        config_dict.update(options)
        if 'result_kwargs' not in config_dict:
            config_dict['result_kwargs'] = {}
        if pipeline_kwargs is None:
            pipeline_kwargs = {}
        if source is None:
            if create_csv:
                self.create_csv()
            all_cols = self.create_columns()
            source = self.create_source(df=None, columns=all_cols)
            config_dict['result_kwargs'].update(columns=all_cols)

        to = TransformOptions(**config_dict)

        dtp = DataTransformationPipeline(source, to, **pipeline_kwargs)
        return dtp

    def create_combine_pipeline(
            self,
            include_indices: Sequence[int] = (0, 1),
            data_sources: Optional[Sequence[DataSource]] = None,
            combine_options_list: Optional[Sequence[CombineOptions]] = None,
            indexed: bool = False,
            all_option_config: Optional[Dict[str, Any]] = None,
            last_option_config: Optional[Dict[str, Any]] = None,
            pipeline_kwargs: Optional[Dict[str, Any]] = None,
            create_csv: bool = True):
        if indexed:
            col_func_1 = self.create_indexed_columns
            col_func_2 = self.create_indexed_columns_for_2
            col_func_3 = self.create_indexed_columns_for_3
        else:
            col_func_1 = self.create_columns
            col_func_2 = self.create_columns_for_2
            col_func_3 = self.create_columns_for_3

        if data_sources is None:
            if create_csv:
                self.create_csv()
                self.create_csv_for_2()
                self.create_csv_for_3()
            ds1_cols = col_func_1()
            ds1 = self.create_source(df=None, columns=ds1_cols, name='one')
            ds2_cols = col_func_2()
            ds2 = self.create_source(df=None,
                                     location=self.csv_path2,
                                     columns=ds2_cols,
                                     name='two')
            ds3_cols = col_func_3()
            ds3 = self.create_source(df=None,
                                     location=self.csv_path3,
                                     columns=ds3_cols,
                                     name='three')
            data_sources = [ds1, ds2, ds3]
            selected_data_sources = []
            for i, ds in enumerate(data_sources):
                if i in include_indices:
                    selected_data_sources.append(ds)
        else:
            selected_data_sources = data_sources

        if all_option_config is None:
            all_cols = []
            for ds in selected_data_sources:
                if isinstance(ds, DataSource) and ds.columns is not None:
                    for col in ds.columns:
                        if col not in all_cols:
                            all_cols.append(col)
            all_option_config = dict(result_kwargs=dict(columns=all_cols))
        if last_option_config is None:
            last_option_config = dict(out_path=self.csv_path_output)
        if pipeline_kwargs is None:
            pipeline_kwargs = {}

        if combine_options_list is None:
            mo = CombineOptions(**all_option_config)
            combine_options_list = [
                mo for _ in range(len(selected_data_sources) - 1)
            ]

        for key, value in last_option_config.items():
            setattr(combine_options_list[-1], key, value)

        dp = DataCombinationPipeline(selected_data_sources,
                                     combine_options_list, **pipeline_kwargs)
        return dp

    def assert_all_pipeline_operations_have_pipeline(self,
                                                     pipeline: DataPipeline):
        for operation in pipeline.operations:
            assert operation.pipeline is pipeline

    def assert_ordered_pipeline_operations(
            self, pipeline: DataPipeline,
            matched_pipelines: List[DataPipeline]):
        if len(pipeline.operations) != len(matched_pipelines):
            raise ValueError(
                f'different number of operations compared to matched '
                f'pipelines. Got {len(pipeline.operations)} operations '
                f'and {len(matched_pipelines)} matched pipelines')
        for operation, matched_pipeline in zip(pipeline.operations,
                                               matched_pipelines):
            assert operation.pipeline is matched_pipeline