def test_nested_last_modified_of_source_manually_overridden_to_be_greater_than_pipeline(
            self):
        counter_value = th.COUNTER
        dc_hooks.on_begin_apply_source_transform = th.increase_counter_hook_return_only_second_arg

        dmp = self.create_merge_pipeline()

        dtp = self.create_transformation_pipeline(source=dmp)
        self.create_csv_for_2()
        now = datetime.datetime.now()
        later = now + datetime.timedelta(minutes=5)
        ds = self.create_source(df=None,
                                location=self.csv_path2,
                                pipeline=dtp,
                                last_modified=later)

        time.sleep(0.01)
        self.create_csv()
        # now earlier source has csv more recently modified, but this
        # should be ignored due to manually passing last_modified in the future

        assert ds.last_modified == later
        assert dmp.last_modified is None
        assert dtp.last_modified is None
        assert ds.pipeline_last_modified < ds.last_modified
        # Should not run pipeline as source was manually set to be newer
        df = ds.df

        dc_hooks.reset_hooks()
        assert_frame_equal(df, self.test_df2)
        assert th.COUNTER == counter_value  # transform operation not called
        self.assert_ordered_pipeline_operations(dtp, [dmp, dtp])
    def test_nested_last_modified_of_source_less_than_earlier_source(self):
        counter_value = th.COUNTER
        dc_hooks.on_begin_apply_source_transform = th.increase_counter_hook_return_only_second_arg

        dtp = self.create_transformation_pipeline()
        self.create_csv_for_2()
        cols2 = self.create_columns()
        ds2 = self.create_source(df=None,
                                 location=self.csv_path2,
                                 pipeline=dtp,
                                 columns=cols2)
        dtp2 = self.create_transformation_pipeline(source=ds2)
        time.sleep(0.01)
        self.create_csv_for_3()
        ds3 = self.create_source(df=None,
                                 location=self.csv_path3,
                                 pipeline=dtp2)

        time.sleep(0.01)
        self.create_csv()
        ds1 = dtp.data_sources[0]
        # now first source is most recently modified, followed by third source. Middle source is oldest
        assert ds1.last_modified > ds2.last_modified
        assert ds3.last_modified > ds2.last_modified

        # Should run both pipelines as original source is newest
        df = ds3.df

        dc_hooks.reset_hooks()
        assert_frame_equal(df, self.expect_df_double_source_transform)
        assert th.COUNTER == counter_value + 2  # transform operation called once
        self.assert_all_pipeline_operations_have_pipeline(dtp)
        self.assert_all_pipeline_operations_have_pipeline(dtp2)
    def test_nested_last_modified_of_pipeline_manually_set_to_be_greater_than_source(
            self):
        counter_value = th.COUNTER
        dc_hooks.on_begin_apply_source_transform = th.increase_counter_hook_return_only_second_arg

        dmp = self.create_merge_pipeline()

        now = datetime.datetime.now()
        later = now + datetime.timedelta(minutes=5)
        dtp = self.create_transformation_pipeline(source=dmp,
                                                  last_modified=later)
        self.create_csv_for_2()
        ds = self.create_source(df=None, location=self.csv_path2, pipeline=dtp)

        assert dtp.last_modified == later
        assert dmp.last_modified is None
        assert dtp.pipeline_last_modified == dmp.pipeline_last_modified
        assert ds.pipeline_last_modified > ds.last_modified
        # Should run pipeline as was manually set last modified in the future
        df = ds.df

        dc_hooks.reset_hooks()
        assert_frame_equal(df, self.expect_merged_1_2_both_transformed)
        assert th.COUNTER == counter_value + 1  # transform operation called once
        self.assert_ordered_pipeline_operations(dtp, [dmp, dtp])
    def test_nested_last_modified_of_source_touched_to_be_greater_than_pipeline(
            self):
        counter_value = th.COUNTER
        dc_hooks.on_begin_apply_source_transform = th.increase_counter_hook_return_only_second_arg

        dmp = self.create_merge_pipeline()

        dtp = self.create_transformation_pipeline(source=dmp)
        self.create_csv_for_2()
        ds = self.create_source(df=None, location=self.csv_path2, pipeline=dtp)

        time.sleep(0.01)
        self.create_csv()  # now earlier source is more recently modified
        assert dmp.data_sources[0].last_modified > ds.last_modified

        before_touch = datetime.datetime.now()
        ds.touch()
        after_touch = datetime.datetime.now()
        assert ds.last_modified < after_touch
        assert ds.last_modified > before_touch
        assert dmp.last_modified is None
        assert dtp.last_modified is None
        assert ds.pipeline_last_modified < ds.last_modified

        # Should not run pipeline as source was touched to be newer
        df = ds.df

        dc_hooks.reset_hooks()
        assert_frame_equal(df, self.test_df2)
        assert th.COUNTER == counter_value  # transform operation not called
        self.assert_ordered_pipeline_operations(dtp, [dmp, dtp])
    def test_nested_last_modified_of_pipeline_touched_to_be_greater_than_source(
            self):
        counter_value = th.COUNTER
        dc_hooks.on_begin_apply_source_transform = th.increase_counter_hook_return_only_second_arg

        dmp = self.create_merge_pipeline()

        dtp = self.create_transformation_pipeline(source=dmp)
        self.create_csv_for_2()
        ds = self.create_source(df=None, location=self.csv_path2, pipeline=dtp)

        before_touch = datetime.datetime.now()
        dtp.touch()
        after_touch = datetime.datetime.now()
        assert dtp.last_modified < after_touch
        assert dtp.last_modified > before_touch
        assert dmp.last_modified is None
        assert dtp.pipeline_last_modified == dmp.pipeline_last_modified
        assert ds.pipeline_last_modified > ds.last_modified
        # Should run pipeline as was just touched
        df = ds.df

        dc_hooks.reset_hooks()
        assert_frame_equal(df, self.expect_merged_1_2_both_transformed)
        assert th.COUNTER == counter_value + 1  # transform operation called once
        self.assert_ordered_pipeline_operations(dtp, [dmp, dtp])
    def test_nested_last_modified_of_source_greater_than_pipeline(self):
        counter_value = th.COUNTER
        dc_hooks.on_begin_apply_source_transform = th.increase_counter_hook_return_only_second_arg

        dmp = self.create_merge_pipeline()

        dtp = self.create_transformation_pipeline(source=dmp)
        time.sleep(0.01)
        self.create_csv_for_3()
        ds = self.create_source(df=None, location=self.csv_path3, pipeline=dtp)

        # Should not run pipeline as source is newer
        df = ds.df

        dc_hooks.reset_hooks()
        assert_frame_equal(df, self.test_df3)
        assert th.COUNTER == counter_value  # transform operation not called
        self.assert_ordered_pipeline_operations(dtp, [dmp, dtp])
    def test_nested_last_modified_of_source_less_than_pipeline(self):
        counter_value = th.COUNTER
        dc_hooks.on_begin_apply_source_transform = th.increase_counter_hook_return_only_second_arg

        dmp = self.create_merge_pipeline()

        dtp = self.create_transformation_pipeline(source=dmp)
        self.create_csv_for_2()
        ds = self.create_source(df=None, location=self.csv_path2, pipeline=dtp)

        time.sleep(0.01)
        self.create_csv()  # now earlier source is more recently modified
        assert dmp.data_sources[0].last_modified > ds.last_modified

        # Should run pipeline as original source is newer
        df = ds.df

        dc_hooks.reset_hooks()
        assert_frame_equal(df, self.expect_merged_1_2_both_transformed)
        assert th.COUNTER == counter_value + 1  # transform operation called once
        self.assert_ordered_pipeline_operations(dtp, [dmp, dtp])
    def test_transformation_pipeline_cache(self):
        counter_value = th.COUNTER
        dc_hooks.on_begin_apply_source_transform = th.increase_counter_hook_return_only_second_arg

        # Run initial pipeline
        cols = self.create_columns()
        dtp = self.create_transformation_pipeline(result_kwargs=dict(
            columns=cols))
        dtp.execute()

        assert_frame_equal(dtp.df, self.expect_func_df)
        self.assert_all_pipeline_operations_have_pipeline(dtp)
        assert th.COUNTER == counter_value + 1  # transform operation called once

        # Now result should be cached to file. Running a new pipeline with
        # the same options should load from cache
        # Options should be checked for equality, so passing deepcopy it should
        # still cache
        dtp = self.create_transformation_pipeline(
            func=deepcopy(source_transform_func),
            result_kwargs=dict(columns=cols))
        dtp.execute()

        assert_frame_equal(dtp.df, self.expect_func_df)
        self.assert_all_pipeline_operations_have_pipeline(dtp)
        assert th.COUNTER == counter_value + 1  # transform operation not called again

        # Running with different options should run operations again
        dtp = self.create_transformation_pipeline(
            result_kwargs=dict(columns=cols),
            subset=lambda source: source.load_variables)
        dtp.execute()

        assert_frame_equal(dtp.df, self.expect_func_df)
        self.assert_all_pipeline_operations_have_pipeline(dtp)
        assert th.COUNTER == counter_value + 2  # transform operation called again

        dc_hooks.reset_hooks()
    def test_create_and_run_multiple_analysis_pipelines_from_same_transformation_pipeline_with_always_rerun(
            self):
        dtp = self.create_transformation_pipeline(always_rerun=True)
        dap1 = self.create_analysis_pipeline(source=dtp)

        analysis_from_source_2 = partial(analysis_from_source, sum_offset=10)
        ao2 = AnalysisOptions(analysis_from_source_2)
        dap2 = self.create_analysis_pipeline(source=dtp, options=ao2)

        counter_value = th.COUNTER
        dc_hooks.on_begin_apply_source_transform = th.increase_counter_hook_return_only_second_arg

        dap1.execute()
        dap2.execute()

        dc_hooks.reset_hooks()

        assert dap1.operations[0] is dap2.operations[0]
        assert dap1.operations[0].data_source is dap2.operations[0].data_source
        assert dap1.result.result == self.ds_one_transformed_analysis_result
        assert dap2.result.result == self.ds_one_transformed_analysis_result_offset_10
        assert th.COUNTER == counter_value + 2  # transform operation called twice as always rerun
        self.assert_ordered_pipeline_operations(dap1, [dtp, dap1])
        self.assert_ordered_pipeline_operations(dap2, [dtp, dap2])
示例#10
0
 def teardown_method(self, *args, **kwargs):
     import tests.test_hooks as th
     super().teardown_method(*args, **kwargs)
     dc_hooks.reset_hooks()
     th.COUNTER = 0
     reset_operation_counter()