Exemplo n.º 1
0
    def test_get_info_with_two_transforms(self):
        raw_sad = SelfAwareData(self.raw_df)
        step_1_sad = raw_sad.transform(self.transform_func,
                                       tag='step_1',
                                       enforce_clean_git=False)
        step_2_sad = step_1_sad.transform(self.transform_func_with_kwarg,
                                          tag='step_2',
                                          enforce_clean_git=False,
                                          factor=2)
        transform_info = step_2_sad.get_info()

        expected_transform_info_keys = {
            'timestamp', 'git_hash', 'tag', 'kwargs', 'code'
        }
        expected_info = [
            {
                'tag': 'step_1',
                'kwargs': {},
            },
            {
                'tag': 'step_2',
                'kwargs': {
                    'factor': 2
                },
            },
        ]
        self.assertEqual(len(transform_info), 2)  # 2 transforms
        for step in range(len(expected_info)):
            self.assertTrue(
                set(transform_info[step].keys()) ==
                expected_transform_info_keys)
            for key in expected_info[step]:
                self.assertEqual(transform_info[step][key],
                                 expected_info[step][key])
Exemplo n.º 2
0
 def test_transform_with_kwarg(self):
     raw_sad = SelfAwareData(self.raw_df)
     my_factor = 2
     my_sad = raw_sad.transform(self.transform_func_with_kwarg,
                                enforce_clean_git=False,
                                factor=my_factor)
     manually_transformed_df = self.transform_func_with_kwarg(
         self.raw_df, my_factor)
     pd.testing.assert_frame_equal(my_sad.data, manually_transformed_df)
Exemplo n.º 3
0
    def test_save_fail_leaves_no_dir(self):
        raw_sad = SelfAwareData(self.raw_df)
        my_sad = raw_sad.transform(self.transform_func,
                                   enforce_clean_git=False)
        sad_path = Path(self.test_dir, 'sad_fail.bad_file_ext')

        with self.assertRaises(Exception) as context:
            sad_file_path = my_sad.save(sad_path, index=False)

        glob_path = Path(self.test_dir, '*')
        subpaths = glob.glob(glob_path.__str__())
        sad_dirs = [
            os.path.basename(file_path) for file_path in subpaths
            if 'sad_dir' in file_path
        ]
        self.assertEqual(len(sad_dirs), 0)
Exemplo n.º 4
0
    def test_save_and_load(self):
        raw_sad = SelfAwareData(self.raw_df)
        my_sad = raw_sad.transform(self.transform_func,
                                   enforce_clean_git=False)

        # TEST SAVE
        sad_file_path = my_sad.save(Path(self.test_dir, 'new_sad.csv'),
                                    index=False)

        # assert a new directory was created, and the path matches the return value
        self.assertTrue(Path(sad_file_path).exists())

        # assert the sad dir name starts with "sad_dir"
        self.assertEqual(sad_file_path.stem[0:9], 'sad_dir__')

        # assert the sad directory contains 3 files
        sad_dir_contents = [f for f in Path(sad_file_path).iterdir()]
        self.assertEqual(len(sad_dir_contents), 3)

        # assert the sad dir contains a csv, dill, and txt file
        contents_extensions = [
            f.suffix.replace('.', '') for f in sad_dir_contents
        ]
        for ext in ['csv', 'dill', 'yaml']:
            self.assertTrue(ext in contents_extensions)

        # TEST LOAD
        reloaded_sad = SelfAwareData.load(sad_file_path)

        # assert the reloaded data is same as what was saved
        pd.testing.assert_frame_equal(reloaded_sad.data, my_sad.data)

        # assert there is 1 transform step and 1 intermediate file step
        transform_sequence = reloaded_sad.transform_sequence.sequence
        self.assertEqual(len(transform_sequence), 2)
        self.assertEqual(type(transform_sequence[0]), LiveTransformStep)
        self.assertEqual(type(transform_sequence[1]),
                         IntermediateFileTransformStep)

        # assert print_steps runs without error
        reloaded_sad.print_steps()

        # assert rerun generates the same data if given the same input
        rerun_df = reloaded_sad.rerun(self.raw_df)
        manually_transformed_df = self.transform_func(self.raw_df)
        pd.testing.assert_frame_equal(rerun_df, manually_transformed_df)
Exemplo n.º 5
0
    def test_load_SAD_from_file_with_kwargs(self):
        raw_data_path = Path(self.test_dir, 'raw_df.csv')
        self.raw_df.to_csv(raw_data_path, index=False, sep='|')
        from_file_sad = SelfAwareData.load_from_file(raw_data_path, sep='|')

        pd.testing.assert_frame_equal(from_file_sad.data, self.raw_df)
        self.assertEqual(len(from_file_sad.transform_sequence.sequence), 1)
        self.assertTrue(type(from_file_sad.transform_sequence.sequence[0]),
                        SourceFileTransformStep)
Exemplo n.º 6
0
    def test_get_info(self):
        raw_sad = SelfAwareData(self.raw_df)
        my_sad = raw_sad.transform(self.transform_func,
                                   tag='new_sad',
                                   enforce_clean_git=False)
        transform_info = my_sad.get_info()

        expected_transform_info_keys = {
            'timestamp', 'git_hash', 'tag', 'kwargs', 'code'
        }
        expected_info = {
            'tag': 'new_sad',
            'kwargs': {},
        }
        self.assertEqual(len(transform_info), 1)  # 1 transform
        transform_step_0_info = transform_info[0]
        self.assertTrue(
            set(transform_step_0_info.keys()) == expected_transform_info_keys)
        for key in expected_info:
            self.assertEqual(transform_step_0_info[key], expected_info[key])
Exemplo n.º 7
0
    def test_save_untransformed(self):
        raw_sad = SelfAwareData(self.raw_df)
        sad_file_path = raw_sad.save(Path(self.test_dir, 'raw_sad.csv'),
                                     index=False)

        # assert a new directory was created, and the path matches the return value
        self.assertTrue(Path(sad_file_path).exists())

        # assert the sad dir name starts with "sad_dir"
        self.assertEqual(sad_file_path.stem[0:9], 'sad_dir__')

        # assert the sad directory contains 3 files
        sad_dir_contents = [f for f in Path(sad_file_path).iterdir()]
        self.assertEqual(len(sad_dir_contents), 3)

        # assert the sad dir contains a csv, dill, and txt file
        contents_extensions = [
            f.suffix.replace('.', '') for f in sad_dir_contents
        ]
        for ext in ['csv', 'dill', 'yaml']:
            self.assertTrue(ext in contents_extensions)
Exemplo n.º 8
0
    def test_SelfAwareDataInterface_get_info(self):
        raw_sad = SelfAwareData(self.raw_df)
        my_sad = raw_sad.transform(self.transform_func,
                                   tag='new_sad',
                                   enforce_clean_git=False)
        sad_file_path = my_sad.save(Path(self.test_dir, 'new_sad.csv'),
                                    index=False)

        info = SelfAwareDataInterface.get_info(sad_file_path)
        self.assertEqual(type(info), dict)
        expected_top_level_keys = {
            'interface_version', 'timestamp', 'tag', 'data_type',
            'transform_steps'
        }
        self.assertTrue(set(info.keys()) == expected_top_level_keys)
        expected_top_level_info = {
            'interface_version': 1,
            'tag': 'new_sad',
            'data_type': 'csv',
        }
        for key in expected_top_level_info:
            self.assertEqual(info[key], expected_top_level_info[key])

        transform_info = info['transform_steps']

        expected_transform_info_keys = {
            'timestamp', 'git_hash', 'tag', 'kwargs', 'code'
        }
        expected_info = {
            'tag': 'new_sad',
            'kwargs': {},
        }
        self.assertEqual(len(transform_info), 1)  # 1 transform
        transform_step_0_info = transform_info[0]
        self.assertTrue(
            set(transform_step_0_info.keys()) == expected_transform_info_keys)
        for key in expected_info:
            self.assertEqual(transform_step_0_info[key], expected_info[key])
Exemplo n.º 9
0
 def test_transform(self):
     raw_sad = SelfAwareData(self.raw_df)
     my_sad = raw_sad.transform(self.transform_func,
                                enforce_clean_git=False)
     manually_transformed_df = self.transform_func(self.raw_df)
     pd.testing.assert_frame_equal(my_sad.data, manually_transformed_df)
Exemplo n.º 10
0
    def test_load_interface_version_0(self):
        raw_sad = SelfAwareData(self.raw_df)
        my_sad = raw_sad.transform(self.transform_func,
                                   'unrecorded_tag',
                                   enforce_clean_git=False)
        sad_file_path = SelfAwareDataInterface.save(my_sad,
                                                    parent_path=self.test_dir,
                                                    file_name='v0_sad.csv',
                                                    interface_version=0,
                                                    index=False)
        # assert a new directory was created, and the path matches the return value
        self.assertTrue(Path(sad_file_path).exists())

        # assert the sad directory contains 3 files
        sad_dir_contents = [f for f in Path(sad_file_path).iterdir()]
        self.assertEqual(len(sad_dir_contents), 3)

        # assert the sad dir contains a csv, dill, and txt file
        contents_extensions = [
            f.suffix.replace('.', '') for f in sad_dir_contents
        ]
        for ext in ['csv', 'dill', 'txt']:
            self.assertTrue(ext in contents_extensions)

        # TEST LOAD
        reloaded_v0_sad = SelfAwareData.load(sad_file_path)

        # assert the reloaded data is same as what was saved
        pd.testing.assert_frame_equal(reloaded_v0_sad.data, my_sad.data)

        # assert there is 1 transform step and 1 intermediate file step
        transform_sequence = reloaded_v0_sad.transform_sequence.sequence
        self.assertEqual(len(transform_sequence), 2)
        self.assertEqual(type(transform_sequence[0]), LiveTransformStep)
        self.assertEqual(type(transform_sequence[1]),
                         IntermediateFileTransformStep)

        # assert print_steps runs without error
        reloaded_v0_sad.print_steps()

        # assert rerun generates the same data if given the same input
        rerun_df = reloaded_v0_sad.rerun(self.raw_df)
        manually_transformed_df = self.transform_func(self.raw_df)
        pd.testing.assert_frame_equal(rerun_df, manually_transformed_df)

        # test get_info
        transform_info = reloaded_v0_sad.get_info()
        print(transform_info)

        expected_transform_info_keys = {
            'timestamp', 'git_hash', 'tag', 'kwargs', 'code'
        }
        expected_info = {
            'tag': 'v0_sad',
            'kwargs': {},
        }
        self.assertEqual(len(transform_info),
                         2)  # 1 transform step and 1 intermediate file step
        transform_step_0_info = transform_info[0]
        self.assertTrue(
            set(transform_step_0_info.keys()) == expected_transform_info_keys)
        for key in expected_info:
            self.assertEqual(transform_step_0_info[key], expected_info[key])