def test_data_storage_in_raw_data_with_data_size_limit(self):
     config_dict = {
         'output_path': self.output_dir,
         'max_size': 25,
         'separate_raw_data_runs': True
     }
     config = DataSaverConfig().create(config_dict=config_dict)
     self.data_saver = DataSaver(config=config)
     first_info = generate_dummy_dataset(self.data_saver, num_runs=2)
     self.assertEqual(sum(first_info['episode_lengths']),
                      self.data_saver._frame_counter)
     self.data_saver.update_saving_directory()
     second_info = generate_dummy_dataset(self.data_saver, num_runs=2)
     self.assertTrue(
         (sum(first_info['episode_lengths']) +
          sum(second_info['episode_lengths'])) > config_dict['max_size'])
     self.assertTrue(
         self.data_saver._frame_counter <= config_dict['max_size'])
     raw_data_dir = os.path.dirname(self.data_saver.get_saving_directory())
     count_actual_frames = sum([
         len(
             os.listdir(
                 os.path.join(raw_data_dir, episode_dir, 'observation')))
         for episode_dir in os.listdir(raw_data_dir)
     ])
     self.assertEqual(count_actual_frames, self.data_saver._frame_counter)
예제 #2
0
def generate_random_dataset_in_raw_data(
        output_dir: str,
        num_runs: int = 20,
        input_size: tuple = (100, 100, 3),
        output_size: tuple = (1, ),
        continuous: bool = True,
        fixed_input_value: Union[float, np.ndarray] = None,
        fixed_output_value: Union[float, np.ndarray] = None,
        store_hdf5: bool = False) -> dict:
    """Generate data, stored in raw_data directory of output_dir"""
    data_saver = DataSaver(config=DataSaverConfig().create(
        config_dict={
            'output_path': output_dir,
            'store_hdf5': store_hdf5,
            'separate_raw_data_runs': True
        }))
    info = generate_dummy_dataset(data_saver,
                                  num_runs=num_runs,
                                  input_size=input_size,
                                  output_size=output_size,
                                  continuous=continuous,
                                  fixed_input_value=fixed_input_value,
                                  fixed_output_value=fixed_output_value,
                                  store_hdf5=store_hdf5)
    return info
    def test_create_hdf5_files_subsampled_in_time(self):
        num_runs = 10
        split = 1.0
        subsample = 3
        config_dict = {
            'output_path': self.output_dir,
            'training_validation_split': split,
            'store_hdf5': True,
            'subsample_hdf5': subsample,
            'separate_raw_data_runs': True
        }
        config = DataSaverConfig().create(config_dict=config_dict)
        self.data_saver = DataSaver(config=config)
        info = generate_dummy_dataset(self.data_saver, num_runs=num_runs)
        self.data_saver.create_train_validation_hdf5_files()

        config = DataLoaderConfig().create(
            config_dict={
                'output_path': self.output_dir,
                'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')]
            })
        training_data_loader = DataLoader(config=config)
        training_data_loader.load_dataset()
        training_data = training_data_loader.get_dataset()

        self.assertEqual(
            len(training_data),
            sum([
                np.ceil((el - 1) / subsample) + 1
                for el in info['episode_lengths']
            ]))
    def test_big_data_hdf5_loop(self):
        # create 3 datasets as hdf5 files
        hdf5_files = []
        infos = []
        for index in range(3):
            output_path = os.path.join(self.output_dir, f'ds{index}')
            os.makedirs(output_path, exist_ok=True)
            config_dict = {
                'output_path': output_path,
                'store_hdf5': True,
                'training_validation_split': 1.0
            }
            config = DataSaverConfig().create(config_dict=config_dict)
            self.data_saver = DataSaver(config=config)
            infos.append(
                generate_dummy_dataset(self.data_saver,
                                       num_runs=2,
                                       input_size=(3, 10, 10),
                                       fixed_input_value=(0.3 * index) *
                                       np.ones((3, 10, 10)),
                                       store_hdf5=True))
            self.assertTrue(
                os.path.isfile(os.path.join(output_path, 'train.hdf5')))
            hdf5_files.append(os.path.join(output_path, 'train.hdf5'))
            hdf5_files.append(os.path.join(output_path, 'wrong.hdf5'))

        # create data loader with big data tag and three hdf5 training sets
        conf = {
            'output_path': self.output_dir,
            'hdf5_files': hdf5_files,
            'batch_size': 15,
            'loop_over_hdf5_files': True
        }
        loader = DataLoader(DataLoaderConfig().create(config_dict=conf))

        # sample data batches and see that index increases every two batches sampled
        for batch in loader.get_data_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0)
        for batch in loader.get_data_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.3,
                                   2)
        for batch in loader.get_data_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.6,
                                   2)
        for batch in loader.get_data_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0, 2)
        for batch in loader.sample_shuffled_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.3,
                                   2)
        for batch in loader.sample_shuffled_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.6,
                                   2)
        for batch in loader.sample_shuffled_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0, 2)
 def setUp(self) -> None:
     self.output_dir = f'{os.environ["PWD"]}/test_dir/{get_filename_without_extension(__file__)}'
     if not os.path.isdir(self.output_dir):
         os.makedirs(self.output_dir)
     config_dict = {'output_path': self.output_dir, 'store_hdf5': True}
     config = DataSaverConfig().create(config_dict=config_dict)
     self.data_saver = DataSaver(config=config)
     self.info = generate_dummy_dataset(self.data_saver,
                                        num_runs=20,
                                        input_size=(100, 100, 3),
                                        output_size=(3, ),
                                        continuous=False)
    def test_sample_batch(self):
        self.info = generate_dummy_dataset(self.data_saver,
                                           num_runs=20,
                                           input_size=(100, 100, 3),
                                           output_size=(3, ),
                                           continuous=False)
        max_num_batches = 2
        config_dict = {
            'data_directories': self.info['episode_directories'],
            'output_path': self.output_dir,
            'random_seed': 1,
            'batch_size': 3
        }
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=config_dict))
        data_loader.load_dataset()
        first_batch = []
        index = 0
        for index, batch in enumerate(
                data_loader.sample_shuffled_batch(
                    max_number_of_batches=max_num_batches)):
            if index == 0:
                first_batch = deepcopy(batch)
            self.assertEqual(len(batch), config_dict['batch_size'])
        self.assertEqual(index, max_num_batches - 1)

        # test sampling seed for reproduction
        config_dict['random_seed'] = 2
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=config_dict))
        data_loader.load_dataset()
        second_batch = []
        for index, batch in enumerate(
                data_loader.sample_shuffled_batch(
                    max_number_of_batches=max_num_batches)):
            second_batch = deepcopy(batch)
            break
        self.assertNotEqual(np.sum(np.asarray(first_batch.observations[0])),
                            np.sum(np.asarray(second_batch.observations[0])))
        config_dict['random_seed'] = 1
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=config_dict))
        data_loader.load_dataset()
        third_batch = []
        for index, batch in enumerate(
                data_loader.sample_shuffled_batch(
                    max_number_of_batches=max_num_batches)):
            third_batch = deepcopy(batch)
            break
        self.assertEqual(np.sum(np.asarray(first_batch.observations[0])),
                         np.sum(np.asarray(third_batch.observations[0])))
 def test_store_in_ram(self):
     config_dict = {
         'output_path': self.output_dir,
         'store_on_ram_only': True,
         'max_size': 10
     }
     number_of_runs = 10
     config = DataSaverConfig().create(config_dict=config_dict)
     self.data_saver = DataSaver(config=config)
     info = generate_dummy_dataset(self.data_saver, num_runs=number_of_runs)
     data = self.data_saver.get_dataset()
     self.assertEqual(len(data), config_dict['max_size'])
     for lst in [data.observations, data.actions, data.rewards, data.done]:
         self.assertEqual(len(lst), config_dict['max_size'])
         self.assertTrue(isinstance(lst[0], torch.Tensor))
 def test_empty_saving_directory(self):
     config_dict = {
         'output_path': self.output_dir,
         'separate_raw_data_runs': True
     }
     number_of_runs = 5
     config = DataSaverConfig().create(config_dict=config_dict)
     self.data_saver = DataSaver(config=config)
     info = generate_dummy_dataset(self.data_saver, num_runs=number_of_runs)
     self.assertEqual(
         len(os.listdir(os.path.join(self.output_dir, 'raw_data'))),
         number_of_runs)
     self.data_saver.empty_raw_data_in_output_directory()
     self.assertEqual(
         len(os.listdir(os.path.join(self.output_dir, 'raw_data'))), 0)
    def test_data_loader_from_raw_path_dirs(self):
        self.info = generate_dummy_dataset(self.data_saver,
                                           num_runs=20,
                                           input_size=(100, 100, 3),
                                           output_size=(3, ),
                                           continuous=False)
        config_dict = {
            'data_directories': [self.output_dir],
            'output_path': self.output_dir,
        }
        config = DataLoaderConfig().create(config_dict=config_dict)
        data_loader = DataLoader(config=config)
        data_loader.load_dataset()

        config = DataLoaderConfig().create(config_dict=config_dict)
        for d in config.data_directories:
            self.assertTrue(os.path.isdir(d))
    def test_data_batch(self):
        self.info = generate_dummy_dataset(self.data_saver,
                                           num_runs=20,
                                           input_size=(100, 100, 3),
                                           output_size=(3, ),
                                           continuous=False)
        config_dict = {
            'data_directories': self.info['episode_directories'],
            'output_path': self.output_dir,
            'random_seed': 1,
            'batch_size': 3
        }
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=config_dict))
        data_loader.load_dataset()

        for batch in data_loader.get_data_batch():
            self.assertEqual(len(batch), config_dict['batch_size'])
            break
    def test_data_loading(self):
        self.info = generate_dummy_dataset(self.data_saver,
                                           num_runs=20,
                                           input_size=(100, 100, 3),
                                           output_size=(3, ),
                                           continuous=False)
        config_dict = {
            'data_directories': self.info['episode_directories'],
            'output_path': self.output_dir,
        }
        config = DataLoaderConfig().create(config_dict=config_dict)
        data_loader = DataLoader(config=config)
        data_loader.load_dataset()

        # assert nothing is empty
        for k in ['observations', 'actions', 'rewards', 'done']:
            data = eval(f'data_loader.get_dataset().{k}')
            self.assertTrue(len(data) > 0)
            self.assertTrue(sum(data[0].shape) > 0)
 def test_data_storage_in_raw_data(self):
     config_dict = {
         'output_path': self.output_dir,
         'separate_raw_data_runs': True
     }
     config = DataSaverConfig().create(config_dict=config_dict)
     self.data_saver = DataSaver(config=config)
     info = generate_dummy_dataset(self.data_saver, num_runs=2)
     for total, episode_dir in zip(info['episode_lengths'],
                                   info['episode_directories']):
         self.assertEqual(
             len(
                 os.listdir(
                     os.path.join(self.output_dir, 'raw_data', episode_dir,
                                  'observation'))), total)
         with open(
                 os.path.join(self.output_dir, 'raw_data', episode_dir,
                              'action.data')) as f:
             expert_controls = f.readlines()
             self.assertEqual(len(expert_controls), total)
 def test_data_subsample(self):
     self.info = generate_dummy_dataset(self.data_saver,
                                        num_runs=20,
                                        input_size=(100, 100, 3),
                                        output_size=(3, ),
                                        continuous=False)
     subsample = 4
     config_dict = {
         'data_directories': self.info['episode_directories'],
         'output_path': self.output_dir,
         'random_seed': 1,
         'batch_size': 3,
         'subsample': subsample
     }
     data_loader = DataLoader(config=DataLoaderConfig().create(
         config_dict=config_dict))
     data_loader.load_dataset()
     self.assertTrue(
         sum([
             np.ceil((el - 1) / subsample) + 1
             for el in self.info['episode_lengths']
         ]), len(data_loader.get_dataset()))
    def test_create_train_validation_hdf5_files(self):
        num_runs = 10
        split = 0.7
        config_dict = {
            'output_path': self.output_dir,
            'training_validation_split': split,
            'store_hdf5': True,
            'separate_raw_data_runs': True
        }
        config = DataSaverConfig().create(config_dict=config_dict)
        self.data_saver = DataSaver(config=config)
        info = generate_dummy_dataset(self.data_saver, num_runs=num_runs)
        self.data_saver.create_train_validation_hdf5_files()

        config = DataLoaderConfig().create(
            config_dict={
                'output_path': self.output_dir,
                'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')]
            })
        training_data_loader = DataLoader(config=config)
        training_data_loader.load_dataset()
        training_data = training_data_loader.get_dataset()

        config = DataLoaderConfig().create(
            config_dict={
                'output_path': self.output_dir,
                'hdf5_files':
                [os.path.join(self.output_dir, 'validation.hdf5')]
            })
        validation_data_loader = DataLoader(config=config)
        validation_data_loader.load_dataset()
        validation_data = validation_data_loader.get_dataset()

        self.assertEqual(len(training_data),
                         sum(info['episode_lengths'][:int(split * num_runs)]))
        self.assertEqual(len(validation_data),
                         sum(info['episode_lengths'][int(split * num_runs):]))