Пример #1
0
 def set_dataset(self, ds: Dataset = None) -> None:
     if ds is not None:
         self._dataset = ds
     else:
         self._dataset = Dataset()
         self.update_data_directories_with_raw_data()
         self.load_dataset()
Пример #2
0
 def __init__(self, config: DataLoaderConfig):
     self._config = config
     self._logger = get_logger(
         name=get_filename_without_extension(__file__),
         output_path=config.output_path,
         quiet=False)
     cprint(f'Started.', self._logger)
     self._dataset = Dataset()
     self._num_runs = 0
     self._probabilities: List = []
     self.seed()
     self._hdf5_file_index = -1
 def setUp(self) -> None:
     self.output_dir = f'{os.environ["PWD"]}/test_dir/{get_filename_without_extension(__file__)}'
     os.makedirs(self.output_dir, exist_ok=True)
     self.batch = Dataset()
     self.durations = [10, 1, 5]
     self.step_reward = torch.as_tensor(1)
     self.end_reward = torch.as_tensor(10)
     for episode in range(3):
         for experience in range(self.durations[episode] - 1):
             self.batch.append(
                 Experience(observation=torch.as_tensor(5),
                            action=torch.as_tensor(5),
                            reward=self.step_reward,
                            done=torch.as_tensor(0)))
         self.batch.append(
             Experience(observation=torch.as_tensor(5),
                        action=torch.as_tensor(5),
                        reward=self.end_reward,
                        done=torch.as_tensor(2)))
Пример #4
0
def generate_dataset_by_length(length: int,
                               input_size: tuple = (3, 100, 100),
                               output_size: tuple = (1, ),
                               continuous: bool = True,
                               fixed_input_value: float = None,
                               fixed_output_value: float = None) -> Dataset:
    dataset = Dataset()
    while len(dataset) < length:
        for count, experience in enumerate(
                experience_generator(input_size=input_size,
                                     output_size=output_size,
                                     continuous=continuous,
                                     fixed_input_value=fixed_input_value,
                                     fixed_output_value=fixed_output_value)):
            if experience.done != TerminationType.Unknown:
                dataset.append(experience)
            if len(dataset) >= length:
                break
    return dataset
Пример #5
0
    def load_dataset(self):
        if len(self._config.hdf5_files) != 0:
            if self._config.loop_over_hdf5_files:
                del self._dataset
                self._dataset = Dataset()
                self._hdf5_file_index += 1
                self._hdf5_file_index %= len(self._config.hdf5_files)
                while len(self._dataset) == 0:
                    try:
                        self._dataset.extend(
                            load_dataset_from_hdf5(
                                self._config.hdf5_files[self._hdf5_file_index],
                                input_size=self._config.input_size))
                    except OSError:
                        cprint(
                            f'Failed to load {self._config.hdf5_files[self._hdf5_file_index]}',
                            self._logger,
                            msg_type=MessageType.warning)
                        del self._config.hdf5_files[self._hdf5_file_index]
                        self._hdf5_file_index %= len(self._config.hdf5_files)
                cprint(
                    f'Loaded {len(self._dataset)} datapoints from {self._config.hdf5_files[self._hdf5_file_index]}',
                    self._logger,
                    msg_type=MessageType.warning if len(
                        self._dataset.observations) == 0 else MessageType.info)
            else:
                for hdf5_file in self._config.hdf5_files:
                    self._dataset.extend(
                        load_dataset_from_hdf5(
                            hdf5_file, input_size=self._config.input_size))
                cprint(
                    f'Loaded {len(self._dataset)} datapoints from {self._config.hdf5_files}',
                    self._logger,
                    msg_type=MessageType.warning if len(
                        self._dataset.observations) == 0 else MessageType.info)
        else:
            self.load_dataset_from_directories(self._config.data_directories)

        if self._config.subsample != 1:
            self._dataset.subsample(self._config.subsample)

        if self._config.balance_over_actions:
            self._probabilities = balance_weights_over_actions(self._dataset)
Пример #6
0
def set_binary_maps_as_target(dataset: Dataset, invert: bool = False, binary_images: List[np.ndarray] = None,
                              smoothen_labels: bool = False) -> Dataset:
    if binary_images is None:
        binary_images = parse_binary_maps(copy.deepcopy(dataset.observations), invert=invert)
    binary_images = torch.stack([torch.as_tensor(b).unsqueeze(0) for b in binary_images], dim=0)
    # smoothen binary maps to punish line predictions close to line less severely
    if smoothen_labels:
        binary_images = gaussian_blur2d(binary_images, kernel_size=(11, 11), sigma=(4, 4))
    dataset.actions = [b.squeeze() for b in binary_images]
    return dataset
Пример #7
0
def select(data: Union[list, torch.Tensor, np.ndarray, Dataset], indices: List[int]) -> Union[list, torch.Tensor,
                                                                                              np.ndarray, Dataset]:
    if isinstance(data, list):
        return [data[i] for i in indices]
    elif isinstance(data, Dataset):
        return Dataset(
            observations=[data.observations[i] for i in indices],
            actions=[data.actions[i] for i in indices],
            rewards=[data.rewards[i] for i in indices],
            done=[data.done[i] for i in indices],
        )
    else:  # assuming Tensor or numpy array
        return data.squeeze()[indices]
    def __init__(self, config: DataSaverConfig):
        self._config = config
        self._logger = get_logger(
            name=get_filename_without_extension(__file__),
            output_path=self._config.output_path,
            quiet=False)
        cprint(f'initiate', self._logger)

        if not self._config.saving_directory.startswith('/'):
            self._config.saving_directory = os.path.join(
                os.environ['HOME'], self._config.saving_directory)

        if self._config.store_on_ram_only:
            self._dataset = Dataset(max_size=self._config.max_size)

        # used to keep track of replay buffer size on file system
        if not self._config.store_on_ram_only \
                and os.path.isdir(os.path.dirname(self._config.saving_directory)) \
                and self._config.max_size != -1:
            data_loader = DataLoader(config=DataLoaderConfig().create(
                config_dict={
                    'data_directories': [
                        os.path.join(
                            os.path.dirname(self._config.saving_directory),
                            run) for run in sorted(
                                os.listdir(
                                    os.path.dirname(
                                        self._config.saving_directory)))
                    ],
                    'output_path':
                    self._config.output_path,
                    'store':
                    False  # don't store config
                }))
            data_loader.load_dataset()
            self._frame_counter = len(data_loader.get_dataset())
        else:
            self._frame_counter = 0
Пример #9
0
 def get_data_batch(self) -> Generator[Dataset, None, None]:
     if len(self._dataset) == 0 or self._config.loop_over_hdf5_files:
         self.load_dataset()
     index = 0
     while index < len(self._dataset):
         batch = Dataset()
         end_index = min(index + self._config.batch_size, len(self._dataset)) \
             if self._config.batch_size != -1 else len(self._dataset)
         batch.observations = self._dataset.observations[index:end_index]
         batch.actions = self._dataset.actions[index:end_index]
         batch.done = self._dataset.done[index:end_index]
         batch.rewards = self._dataset.rewards[index:end_index]
         index = index + self._config.batch_size if self._config.batch_size != -1 else len(
             self._dataset)
         yield batch
 def test_dataset_shuffle(self):
     run_length = 10
     dataset = Dataset()
     for run_index in range(3):
         for step_index in range(run_length + run_index):
             dataset.append(
                 Experience(
                     observation=torch.as_tensor((len(dataset), )),
                     action=torch.as_tensor((len(dataset), )),
                     reward=torch.as_tensor((0, )),
                     done=torch.as_tensor(
                         (0, )) if step_index != run_length + run_index - 1
                     else torch.as_tensor((1, ))))
     self.assertEqual(dataset.observations[0].item(), 0)
     dataset.shuffle()
     self.assertEqual(dataset.observations[0], dataset.actions[0])
     self.assertNotEqual(dataset.observations[0].item(), 0)
 def test_dataset_subsample(self):
     run_length = 10
     subsample = 3
     dataset = Dataset()
     for run_index in range(3):
         for step_index in range(run_length + run_index):
             dataset.append(
                 Experience(
                     observation=torch.as_tensor((step_index, )),
                     action=torch.as_tensor((0, )),
                     reward=torch.as_tensor((0, )),
                     done=torch.as_tensor(
                         (0, )) if step_index != run_length + run_index - 1
                     else torch.as_tensor((1, ))))
     dataset.subsample(subsample)
     for exp_index in range(len(dataset)):
         self.assertTrue(
             dataset.observations[exp_index].item() % subsample == 0
             or dataset.done[exp_index].item() == 1)
 def test_dataset_size(self):
     dataset = Dataset()
     dataset.append(
         Experience(observation=torch.as_tensor([0] * 10),
                    action=torch.as_tensor([1] * 3),
                    reward=torch.as_tensor(0),
                    done=torch.as_tensor(2)))
     first_size = dataset.get_memory_size()
     dataset.append(
         Experience(observation=torch.as_tensor([0] * 10),
                    action=torch.as_tensor([1] * 3),
                    reward=torch.as_tensor(0),
                    done=torch.as_tensor(2)))
     self.assertEqual(2 * first_size, dataset.get_memory_size())
     dataset = Dataset()
     dataset.append(
         Experience(observation=torch.as_tensor([0] * 10,
                                                dtype=torch.float32),
                    action=torch.as_tensor([1] * 3, dtype=torch.float32),
                    reward=torch.as_tensor(0, dtype=torch.float32),
                    done=torch.as_tensor(2, dtype=torch.float32)))
     second_size = dataset.get_memory_size()
     self.assertEqual(first_size, 2 * second_size)
Пример #13
0
def set_binary_maps_as_target(dataset: Dataset, invert: bool = False, binary_images: List[np.ndarray] = None) -> Dataset:
    if binary_images is None:
        binary_images = parse_binary_maps(copy.deepcopy(dataset.observations), invert=invert)
    dataset.actions = [torch.as_tensor(b) for b in binary_images]
    return dataset
    def _clean(self, filename_tag: str, runs: List[str]) -> None:
        total_data_points = 0
        filename_index = 0
        hdf5_data = Dataset()
        for run in tqdm(runs):
            if self._config.require_success:
                if not os.path.isfile(os.path.join(run, 'Success')):
                    continue
            # load data in dataset in input size
            run_dataset = self._data_loader.load_dataset_from_directories(
                [run])
            if len(run_dataset) <= self._config.remove_first_n_timestamps:
                continue
            # remove first N frames
            for _ in range(self._config.remove_first_n_timestamps):
                run_dataset.pop()
            # subsample
            run_dataset.subsample(self._config.data_loader_config.subsample)
            # enforce max run length
            if self._config.max_run_length != -1:
                run_dataset.clip(self._config.max_run_length)
                assert len(run_dataset) <= self._config.max_run_length
            # augment with background noise and change target to binary map

            binary_maps = parse_binary_maps(run_dataset.observations, invert=self._config.invert_binary_maps) \
                if self._config.augment_background_noise != 0 or self._config.augment_background_textured != 0 else None
            if self._config.binary_maps_as_target:
                run_dataset = set_binary_maps_as_target(
                    run_dataset,
                    invert=self._config.invert_binary_maps,
                    binary_images=binary_maps,
                    smoothen_labels=self._config.smoothen_labels)

            if self._config.augment_background_noise != 0:
                run_dataset = augment_background_noise(
                    run_dataset,
                    p=self._config.augment_background_noise,
                    binary_images=binary_maps)
            if self._config.augment_background_textured != 0:
                run_dataset = augment_background_textured(
                    run_dataset,
                    texture_directory=self._config.texture_directory,
                    p=self._config.augment_background_textured,
                    p_empty=self._config.augment_empty_images,
                    binary_images=binary_maps)
            # store dhf5 file once max dataset size is reached
            hdf5_data.extend(run_dataset)
            self._data_loader.empty_dataset()
            if hdf5_data.get_memory_size() > self._config.max_hdf5_size:
                if self._config.shuffle:
                    hdf5_data.shuffle()
                create_hdf5_file_from_dataset(filename=os.path.join(
                    self._config.output_path,
                    f'{filename_tag}_{filename_index}.hdf5'),
                                              dataset=hdf5_data)
                filename_index += 1
                total_data_points += len(hdf5_data)
                hdf5_data = Dataset()
        if len(hdf5_data) != 0:
            if self._config.shuffle:
                hdf5_data.shuffle()
            create_hdf5_file_from_dataset(filename=os.path.join(
                self._config.output_path,
                f'{filename_tag}_{filename_index}.hdf5'),
                                          dataset=hdf5_data)
            total_data_points += len(hdf5_data)
        print(f'Total data points: {total_data_points}')
Пример #15
0
class DataLoader:
    def __init__(self, config: DataLoaderConfig):
        self._config = config
        self._logger = get_logger(
            name=get_filename_without_extension(__file__),
            output_path=config.output_path,
            quiet=False)
        cprint(f'Started.', self._logger)
        self._dataset = Dataset()
        self._num_runs = 0
        self._probabilities: List = []
        self.seed()
        self._hdf5_file_index = -1

    def seed(self, seed: int = None):
        np.random.seed(
            self._config.random_seed) if seed is None else np.random.seed(seed)

    def update_data_directories_with_raw_data(self):
        if self._config.data_directories is None:
            self._config.data_directories = []
        for d in sorted(
                os.listdir(os.path.join(self._config.output_path,
                                        'raw_data'))):
            self._config.data_directories.append(
                os.path.join(self._config.output_path, 'raw_data', d))
        self._config.data_directories = list(set(
            self._config.data_directories))

    def load_dataset(self):
        if len(self._config.hdf5_files) != 0:
            if self._config.loop_over_hdf5_files:
                self._dataset = Dataset()
                self._hdf5_file_index += 1
                self._hdf5_file_index %= len(self._config.hdf5_files)
                while len(self._dataset) == 0:
                    try:
                        self._dataset.extend(
                            load_dataset_from_hdf5(
                                self._config.hdf5_files[self._hdf5_file_index],
                                input_size=self._config.input_size))
                    except OSError:
                        cprint(
                            f'Failed to load {self._config.hdf5_files[self._hdf5_file_index]}',
                            self._logger,
                            msg_type=MessageType.warning)
                        del self._config.hdf5_files[self._hdf5_file_index]
                        self._hdf5_file_index %= len(self._config.hdf5_files)
                cprint(
                    f'Loaded {len(self._dataset)} datapoints from {self._config.hdf5_files[self._hdf5_file_index]}',
                    self._logger,
                    msg_type=MessageType.warning if len(
                        self._dataset.observations) == 0 else MessageType.info)
            else:
                for hdf5_file in self._config.hdf5_files:
                    self._dataset.extend(
                        load_dataset_from_hdf5(
                            hdf5_file, input_size=self._config.input_size))
                cprint(
                    f'Loaded {len(self._dataset)} datapoints from {self._config.hdf5_files}',
                    self._logger,
                    msg_type=MessageType.warning if len(
                        self._dataset.observations) == 0 else MessageType.info)
        else:
            self.load_dataset_from_directories(self._config.data_directories)

        if self._config.subsample != 1:
            self._dataset.subsample(self._config.subsample)

        if self._config.balance_over_actions:
            self._probabilities = balance_weights_over_actions(self._dataset)

    def load_dataset_from_directories(self,
                                      directories: List[str] = None
                                      ) -> Dataset:
        directory_generator = tqdm(directories, ascii=True, desc=__name__) \
            if len(directories) > 10 else directories
        for directory in directory_generator:
            run = load_run(directory,
                           arrange_according_to_timestamp=False,
                           input_size=self._config.input_size,
                           scope=self._config.input_scope)
            if len(run) != 0:
                self._dataset.extend(experiences=run)
        cprint(
            f'Loaded {len(self._dataset)} data points from {len(directories)} directories',
            self._logger,
            msg_type=MessageType.warning
            if len(self._dataset) == 0 else MessageType.info)
        return self._dataset

    def empty_dataset(self) -> None:
        self._dataset = Dataset()

    def set_dataset(self, ds: Dataset = None) -> None:
        if ds is not None:
            self._dataset = ds
        else:
            self._dataset = Dataset()
            self.update_data_directories_with_raw_data()
            self.load_dataset()

    def empty_dataset(self) -> None:
        self._dataset = Dataset()

    def get_dataset(self) -> Dataset:
        return self._dataset

    def get_data_batch(self) -> Generator[Dataset, None, None]:
        if len(self._dataset) == 0 or self._config.loop_over_hdf5_files:
            self.load_dataset()
        index = 0
        while index < len(self._dataset):
            batch = Dataset()
            end_index = min(index + self._config.batch_size, len(self._dataset)) \
                if self._config.batch_size != -1 else len(self._dataset)
            batch.observations = self._dataset.observations[index:end_index]
            batch.actions = self._dataset.actions[index:end_index]
            batch.done = self._dataset.done[index:end_index]
            batch.rewards = self._dataset.rewards[index:end_index]
            index = index + self._config.batch_size if self._config.batch_size != -1 else len(
                self._dataset)
            yield batch

    def sample_shuffled_batch(self, max_number_of_batches: int = 1000) \
            -> Generator[Dataset, None, None]:
        """
        randomly shuffle data samples in runs in dataset and provide them as ready run objects
        :param batch_size: number of samples or datapoints in one batch
        :param max_number_of_batches: define an upperbound in number of batches to end epoch
        :param dataset: list of runs with inputs, outputs and batches
        :return: yield a batch up until all samples are done
        """
        if len(self._dataset) == 0 or self._config.loop_over_hdf5_files:
            self.load_dataset()
        # Get data indices:
        batch_count = 0
        while batch_count < min(
                len(self._dataset),
                max_number_of_batches * self._config.batch_size):
            sample_indices = np.random.choice(
                list(range(len(self._dataset))),
                size=self._config.batch_size,
                replace=len(self._dataset) < self._config.batch_size,
                p=self._probabilities
                if len(self._probabilities) != 0 else None)
            batch = select(self._dataset, sample_indices)
            batch_count += len(batch)
            yield batch
        return

    def split_data(self, indices: np.ndarray,
                   *args) -> Generator[tuple, None, None]:
        """
        Split the indices in batches of configs batch_size and select the data in args.
        :param indices: possible indices to be selected. If all indices can be selected, provide empty array.
        :param args: lists or tensors from which the corresponding data according to the indices is selected.
        :return: provides a tuple in the same order as the args with the selected data.
        """
        if len(indices) == 0:
            indices = np.arange(len(self._dataset))
        np.random.shuffle(indices)
        splits = np.array_split(
            indices, max(1, int(len(self._dataset) / self._config.batch_size)))
        for selected_indices in splits:
            return_tuple = (select(data, selected_indices) for data in args)
            yield return_tuple

    def remove(self):
        [h.close() for h in self._logger.handlers]
Пример #16
0
 def empty_dataset(self) -> None:
     del self._dataset
     self._dataset = Dataset()
Пример #17
0
 def empty_dataset(self) -> None:
     self._dataset = Dataset()
class PhiWeightTest(unittest.TestCase):
    def setUp(self) -> None:
        self.output_dir = f'{os.environ["PWD"]}/test_dir/{get_filename_without_extension(__file__)}'
        os.makedirs(self.output_dir, exist_ok=True)
        self.batch = Dataset()
        self.durations = [10, 1, 5]
        self.step_reward = torch.as_tensor(1)
        self.end_reward = torch.as_tensor(10)
        for episode in range(3):
            for experience in range(self.durations[episode] - 1):
                self.batch.append(
                    Experience(observation=torch.as_tensor(5),
                               action=torch.as_tensor(5),
                               reward=self.step_reward,
                               done=torch.as_tensor(0)))
            self.batch.append(
                Experience(observation=torch.as_tensor(5),
                           action=torch.as_tensor(5),
                           reward=self.end_reward,
                           done=torch.as_tensor(2)))

    def test_get_returns_on_dataset(self):
        returns = get_returns(self.batch)
        targets = [
            self.end_reward + (duration - 1) * self.step_reward
            for duration in self.durations for _ in range(duration)
        ]
        for r_e, r_t in zip(returns, targets):
            self.assertEqual(r_e, r_t)

    def test_get_reward_to_go(self):
        returns = get_reward_to_go(self.batch)
        targets = reversed([
            self.end_reward + t * self.step_reward
            for duration in reversed(self.durations) for t in range(duration)
        ])

        for r_e, r_t in zip(returns, targets):
            self.assertEqual(r_e, r_t)

    def test_generalized_advantage_estimate(self):
        # with gae_lambda == 1 and no value --> same as reward-to-go
        rtg_returns = get_generalized_advantage_estimate(
            batch_rewards=self.batch.rewards,
            batch_done=self.batch.done,
            batch_values=[torch.as_tensor(0.)] * len(self.batch),
            discount=1,
            gae_lambda=1)
        for r_e, r_t in zip(rtg_returns, get_reward_to_go(self.batch)):
            self.assertEqual(r_e, r_t)

        one_step_returns = get_generalized_advantage_estimate(
            batch_rewards=self.batch.rewards,
            batch_done=self.batch.done,
            batch_values=[torch.as_tensor(0.)] * len(self.batch),
            discount=1,
            gae_lambda=0)
        targets = [
            self.step_reward if d == 0 else self.end_reward
            for d in self.batch.done
        ]
        for r_e, r_t in zip(one_step_returns, targets):
            self.assertEqual(r_e, r_t)

        gae_returns = get_generalized_advantage_estimate(
            batch_rewards=self.batch.rewards,
            batch_done=self.batch.done,
            batch_values=[torch.as_tensor(0.)] * len(self.batch),
            discount=0.99,
            gae_lambda=0.99)
        for t in range(len(self.batch)):
            self.assertGreaterEqual(gae_returns[t], one_step_returns[t])
            self.assertLessEqual(gae_returns[t], rtg_returns[t])

    def tearDown(self) -> None:
        shutil.rmtree(self.output_dir, ignore_errors=True)
class DataSaver:
    def __init__(self, config: DataSaverConfig):
        self._config = config
        self._logger = get_logger(
            name=get_filename_without_extension(__file__),
            output_path=self._config.output_path,
            quiet=False)
        cprint(f'initiate', self._logger)

        if not self._config.saving_directory.startswith('/'):
            self._config.saving_directory = os.path.join(
                os.environ['HOME'], self._config.saving_directory)

        if self._config.store_on_ram_only:
            self._dataset = Dataset(max_size=self._config.max_size)

        # used to keep track of replay buffer size on file system
        if not self._config.store_on_ram_only \
                and os.path.isdir(os.path.dirname(self._config.saving_directory)) \
                and self._config.max_size != -1:
            data_loader = DataLoader(config=DataLoaderConfig().create(
                config_dict={
                    'data_directories': [
                        os.path.join(
                            os.path.dirname(self._config.saving_directory),
                            run) for run in sorted(
                                os.listdir(
                                    os.path.dirname(
                                        self._config.saving_directory)))
                    ],
                    'output_path':
                    self._config.output_path,
                    'store':
                    False  # don't store config
                }))
            data_loader.load_dataset()
            self._frame_counter = len(data_loader.get_dataset())
        else:
            self._frame_counter = 0

    def __len__(self):
        if self._config.store_on_ram_only:
            return len(self._dataset)
        else:
            return self._frame_counter

    def update_saving_directory(self):
        if self._config.separate_raw_data_runs:
            self._config.saving_directory = create_saving_directory(
                self._config.output_path, self._config.saving_directory_tag)

    def get_saving_directory(self):
        return self._config.saving_directory if not self._config.store_on_ram_only else 'ram'

    def get_dataset(self):
        return self._dataset

    def save(self, experience: Experience) -> None:
        if experience.done == TerminationType.Unknown:
            return  # don't save experiences in an unknown state
        if self._config.store_on_ram_only:
            return self._dataset.append(experience)
        else:
            os.makedirs(self._config.saving_directory, exist_ok=True)
            return self._store_in_file_system(experience=experience)

    def _store_in_file_system(self, experience: Experience) -> None:
        for dst, data in zip(['observation', 'action', 'reward', 'done'], [
                experience.observation, experience.action, experience.reward,
                experience.done
        ]):
            if data is not None:
                self._store_frame(data=np.asarray(
                    data.value if isinstance(data, Action) else data),
                                  dst=dst,
                                  time_stamp=experience.time_stamp)

        for key, value in experience.info.items():
            self._store_frame(data=np.asarray(value.value) if isinstance(
                value, Action) else value,
                              dst=f'info_{to_file_name(key)}',
                              time_stamp=experience.time_stamp)

        if experience.done in [
                TerminationType.Success, TerminationType.Failure
        ]:
            os.system(
                f'touch {os.path.join(self._config.saving_directory, experience.done.name)}'
            )
        self._check_dataset_size_on_file_system()

    def _store_frame(self, data: Union[np.ndarray, float], dst: str,
                     time_stamp: int) -> None:
        if not isinstance(data, np.ndarray):
            data = np.asarray(data)
        try:
            if len(data.shape) in [2, 3]:
                if not os.path.isdir(
                        os.path.join(self._config.saving_directory, dst)):
                    os.makedirs(os.path.join(self._config.saving_directory,
                                             dst),
                                exist_ok=True)
                store_image(
                    data=data,
                    file_name=os.path.join(self._config.saving_directory, dst,
                                           timestamp_to_filename(time_stamp)) +
                    '.jpg')
            elif len(data.shape) in [0, 1]:
                store_array_to_file(data=data,
                                    file_name=os.path.join(
                                        self._config.saving_directory,
                                        dst + '.data'),
                                    time_stamp=time_stamp)
        except Exception as e:
            cprint(f'Failed to store frame: {e}',
                   self._logger,
                   msg_type=MessageType.error)

    def _check_dataset_size_on_file_system(self):
        self._frame_counter += 1
        # If number of frames exceed max_size, remove oldest run and decrease frame counter
        if self._frame_counter > self._config.max_size != -1:
            raw_data_dir = os.path.dirname(self._config.saving_directory)
            first_run = sorted(os.listdir(raw_data_dir))[0]
            with open(os.path.join(raw_data_dir, first_run, 'done.data'),
                      'r') as f:
                run_length = len(f.readlines())
            self._frame_counter -= run_length
            shutil.rmtree(os.path.join(raw_data_dir, first_run),
                          ignore_errors=True)
            if not self._config.separate_raw_data_runs:
                cprint(
                    f"Reached max buffer size and removing all data."
                    f"Avoid this by setting data_saver_config.separate_raw_data_runs to True.",
                    msg_type=MessageType.warning,
                    logger=self._logger)

    def _get_runs(self) -> list:
        """
        parse the parent directory of the saving directory for all raw_data runs.
        Return a list of the absolute paths to these runs.
        """
        raw_data_dir = os.path.dirname(self._config.saving_directory)
        return [
            os.path.join(raw_data_dir, run)
            for run in sorted(os.listdir(raw_data_dir))
        ]

    def create_train_validation_hdf5_files(
            self,
            runs: List[str] = None,
            input_size: List[int] = None) -> None:
        all_runs = runs if runs is not None else self._get_runs()

        number_of_training_runs = int(self._config.training_validation_split *
                                      len(all_runs))
        train_runs = all_runs[0:number_of_training_runs]
        validation_runs = all_runs[number_of_training_runs:]

        for file_name, runs in zip(['train', 'validation'],
                                   [train_runs, validation_runs]):
            config = DataLoaderConfig().create(
                config_dict={
                    'data_directories': runs,
                    'output_path': self._config.output_path,
                    'subsample': self._config.subsample_hdf5,
                    'input_size': input_size
                })
            data_loader = DataLoader(config=config)
            data_loader.load_dataset()
            create_hdf5_file_from_dataset(filename=os.path.join(
                self._config.output_path, file_name + '.hdf5'),
                                          dataset=data_loader.get_dataset())
            cprint(f'created {file_name}.hdf5', self._logger)

    def empty_raw_data_in_output_directory(self) -> None:
        raw_data_directory = os.path.dirname(self._config.saving_directory)
        if os.path.isdir(raw_data_directory):
            for d in os.listdir(raw_data_directory):
                shutil.rmtree(os.path.join(raw_data_directory, d))

    def clear_buffer(self) -> None:
        self._frame_counter = 0
        if self._config.store_on_ram_only:
            self._dataset = Dataset()
        else:
            self.empty_raw_data_in_output_directory()

    def remove(self):
        [h.close() for h in self._logger.handlers]
 def clear_buffer(self) -> None:
     self._frame_counter = 0
     if self._config.store_on_ram_only:
         self._dataset = Dataset()
     else:
         self.empty_raw_data_in_output_directory()