def create_train_validation_hdf5_files( self, runs: List[str] = None, input_size: List[int] = None) -> None: all_runs = runs if runs is not None else self._get_runs() number_of_training_runs = int(self._config.training_validation_split * len(all_runs)) train_runs = all_runs[0:number_of_training_runs] validation_runs = all_runs[number_of_training_runs:] for file_name, runs in zip(['train', 'validation'], [train_runs, validation_runs]): config = DataLoaderConfig().create( config_dict={ 'data_directories': runs, 'output_path': self._config.output_path, 'subsample': self._config.subsample_hdf5, 'input_size': input_size }) data_loader = DataLoader(config=config) data_loader.load_dataset() create_hdf5_file_from_dataset(filename=os.path.join( self._config.output_path, file_name + '.hdf5'), dataset=data_loader.get_dataset()) cprint(f'created {file_name}.hdf5', self._logger)
def test_clip_first_x_frames(self): info = generate_random_dataset_in_raw_data(output_dir=self.output_dir, num_runs=20, input_size=(100, 100, 3), output_size=(1, ), continuous=True, store_hdf5=False) cleaner_config_dict = { 'output_path': self.output_dir, 'data_loader_config': { 'data_directories': info['episode_directories'], 'subsample': 2 }, 'training_validation_split': 1.0, 'remove_first_n_timestamps': 5, } data_cleaner = DataCleaner(config=DataCleaningConfig().create( config_dict=cleaner_config_dict)) data_cleaner.clean() data_loader = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': glob(f'{self.output_dir}/train*.hdf5') })) data_loader.load_dataset() self.assertEqual( sum(int((e - 5) / 2) + 1 for e in info['episode_lengths']), len(data_loader.get_dataset()))
def __init__(self, config: EvaluatorConfig, network: BaseNet, quiet: bool = False): self._config = config self._net = network self.data_loader = DataLoader(config=self._config.data_loader_config) if not quiet: self._logger = get_logger( name=get_filename_without_extension(__file__), output_path=config.output_path, quiet=False) if type(self) == Evaluator else None cprint(f'Started.', self._logger) self._device = torch.device( "cuda" if self._config.device in ['gpu', 'cuda'] and torch.cuda.is_available() else "cpu") self._criterion = eval( f'{self._config.criterion}(reduction=\'none\', {self._config.criterion_args_str})' ) self._criterion.to(self._device) self._lowest_validation_loss = None self.data_loader.load_dataset() self._minimum_error = float(10**6) self._original_model_device = self._net.get_device( ) if self._net is not None else None
def test_create_hdf5_files_subsampled_in_time(self): num_runs = 10 split = 1.0 subsample = 3 config_dict = { 'output_path': self.output_dir, 'training_validation_split': split, 'store_hdf5': True, 'subsample_hdf5': subsample, 'separate_raw_data_runs': True } config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) info = generate_dummy_dataset(self.data_saver, num_runs=num_runs) self.data_saver.create_train_validation_hdf5_files() config = DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')] }) training_data_loader = DataLoader(config=config) training_data_loader.load_dataset() training_data = training_data_loader.get_dataset() self.assertEqual( len(training_data), sum([ np.ceil((el - 1) / subsample) + 1 for el in info['episode_lengths'] ]))
def test_split_hdf5_chunks(self): info = generate_random_dataset_in_raw_data(output_dir=self.output_dir, num_runs=20, input_size=(100, 100, 3), output_size=(1, ), continuous=True, store_hdf5=False) cleaner_config_dict = { 'output_path': self.output_dir, 'data_loader_config': { 'data_directories': info['episode_directories'], }, 'training_validation_split': 1.0, 'max_hdf5_size': 5 * 10**6 } data_cleaner = DataCleaner(config=DataCleaningConfig().create( config_dict=cleaner_config_dict)) data_cleaner.clean() for hdf5_file in glob(f'{self.output_dir}/train*.hdf5'): data_loader = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': [hdf5_file] })) data_loader.load_dataset() self.assertTrue( data_loader.get_dataset().get_memory_size() < 6 * 10**6)
def pred_info(self, model_path, model_name, data_path, output_path=None, hparams=None): # get outputs self._model = self.get_model(model_name) default_hparams = self._model.get_default_params() if hparams is not None: default_hparams.update_merge(hparams=hparams) hparams = default_hparams batch_size = hparams.batch_size data_loader = DataLoader(data_path=data_path, test_path=data_path, hparams=hparams) label_list = data_loader.label_list hparams.update(num_labels=len(label_list)) model, sess, g = self._model_init(model=self._model, hparams=hparams, directory=model_path) cur_time = datetime.now() total_outputs = list() total_true_false = list() total_pred = list() for t_i, (t_data, t_labels) in enumerate( data_loader.batch_loader(data_loader.dataset, batch_size)): outputs, true_false, pred = sess.run( [model.outputs, model.true_false, model.pred], feed_dict={ model.x: t_data, model.y: t_labels, model.dropout_keep_prob: 1.0 }) outputs = outputs[:, -1] # take only the last one with the size same as hidden dim total_outputs.extend(outputs) total_true_false.extend(true_false) total_pred.extend(pred) if output_path is not None: # np.save(total_outputs, total_outputs) print('cannot save currently') return total_outputs
def test_big_data_hdf5_loop(self): # create 3 datasets as hdf5 files hdf5_files = [] infos = [] for index in range(3): output_path = os.path.join(self.output_dir, f'ds{index}') os.makedirs(output_path, exist_ok=True) config_dict = { 'output_path': output_path, 'store_hdf5': True, 'training_validation_split': 1.0 } config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) infos.append( generate_dummy_dataset(self.data_saver, num_runs=2, input_size=(3, 10, 10), fixed_input_value=(0.3 * index) * np.ones((3, 10, 10)), store_hdf5=True)) self.assertTrue( os.path.isfile(os.path.join(output_path, 'train.hdf5'))) hdf5_files.append(os.path.join(output_path, 'train.hdf5')) hdf5_files.append(os.path.join(output_path, 'wrong.hdf5')) # create data loader with big data tag and three hdf5 training sets conf = { 'output_path': self.output_dir, 'hdf5_files': hdf5_files, 'batch_size': 15, 'loop_over_hdf5_files': True } loader = DataLoader(DataLoaderConfig().create(config_dict=conf)) # sample data batches and see that index increases every two batches sampled for batch in loader.get_data_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0) for batch in loader.get_data_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.3, 2) for batch in loader.get_data_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.6, 2) for batch in loader.get_data_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0, 2) for batch in loader.sample_shuffled_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.3, 2) for batch in loader.sample_shuffled_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.6, 2) for batch in loader.sample_shuffled_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0, 2)
def test_data_loader_from_raw_path_dirs(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) config_dict = { 'data_directories': [self.output_dir], 'output_path': self.output_dir, } config = DataLoaderConfig().create(config_dict=config_dict) data_loader = DataLoader(config=config) data_loader.load_dataset() config = DataLoaderConfig().create(config_dict=config_dict) for d in config.data_directories: self.assertTrue(os.path.isdir(d))
def setUp(self) -> None: self.n_data = 10 self.validation_split = 0.5 self.batch_size = 2 self.datapath_kor = "../data/input/aihub_kor-eng/1.구어체.xlsx" self.data_loader = DataLoader( self.datapath_kor, n_data=self.n_data, validation_split=self.validation_split, deu=False, ) self.datapath_deu = "../data/input/deu.txt" self.data_loader_deu = DataLoader( self.datapath_deu, n_data=self.n_data, validation_split=self.validation_split, deu=True, )
def classify() -> None: """Run classification of the 20 newsgroups dataset :return: None :rtype: None """ logger = logging.getLogger(__name__) logger.info(f'Running {classify.__name__}') # setup logger.info('Loading config') app_cfg = ConfigLoader.load_config() model_cfg = ConfigLoader.load_config(app_cfg['model']['config']) logger.info(f"Using random seed: {model_cfg['random_seed']}") RandUtils.set_random_seed(model_cfg['random_seed']) # load data logger.info('Loading data') dl = DataLoader(app_cfg['paths']['data_dir']) splits = dl.load_splits(app_cfg['model']['data']) # extract features logger.info('Extracting features') logger.info(f"Vectorizer class: {app_cfg['model']['vectorizer_class']}") vectorizer = eval(app_cfg['model']['vectorizer_class'])( model_cfg['tokenizer'], model_cfg['vectorizer'] ) X_train, X_test = vectorizer.vectorize( splits['train'][0] + splits['dev'][0], splits['test'][0] ) y_train = np.array(splits['train'][1] + splits['dev'][1]) y_test = np.array(splits['test'][1]) # run classification logger.info('Running model') logger.info(f"Model class: {app_cfg['model']['model_class']}") model = eval(app_cfg['model']['model_class'])(model_cfg['model']) results = model.train_test(X_train, X_test, y_train, y_test) # log results logger.info('Logging results') el = ExperimentsLogger(app_cfg['paths']['output_dir']) el.log_experiment(app_cfg, model_cfg, results) logger.info(f'Done running {classify.__name__}')
def test_data_batch(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) config_dict = { 'data_directories': self.info['episode_directories'], 'output_path': self.output_dir, 'random_seed': 1, 'batch_size': 3 } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() for batch in data_loader.get_data_batch(): self.assertEqual(len(batch), config_dict['batch_size']) break
def test_data_loading(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) config_dict = { 'data_directories': self.info['episode_directories'], 'output_path': self.output_dir, } config = DataLoaderConfig().create(config_dict=config_dict) data_loader = DataLoader(config=config) data_loader.load_dataset() # assert nothing is empty for k in ['observations', 'actions', 'rewards', 'done']: data = eval(f'data_loader.get_dataset().{k}') self.assertTrue(len(data) > 0) self.assertTrue(sum(data[0].shape) > 0)
def test_vocab_size(self): num_words = 10000 data_loader = DataLoader( self.datapath_kor, n_data=None, validation_split=0.1, deu=False, num_words=num_words, ) print()
def test_data_subsample(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) subsample = 4 config_dict = { 'data_directories': self.info['episode_directories'], 'output_path': self.output_dir, 'random_seed': 1, 'batch_size': 3, 'subsample': subsample } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() self.assertTrue( sum([ np.ceil((el - 1) / subsample) + 1 for el in self.info['episode_lengths'] ]), len(data_loader.get_dataset()))
def __init__(self, config: DataSaverConfig): self._config = config self._logger = get_logger( name=get_filename_without_extension(__file__), output_path=self._config.output_path, quiet=False) cprint(f'initiate', self._logger) if not self._config.saving_directory.startswith('/'): self._config.saving_directory = os.path.join( os.environ['HOME'], self._config.saving_directory) if self._config.store_on_ram_only: self._dataset = Dataset(max_size=self._config.max_size) # used to keep track of replay buffer size on file system if not self._config.store_on_ram_only \ and os.path.isdir(os.path.dirname(self._config.saving_directory)) \ and self._config.max_size != -1: data_loader = DataLoader(config=DataLoaderConfig().create( config_dict={ 'data_directories': [ os.path.join( os.path.dirname(self._config.saving_directory), run) for run in sorted( os.listdir( os.path.dirname( self._config.saving_directory))) ], 'output_path': self._config.output_path, 'store': False # don't store config })) data_loader.load_dataset() self._frame_counter = len(data_loader.get_dataset()) else: self._frame_counter = 0
def test_create_train_validation_hdf5_files(self): num_runs = 10 split = 0.7 config_dict = { 'output_path': self.output_dir, 'training_validation_split': split, 'store_hdf5': True, 'separate_raw_data_runs': True } config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) info = generate_dummy_dataset(self.data_saver, num_runs=num_runs) self.data_saver.create_train_validation_hdf5_files() config = DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')] }) training_data_loader = DataLoader(config=config) training_data_loader.load_dataset() training_data = training_data_loader.get_dataset() config = DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': [os.path.join(self.output_dir, 'validation.hdf5')] }) validation_data_loader = DataLoader(config=config) validation_data_loader.load_dataset() validation_data = validation_data_loader.get_dataset() self.assertEqual(len(training_data), sum(info['episode_lengths'][:int(split * num_runs)])) self.assertEqual(len(validation_data), sum(info['episode_lengths'][int(split * num_runs):]))
def test_generate_random_dataset_with_train_validation_hdf5(self): num_runs = 10 # generate network network = eval(architecture_base_config['architecture']).Net( config=ArchitectureConfig().create( config_dict=architecture_base_config)) # generate dummy dataset info = generate_random_dataset_in_raw_data( output_dir=self.output_dir, num_runs=num_runs, input_size=network.input_size, output_size=network.output_size, continuous=not network.discrete, store_hdf5=True) data_loader_config = { 'output_path': self.output_dir, 'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')] } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=data_loader_config)) data_loader.load_dataset() self.assertNotEqual( sum(d != 0 for d in data_loader.get_dataset().done), 0)
def __init__(self, config: TrainerConfig, network: BaseNet, quiet: bool = False): super().__init__(config, network, quiet=True) self._config.epsilon = 0.2 if self._config.epsilon == "default" else self._config.epsilon self.target_data_loader = DataLoader(config=self._config.target_data_loader_config) self.target_data_loader.load_dataset() self._domain_adaptation_criterion = eval(f'{self._config.domain_adaptation_criterion}()') \ if not self._config.domain_adaptation_criterion == 'default' else MMDLossZhao() self._domain_adaptation_criterion.to(self._device) if not quiet: self._optimizer = eval(f'torch.optim.{self._config.optimizer}')(params=self._net.parameters(), lr=self._config.learning_rate, weight_decay=self._config.weight_decay) lambda_function = lambda f: 1 - f / self._config.scheduler_config.number_of_epochs self._scheduler = torch.optim.lr_scheduler.LambdaLR(self._optimizer, lr_lambda=lambda_function) \ if self._config.scheduler_config is not None else None self._logger = get_logger(name=get_filename_without_extension(__file__), output_path=config.output_path, quiet=False) cprint(f'Started.', self._logger)
def test_generate_random_dataset_in_raw_data(self): num_runs = 10 # generate network network = eval(architecture_base_config['architecture']).Net( config=ArchitectureConfig().create( config_dict=architecture_base_config)) # generate dummy dataset info = generate_random_dataset_in_raw_data( output_dir=self.output_dir, num_runs=num_runs, input_size=network.input_size, output_size=network.output_size, continuous=not network.discrete, ) data_loader_config = { 'output_path': self.output_dir, 'data_directories': info['episode_directories'], } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=data_loader_config)) data_loader.load_dataset() self.assertEqual(sum(d != 0 for d in data_loader.get_dataset().done), num_runs)
def test_line_world_augmentation(self): line_image = np.ones((100, 100, 3)) line_image[:, 40:43, 0:2] = 0 info = generate_random_dataset_in_raw_data( output_dir=self.output_dir, num_runs=20, input_size=(100, 100, 3), output_size=(1, ), continuous=True, fixed_input_value=line_image, store_hdf5=False) cleaner_config_dict = { 'output_path': self.output_dir, 'data_loader_config': { 'data_directories': info['episode_directories'], 'input_size': (1, 64, 64) }, 'training_validation_split': 0.7, 'remove_first_n_timestamps': 5, 'binary_maps_as_target': True, 'invert_binary_maps': True, 'augment_background_noise': 0.1, 'augment_background_textured': 0.9, 'texture_directory': 'textured_dataset', 'augment_empty_images': 0.1 } data_cleaner = DataCleaner(config=DataCleaningConfig().create( config_dict=cleaner_config_dict)) data_cleaner.clean() data_loader = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': glob(f'{self.output_dir}/train*.hdf5') })) data_loader.load_dataset() data_loader.get_dataset().plot()
import datetime import lightgbm as lgb import pandas as pd import numpy as np import gc from src.data.data_loader import DataLoader from src.data.preprocessor import Preprocessor # Load files data_loader = DataLoader() order_products = data_loader.load_raw_file('order_products') orders = data_loader.load_raw_file('orders') products = data_loader.load_raw_file('products') data = orders.merge(order_products, on='order_id', how='left') prior = data[data.eval_set == 'prior'] train = data[data.eval_set == 'train'] test = data[data.eval_set == 'test'] # Remove products that were first ordered in the user's last order (we don't have to predict those) train = train[train.reordered != 0] # Column not given for test data train.drop('add_to_cart_order', inplace=True, axis=1) # All reorders per user reorders = train.groupby('user_id').product_id.apply(set) # All products user has bought
class DataCleaner: def __init__(self, config: DataCleaningConfig): self._config = config self._data_loader = DataLoader(config=config.data_loader_config) if len(config.data_loader_config.data_directories) == 0: self._data_loader.update_data_directories_with_raw_data() def clean(self): self._split_and_clean() def _split_and_clean(self): shuffled_runs = self._config.data_loader_config.data_directories[:] random.shuffle(shuffled_runs) num_training_runs = int( len(shuffled_runs) * self._config.training_validation_split) training_runs = shuffled_runs[:num_training_runs] validation_runs = shuffled_runs[num_training_runs:] for filename_tag, runs in zip(['train', 'validation'], [training_runs, validation_runs]): self._clean(filename_tag, runs) def _clean(self, filename_tag: str, runs: List[str]) -> None: total_data_points = 0 filename_index = 0 hdf5_data = Dataset() for run in tqdm(runs): if self._config.require_success: if not os.path.isfile(os.path.join(run, 'Success')): continue # load data in dataset in input size run_dataset = self._data_loader.load_dataset_from_directories( [run]) if len(run_dataset) <= self._config.remove_first_n_timestamps: continue # remove first N frames for _ in range(self._config.remove_first_n_timestamps): run_dataset.pop() # subsample run_dataset.subsample(self._config.data_loader_config.subsample) # enforce max run length if self._config.max_run_length != -1: run_dataset.clip(self._config.max_run_length) assert len(run_dataset) <= self._config.max_run_length # augment with background noise and change target to binary map binary_maps = parse_binary_maps(run_dataset.observations, invert=self._config.invert_binary_maps) \ if self._config.augment_background_noise != 0 or self._config.augment_background_textured != 0 else None if self._config.binary_maps_as_target: run_dataset = set_binary_maps_as_target( run_dataset, invert=self._config.invert_binary_maps, binary_images=binary_maps, smoothen_labels=self._config.smoothen_labels) if self._config.augment_background_noise != 0: run_dataset = augment_background_noise( run_dataset, p=self._config.augment_background_noise, binary_images=binary_maps) if self._config.augment_background_textured != 0: run_dataset = augment_background_textured( run_dataset, texture_directory=self._config.texture_directory, p=self._config.augment_background_textured, p_empty=self._config.augment_empty_images, binary_images=binary_maps) # store dhf5 file once max dataset size is reached hdf5_data.extend(run_dataset) self._data_loader.empty_dataset() if hdf5_data.get_memory_size() > self._config.max_hdf5_size: if self._config.shuffle: hdf5_data.shuffle() create_hdf5_file_from_dataset(filename=os.path.join( self._config.output_path, f'{filename_tag}_{filename_index}.hdf5'), dataset=hdf5_data) filename_index += 1 total_data_points += len(hdf5_data) hdf5_data = Dataset() if len(hdf5_data) != 0: if self._config.shuffle: hdf5_data.shuffle() create_hdf5_file_from_dataset(filename=os.path.join( self._config.output_path, f'{filename_tag}_{filename_index}.hdf5'), dataset=hdf5_data) total_data_points += len(hdf5_data) print(f'Total data points: {total_data_points}')
parser = argparse.ArgumentParser() parser.add_argument("--config_json", type=str, required=True) parser.add_argument("--data_path", type=str, required=True) parser.add_argument("--deu", type=int, required=False) args = parser.parse_args() data_path = args.data_path config_json = args.config_json deu = args.deu # Config config = Config.from_json_file(config_json) logger.setLevel(config.log_level) # DataLoader data_loader = DataLoader(data_path, **config.data_loader["args"], deu=deu) dataset_train = (tf.data.Dataset.from_generator( data_loader.train_data_generator, output_types=(tf.int32, tf.int32)).shuffle(config.buffer_size).batch( config.batch_size, drop_remainder=True).prefetch(PREFETCH)) dataset_test = (tf.data.Dataset.from_generator( data_loader.test_data_generator, output_types=(tf.int32, tf.int32)).shuffle(config.buffer_size).batch( config.inference_size, drop_remainder=True).repeat().prefetch(PREFETCH)) dataset_test_iterator = iter(dataset_test) # Tokenizer logger.info("Getting Tokenizer") tokenizers = data_loader.tokenizer tokenizer_ori: tf.keras.preprocessing.text.Tokenizer = tokenizers.ori
def __init__(self, config: DataCleaningConfig): self._config = config self._data_loader = DataLoader(config=config.data_loader_config) if len(config.data_loader_config.data_directories) == 0: self._data_loader.update_data_directories_with_raw_data()
class Evaluator: def __init__(self, config: EvaluatorConfig, network: BaseNet, quiet: bool = False): self._config = config self._net = network self.data_loader = DataLoader(config=self._config.data_loader_config) if not quiet: self._logger = get_logger( name=get_filename_without_extension(__file__), output_path=config.output_path, quiet=False) if type(self) == Evaluator else None cprint(f'Started.', self._logger) self._device = torch.device( "cuda" if self._config.device in ['gpu', 'cuda'] and torch.cuda.is_available() else "cpu") self._criterion = eval( f'{self._config.criterion}(reduction=\'none\', {self._config.criterion_args_str})' ) self._criterion.to(self._device) self._lowest_validation_loss = None self.data_loader.load_dataset() self._minimum_error = float(10**6) self._original_model_device = self._net.get_device( ) if self._net is not None else None def put_model_on_device(self, device: str = None): self._original_model_device = self._net.get_device() self._net.set_device( torch.device(self._config.device) if device is None else torch. device(device)) def put_model_back_to_original_device(self): self._net.set_device(self._original_model_device) def evaluate(self, epoch: int = -1, writer=None, tag: str = 'validation') -> Tuple[str, bool]: self.put_model_on_device() total_error = [] # for batch in tqdm(self.data_loader.get_data_batch(), ascii=True, desc='evaluate'): for batch in self.data_loader.get_data_batch(): with torch.no_grad(): predictions = self._net.forward(batch.observations, train=False) targets = data_to_tensor(batch.actions).type( self._net.dtype).to(self._device) error = self._criterion(predictions, targets).mean() total_error.append(error) error_distribution = Distribution(total_error) self.put_model_back_to_original_device() if writer is not None: writer.write_distribution(error_distribution, tag) if self._config.store_output_on_tensorboard and (epoch % 30 == 0 or tag == 'test'): writer.write_output_image(predictions, f'{tag}/predictions') writer.write_output_image(targets, f'{tag}/targets') writer.write_output_image(torch.stack(batch.observations), f'{tag}/inputs') msg = f' {tag} {self._config.criterion} {error_distribution.mean: 0.3e} [{error_distribution.std:0.2e}]' best_checkpoint = False if self._lowest_validation_loss is None or error_distribution.mean < self._lowest_validation_loss: self._lowest_validation_loss = error_distribution.mean best_checkpoint = True return msg, best_checkpoint def evaluate_extensive(self) -> None: """ Extra offline evaluation methods for an extensive evaluation at the end of training :return: None """ self.put_model_on_device('cpu') self.data_loader.get_dataset().subsample(10) dataset = self.data_loader.get_dataset() predictions = self._net.forward(dataset.observations, train=False).detach().cpu() #error = predictions - torch.stack(dataset.actions) self.put_model_back_to_original_device() # save_output_plots(output_dir=self._config.output_path, # data={'expert': np.stack(dataset.actions), # 'network': predictions.numpy(), # 'difference': error.numpy()}) # create_output_video(output_dir=self._config.output_path, # observations=dataset.observations, # actions={'expert': np.stack(dataset.actions), # 'network': predictions.numpy()}) create_output_video_segmentation_network( output_dir=self._config.output_path, observations=torch.stack(dataset.observations).numpy(), predictions=predictions.numpy()) def remove(self): self.data_loader.remove() [h.close() for h in self._logger.handlers]
def test_sample_batch(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) max_num_batches = 2 config_dict = { 'data_directories': self.info['episode_directories'], 'output_path': self.output_dir, 'random_seed': 1, 'batch_size': 3 } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() first_batch = [] index = 0 for index, batch in enumerate( data_loader.sample_shuffled_batch( max_number_of_batches=max_num_batches)): if index == 0: first_batch = deepcopy(batch) self.assertEqual(len(batch), config_dict['batch_size']) self.assertEqual(index, max_num_batches - 1) # test sampling seed for reproduction config_dict['random_seed'] = 2 data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() second_batch = [] for index, batch in enumerate( data_loader.sample_shuffled_batch( max_number_of_batches=max_num_batches)): second_batch = deepcopy(batch) break self.assertNotEqual(np.sum(np.asarray(first_batch.observations[0])), np.sum(np.asarray(second_batch.observations[0]))) config_dict['random_seed'] = 1 data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() third_batch = [] for index, batch in enumerate( data_loader.sample_shuffled_batch( max_number_of_batches=max_num_batches)): third_batch = deepcopy(batch) break self.assertEqual(np.sum(np.asarray(first_batch.observations[0])), np.sum(np.asarray(third_batch.observations[0])))
class DomainAdaptationTrainer(Trainer): def __init__(self, config: TrainerConfig, network: BaseNet, quiet: bool = False): super().__init__(config, network, quiet=True) self._config.epsilon = 0.2 if self._config.epsilon == "default" else self._config.epsilon self.target_data_loader = DataLoader(config=self._config.target_data_loader_config) self.target_data_loader.load_dataset() self._domain_adaptation_criterion = eval(f'{self._config.domain_adaptation_criterion}()') \ if not self._config.domain_adaptation_criterion == 'default' else MMDLossZhao() self._domain_adaptation_criterion.to(self._device) if not quiet: self._optimizer = eval(f'torch.optim.{self._config.optimizer}')(params=self._net.parameters(), lr=self._config.learning_rate, weight_decay=self._config.weight_decay) lambda_function = lambda f: 1 - f / self._config.scheduler_config.number_of_epochs self._scheduler = torch.optim.lr_scheduler.LambdaLR(self._optimizer, lr_lambda=lambda_function) \ if self._config.scheduler_config is not None else None self._logger = get_logger(name=get_filename_without_extension(__file__), output_path=config.output_path, quiet=False) cprint(f'Started.', self._logger) def train(self, epoch: int = -1, writer=None) -> str: self.put_model_on_device() total_error = [] task_error = [] domain_error = [] for source_batch, target_batch in zip(self.data_loader.sample_shuffled_batch(), self.target_data_loader.sample_shuffled_batch()): self._optimizer.zero_grad() targets = data_to_tensor(source_batch.actions).type(self._net.dtype).to(self._device) # task loss predictions = self._net.forward(source_batch.observations, train=True) task_loss = (1 - self._config.epsilon) * self._criterion(predictions, targets).mean() # add domain adaptation loss domain_loss = self._config.epsilon * self._domain_adaptation_criterion( self._net.get_features(source_batch.observations, train=True), self._net.get_features(target_batch.observations, train=True)) loss = task_loss + domain_loss loss.backward() if self._config.gradient_clip_norm != -1: nn.utils.clip_grad_norm_(self._net.parameters(), self._config.gradient_clip_norm) self._optimizer.step() self._net.global_step += 1 task_error.append(task_loss.cpu().detach()) domain_error.append(domain_loss.cpu().detach()) total_error.append(loss.cpu().detach()) self.put_model_back_to_original_device() if self._scheduler is not None: self._scheduler.step() task_error_distribution = Distribution(task_error) domain_error_distribution = Distribution(domain_error) total_error_distribution = Distribution(total_error) if writer is not None: writer.set_step(self._net.global_step) writer.write_distribution(task_error_distribution, 'training/task_error') writer.write_distribution(domain_error_distribution, 'training/domain_error') writer.write_distribution(total_error_distribution, 'training/total_error') if self._config.store_output_on_tensorboard and epoch % 30 == 0: writer.write_output_image(predictions, 'source/predictions') writer.write_output_image(targets, 'source/targets') writer.write_output_image(torch.stack(source_batch.observations), 'source/inputs') writer.write_output_image(self._net.forward(target_batch.observations, train=True), 'target/predictions') writer.write_output_image(torch.stack(target_batch.observations), 'target/inputs') return f' training task: {self._config.criterion} {task_error_distribution.mean: 0.3e} ' \ f'[{task_error_distribution.std:0.2e}]' \ f' domain: {self._config.domain_adaptation_criterion} {domain_error_distribution.mean: 0.3e} ' \ f'[{domain_error_distribution.std:0.2e}]'
class TestDataLoader(unittest.TestCase): def setUp(self) -> None: self.n_data = 10 self.validation_split = 0.5 self.batch_size = 2 self.datapath_kor = "../data/input/aihub_kor-eng/1.구어체.xlsx" self.data_loader = DataLoader( self.datapath_kor, n_data=self.n_data, validation_split=self.validation_split, deu=False, ) self.datapath_deu = "../data/input/deu.txt" self.data_loader_deu = DataLoader( self.datapath_deu, n_data=self.n_data, validation_split=self.validation_split, deu=True, ) def test_len(self): self.assertEqual(len(self.data_loader.data_train), self.n_data * (1 - self.validation_split)) self.assertEqual(len(self.data_loader.data_test), self.n_data * self.validation_split) self.assertEqual( len(self.data_loader_deu.data_train), self.n_data * (1 - self.validation_split), ) def test_tokenizer(self): texts = ["<start> 나 는 매일 저녁 배트 를 만나 러 다락방 으로 가요 . <end>"] sequences = self.data_loader.tokenizer.tar.texts_to_sequences( ["<start> 나 는 매일 저녁 배트 를 만나 러 다락방 으로 가요 . <end>"]) self.assertEqual( self.data_loader.tokenizer.tar.sequences_to_texts(sequences), texts) def test_train_generator(self): it = iter(self.data_loader.train_data_generator()) dataset = tf.data.Dataset.from_generator( self.data_loader.train_data_generator, output_types=(tf.int32, tf.int32)) self.assertListEqual(list(next(iter(dataset))[0].numpy()), list(next(it)[0])) dataset = tf.data.Dataset.from_generator( self.data_loader.train_data_generator, output_types=(tf.int32, tf.int32)).batch(self.batch_size) example = next(iter(dataset)) dataset = tf.data.Dataset.from_generator( self.data_loader_deu.train_data_generator, output_types=(tf.int32, tf.int32)).batch(self.batch_size) example = next(iter(dataset)) print(example) def test_test_generator(self): dataset = tf.data.Dataset.from_generator( self.data_loader.test_data_generator, output_types=(tf.int32, tf.int32)).batch(batch_size=self.batch_size) example = next(iter(dataset)) print(example) dataset = tf.data.Dataset.from_generator( self.data_loader_deu.test_data_generator, output_types=(tf.int32, tf.int32)).batch(batch_size=self.batch_size) example = next(iter(dataset)) print(example) def test_dataloader_german_generator_of_train_and_test(self): dataset_train = tf.data.Dataset.from_generator( self.data_loader_deu.train_data_generator, output_types=(tf.int32, tf.int32)).batch(batch_size=self.batch_size) example = next(iter(dataset_train)) print(example) dataset_test = tf.data.Dataset.from_generator( self.data_loader_deu.train_data_generator, output_types=(tf.int32, tf.int32)).batch(batch_size=self.batch_size) example = next(iter(dataset_test)) print(example) print(self.data_loader_deu.tokenizer.ori.index_word) print(self.data_loader_deu.tokenizer.ori.num_words) print(self.data_loader_deu.tokenizer.ori.index_docs) print(self.data_loader_deu.tokenizer.ori.word_docs) def test_vocab_size(self): num_words = 10000 data_loader = DataLoader( self.datapath_kor, n_data=None, validation_split=0.1, deu=False, num_words=num_words, ) print()
def train(self, data_path, test_path, output_path, model_name, hparams=None): self._model = self.get_model(model_name) default_hparams = self._model.get_default_params() if hparams is not None: default_hparams.update_merge(hparams=hparams) hparams = default_hparams else: hparams = default_hparams model, sess, g = self._model_init(model=self._model, hparams=hparams) epochs = hparams.epochs batch_size = hparams.batch_size learning_rate = hparams.learning_rate data_loader = DataLoader(data_path=data_path, test_path=test_path, output_path=output_path, hparams=hparams) label_list = data_loader.label_list hparams.update(num_labels=len(label_list)) print('Label Length: %i' % (len(label_list))) global_step = 0 print_step_interval = 500 step_time = datetime.now() highest_accuracy = 0 early_stop_count = 0 for epoch in range(epochs): data_loader.reshuffle() avg_loss = 0.0 avg_accuracy = 0.0 for i, (data, labels) in enumerate( data_loader.batch_loader(data_loader.dataset, batch_size)): # print(labels) # print(data, labels) _, loss, accuracy, logits, outputs = sess.run( [ model.train, model.loss, model.accuracy, model.logits, model.outputs ], feed_dict={ model.x: data, model.y: labels, model.dropout_keep_prob: 0.5, model.learning_rate: learning_rate }) avg_loss += float(loss) avg_accuracy += float(accuracy) global_step += 1 if global_step % print_step_interval == 0: print( '[global_step-%i] duration: %is train_loss: %f accuracy: %f' % (global_step, (datetime.now() - step_time).seconds, float(avg_loss / print_step_interval), float(avg_accuracy / print_step_interval))) avg_loss = 0 avg_accuracy = 0 step_time = datetime.now() if global_step % (print_step_interval * 10) == 0: step_t_time = datetime.now() t_avg_loss = 0.0 t_avg_accuracy = 0.0 t_batch_iter_max = len( data_loader.test_dataset) / batch_size + 1 for t_i, (t_data, t_labels) in enumerate( data_loader.batch_loader(data_loader.test_dataset, batch_size)): accuracy, logits, loss = sess.run( [model.accuracy, model.logits, model.loss], feed_dict={ model.x: t_data, model.y: t_labels, model.dropout_keep_prob: 1.0 }) t_avg_loss += float(loss) t_avg_accuracy += float(accuracy) t_avg_loss = float(t_avg_loss / t_batch_iter_max) t_avg_accuracy = float(t_avg_accuracy / t_batch_iter_max) current_accuracy = t_avg_accuracy print( '[global_step-%i] duration: %is test_loss: %f accuracy: %f' % (global_step, (datetime.now() - step_t_time).seconds, t_avg_loss, t_avg_accuracy)) if highest_accuracy < current_accuracy: print('Saving model...') highest_accuracy = current_accuracy current_accuracy = 0 if output_path is not None: if not exists(output_path): makedirs(output_path) output_full_path = join( output_path, 'loss%f_acc%f_epoch%i' % (avg_loss, avg_accuracy, epoch + 1)) self.save_session(directory=output_full_path, global_step=global_step) if current_accuracy != 0: early_stop_count += 1 step_time = datetime.now() if early_stop_count > 2: learning_rate = learning_rate * 0.90 if early_stop_count > 5: print('Early stopped !') break
def test_create_dataset_and_clean(self): info = generate_random_dataset_in_raw_data(output_dir=self.output_dir, num_runs=20, input_size=(100, 100, 3), output_size=(1, ), continuous=True, store_hdf5=False) cleaner_config_dict = { 'output_path': self.output_dir, 'data_loader_config': { 'data_directories': info['episode_directories'], 'input_size': (150, 150, 1) }, 'training_validation_split': 0.7, } data_cleaner = DataCleaner(config=DataCleaningConfig().create( config_dict=cleaner_config_dict)) data_cleaner.clean() data_loader_train = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': glob(f'{self.output_dir}/train*.hdf5') })) data_loader_train.load_dataset() data_loader_validation = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': glob(f'{self.output_dir}/validation*.hdf5') })) data_loader_validation.load_dataset() ratio = len(data_loader_train.get_dataset()) / ( 0. + len(data_loader_train.get_dataset()) + len(data_loader_validation.get_dataset())) self.assertTrue(ratio > 0.6) self.assertTrue(ratio < 0.8)