def create_train_validation_hdf5_files(
            self,
            runs: List[str] = None,
            input_size: List[int] = None) -> None:
        all_runs = runs if runs is not None else self._get_runs()

        number_of_training_runs = int(self._config.training_validation_split *
                                      len(all_runs))
        train_runs = all_runs[0:number_of_training_runs]
        validation_runs = all_runs[number_of_training_runs:]

        for file_name, runs in zip(['train', 'validation'],
                                   [train_runs, validation_runs]):
            config = DataLoaderConfig().create(
                config_dict={
                    'data_directories': runs,
                    'output_path': self._config.output_path,
                    'subsample': self._config.subsample_hdf5,
                    'input_size': input_size
                })
            data_loader = DataLoader(config=config)
            data_loader.load_dataset()
            create_hdf5_file_from_dataset(filename=os.path.join(
                self._config.output_path, file_name + '.hdf5'),
                                          dataset=data_loader.get_dataset())
            cprint(f'created {file_name}.hdf5', self._logger)
 def test_clip_first_x_frames(self):
     info = generate_random_dataset_in_raw_data(output_dir=self.output_dir,
                                                num_runs=20,
                                                input_size=(100, 100, 3),
                                                output_size=(1, ),
                                                continuous=True,
                                                store_hdf5=False)
     cleaner_config_dict = {
         'output_path': self.output_dir,
         'data_loader_config': {
             'data_directories': info['episode_directories'],
             'subsample': 2
         },
         'training_validation_split': 1.0,
         'remove_first_n_timestamps': 5,
     }
     data_cleaner = DataCleaner(config=DataCleaningConfig().create(
         config_dict=cleaner_config_dict))
     data_cleaner.clean()
     data_loader = DataLoader(config=DataLoaderConfig().create(
         config_dict={
             'output_path': self.output_dir,
             'hdf5_files': glob(f'{self.output_dir}/train*.hdf5')
         }))
     data_loader.load_dataset()
     self.assertEqual(
         sum(int((e - 5) / 2) + 1 for e in info['episode_lengths']),
         len(data_loader.get_dataset()))
예제 #3
0
    def __init__(self,
                 config: EvaluatorConfig,
                 network: BaseNet,
                 quiet: bool = False):
        self._config = config
        self._net = network
        self.data_loader = DataLoader(config=self._config.data_loader_config)

        if not quiet:
            self._logger = get_logger(
                name=get_filename_without_extension(__file__),
                output_path=config.output_path,
                quiet=False) if type(self) == Evaluator else None
            cprint(f'Started.', self._logger)

        self._device = torch.device(
            "cuda" if self._config.device in ['gpu', 'cuda']
            and torch.cuda.is_available() else "cpu")
        self._criterion = eval(
            f'{self._config.criterion}(reduction=\'none\', {self._config.criterion_args_str})'
        )
        self._criterion.to(self._device)
        self._lowest_validation_loss = None
        self.data_loader.load_dataset()

        self._minimum_error = float(10**6)
        self._original_model_device = self._net.get_device(
        ) if self._net is not None else None
    def test_create_hdf5_files_subsampled_in_time(self):
        num_runs = 10
        split = 1.0
        subsample = 3
        config_dict = {
            'output_path': self.output_dir,
            'training_validation_split': split,
            'store_hdf5': True,
            'subsample_hdf5': subsample,
            'separate_raw_data_runs': True
        }
        config = DataSaverConfig().create(config_dict=config_dict)
        self.data_saver = DataSaver(config=config)
        info = generate_dummy_dataset(self.data_saver, num_runs=num_runs)
        self.data_saver.create_train_validation_hdf5_files()

        config = DataLoaderConfig().create(
            config_dict={
                'output_path': self.output_dir,
                'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')]
            })
        training_data_loader = DataLoader(config=config)
        training_data_loader.load_dataset()
        training_data = training_data_loader.get_dataset()

        self.assertEqual(
            len(training_data),
            sum([
                np.ceil((el - 1) / subsample) + 1
                for el in info['episode_lengths']
            ]))
 def test_split_hdf5_chunks(self):
     info = generate_random_dataset_in_raw_data(output_dir=self.output_dir,
                                                num_runs=20,
                                                input_size=(100, 100, 3),
                                                output_size=(1, ),
                                                continuous=True,
                                                store_hdf5=False)
     cleaner_config_dict = {
         'output_path': self.output_dir,
         'data_loader_config': {
             'data_directories': info['episode_directories'],
         },
         'training_validation_split': 1.0,
         'max_hdf5_size': 5 * 10**6
     }
     data_cleaner = DataCleaner(config=DataCleaningConfig().create(
         config_dict=cleaner_config_dict))
     data_cleaner.clean()
     for hdf5_file in glob(f'{self.output_dir}/train*.hdf5'):
         data_loader = DataLoader(config=DataLoaderConfig().create(
             config_dict={
                 'output_path': self.output_dir,
                 'hdf5_files': [hdf5_file]
             }))
         data_loader.load_dataset()
         self.assertTrue(
             data_loader.get_dataset().get_memory_size() < 6 * 10**6)
예제 #6
0
    def pred_info(self,
                  model_path,
                  model_name,
                  data_path,
                  output_path=None,
                  hparams=None):
        # get outputs

        self._model = self.get_model(model_name)

        default_hparams = self._model.get_default_params()
        if hparams is not None:
            default_hparams.update_merge(hparams=hparams)
            hparams = default_hparams

        batch_size = hparams.batch_size

        data_loader = DataLoader(data_path=data_path,
                                 test_path=data_path,
                                 hparams=hparams)

        label_list = data_loader.label_list
        hparams.update(num_labels=len(label_list))

        model, sess, g = self._model_init(model=self._model,
                                          hparams=hparams,
                                          directory=model_path)

        cur_time = datetime.now()

        total_outputs = list()
        total_true_false = list()
        total_pred = list()

        for t_i, (t_data, t_labels) in enumerate(
                data_loader.batch_loader(data_loader.dataset, batch_size)):
            outputs, true_false, pred = sess.run(
                [model.outputs, model.true_false, model.pred],
                feed_dict={
                    model.x: t_data,
                    model.y: t_labels,
                    model.dropout_keep_prob: 1.0
                })
            outputs = outputs[:,
                              -1]  # take only the last one with the size same as hidden dim

            total_outputs.extend(outputs)
            total_true_false.extend(true_false)
            total_pred.extend(pred)

        if output_path is not None:
            # np.save(total_outputs, total_outputs)
            print('cannot save currently')

        return total_outputs
    def test_big_data_hdf5_loop(self):
        # create 3 datasets as hdf5 files
        hdf5_files = []
        infos = []
        for index in range(3):
            output_path = os.path.join(self.output_dir, f'ds{index}')
            os.makedirs(output_path, exist_ok=True)
            config_dict = {
                'output_path': output_path,
                'store_hdf5': True,
                'training_validation_split': 1.0
            }
            config = DataSaverConfig().create(config_dict=config_dict)
            self.data_saver = DataSaver(config=config)
            infos.append(
                generate_dummy_dataset(self.data_saver,
                                       num_runs=2,
                                       input_size=(3, 10, 10),
                                       fixed_input_value=(0.3 * index) *
                                       np.ones((3, 10, 10)),
                                       store_hdf5=True))
            self.assertTrue(
                os.path.isfile(os.path.join(output_path, 'train.hdf5')))
            hdf5_files.append(os.path.join(output_path, 'train.hdf5'))
            hdf5_files.append(os.path.join(output_path, 'wrong.hdf5'))

        # create data loader with big data tag and three hdf5 training sets
        conf = {
            'output_path': self.output_dir,
            'hdf5_files': hdf5_files,
            'batch_size': 15,
            'loop_over_hdf5_files': True
        }
        loader = DataLoader(DataLoaderConfig().create(config_dict=conf))

        # sample data batches and see that index increases every two batches sampled
        for batch in loader.get_data_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0)
        for batch in loader.get_data_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.3,
                                   2)
        for batch in loader.get_data_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.6,
                                   2)
        for batch in loader.get_data_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0, 2)
        for batch in loader.sample_shuffled_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.3,
                                   2)
        for batch in loader.sample_shuffled_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.6,
                                   2)
        for batch in loader.sample_shuffled_batch():
            self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0, 2)
    def test_data_loader_from_raw_path_dirs(self):
        self.info = generate_dummy_dataset(self.data_saver,
                                           num_runs=20,
                                           input_size=(100, 100, 3),
                                           output_size=(3, ),
                                           continuous=False)
        config_dict = {
            'data_directories': [self.output_dir],
            'output_path': self.output_dir,
        }
        config = DataLoaderConfig().create(config_dict=config_dict)
        data_loader = DataLoader(config=config)
        data_loader.load_dataset()

        config = DataLoaderConfig().create(config_dict=config_dict)
        for d in config.data_directories:
            self.assertTrue(os.path.isdir(d))
 def setUp(self) -> None:
     self.n_data = 10
     self.validation_split = 0.5
     self.batch_size = 2
     self.datapath_kor = "../data/input/aihub_kor-eng/1.구어체.xlsx"
     self.data_loader = DataLoader(
         self.datapath_kor,
         n_data=self.n_data,
         validation_split=self.validation_split,
         deu=False,
     )
     self.datapath_deu = "../data/input/deu.txt"
     self.data_loader_deu = DataLoader(
         self.datapath_deu,
         n_data=self.n_data,
         validation_split=self.validation_split,
         deu=True,
     )
예제 #10
0
def classify() -> None:
    """Run classification of the 20 newsgroups dataset
    
    :return: None
    :rtype: None
    """
    logger = logging.getLogger(__name__)
    logger.info(f'Running {classify.__name__}')

    # setup
    logger.info('Loading config')
    app_cfg = ConfigLoader.load_config()
    model_cfg = ConfigLoader.load_config(app_cfg['model']['config'])
    logger.info(f"Using random seed: {model_cfg['random_seed']}")
    RandUtils.set_random_seed(model_cfg['random_seed'])

    # load data
    logger.info('Loading data')
    dl = DataLoader(app_cfg['paths']['data_dir'])
    splits = dl.load_splits(app_cfg['model']['data'])

    # extract features
    logger.info('Extracting features')
    logger.info(f"Vectorizer class: {app_cfg['model']['vectorizer_class']}")
    vectorizer = eval(app_cfg['model']['vectorizer_class'])(
        model_cfg['tokenizer'], model_cfg['vectorizer']
    )
    X_train, X_test = vectorizer.vectorize(
        splits['train'][0] + splits['dev'][0], splits['test'][0]
    )
    y_train = np.array(splits['train'][1] + splits['dev'][1])
    y_test = np.array(splits['test'][1])

    # run classification
    logger.info('Running model')
    logger.info(f"Model class: {app_cfg['model']['model_class']}")
    model = eval(app_cfg['model']['model_class'])(model_cfg['model'])
    results = model.train_test(X_train, X_test, y_train, y_test)

    # log results
    logger.info('Logging results')
    el = ExperimentsLogger(app_cfg['paths']['output_dir'])
    el.log_experiment(app_cfg, model_cfg, results)
    logger.info(f'Done running {classify.__name__}')
    def test_data_batch(self):
        self.info = generate_dummy_dataset(self.data_saver,
                                           num_runs=20,
                                           input_size=(100, 100, 3),
                                           output_size=(3, ),
                                           continuous=False)
        config_dict = {
            'data_directories': self.info['episode_directories'],
            'output_path': self.output_dir,
            'random_seed': 1,
            'batch_size': 3
        }
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=config_dict))
        data_loader.load_dataset()

        for batch in data_loader.get_data_batch():
            self.assertEqual(len(batch), config_dict['batch_size'])
            break
    def test_data_loading(self):
        self.info = generate_dummy_dataset(self.data_saver,
                                           num_runs=20,
                                           input_size=(100, 100, 3),
                                           output_size=(3, ),
                                           continuous=False)
        config_dict = {
            'data_directories': self.info['episode_directories'],
            'output_path': self.output_dir,
        }
        config = DataLoaderConfig().create(config_dict=config_dict)
        data_loader = DataLoader(config=config)
        data_loader.load_dataset()

        # assert nothing is empty
        for k in ['observations', 'actions', 'rewards', 'done']:
            data = eval(f'data_loader.get_dataset().{k}')
            self.assertTrue(len(data) > 0)
            self.assertTrue(sum(data[0].shape) > 0)
 def test_vocab_size(self):
     num_words = 10000
     data_loader = DataLoader(
         self.datapath_kor,
         n_data=None,
         validation_split=0.1,
         deu=False,
         num_words=num_words,
     )
     print()
 def test_data_subsample(self):
     self.info = generate_dummy_dataset(self.data_saver,
                                        num_runs=20,
                                        input_size=(100, 100, 3),
                                        output_size=(3, ),
                                        continuous=False)
     subsample = 4
     config_dict = {
         'data_directories': self.info['episode_directories'],
         'output_path': self.output_dir,
         'random_seed': 1,
         'batch_size': 3,
         'subsample': subsample
     }
     data_loader = DataLoader(config=DataLoaderConfig().create(
         config_dict=config_dict))
     data_loader.load_dataset()
     self.assertTrue(
         sum([
             np.ceil((el - 1) / subsample) + 1
             for el in self.info['episode_lengths']
         ]), len(data_loader.get_dataset()))
    def __init__(self, config: DataSaverConfig):
        self._config = config
        self._logger = get_logger(
            name=get_filename_without_extension(__file__),
            output_path=self._config.output_path,
            quiet=False)
        cprint(f'initiate', self._logger)

        if not self._config.saving_directory.startswith('/'):
            self._config.saving_directory = os.path.join(
                os.environ['HOME'], self._config.saving_directory)

        if self._config.store_on_ram_only:
            self._dataset = Dataset(max_size=self._config.max_size)

        # used to keep track of replay buffer size on file system
        if not self._config.store_on_ram_only \
                and os.path.isdir(os.path.dirname(self._config.saving_directory)) \
                and self._config.max_size != -1:
            data_loader = DataLoader(config=DataLoaderConfig().create(
                config_dict={
                    'data_directories': [
                        os.path.join(
                            os.path.dirname(self._config.saving_directory),
                            run) for run in sorted(
                                os.listdir(
                                    os.path.dirname(
                                        self._config.saving_directory)))
                    ],
                    'output_path':
                    self._config.output_path,
                    'store':
                    False  # don't store config
                }))
            data_loader.load_dataset()
            self._frame_counter = len(data_loader.get_dataset())
        else:
            self._frame_counter = 0
    def test_create_train_validation_hdf5_files(self):
        num_runs = 10
        split = 0.7
        config_dict = {
            'output_path': self.output_dir,
            'training_validation_split': split,
            'store_hdf5': True,
            'separate_raw_data_runs': True
        }
        config = DataSaverConfig().create(config_dict=config_dict)
        self.data_saver = DataSaver(config=config)
        info = generate_dummy_dataset(self.data_saver, num_runs=num_runs)
        self.data_saver.create_train_validation_hdf5_files()

        config = DataLoaderConfig().create(
            config_dict={
                'output_path': self.output_dir,
                'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')]
            })
        training_data_loader = DataLoader(config=config)
        training_data_loader.load_dataset()
        training_data = training_data_loader.get_dataset()

        config = DataLoaderConfig().create(
            config_dict={
                'output_path': self.output_dir,
                'hdf5_files':
                [os.path.join(self.output_dir, 'validation.hdf5')]
            })
        validation_data_loader = DataLoader(config=config)
        validation_data_loader.load_dataset()
        validation_data = validation_data_loader.get_dataset()

        self.assertEqual(len(training_data),
                         sum(info['episode_lengths'][:int(split * num_runs)]))
        self.assertEqual(len(validation_data),
                         sum(info['episode_lengths'][int(split * num_runs):]))
    def test_generate_random_dataset_with_train_validation_hdf5(self):
        num_runs = 10
        # generate network
        network = eval(architecture_base_config['architecture']).Net(
            config=ArchitectureConfig().create(
                config_dict=architecture_base_config))

        # generate dummy dataset
        info = generate_random_dataset_in_raw_data(
            output_dir=self.output_dir,
            num_runs=num_runs,
            input_size=network.input_size,
            output_size=network.output_size,
            continuous=not network.discrete,
            store_hdf5=True)
        data_loader_config = {
            'output_path': self.output_dir,
            'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')]
        }
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=data_loader_config))
        data_loader.load_dataset()
        self.assertNotEqual(
            sum(d != 0 for d in data_loader.get_dataset().done), 0)
예제 #18
0
    def __init__(self, config: TrainerConfig, network: BaseNet, quiet: bool = False):
        super().__init__(config, network, quiet=True)

        self._config.epsilon = 0.2 if self._config.epsilon == "default" else self._config.epsilon

        self.target_data_loader = DataLoader(config=self._config.target_data_loader_config)
        self.target_data_loader.load_dataset()
        self._domain_adaptation_criterion = eval(f'{self._config.domain_adaptation_criterion}()') \
            if not self._config.domain_adaptation_criterion == 'default' else MMDLossZhao()
        self._domain_adaptation_criterion.to(self._device)

        if not quiet:
            self._optimizer = eval(f'torch.optim.{self._config.optimizer}')(params=self._net.parameters(),
                                                                            lr=self._config.learning_rate,
                                                                            weight_decay=self._config.weight_decay)

            lambda_function = lambda f: 1 - f / self._config.scheduler_config.number_of_epochs
            self._scheduler = torch.optim.lr_scheduler.LambdaLR(self._optimizer, lr_lambda=lambda_function) \
                if self._config.scheduler_config is not None else None

            self._logger = get_logger(name=get_filename_without_extension(__file__),
                                      output_path=config.output_path,
                                      quiet=False)
            cprint(f'Started.', self._logger)
    def test_generate_random_dataset_in_raw_data(self):
        num_runs = 10
        # generate network
        network = eval(architecture_base_config['architecture']).Net(
            config=ArchitectureConfig().create(
                config_dict=architecture_base_config))

        # generate dummy dataset
        info = generate_random_dataset_in_raw_data(
            output_dir=self.output_dir,
            num_runs=num_runs,
            input_size=network.input_size,
            output_size=network.output_size,
            continuous=not network.discrete,
        )
        data_loader_config = {
            'output_path': self.output_dir,
            'data_directories': info['episode_directories'],
        }
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=data_loader_config))
        data_loader.load_dataset()
        self.assertEqual(sum(d != 0 for d in data_loader.get_dataset().done),
                         num_runs)
 def test_line_world_augmentation(self):
     line_image = np.ones((100, 100, 3))
     line_image[:, 40:43, 0:2] = 0
     info = generate_random_dataset_in_raw_data(
         output_dir=self.output_dir,
         num_runs=20,
         input_size=(100, 100, 3),
         output_size=(1, ),
         continuous=True,
         fixed_input_value=line_image,
         store_hdf5=False)
     cleaner_config_dict = {
         'output_path': self.output_dir,
         'data_loader_config': {
             'data_directories': info['episode_directories'],
             'input_size': (1, 64, 64)
         },
         'training_validation_split': 0.7,
         'remove_first_n_timestamps': 5,
         'binary_maps_as_target': True,
         'invert_binary_maps': True,
         'augment_background_noise': 0.1,
         'augment_background_textured': 0.9,
         'texture_directory': 'textured_dataset',
         'augment_empty_images': 0.1
     }
     data_cleaner = DataCleaner(config=DataCleaningConfig().create(
         config_dict=cleaner_config_dict))
     data_cleaner.clean()
     data_loader = DataLoader(config=DataLoaderConfig().create(
         config_dict={
             'output_path': self.output_dir,
             'hdf5_files': glob(f'{self.output_dir}/train*.hdf5')
         }))
     data_loader.load_dataset()
     data_loader.get_dataset().plot()
예제 #21
0
import datetime
import lightgbm as lgb
import pandas as pd
import numpy as np
import gc

from src.data.data_loader import DataLoader
from src.data.preprocessor import Preprocessor

# Load files
data_loader = DataLoader()
order_products = data_loader.load_raw_file('order_products')
orders = data_loader.load_raw_file('orders')
products = data_loader.load_raw_file('products')

data = orders.merge(order_products, on='order_id', how='left')

prior = data[data.eval_set == 'prior']
train = data[data.eval_set == 'train']
test = data[data.eval_set == 'test']

# Remove products that were first ordered in the user's last order (we don't have to predict those)
train = train[train.reordered != 0]

# Column not given for test data
train.drop('add_to_cart_order', inplace=True, axis=1)

# All reorders per user
reorders = train.groupby('user_id').product_id.apply(set)

# All products user has bought
class DataCleaner:
    def __init__(self, config: DataCleaningConfig):
        self._config = config
        self._data_loader = DataLoader(config=config.data_loader_config)
        if len(config.data_loader_config.data_directories) == 0:
            self._data_loader.update_data_directories_with_raw_data()

    def clean(self):
        self._split_and_clean()

    def _split_and_clean(self):
        shuffled_runs = self._config.data_loader_config.data_directories[:]
        random.shuffle(shuffled_runs)
        num_training_runs = int(
            len(shuffled_runs) * self._config.training_validation_split)
        training_runs = shuffled_runs[:num_training_runs]
        validation_runs = shuffled_runs[num_training_runs:]
        for filename_tag, runs in zip(['train', 'validation'],
                                      [training_runs, validation_runs]):
            self._clean(filename_tag, runs)

    def _clean(self, filename_tag: str, runs: List[str]) -> None:
        total_data_points = 0
        filename_index = 0
        hdf5_data = Dataset()
        for run in tqdm(runs):
            if self._config.require_success:
                if not os.path.isfile(os.path.join(run, 'Success')):
                    continue
            # load data in dataset in input size
            run_dataset = self._data_loader.load_dataset_from_directories(
                [run])
            if len(run_dataset) <= self._config.remove_first_n_timestamps:
                continue
            # remove first N frames
            for _ in range(self._config.remove_first_n_timestamps):
                run_dataset.pop()
            # subsample
            run_dataset.subsample(self._config.data_loader_config.subsample)
            # enforce max run length
            if self._config.max_run_length != -1:
                run_dataset.clip(self._config.max_run_length)
                assert len(run_dataset) <= self._config.max_run_length
            # augment with background noise and change target to binary map

            binary_maps = parse_binary_maps(run_dataset.observations, invert=self._config.invert_binary_maps) \
                if self._config.augment_background_noise != 0 or self._config.augment_background_textured != 0 else None
            if self._config.binary_maps_as_target:
                run_dataset = set_binary_maps_as_target(
                    run_dataset,
                    invert=self._config.invert_binary_maps,
                    binary_images=binary_maps,
                    smoothen_labels=self._config.smoothen_labels)

            if self._config.augment_background_noise != 0:
                run_dataset = augment_background_noise(
                    run_dataset,
                    p=self._config.augment_background_noise,
                    binary_images=binary_maps)
            if self._config.augment_background_textured != 0:
                run_dataset = augment_background_textured(
                    run_dataset,
                    texture_directory=self._config.texture_directory,
                    p=self._config.augment_background_textured,
                    p_empty=self._config.augment_empty_images,
                    binary_images=binary_maps)
            # store dhf5 file once max dataset size is reached
            hdf5_data.extend(run_dataset)
            self._data_loader.empty_dataset()
            if hdf5_data.get_memory_size() > self._config.max_hdf5_size:
                if self._config.shuffle:
                    hdf5_data.shuffle()
                create_hdf5_file_from_dataset(filename=os.path.join(
                    self._config.output_path,
                    f'{filename_tag}_{filename_index}.hdf5'),
                                              dataset=hdf5_data)
                filename_index += 1
                total_data_points += len(hdf5_data)
                hdf5_data = Dataset()
        if len(hdf5_data) != 0:
            if self._config.shuffle:
                hdf5_data.shuffle()
            create_hdf5_file_from_dataset(filename=os.path.join(
                self._config.output_path,
                f'{filename_tag}_{filename_index}.hdf5'),
                                          dataset=hdf5_data)
            total_data_points += len(hdf5_data)
        print(f'Total data points: {total_data_points}')
예제 #23
0
parser = argparse.ArgumentParser()
parser.add_argument("--config_json", type=str, required=True)
parser.add_argument("--data_path", type=str, required=True)
parser.add_argument("--deu", type=int, required=False)
args = parser.parse_args()

data_path = args.data_path
config_json = args.config_json
deu = args.deu

# Config
config = Config.from_json_file(config_json)
logger.setLevel(config.log_level)

# DataLoader
data_loader = DataLoader(data_path, **config.data_loader["args"], deu=deu)
dataset_train = (tf.data.Dataset.from_generator(
    data_loader.train_data_generator,
    output_types=(tf.int32, tf.int32)).shuffle(config.buffer_size).batch(
        config.batch_size, drop_remainder=True).prefetch(PREFETCH))
dataset_test = (tf.data.Dataset.from_generator(
    data_loader.test_data_generator,
    output_types=(tf.int32, tf.int32)).shuffle(config.buffer_size).batch(
        config.inference_size,
        drop_remainder=True).repeat().prefetch(PREFETCH))
dataset_test_iterator = iter(dataset_test)

# Tokenizer
logger.info("Getting Tokenizer")
tokenizers = data_loader.tokenizer
tokenizer_ori: tf.keras.preprocessing.text.Tokenizer = tokenizers.ori
 def __init__(self, config: DataCleaningConfig):
     self._config = config
     self._data_loader = DataLoader(config=config.data_loader_config)
     if len(config.data_loader_config.data_directories) == 0:
         self._data_loader.update_data_directories_with_raw_data()
예제 #25
0
class Evaluator:
    def __init__(self,
                 config: EvaluatorConfig,
                 network: BaseNet,
                 quiet: bool = False):
        self._config = config
        self._net = network
        self.data_loader = DataLoader(config=self._config.data_loader_config)

        if not quiet:
            self._logger = get_logger(
                name=get_filename_without_extension(__file__),
                output_path=config.output_path,
                quiet=False) if type(self) == Evaluator else None
            cprint(f'Started.', self._logger)

        self._device = torch.device(
            "cuda" if self._config.device in ['gpu', 'cuda']
            and torch.cuda.is_available() else "cpu")
        self._criterion = eval(
            f'{self._config.criterion}(reduction=\'none\', {self._config.criterion_args_str})'
        )
        self._criterion.to(self._device)
        self._lowest_validation_loss = None
        self.data_loader.load_dataset()

        self._minimum_error = float(10**6)
        self._original_model_device = self._net.get_device(
        ) if self._net is not None else None

    def put_model_on_device(self, device: str = None):
        self._original_model_device = self._net.get_device()
        self._net.set_device(
            torch.device(self._config.device) if device is None else torch.
            device(device))

    def put_model_back_to_original_device(self):
        self._net.set_device(self._original_model_device)

    def evaluate(self,
                 epoch: int = -1,
                 writer=None,
                 tag: str = 'validation') -> Tuple[str, bool]:
        self.put_model_on_device()
        total_error = []
        #        for batch in tqdm(self.data_loader.get_data_batch(), ascii=True, desc='evaluate'):
        for batch in self.data_loader.get_data_batch():
            with torch.no_grad():
                predictions = self._net.forward(batch.observations,
                                                train=False)
                targets = data_to_tensor(batch.actions).type(
                    self._net.dtype).to(self._device)
                error = self._criterion(predictions, targets).mean()
                total_error.append(error)
        error_distribution = Distribution(total_error)
        self.put_model_back_to_original_device()
        if writer is not None:
            writer.write_distribution(error_distribution, tag)
            if self._config.store_output_on_tensorboard and (epoch % 30 == 0
                                                             or tag == 'test'):
                writer.write_output_image(predictions, f'{tag}/predictions')
                writer.write_output_image(targets, f'{tag}/targets')
                writer.write_output_image(torch.stack(batch.observations),
                                          f'{tag}/inputs')

        msg = f' {tag} {self._config.criterion} {error_distribution.mean: 0.3e} [{error_distribution.std:0.2e}]'

        best_checkpoint = False
        if self._lowest_validation_loss is None or error_distribution.mean < self._lowest_validation_loss:
            self._lowest_validation_loss = error_distribution.mean
            best_checkpoint = True
        return msg, best_checkpoint

    def evaluate_extensive(self) -> None:
        """
        Extra offline evaluation methods for an extensive evaluation at the end of training
        :return: None
        """
        self.put_model_on_device('cpu')
        self.data_loader.get_dataset().subsample(10)
        dataset = self.data_loader.get_dataset()
        predictions = self._net.forward(dataset.observations,
                                        train=False).detach().cpu()
        #error = predictions - torch.stack(dataset.actions)
        self.put_model_back_to_original_device()

        # save_output_plots(output_dir=self._config.output_path,
        #                   data={'expert': np.stack(dataset.actions),
        #                         'network': predictions.numpy(),
        #                         'difference': error.numpy()})
        # create_output_video(output_dir=self._config.output_path,
        #                     observations=dataset.observations,
        #                     actions={'expert': np.stack(dataset.actions),
        #                              'network': predictions.numpy()})
        create_output_video_segmentation_network(
            output_dir=self._config.output_path,
            observations=torch.stack(dataset.observations).numpy(),
            predictions=predictions.numpy())

    def remove(self):
        self.data_loader.remove()
        [h.close() for h in self._logger.handlers]
    def test_sample_batch(self):
        self.info = generate_dummy_dataset(self.data_saver,
                                           num_runs=20,
                                           input_size=(100, 100, 3),
                                           output_size=(3, ),
                                           continuous=False)
        max_num_batches = 2
        config_dict = {
            'data_directories': self.info['episode_directories'],
            'output_path': self.output_dir,
            'random_seed': 1,
            'batch_size': 3
        }
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=config_dict))
        data_loader.load_dataset()
        first_batch = []
        index = 0
        for index, batch in enumerate(
                data_loader.sample_shuffled_batch(
                    max_number_of_batches=max_num_batches)):
            if index == 0:
                first_batch = deepcopy(batch)
            self.assertEqual(len(batch), config_dict['batch_size'])
        self.assertEqual(index, max_num_batches - 1)

        # test sampling seed for reproduction
        config_dict['random_seed'] = 2
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=config_dict))
        data_loader.load_dataset()
        second_batch = []
        for index, batch in enumerate(
                data_loader.sample_shuffled_batch(
                    max_number_of_batches=max_num_batches)):
            second_batch = deepcopy(batch)
            break
        self.assertNotEqual(np.sum(np.asarray(first_batch.observations[0])),
                            np.sum(np.asarray(second_batch.observations[0])))
        config_dict['random_seed'] = 1
        data_loader = DataLoader(config=DataLoaderConfig().create(
            config_dict=config_dict))
        data_loader.load_dataset()
        third_batch = []
        for index, batch in enumerate(
                data_loader.sample_shuffled_batch(
                    max_number_of_batches=max_num_batches)):
            third_batch = deepcopy(batch)
            break
        self.assertEqual(np.sum(np.asarray(first_batch.observations[0])),
                         np.sum(np.asarray(third_batch.observations[0])))
예제 #27
0
class DomainAdaptationTrainer(Trainer):

    def __init__(self, config: TrainerConfig, network: BaseNet, quiet: bool = False):
        super().__init__(config, network, quiet=True)

        self._config.epsilon = 0.2 if self._config.epsilon == "default" else self._config.epsilon

        self.target_data_loader = DataLoader(config=self._config.target_data_loader_config)
        self.target_data_loader.load_dataset()
        self._domain_adaptation_criterion = eval(f'{self._config.domain_adaptation_criterion}()') \
            if not self._config.domain_adaptation_criterion == 'default' else MMDLossZhao()
        self._domain_adaptation_criterion.to(self._device)

        if not quiet:
            self._optimizer = eval(f'torch.optim.{self._config.optimizer}')(params=self._net.parameters(),
                                                                            lr=self._config.learning_rate,
                                                                            weight_decay=self._config.weight_decay)

            lambda_function = lambda f: 1 - f / self._config.scheduler_config.number_of_epochs
            self._scheduler = torch.optim.lr_scheduler.LambdaLR(self._optimizer, lr_lambda=lambda_function) \
                if self._config.scheduler_config is not None else None

            self._logger = get_logger(name=get_filename_without_extension(__file__),
                                      output_path=config.output_path,
                                      quiet=False)
            cprint(f'Started.', self._logger)

    def train(self, epoch: int = -1, writer=None) -> str:
        self.put_model_on_device()
        total_error = []
        task_error = []
        domain_error = []
        for source_batch, target_batch in zip(self.data_loader.sample_shuffled_batch(),
                                              self.target_data_loader.sample_shuffled_batch()):
            self._optimizer.zero_grad()
            targets = data_to_tensor(source_batch.actions).type(self._net.dtype).to(self._device)
            # task loss
            predictions = self._net.forward(source_batch.observations, train=True)
            task_loss = (1 - self._config.epsilon) * self._criterion(predictions, targets).mean()

            # add domain adaptation loss
            domain_loss = self._config.epsilon * self._domain_adaptation_criterion(
                self._net.get_features(source_batch.observations, train=True),
                self._net.get_features(target_batch.observations, train=True))

            loss = task_loss + domain_loss
            loss.backward()
            if self._config.gradient_clip_norm != -1:
                nn.utils.clip_grad_norm_(self._net.parameters(),
                                         self._config.gradient_clip_norm)
            self._optimizer.step()
            self._net.global_step += 1
            task_error.append(task_loss.cpu().detach())
            domain_error.append(domain_loss.cpu().detach())
            total_error.append(loss.cpu().detach())
        self.put_model_back_to_original_device()

        if self._scheduler is not None:
            self._scheduler.step()

        task_error_distribution = Distribution(task_error)
        domain_error_distribution = Distribution(domain_error)
        total_error_distribution = Distribution(total_error)
        if writer is not None:
            writer.set_step(self._net.global_step)
            writer.write_distribution(task_error_distribution, 'training/task_error')
            writer.write_distribution(domain_error_distribution, 'training/domain_error')
            writer.write_distribution(total_error_distribution, 'training/total_error')
            if self._config.store_output_on_tensorboard and epoch % 30 == 0:
                writer.write_output_image(predictions, 'source/predictions')
                writer.write_output_image(targets, 'source/targets')
                writer.write_output_image(torch.stack(source_batch.observations), 'source/inputs')
                writer.write_output_image(self._net.forward(target_batch.observations, train=True),
                                          'target/predictions')
                writer.write_output_image(torch.stack(target_batch.observations), 'target/inputs')

        return f' training task: {self._config.criterion} {task_error_distribution.mean: 0.3e} ' \
               f'[{task_error_distribution.std:0.2e}]' \
               f' domain: {self._config.domain_adaptation_criterion} {domain_error_distribution.mean: 0.3e} ' \
               f'[{domain_error_distribution.std:0.2e}]'
class TestDataLoader(unittest.TestCase):
    def setUp(self) -> None:
        self.n_data = 10
        self.validation_split = 0.5
        self.batch_size = 2
        self.datapath_kor = "../data/input/aihub_kor-eng/1.구어체.xlsx"
        self.data_loader = DataLoader(
            self.datapath_kor,
            n_data=self.n_data,
            validation_split=self.validation_split,
            deu=False,
        )
        self.datapath_deu = "../data/input/deu.txt"
        self.data_loader_deu = DataLoader(
            self.datapath_deu,
            n_data=self.n_data,
            validation_split=self.validation_split,
            deu=True,
        )

    def test_len(self):
        self.assertEqual(len(self.data_loader.data_train),
                         self.n_data * (1 - self.validation_split))
        self.assertEqual(len(self.data_loader.data_test),
                         self.n_data * self.validation_split)
        self.assertEqual(
            len(self.data_loader_deu.data_train),
            self.n_data * (1 - self.validation_split),
        )

    def test_tokenizer(self):
        texts = ["<start> 나 는 매일 저녁 배트 를 만나 러 다락방 으로 가요 . <end>"]
        sequences = self.data_loader.tokenizer.tar.texts_to_sequences(
            ["<start> 나 는 매일 저녁 배트 를 만나 러 다락방 으로 가요 . <end>"])
        self.assertEqual(
            self.data_loader.tokenizer.tar.sequences_to_texts(sequences),
            texts)

    def test_train_generator(self):
        it = iter(self.data_loader.train_data_generator())
        dataset = tf.data.Dataset.from_generator(
            self.data_loader.train_data_generator,
            output_types=(tf.int32, tf.int32))
        self.assertListEqual(list(next(iter(dataset))[0].numpy()),
                             list(next(it)[0]))
        dataset = tf.data.Dataset.from_generator(
            self.data_loader.train_data_generator,
            output_types=(tf.int32, tf.int32)).batch(self.batch_size)
        example = next(iter(dataset))
        dataset = tf.data.Dataset.from_generator(
            self.data_loader_deu.train_data_generator,
            output_types=(tf.int32, tf.int32)).batch(self.batch_size)
        example = next(iter(dataset))
        print(example)

    def test_test_generator(self):
        dataset = tf.data.Dataset.from_generator(
            self.data_loader.test_data_generator,
            output_types=(tf.int32,
                          tf.int32)).batch(batch_size=self.batch_size)
        example = next(iter(dataset))
        print(example)
        dataset = tf.data.Dataset.from_generator(
            self.data_loader_deu.test_data_generator,
            output_types=(tf.int32,
                          tf.int32)).batch(batch_size=self.batch_size)
        example = next(iter(dataset))
        print(example)

    def test_dataloader_german_generator_of_train_and_test(self):
        dataset_train = tf.data.Dataset.from_generator(
            self.data_loader_deu.train_data_generator,
            output_types=(tf.int32,
                          tf.int32)).batch(batch_size=self.batch_size)
        example = next(iter(dataset_train))
        print(example)
        dataset_test = tf.data.Dataset.from_generator(
            self.data_loader_deu.train_data_generator,
            output_types=(tf.int32,
                          tf.int32)).batch(batch_size=self.batch_size)
        example = next(iter(dataset_test))
        print(example)

        print(self.data_loader_deu.tokenizer.ori.index_word)
        print(self.data_loader_deu.tokenizer.ori.num_words)
        print(self.data_loader_deu.tokenizer.ori.index_docs)
        print(self.data_loader_deu.tokenizer.ori.word_docs)

    def test_vocab_size(self):
        num_words = 10000
        data_loader = DataLoader(
            self.datapath_kor,
            n_data=None,
            validation_split=0.1,
            deu=False,
            num_words=num_words,
        )
        print()
예제 #29
0
    def train(self,
              data_path,
              test_path,
              output_path,
              model_name,
              hparams=None):

        self._model = self.get_model(model_name)

        default_hparams = self._model.get_default_params()
        if hparams is not None:
            default_hparams.update_merge(hparams=hparams)
            hparams = default_hparams
        else:
            hparams = default_hparams

        model, sess, g = self._model_init(model=self._model, hparams=hparams)

        epochs = hparams.epochs
        batch_size = hparams.batch_size
        learning_rate = hparams.learning_rate

        data_loader = DataLoader(data_path=data_path,
                                 test_path=test_path,
                                 output_path=output_path,
                                 hparams=hparams)

        label_list = data_loader.label_list
        hparams.update(num_labels=len(label_list))
        print('Label Length: %i' % (len(label_list)))

        global_step = 0
        print_step_interval = 500
        step_time = datetime.now()

        highest_accuracy = 0
        early_stop_count = 0

        for epoch in range(epochs):

            data_loader.reshuffle()
            avg_loss = 0.0
            avg_accuracy = 0.0

            for i, (data, labels) in enumerate(
                    data_loader.batch_loader(data_loader.dataset, batch_size)):
                # print(labels)
                # print(data, labels)
                _, loss, accuracy, logits, outputs = sess.run(
                    [
                        model.train, model.loss, model.accuracy, model.logits,
                        model.outputs
                    ],
                    feed_dict={
                        model.x: data,
                        model.y: labels,
                        model.dropout_keep_prob: 0.5,
                        model.learning_rate: learning_rate
                    })

                avg_loss += float(loss)
                avg_accuracy += float(accuracy)
                global_step += 1

                if global_step % print_step_interval == 0:
                    print(
                        '[global_step-%i] duration: %is train_loss: %f accuracy: %f'
                        % (global_step, (datetime.now() - step_time).seconds,
                           float(avg_loss / print_step_interval),
                           float(avg_accuracy / print_step_interval)))
                    avg_loss = 0
                    avg_accuracy = 0
                    step_time = datetime.now()

                if global_step % (print_step_interval * 10) == 0:

                    step_t_time = datetime.now()
                    t_avg_loss = 0.0
                    t_avg_accuracy = 0.0
                    t_batch_iter_max = len(
                        data_loader.test_dataset) / batch_size + 1

                    for t_i, (t_data, t_labels) in enumerate(
                            data_loader.batch_loader(data_loader.test_dataset,
                                                     batch_size)):
                        accuracy, logits, loss = sess.run(
                            [model.accuracy, model.logits, model.loss],
                            feed_dict={
                                model.x: t_data,
                                model.y: t_labels,
                                model.dropout_keep_prob: 1.0
                            })

                        t_avg_loss += float(loss)
                        t_avg_accuracy += float(accuracy)

                    t_avg_loss = float(t_avg_loss / t_batch_iter_max)
                    t_avg_accuracy = float(t_avg_accuracy / t_batch_iter_max)
                    current_accuracy = t_avg_accuracy

                    print(
                        '[global_step-%i] duration: %is test_loss: %f accuracy: %f'
                        % (global_step, (datetime.now() - step_t_time).seconds,
                           t_avg_loss, t_avg_accuracy))

                    if highest_accuracy < current_accuracy:
                        print('Saving model...')
                        highest_accuracy = current_accuracy
                        current_accuracy = 0
                        if output_path is not None:
                            if not exists(output_path):
                                makedirs(output_path)
                        output_full_path = join(
                            output_path, 'loss%f_acc%f_epoch%i' %
                            (avg_loss, avg_accuracy, epoch + 1))
                        self.save_session(directory=output_full_path,
                                          global_step=global_step)

                    if current_accuracy != 0:
                        early_stop_count += 1

                    step_time = datetime.now()

            if early_stop_count > 2:
                learning_rate = learning_rate * 0.90

            if early_stop_count > 5:
                print('Early stopped !')
                break
 def test_create_dataset_and_clean(self):
     info = generate_random_dataset_in_raw_data(output_dir=self.output_dir,
                                                num_runs=20,
                                                input_size=(100, 100, 3),
                                                output_size=(1, ),
                                                continuous=True,
                                                store_hdf5=False)
     cleaner_config_dict = {
         'output_path': self.output_dir,
         'data_loader_config': {
             'data_directories': info['episode_directories'],
             'input_size': (150, 150, 1)
         },
         'training_validation_split': 0.7,
     }
     data_cleaner = DataCleaner(config=DataCleaningConfig().create(
         config_dict=cleaner_config_dict))
     data_cleaner.clean()
     data_loader_train = DataLoader(config=DataLoaderConfig().create(
         config_dict={
             'output_path': self.output_dir,
             'hdf5_files': glob(f'{self.output_dir}/train*.hdf5')
         }))
     data_loader_train.load_dataset()
     data_loader_validation = DataLoader(config=DataLoaderConfig().create(
         config_dict={
             'output_path': self.output_dir,
             'hdf5_files': glob(f'{self.output_dir}/validation*.hdf5')
         }))
     data_loader_validation.load_dataset()
     ratio = len(data_loader_train.get_dataset()) / (
         0. + len(data_loader_train.get_dataset()) +
         len(data_loader_validation.get_dataset()))
     self.assertTrue(ratio > 0.6)
     self.assertTrue(ratio < 0.8)