Exemplo n.º 1
0
def run(config_file):
    config = load_config(config_file)

    os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    all_transforms['valid'] = get_transforms(config.transforms.test)

    dataloaders = {
        phase: make_loader(
            data_folder=config.data.train_dir,
            df_path=config.data.train_df_path,
            phase=phase,
            batch_size=config.train.batch_size,
            num_workers=config.num_workers,
            idx_fold=config.data.params.idx_fold,
            transforms=all_transforms[phase],
            num_classes=config.data.num_classes,
            pseudo_label_path=config.train.pseudo_label_path,
            task='cls'
        )
        for phase in ['train', 'valid']
    }

    # create model
    model = CustomNet(config.model.encoder, config.data.num_classes)

    # train setting
    criterion = get_loss(config)
    params = [
        {'params': model.base_params(), 'lr': config.optimizer.params.encoder_lr},
        {'params': model.fresh_params(), 'lr': config.optimizer.params.decoder_lr}
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)

    # model runner
    runner = SupervisedRunner(model=model)

    callbacks = [MultiClassAccuracyCallback(threshold=0.5), F1ScoreCallback()]
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(CheckpointCallback(resume=config.work_dir + '/checkpoints/best_full.pth'))

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        callbacks=callbacks,
        verbose=True,
        fp16=True,
    )
Exemplo n.º 2
0
    def __init__(self, cfg):
        self.cfg = cfg
        train_dataset = getDataSet(cfg['data']['train'], 'train',
                                   cfg['data']['scale'])
        self.train_loader = DataLoader(
            train_dataset,
            cfg['data']['train']['batch_size'],
            shuffle=True,
            num_workers=cfg['data']['train']['n_workers'])
        val_dataset = getDataSet(cfg['data']['val'], 'val',
                                 cfg['data']['scale'])
        self.val_loader = DataLoader(
            val_dataset,
            1,
            shuffle=False,
            num_workers=cfg['data']['val']['n_workers'])
        self.records = {'Epoch': [], 'PSNR': [], 'SSIM': []}
        self.log_dir = os.path.join(
            cfg['output_dir'], cfg['name'],
            time.strftime('%Y-%m-%d-%H:%M:%S', time.localtime(time.time())))
        self.logger = utils.Logger(os.path.join(self.log_dir, 'info.log'))
        self.max_epochs = cfg['schedule']['num_epochs']
        self.checkpoint_dir = os.path.join(self.log_dir, 'checkpoint')
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        self.epoch = 1
        save_config(cfg, os.path.join(self.log_dir, 'config.yml'))

        self.logger.log('Train dataset has {} images and {} batches.'.format(
            len(train_dataset), len(self.train_loader)))
        self.logger.log('Val dataset has {} images and {} batches.'.format(
            len(val_dataset), len(self.val_loader)))
Exemplo n.º 3
0
    def val(self, model, sess, global_step):
        # load latest checkpoint
        model.load(sess)

        # initialize loss and score
        losses = list()
        scores = list()

        # get validation data
        val_iterator = self.data.get_val_iterator(self.batch_size)

        # define loop
        num_batches_per_epoch = (self.data.val_size - 1) // self.batch_size + 1

        for step in tqdm(range(1, num_batches_per_epoch + 1)):
            batch_A, batch_B, \
        batch_sentence_diff, batch_extra_features, labels = next(val_iterator)
            cur_batch_length = len(batch_A)
            feed_dict = {
                model.input_A: batch_A,
                model.input_B: batch_B,
                model.dropout_keep_prob: 1,
                model.sentence_vector_diff: batch_sentence_diff,
                model.extra_features: batch_extra_features,
                model.labels: [int(l) for l in labels]
            }
            loss, score, _ = model.val(sess, feed_dict=feed_dict)
            losses.append(loss)
            scores.append(score)
        val_loss = np.mean(losses)
        val_score = np.mean(scores)

        # summarize val loss and score
        self.summary_writer.summarize(global_step,
                                      summarizer="val",
                                      summaries_dict={
                                          "score": np.array(val_score),
                                          "loss": np.array(val_loss)
                                      })

        # save as best model if it is best score
        best_loss = float(getattr(self.config, "best_loss", 1e+5))
        if val_loss < best_loss:
            self.logger.warn(
                "[Step {}] Saving for best loss : {:.5f} -> {:.5f}".format(
                    global_step, best_loss, val_loss))
            model.save(
                sess,
                os.path.join(self.checkpoint_dir, "best_loss",
                             "best_loss.ckpt"))
            setattr(self.config, "best_loss", "{:.5f}".format(val_loss))
            # save best config
            setattr(self.config, "best_step", str(self.global_step))
            setattr(self.config, "best_epoch", str(self.cur_epoch))
            save_config(self.config.checkpoint_dir, self.config)
        return val_loss, val_score
Exemplo n.º 4
0
    def val(self, model, sess, global_step):
        # Load latest checkpoint
        model.load(sess)
        sess.run(model.data_iterator.initializer)

        # Initialize loss and score
        losses = list()
        scores = list()

        # Define loop
        num_batches_per_epoch = (self.data.val_size - 1) // self.batch_size + 1
        loop = tqdm(range(1, num_batches_per_epoch + 1))

        for step in loop:
            feed_dict = {
                model.lstm_dropout_keep_prob: 1,
                model.num_negative_samples: 4,
                model.embed_dropout_keep_prob: 1,
                model.dense_dropout_keep_prob: 1
            }

            loss, score = sess.run([model.loss, model.accuracy],
                                   feed_dict=feed_dict)
            losses.append(loss)
            scores.append(score)
        val_loss = np.mean(losses)
        val_score = np.mean(scores)

        # Summarize val loss and score
        self.summary_writer.summarize(global_step,
                                      summarizer="val",
                                      summaries_dict={
                                          "score": np.array(val_score),
                                          "loss": np.array(val_loss)
                                      })

        # Save as best model if it is best score
        best_loss = float(getattr(self.config, "best_loss", 1e+5))
        if val_loss < best_loss:
            self.logger.warn(
                "[Step {}] Saving for best loss : {:.5f} -> {:.5f}".format(
                    global_step, best_loss, val_loss))
            model.save(
                sess,
                os.path.join(self.checkpoint_dir, "best_loss",
                             "best_loss.ckpt"))
            setattr(self.config, "best_loss", "{:.5f}".format(val_loss))

            # Save best config
            setattr(self.config, "best_step", str(self.global_step))
            setattr(self.config, "best_epoch", str(self.cur_epoch))
            save_config(self.config.checkpoint_dir, self.config)
        return val_loss, val_score
Exemplo n.º 5
0
    def val(self, model, sess, global_step):
        # load latest checkpoint
        model.load(sess)
        
        # initialize loss and score
        losses = list()
        scores = list()
        
        # get validation data
        val_iterator = self.data.get_val_iterator(self.batch_size)
        
        # define loop
        num_batches_per_epoch = (self.data.val_size - 1) // self.batch_size + 1
        loop = tqdm(range(1, num_batches_per_epoch+1))

        for step in loop:
            val_queries, val_replies, val_queries_lengths, val_replies_lengths = next(val_iterator)
            feed_dict = {model.input_queries: val_queries,
                         model.input_replies: val_replies,
                         model.queries_lengths: val_queries_lengths,
                         model.replies_lengths: val_replies_lengths, 
                         model.dropout_keep_prob: 1,
                         model.num_negative_samples: self.config.num_negative_samples}
            loss, score, _ = model.val(sess, feed_dict=feed_dict)
            losses.append(loss)
            scores.append(score)
        val_loss = np.mean(losses)
        val_score = np.mean(scores)

        # summarize val loss and score
        self.summary_writer.summarize(global_step,
                                      summarizer="val",
                                      summaries_dict={"score": np.array(val_score),
                                                      "loss": np.array(val_loss)})

        # save as best model if it is best score
        best_loss = float(getattr(self.config, "best_loss", 1e+5))
        if val_loss < best_loss:
            self.logger.warn("[Step {}] Saving for best loss : {:.5f} -> {:.5f}".format(global_step, best_loss, val_loss))
            model.save(sess,
                       os.path.join(self.checkpoint_dir, "best_loss", "best_loss.ckpt"))
            setattr(self.config, "best_loss", "{:.5f}".format(val_loss))
            # save best config
            setattr(self.config, "best_step", str(self.global_step))
            setattr(self.config, "best_epoch", str(self.cur_epoch))
            save_config(self.config.checkpoint_dir, self.config)
        return val_loss, val_score
Exemplo n.º 6
0
def run_once(args):
    cfg, run_id, path = args

    # -- Set seed
    cfg.general.seed = utils.set_seed(cfg.general.seed)

    # -- Get data loaders
    data_loader = get_data_loader(cfg.data_loader)

    train_data = data_loader.get_train_loader()
    test_data = data_loader.get_test_loader()

    # -- Resume agent and metrics if checkpoints are available
    # TODO Resume
    if cfg.checkpoint != "":
        resume_path = path + "/" + cfg.checkpoint
        log.info("Resuming training ...")
        cfg.agent.resume = resume_path

    # -- Get agent
    agent = get_agent(cfg.agent)

    # -- Should have some kind of reporting agent
    # TODO Implement reporting agent

    # -- Init finished
    save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg)

    eval_freq = cfg.train.eval_freq
    no_epochs = cfg.train.no_epochs - agent.get_train_epoch()

    for epoch in range(no_epochs):
        log.info("Train epoch: {}".format(epoch))
        agent.train(train_data)
        if epoch % eval_freq == 0:
            agent.test(test_data)
        print("Finished an epoch :D")

    with open(path + "/loss_values_train", "wb") as f:
        pickle.dump(agent.loss_values_train, f)

    with open(path + "/loss_values_test", "wb") as f:
        pickle.dump(agent.loss_values_test, f)

    agent.eval_agent()
Exemplo n.º 7
0
def run_once(args):
    cfg, run_id, path = args

    # -- Set seed
    cfg.general.seed = utils.set_seed(cfg.general.seed)

    # -- Resume agent and metrics if checkpoints are available
    # TODO Resume
    resume_path = path + "/" + cfg.checkpoint
    if resume_path:
        log.info("Resuming training ...")
        cfg.agent.resume = resume_path

    # -- Get agent
    agent = get_agent(cfg.agent)

    # -- Should have some kind of reporting agent
    # TODO Implement reporting agent

    # -- Init finished
    save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg)

    agent.eval_agent()
Exemplo n.º 8
0
def main():

    args = parser.parse_args()

    ##################################################
    # DATSET
    ##################################################
    if args.model_save_path is not None:
        # Load a config file (.yml)
        params = load_config(args.config_path)
    # NOTE: Retrain the saved model from the last checkpoint
    elif args.saved_model_path is not None:
        params = load_config(os.path.join(args.saved_model_path, 'config.yml'))
    else:
        raise ValueError("Set model_save_path or saved_model_path.")

    # Load dataset
    train_data = Dataset(data_save_path=args.data_save_path,
                         backend=params['backend'],
                         input_channel=params['input_channel'],
                         use_delta=params['use_delta'],
                         use_double_delta=params['use_double_delta'],
                         data_type='train',
                         data_size=params['data_size'],
                         label_type=params['label_type'],
                         batch_size=params['batch_size'],
                         max_epoch=params['num_epoch'],
                         splice=params['splice'],
                         num_stack=params['num_stack'],
                         num_skip=params['num_skip'],
                         sort_utt=True,
                         sort_stop_epoch=params['sort_stop_epoch'],
                         tool=params['tool'],
                         num_enque=None,
                         dynamic_batching=params['dynamic_batching'])
    dev_clean_data = Dataset(data_save_path=args.data_save_path,
                             backend=params['backend'],
                             input_channel=params['input_channel'],
                             use_delta=params['use_delta'],
                             use_double_delta=params['use_double_delta'],
                             data_type='dev_clean',
                             data_size=params['data_size'],
                             label_type=params['label_type'],
                             batch_size=params['batch_size'],
                             splice=params['splice'],
                             num_stack=params['num_stack'],
                             num_skip=params['num_skip'],
                             shuffle=True,
                             tool=params['tool'])
    dev_other_data = Dataset(data_save_path=args.data_save_path,
                             backend=params['backend'],
                             input_channel=params['input_channel'],
                             use_delta=params['use_delta'],
                             use_double_delta=params['use_double_delta'],
                             data_type='dev_other',
                             data_size=params['data_size'],
                             label_type=params['label_type'],
                             batch_size=params['batch_size'],
                             splice=params['splice'],
                             num_stack=params['num_stack'],
                             num_skip=params['num_skip'],
                             shuffle=True,
                             tool=params['tool'])
    test_clean_data = Dataset(data_save_path=args.data_save_path,
                              backend=params['backend'],
                              input_channel=params['input_channel'],
                              use_delta=params['use_delta'],
                              use_double_delta=params['use_double_delta'],
                              data_type='test_clean',
                              data_size=params['data_size'],
                              label_type=params['label_type'],
                              batch_size=params['batch_size'],
                              splice=params['splice'],
                              num_stack=params['num_stack'],
                              num_skip=params['num_skip'],
                              tool=params['tool'])
    test_other_data = Dataset(data_save_path=args.data_save_path,
                              backend=params['backend'],
                              input_channel=params['input_channel'],
                              use_delta=params['use_delta'],
                              use_double_delta=params['use_double_delta'],
                              data_type='test_other',
                              data_size=params['data_size'],
                              label_type=params['label_type'],
                              batch_size=params['batch_size'],
                              splice=params['splice'],
                              num_stack=params['num_stack'],
                              num_skip=params['num_skip'],
                              tool=params['tool'])

    params['num_classes'] = train_data.num_classes

    ##################################################
    # MODEL
    ##################################################
    # Model setting
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    if args.model_save_path is not None:

        # Set save path
        save_path = mkdir_join(args.model_save_path, params['backend'],
                               params['model_type'], params['label_type'],
                               params['data_size'], model.name)
        model.set_save_path(save_path)

        # Save config file
        save_config(config_path=args.config_path, save_path=model.save_path)

        # Setting for logging
        logger = set_logger(model.save_path)

        if os.path.isdir(params['char_init']):
            # NOTE: Start training from the pre-trained character model
            model.load_checkpoint(save_path=params['char_init'],
                                  epoch=-1,
                                  load_pretrained_model=True)

        # Count total parameters
        for name in sorted(list(model.num_params_dict.keys())):
            num_params = model.num_params_dict[name]
            logger.info("%s %d" % (name, num_params))
        logger.info("Total %.3f M parameters" %
                    (model.total_parameters / 1000000))

        # Define optimizer
        model.set_optimizer(optimizer=params['optimizer'],
                            learning_rate_init=float(params['learning_rate']),
                            weight_decay=float(params['weight_decay']),
                            clip_grad_norm=params['clip_grad_norm'],
                            lr_schedule=False,
                            factor=params['decay_rate'],
                            patience_epoch=params['decay_patient_epoch'])

        epoch, step = 1, 0
        learning_rate = float(params['learning_rate'])
        metric_dev_best = 1

    # NOTE: Retrain the saved model from the last checkpoint
    elif args.saved_model_path is not None:

        # Set save path
        model.save_path = args.saved_model_path

        # Setting for logging
        logger = set_logger(model.save_path, restart=True)

        # Define optimizer
        model.set_optimizer(
            optimizer=params['optimizer'],
            learning_rate_init=float(params['learning_rate']),  # on-the-fly
            weight_decay=float(params['weight_decay']),
            clip_grad_norm=params['clip_grad_norm'],
            lr_schedule=False,
            factor=params['decay_rate'],
            patience_epoch=params['decay_patient_epoch'])

        # Restore the last saved model
        epoch, step, learning_rate, metric_dev_best = model.load_checkpoint(
            save_path=args.saved_model_path, epoch=-1, restart=True)

    else:
        raise ValueError("Set model_save_path or saved_model_path.")

    train_data.epoch = epoch - 1

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    logger.info('PID: %s' % os.getpid())
    logger.info('USERNAME: %s' % os.uname()[1])

    # Set process name
    setproctitle('libri_' + params['backend'] + '_' + params['model_type'] +
                 '_' + params['label_type'] + '_' + params['data_size'])

    ##################################################
    # TRAINING LOOP
    ##################################################
    # Define learning rate controller
    lr_controller = Controller(
        learning_rate_init=learning_rate,
        backend=params['backend'],
        decay_start_epoch=params['decay_start_epoch'],
        decay_rate=params['decay_rate'],
        decay_patient_epoch=params['decay_patient_epoch'],
        lower_better=True)

    # Setting for tensorboard
    if params['backend'] == 'pytorch':
        tf_writer = SummaryWriter(model.save_path)

    # Train model
    csv_steps, csv_loss_train, csv_loss_dev = [], [], []
    start_time_train = time.time()
    start_time_epoch = time.time()
    start_time_step = time.time()
    not_improved_epoch = 0
    best_model = model
    loss_train_mean = 0.
    pbar_epoch = tqdm(total=len(train_data))
    while True:
        # Compute loss in the training set (including parameter update)
        batch_train, is_new_epoch = train_data.next()
        model, loss_train_val = train_step(model,
                                           batch_train,
                                           params['clip_grad_norm'],
                                           backend=params['backend'])
        loss_train_mean += loss_train_val

        pbar_epoch.update(len(batch_train['xs']))

        if (step + 1) % params['print_step'] == 0:

            # Compute loss in the dev set
            batch_dev = dev_clean_data.next()[0]
            loss_dev = model(batch_dev['xs'],
                             batch_dev['ys'],
                             batch_dev['x_lens'],
                             batch_dev['y_lens'],
                             is_eval=True)

            loss_train_mean /= params['print_step']
            csv_steps.append(step)
            csv_loss_train.append(loss_train_mean)
            csv_loss_dev.append(loss_dev)

            # Logging by tensorboard
            if params['backend'] == 'pytorch':
                tf_writer.add_scalar('train/loss', loss_train_mean, step + 1)
                tf_writer.add_scalar('dev/loss', loss_dev, step + 1)
                for name, param in model.named_parameters():
                    name = name.replace('.', '/')
                    tf_writer.add_histogram(name,
                                            param.data.cpu().numpy(), step + 1)
                    tf_writer.add_histogram(name + '/grad',
                                            param.grad.data.cpu().numpy(),
                                            step + 1)

            duration_step = time.time() - start_time_step
            logger.info(
                "...Step:%d(epoch:%.3f) loss:%.3f(%.3f)/lr:%.5f/batch:%d/x_lens:%d (%.3f min)"
                % (step + 1, train_data.epoch_detail, loss_train_mean,
                   loss_dev, learning_rate, train_data.current_batch_size,
                   max(batch_train['x_lens']) * params['num_stack'],
                   duration_step / 60))
            start_time_step = time.time()
            loss_train_mean = 0.
        step += 1

        # Save checkpoint and evaluate model per epoch
        if is_new_epoch:
            duration_epoch = time.time() - start_time_epoch
            logger.info('===== EPOCH:%d (%.3f min) =====' %
                        (epoch, duration_epoch / 60))

            # Save fugure of loss
            plot_loss(csv_loss_train,
                      csv_loss_dev,
                      csv_steps,
                      save_path=model.save_path)

            if epoch < params['eval_start_epoch']:
                # Save the model
                model.save_checkpoint(model.save_path, epoch, step,
                                      learning_rate, metric_dev_best)
            else:
                start_time_eval = time.time()
                # dev
                if 'word' in params['label_type']:
                    metric_dev_epoch, _ = do_eval_wer(
                        models=[model],
                        dataset=dev_clean_data,
                        beam_width=1,
                        max_decode_len=MAX_DECODE_LEN_WORD,
                        eval_batch_size=1)
                    logger.info('  WER (dev-clean): %.3f %%' %
                                (metric_dev_epoch * 100))
                else:
                    metric_dev_epoch, wer_dev_clean_epoch, _ = do_eval_cer(
                        models=[model],
                        dataset=dev_clean_data,
                        beam_width=1,
                        max_decode_len=MAX_DECODE_LEN_CHAR,
                        eval_batch_size=1)
                    logger.info('  CER / WER (dev-clean): %.3f %% / %.3f %%' %
                                ((metric_dev_epoch * 100),
                                 (wer_dev_clean_epoch * 100)))

                if metric_dev_epoch < metric_dev_best:
                    metric_dev_best = metric_dev_epoch
                    not_improved_epoch = 0
                    best_model = copy.deepcopy(model)
                    logger.info('||||| Best Score |||||')

                    # Save the model
                    model.save_checkpoint(model.save_path, epoch, step,
                                          learning_rate, metric_dev_best)

                    # dev-other & test
                    if 'word' in params['label_type']:
                        metric_dev_other_epoch, _ = do_eval_wer(
                            models=[model],
                            dataset=dev_other_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_WORD,
                            eval_batch_size=1)
                        logger.info('  WER (dev-other): %.3f %%' %
                                    (metric_dev_other_epoch * 100))

                        wer_test_clean, _ = do_eval_wer(
                            models=[model],
                            dataset=test_clean_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_WORD,
                            eval_batch_size=1)
                        logger.info('  WER (test-clean): %.3f %%' %
                                    (wer_test_clean * 100))

                        wer_test_other, _ = do_eval_wer(
                            models=[model],
                            dataset=test_other_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_WORD,
                            eval_batch_size=1)
                        logger.info('  WER (test-other): %.3f %%' %
                                    (wer_test_other * 100))

                        logger.info(
                            '  WER (test-mean): %.3f %%' %
                            ((wer_test_clean + wer_test_other) * 100 / 2))
                    else:
                        metric_dev_other_epoch, wer_dev_other_epoch, _ = do_eval_cer(
                            models=[model],
                            dataset=dev_other_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_CHAR,
                            eval_batch_size=1)
                        logger.info(
                            '  CER / WER (dev-other): %.3f %% / %.3f %%' %
                            ((metric_dev_other_epoch * 100),
                             (wer_dev_other_epoch * 100)))

                        cer_test_clean, wer_test_clean, _ = do_eval_cer(
                            models=[model],
                            dataset=test_clean_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_CHAR,
                            eval_batch_size=1)
                        logger.info(
                            '  CER / WER (test-clean): %.3f %% / %.3f %%' %
                            ((cer_test_clean * 100), (wer_test_clean * 100)))

                        cer_test_other, wer_test_other, _ = do_eval_cer(
                            models=[model],
                            dataset=test_other_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_CHAR,
                            eval_batch_size=1)
                        logger.info(
                            '  CER / WER (test-other): %.3f %% / %.3f %%' %
                            ((cer_test_other * 100), (wer_test_other * 100)))

                        logger.info(
                            '  CER / WER (test-mean): %.3f %% / %.3f %%' %
                            (((cer_test_clean + cer_test_other) * 100 / 2),
                             ((wer_test_clean + wer_test_other) * 100 / 2)))

                else:
                    not_improved_epoch += 1

                duration_eval = time.time() - start_time_eval
                logger.info('Evaluation time: %.3f min' % (duration_eval / 60))

                # Early stopping
                if not_improved_epoch == params['not_improved_patient_epoch']:
                    break

                # Update learning rate
                model.optimizer, learning_rate = lr_controller.decay_lr(
                    optimizer=model.optimizer,
                    learning_rate=learning_rate,
                    epoch=epoch,
                    value=metric_dev_epoch)

                if epoch == params['convert_to_sgd_epoch']:
                    # Convert to fine-tuning stage
                    model.set_optimizer(
                        'sgd',
                        learning_rate_init=learning_rate,
                        weight_decay=float(params['weight_decay']),
                        clip_grad_norm=params['clip_grad_norm'],
                        lr_schedule=False,
                        factor=params['decay_rate'],
                        patience_epoch=params['decay_patient_epoch'])
                    logger.info('========== Convert to SGD ==========')

                    # Inject Gaussian noise to all parameters
                    if float(params['weight_noise_std']) > 0:
                        model.weight_noise_injection = True

            pbar_epoch = tqdm(total=len(train_data))
            print('========== EPOCH:%d (%.3f min) ==========' %
                  (epoch, duration_epoch / 60))

            if epoch == params['num_epoch']:
                break

            start_time_step = time.time()
            start_time_epoch = time.time()
            epoch += 1

    # TODO: evaluate the best model by beam search here

    duration_train = time.time() - start_time_train
    logger.info('Total time: %.3f hour' % (duration_train / 3600))

    if params['backend'] == 'pytorch':
        tf_writer.close()

    # Training was finished correctly
    with open(os.path.join(model.save_path, 'COMPLETE'), 'w') as f:
        f.write('')
Exemplo n.º 9
0
def main():
    global parser, args, args
    # arguments
    parser = argparse.ArgumentParser(description='byol-lightning-test')
    # Architecture & hyper-parameter
    parser.add_argument(
        '--arch',
        '-a',
        metavar='ARCH',
        default='resnet',
        help='model architecture: | [resnet, ...] (default: resnet18)')
    parser.add_argument('--depth', type=int, default=18, help='Model depth.')
    parser.add_argument('-c',
                        '--checkpoint',
                        default='../checkpoints',
                        type=str,
                        metavar='PATH',
                        help='path to save checkpoint (default: checkpoint)')
    parser.add_argument('--epoch', type=int, default=100, help='Epoch')
    parser.add_argument('--batch-size', type=int, default=32, help='Epoch')
    parser.add_argument('--lr',
                        '--learning-rate',
                        default=1,
                        type=float,
                        metavar='LR',
                        help='initial learning rate')
    parser.add_argument('--num-classes', type=int, default=100, help='Epoch')
    parser.add_argument('--from-scratch',
                        action='store_true',
                        default=False,
                        help='use pre-trained model')
    parser.add_argument('--tune-all',
                        action='store_true',
                        default=False,
                        help='use pre-trained model')

    # Device options
    parser.add_argument('--manualSeed', type=int, help='manual seed')
    parser.add_argument('--gpu-id',
                        default='0',
                        type=str,
                        help='id(s) for CUDA_VISIBLE_DEVICES')
    parser.add_argument('--model-path',
                        '--mp',
                        type=str,
                        help='byol trained model path')
    # Paths
    parser.add_argument('-d', '--dataset', default='neu', type=str)
    parser.add_argument(
        '--image_folder',
        type=str,
        required=True,
        help='path to your folder of images for self-supervised learning')
    parser.add_argument('--board-path',
                        '--bp',
                        default='../board',
                        type=str,
                        help='tensorboardx path')
    parser.add_argument('--board-tag',
                        '--tg',
                        default='fine-tuned',
                        type=str,
                        help='tensorboardx writer tag')
    args = parser.parse_args()

    # Use CUDA
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    use_cuda = torch.cuda.is_available()
    # Torch Seed
    # Random seed
    if args.manualSeed is None:
        args.manualSeed = random.randint(1, 10000)
    # Random Lib Seed
    random.seed(args.manualSeed)
    # Numpy Seed
    np.random.seed(args.manualSeed)
    if use_cuda:
        torch.cuda.manual_seed_all(args.manualSeed)

    # constants
    args.image_size = 256
    args.workers = multiprocessing.cpu_count()

    args.task_time = datetime.now().isoformat()
    output_name = "{}{:d}-bs{:d}-lr{:.5f}-{}".format(args.arch, args.depth,
                                                     args.batch_size, args.lr,
                                                     args.board_tag)
    args.checkpoint = os.path.join(args.checkpoint, args.dataset, output_name,
                                   args.task_time)
    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)
    config.save_config(args, os.path.join(args.checkpoint, "config.txt"))

    writer_train = SummaryWriter(log_dir=os.path.join(
        args.board_path, args.dataset, output_name, args.task_time, "train"))
    writer_test = SummaryWriter(log_dir=os.path.join(
        args.board_path, args.dataset, output_name, args.task_time, "test"))

    if args.arch is "resnet":
        if args.depth == 18:
            model = models.resnet18(pretrained=False).cuda()
        elif args.depth == 34:
            model = models.resnet34(pretrained=False).cuda()
        elif args.depth == 50:
            model = models.resnet50(pretrained=False).cuda()
        elif args.depth == 101:
            model = models.resnet101(pretrained=False).cuda()
        else:
            assert ("Not supported Depth")

    if not args.from_scratch:
        checkpoint = torch.load(args.model_path)
        model.load_state_dict(checkpoint)
    print("\t==> Fine tune full layers? : {}".format(str(args.tune_all)))
    # Simple manual fine tuning logic
    # if full == False, only last layer will be fine tuned~!!
    if not args.tune_all:
        params = model.parameters()
        for param in params:
            param.requires_grad = False
    model.num_classes = args.num_classes
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, args.num_classes)

    model = torch.nn.DataParallel(model).cuda()
    opt = torch.optim.Adam(model.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss().cuda()
    softmax = nn.Softmax(1).cuda()

    # Data loading code
    traindir = os.path.join(args.image_folder, 'train')
    testdir = os.path.join(args.image_folder, 'test')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    trainloader = torch.utils.data.DataLoader(
        datasets.ImageFolder(
            traindir,
            transforms.Compose([
                transforms.Resize(args.image_size),
                transforms.RandomHorizontalFlip(),
                transforms.RandomVerticalFlip(),
                transforms.ColorJitter(0.4, 0.4, 0.4),
                transforms.ToTensor(),
                # normalize,
            ])),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True)
    testloader = torch.utils.data.DataLoader(
        datasets.ImageFolder(
            testdir,
            transforms.Compose([
                transforms.Resize(args.image_size),
                transforms.ToTensor(),
                # normalize,
            ])),
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True)

    losses_train = AverageMeter()
    top1_train = AverageMeter()
    top5_train = AverageMeter()
    losses_test = AverageMeter()
    top1_test = AverageMeter()
    top5_test = AverageMeter()

    for epoch in range(args.epoch):
        bar_train = Bar('Processing', max=len(trainloader))
        bar_test = Bar('Processing', max=len(testloader))
        train(model, criterion, opt, softmax, bar_train, epoch, trainloader,
              losses_train, top1_train, top5_train, writer_train)
        test(model, criterion, softmax, bar_test, epoch, testloader,
             losses_test, top1_test, top5_test, writer_test)
    # save your improved network
    torch.save(model.state_dict(),
               os.path.join(args.checkpoint, 'byol-finetune.pt'))
Exemplo n.º 10
0
#     log_dir=os.path.join(args.board_path, args.dataset, "{}-{}{:d}-bs{:d}-lr{:.5f}-{}".format(args.arch,
#                                                                                               args.depth,
#                                                                                               args.batch_size,
#                                                                                               args.lr,
#                                                                                               args.board_tag),
#                          task_time, "train"))
args.task_time = datetime.now().isoformat()
output_name = "{}{:d}-bs{:d}-lr{:.5f}-{}".format(args.arch,
                                                 args.depth,
                                                 args.batch_size,
                                                 args.lr,
                                                 args.board_tag)
args.checkpoint = os.path.join(args.checkpoint, args.dataset, output_name, args.task_time)
if not os.path.isdir(args.checkpoint):
    mkdir_p(args.checkpoint)
config.save_config(args, os.path.join(args.checkpoint, "config.txt"))

writer_train = SummaryWriter(
    log_dir=os.path.join(args.board_path, args.dataset, output_name, args.task_time))

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])


def expand_greyscale(t):
    return t.expand(3, -1, -1)


class ImagesDataset(Dataset):
    def __init__(self, folder, image_size):
        super().__init__()
Exemplo n.º 11
0
def run(config_file):
    config = load_config(config_file)

    if not os.path.exists(config.work_dir):
        os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    all_transforms['valid'] = get_transforms(config.transforms.test)

    dataloaders = {
        phase: make_loader(data_folder=config.data.train_dir,
                           df_path=config.data.train_df_path,
                           phase=phase,
                           batch_size=config.train.batch_size,
                           num_workers=config.num_workers,
                           idx_fold=config.data.params.idx_fold,
                           transforms=all_transforms[phase],
                           num_classes=config.data.num_classes,
                           pseudo_label_path=config.train.pseudo_label_path,
                           debug=config.debug)
        for phase in ['train', 'valid']
    }

    # create segmentation model with pre trained encoder
    model = getattr(smp, config.model.arch)(
        encoder_name=config.model.encoder,
        encoder_weights=config.model.pretrained,
        classes=config.data.num_classes,
        activation=None,
    )

    # train setting
    criterion = get_loss(config)
    params = [
        {
            'params': model.decoder.parameters(),
            'lr': config.optimizer.params.decoder_lr
        },
        {
            'params': model.encoder.parameters(),
            'lr': config.optimizer.params.encoder_lr
        },
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)

    # model runner
    runner = SupervisedRunner(model=model)

    callbacks = [DiceCallback(), IouCallback()]

    # to resume from check points if exists
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(
            CheckpointCallback(resume=config.work_dir +
                               '/checkpoints/best_full.pth'))

    if config.train.mixup:
        callbacks.append(MixupCallback())

    if config.train.cutmix:
        callbacks.append(CutMixCallback())

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        callbacks=callbacks,
        verbose=True,
        fp16=True,
    )
Exemplo n.º 12
0
    def val(self, model, sess, global_step):
        # load latest checkpoint
        model.load(sess)
        sess.run(model.data_iterator.initializer)

        # initialize loss and score
        losses = list()
        val_queries, val_replies, val_generated_replies = list(), list(), list(
        )

        # define loop
        num_batches_per_epoch = (self.data.val_size - 1) // self.batch_size + 1
        loop = tqdm(range(1, num_batches_per_epoch + 1))

        for step in loop:
            feed_dict = {model.dropout_keep_prob: 1}
            queries, replies, generated_replies, loss = model.val(
                sess, feed_dict=feed_dict)

            queries = [
                " ".join([token.decode("utf-8") for token in query_tokens])
                for query_tokens in queries
            ]
            replies = [
                " ".join([token.decode("utf-8") for token in reply_tokens])
                for reply_tokens in replies
            ]
            generated_replies = [" ".join([token.decode("utf-8") for token in generated_reply_tokens]) \
                                 for generated_reply_tokens in generated_replies]

            val_queries.extend(queries)
            val_replies.extend(replies)
            val_generated_replies.extend(generated_replies)
            losses.append(loss)

        val_loss = np.mean(losses)

        # summarize val loss and score
        self.summary_writer.summarize(
            global_step,
            summarizer="val",
            summaries_dict={"loss": np.array(val_loss)})

        # display some generated samples
        random_indices = sorted(
            np.random.choice(100, 10, replace=False).tolist())

        for idx in random_indices:
            self.logger.info(
                self.generation_summary.format(val_queries[idx],
                                               val_replies[idx],
                                               generated_replies[idx]))

        # save as best model if it is best loss
        best_loss = float(getattr(self.config, "best_loss", 1e+5))
        if val_loss < best_loss:
            self.logger.warn(
                "[Step {}] Saving for best loss : {:.5f} -> {:.5f}".format(
                    global_step, best_loss, val_loss))
            model.save(
                sess,
                os.path.join(self.checkpoint_dir, "best_loss",
                             "best_loss.ckpt"))
            setattr(self.config, "best_loss", "{:.5f}".format(val_loss))
            # save best config
            setattr(self.config, "best_step", str(self.global_step))
            setattr(self.config, "best_epoch", str(self.cur_epoch))
            save_config(self.config.checkpoint_dir, self.config)
            with open(
                    os.path.join(self.checkpoint_dir, "best_loss",
                                 "generated_result.txt"), "w") as f:
                for query, reply, generated_reply in zip(
                        val_queries, val_replies, val_generated_replies):
                    f.write("{}\t{}\t{}\n".format(query, reply,
                                                  generated_reply))
        return val_loss
Exemplo n.º 13
0
def run(config_file):
    config = load_config(config_file)
    #set up the environment flags for working with the KAGGLE GPU OR COLAB_GPU
    if 'COLAB_GPU' in os.environ:
        config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir
    elif 'KAGGLE_WORKING_DIR' in os.environ:
        config.work_dir = '/kaggle/working/' + config.work_dir
    print('working directory:', config.work_dir)

    #save the configuration to the working dir
    if not os.path.exists(config.work_dir):
        os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    #Enter the GPUS you have,
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    #our dataset has an explicit validation folder, use that later.
    all_transforms['valid'] = get_transforms(config.transforms.test)

    print("before rajat config", config.data.height, config.data.width)
    #fetch the dataloaders we need
    dataloaders = {
        phase: make_loader(data_folder=config.data.train_dir,
                           df_path=config.data.train_df_path,
                           phase=phase,
                           img_size=(config.data.height, config.data.width),
                           batch_size=config.train.batch_size,
                           num_workers=config.num_workers,
                           idx_fold=config.data.params.idx_fold,
                           transforms=all_transforms[phase],
                           num_classes=config.data.num_classes,
                           pseudo_label_path=config.train.pseudo_label_path,
                           debug=config.debug)
        for phase in ['train', 'valid']
    }

    #creating the segmentation model with pre-trained encoder
    '''
    dumping the parameters for smp library
    encoder_name: str = "resnet34",
    encoder_depth: int = 5,
    encoder_weights: str = "imagenet",
    decoder_use_batchnorm: bool = True,
    decoder_channels: List[int] = (256, 128, 64, 32, 16),
    decoder_attention_type: Optional[str] = None,
    in_channels: int = 3,
    classes: int = 1,
    activation: Optional[Union[str, callable]] = None,
    aux_params: Optional[dict] = None,
    '''
    model = getattr(smp, config.model.arch)(
        encoder_name=config.model.encoder,
        encoder_weights=config.model.pretrained,
        classes=config.data.num_classes,
        activation=None,
    )

    #fetch the loss
    criterion = get_loss(config)
    params = [
        {
            'params': model.decoder.parameters(),
            'lr': config.optimizer.params.decoder_lr
        },
        {
            'params': model.encoder.parameters(),
            'lr': config.optimizer.params.encoder_lr
        },
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)
    '''
    dumping the catalyst supervised runner
    https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py

    model (Model): Torch model object
    device (Device): Torch device
    input_key (str): Key in batch dict mapping for model input
    output_key (str): Key in output dict model output
        will be stored under
    input_target_key (str): Key in batch dict mapping for target
    '''

    runner = SupervisedRunner(model=model, device=get_device())

    #@pavel,srk,rajat,vladimir,pudae check the IOU and the Dice Callbacks

    callbacks = [DiceCallback(), IouCallback()]

    #adding patience
    if config.train.early_stop_patience > 0:
        callbacks.append(
            EarlyStoppingCallback(patience=config.train.early_stop_patience))

    #thanks for handling the distributed training
    '''
    we are gonna take zero_grad after accumulation accumulation_steps
    '''
    if config.train.accumulation_size > 0:
        accumulation_steps = config.train.accumulation_size // config.train.batch_size
        callbacks.extend([
            CriterionCallback(),
            OptimizerCallback(accumulation_steps=accumulation_steps)
        ])

    # to resume from check points if exists
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(
            CheckpointCallback(resume=config.work_dir +
                               '/checkpoints/last_full.pth'))
    '''
    pudae добавь пожалуйста обратный вызов
    https://arxiv.org/pdf/1710.09412.pdf
    **srk adding the mixup callback
    '''
    if config.train.mixup:
        callbacks.append(MixupCallback())
    if config.train.cutmix:
        callbacks.append(CutMixCallback())
    '''@rajat implemented cutmix, a wieghed combination of cutout and mixup '''
    callbacks.append(MixupCallback())
    callbacks.append(CutMixCallback())
    '''
    rajat introducing training loop
    https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py
    take care of the nvidias fp16 precision
    '''
    print(config.work_dir)
    print(config.train.minimize_metric)
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        main_metric=config.train.main_metric,
        minimize_metric=config.train.minimize_metric,
        callbacks=callbacks,
        verbose=True,
        fp16=False,
    )
Exemplo n.º 14
0
def run(config_file):
    config = load_config(config_file)
    if 'COLAB_GPU' in os.environ:
        config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir
    elif 'KAGGLE_WORKING_DIR' in os.environ:
        config.work_dir = '/kaggle/working/' + config.work_dir
    print('working directory:', config.work_dir)

    if not os.path.exists(config.work_dir):
        os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    all_transforms['valid'] = get_transforms(config.transforms.test)

    dataloaders = {
        phase: make_loader(
            data_folder=config.data.train_dir,
            df_path=config.data.train_df_path,
            phase=phase,
            img_size=(config.data.height, config.data.width),
            batch_size=config.train.batch_size,
            num_workers=config.num_workers,
            idx_fold=config.data.params.idx_fold,
            transforms=all_transforms[phase],
            num_classes=config.data.num_classes,
            pseudo_label_path=config.train.pseudo_label_path,
            debug=config.debug
        )
        for phase in ['train', 'valid']
    }

    # create segmentation model with pre trained encoder
    model = getattr(smp, config.model.arch)(
        encoder_name=config.model.encoder,
        encoder_weights=config.model.pretrained,
        classes=config.data.num_classes,
        activation=None,
    )

    # train setting
    criterion = get_loss(config)
    params = [
        {'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr},
        {'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr},
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)

    # model runner
    runner = SupervisedRunner(model=model, device=get_device())

    callbacks = [DiceCallback(), IouCallback()]

    if config.train.early_stop_patience > 0:
        callbacks.append(EarlyStoppingCallback(
            patience=config.train.early_stop_patience))

    if config.train.accumulation_size > 0:
        accumulation_steps = config.train.accumulation_size // config.train.batch_size
        callbacks.extend(
            [CriterionCallback(),
             OptimizerCallback(accumulation_steps=accumulation_steps)]
        )

    # to resume from check points if exists
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(CheckpointCallback(
            resume=config.work_dir + '/checkpoints/last_full.pth'))

    if config.train.mixup:
        callbacks.append(MixupCallback())

    if config.train.cutmix:
        callbacks.append(CutMixCallback())

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        main_metric=config.train.main_metric,
        minimize_metric=config.train.minimize_metric,
        callbacks=callbacks,
        verbose=True,
        fp16=True,
    )
Exemplo n.º 15
0
def cli_main():

    parser = options.get_training_parser()
    parser.add_argument(
        '--config',
        type=str,
        nargs='*',
        help=
        'paths to JSON files of experiment configurations, from high to low priority',
    )
    parser.add_argument('--exp-name',
                        type=str,
                        default='',
                        help='name of the experiment')
    parser.add_argument(
        '--debug',
        default=False,
        action='store_true',
        help='run training in the debugging mode',
    )
    parser.add_argument('--path-attributes',
                        type=str,
                        nargs='*',
                        default=['task', 'arch', 'lr'])
    parser.add_argument('--torch-file-system', action='store_true')

    pre_parsed_args, unknown = parser.parse_known_args()

    config_dict = {}
    for config_path in pre_parsed_args.config:
        config_dict = update_config(config_dict, compose_configs(config_path))

    parser_modifier = modify_factory(config_dict)

    args = options.parse_args_and_arch(parser, modify_parser=parser_modifier)

    update_namespace(args, config_dict)

    # set sharing strategy file system in case /dev/shm/ limits are small
    if args.torch_file_system:
        torch.multiprocessing.set_sharing_strategy('file_system')

    training_name = get_training_name(args)
    base_save_dir = generate_save_dir(args, training_name, sys.argv[1:])
    setattr(args, 'training_name', training_name)
    setattr(args, 'save_dir', os.path.join(base_save_dir, 'checkpoints'))
    setattr(args, 'tensorboard_logdir',
            os.path.join(base_save_dir, 'tensorboard'))

    save_config(vars(args), base_save_dir)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if (args.update_freq is not None and max(args.update_freq) > 1
                and args.ddp_backend != 'no_c10d'):
            logger.info(
                'NOTE: you may get faster training with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )

    else:
        # single GPU training
        main(args)