def create_supervised_classification_trainer(model,
                                             loss_fn,
                                             optimizer,
                                             val_loader,
                                             learning_rate_scheduler,
                                             callback=None,
                                             use_cuda=None):
    """
    Todo: Add description
    :param model:
    :param loss_fn:
    :param optimizer:
    :param val_loader:
    :param learning_rate_scheduler:
    :param callback:
    :param use_cuda:
    :return:
    """

    if use_cuda and not torch.cuda.is_available():
        raise RuntimeError(
            'Trying to run using cuda, while cuda is not available')

    if use_cuda and torch.cuda.is_available():
        device = torch.device('cuda:0')
        torch.backends.cudnn.benchmark = True
        if torch.cuda.device_count() > 1 and not isinstance(
                model, nn.DataParallel):
            model = nn.DataParallel(model)
            print("Using {} gpus for training".format(
                torch.cuda.device_count()))
    else:
        device = torch.device('cpu')

    trainer = create_trainer(model=model,
                             optimizer=optimizer,
                             loss_fn=loss_fn,
                             metrics={
                                 'top_1_accuracy': CategoricalAccuracy(),
                                 'top_5_accuracy': TopKCategoricalAccuracy(),
                                 'loss': Loss(loss_fn),
                             },
                             device=device)

    evaluator = create_supervised_classification_evaluator(
        model, loss_fn, use_cuda)

    if learning_rate_scheduler:
        trainer.add_event_handler(Events.EPOCH_STARTED,
                                  lambda _: learning_rate_scheduler.step())

    if callback is not None:
        trainer.add_event_handler(Events.ITERATION_COMPLETED, callback, model)

    trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results,
                              optimizer)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, run_evaluation,
                              evaluator, val_loader)

    return trainer, evaluator
Пример #2
0
def test_pbar_with_metric():

    n_iters = 20
    batch_size = 10
    n_classes = 2
    data = list(range(n_iters))
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    loss_values = iter(range(n_iters))

    def step(engine, batch):
        loss_value = next(loss_values)
        y_true_batch = next(y_true_batch_values)
        y_pred_batch = next(y_pred_batch_values)
        return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy(
            y_true_batch)

    trainer = Engine(step)

    accuracy = CategoricalAccuracy(output_transform=lambda x: (x[1], x[2]))
    accuracy.attach(trainer, "avg_accuracy")

    pbar = ProgressBar()
    pbar.attach(trainer, ['avg_accuracy'])

    with pytest.raises(KeyError):
        trainer.run(data=data, max_epochs=1)
def create_supervised_classification_evaluator(model, loss_fn, use_cuda):
    """
    Create an evaluator
    :param model:
    :param loss_fn:
    :param use_cuda:
    :return:
    """

    if use_cuda and torch.cuda.is_available():
        device = torch.device('cuda:0')
        # multiple GPUs, we can remove this as well
        torch.backends.cudnn.benchmark = True
        if torch.cuda.device_count() > 1 and not isinstance(
                model, nn.DataParallel):
            model = nn.DataParallel(model)
            logger.info("Using %d gpus for training",
                        torch.cuda.device_count())
    else:
        device = torch.device('cpu')

    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'top_1_accuracy':
                                                CategoricalAccuracy(),
                                                'top_5_accuracy':
                                                TopKCategoricalAccuracy(),
                                                'loss':
                                                Loss(loss_fn)
                                            },
                                            device=device)
    return evaluator
Пример #4
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    cuda = torch.cuda.is_available()
    train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size)

    model = Net()
    if cuda:
        model = model.cuda()
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    trainer = create_supervised_trainer(model, optimizer, F.nll_loss, cuda=cuda)
    evaluator = create_supervised_evaluator(model,
                                            metrics={'accuracy': CategoricalAccuracy(),
                                                     'nll': Loss(F.nll_loss)},
                                            cuda=cuda)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
                  "".format(engine.state.epoch, iter, len(train_loader), engine.state.output))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print("Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
              .format(engine.state.epoch, avg_accuracy, avg_nll))

    trainer.run(train_loader, max_epochs=epochs)
Пример #5
0
    def folds(self, kf):
        model = BGRU(self.input_size, self.hidden_size, self.num_layers,
                     self.num_classes, self.batch_size, self.dropout)
        loss = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
        train_loader, valid_loader = _get_data_loader(kf, self.batch_size)
        trainer = create_supervised_trainer(model,
                                            optimizer,
                                            loss,
                                            device=DEVICE)
        evaluator = create_supervised_evaluator(model,
                                                metrics={
                                                    'acc':
                                                    CategoricalAccuracy(),
                                                    'loss': Loss(loss),
                                                    'prec':
                                                    Precision(average=True),
                                                    'recall':
                                                    Recall(average=True)
                                                },
                                                device=DEVICE)

        @trainer.on(Events.ITERATION_COMPLETED)
        def log_training_loss(trainer):
            iter_num = trainer.state.iteration
            if iter_num % 10 == 0:
                logger.info("Epoch[{}] Iter: {} Loss: {:.2f}".format(
                    trainer.state.epoch, iter_num, trainer.state.output))

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_results(trainer):
            evaluator.run(train_loader)
            metrics = evaluator.state.metrics
            f1 = (2 * metrics['prec'] *
                  metrics['recall']) / (metrics['prec'] + metrics['recall'])
            logger.info(
                "Train Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f} Avg Precision: {:.2f} Avg Recall: {:.2f} Avg F1 Score: {:.2f}"
                .format(trainer.state.epoch, metrics['acc'], metrics['loss'],
                        metrics['prec'], metrics['recall'], f1))

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_validation_results(trainer):
            evaluator.run(valid_loader)
            metrics = evaluator.state.metrics
            f1 = (2 * metrics['prec'] *
                  metrics['recall']) / (metrics['prec'] + metrics['recall'])
            for k in self.res.keys():
                if k != 'f1':
                    self.res[k].append(metrics[k])
                else:
                    self.res[k].append(f1)
            logger.info(
                "Valid Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f} Avg Precision: {:.2f} Avg Recall: {:.2f} Avg F1 Score: {:.2f}"
                .format(trainer.state.epoch, metrics['acc'], metrics['loss'],
                        metrics['prec'], metrics['recall'], f1))

        trainer.run(train_loader, max_epochs=self.num_epochs)
        return model
Пример #6
0
def run(mode, noise_fraction, train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir):

    seed = 12345
    random.seed(seed)
    torch.manual_seed(seed)

    now = datetime.now()
    log_dir = os.path.join(log_dir, "train_{}_{}__{}".format(mode, noise_fraction, now.strftime("%Y%m%d_%H%M")))
    os.makedirs(log_dir)

    cuda = torch.cuda.is_available()
    train_loader, val_loader = get_data_loaders(noise_fraction, train_batch_size, val_batch_size)

    model = Net()

    writer = create_summary_writer(log_dir)
    if cuda:
        model = model.cuda()
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)

    if mode == 'xentropy':
        criterion = nn.CrossEntropyLoss()
    elif mode == 'soft_bootstrap':
        criterion = SoftBootstrappingLoss(beta=0.95)
    elif mode == 'hard_bootstrap':
        criterion = HardBootstrappingLoss(beta=0.8)
    else:
        raise TypeError("Wrong mode {}, expected: xentropy, soft_bootstrap or hard_bootstrap".format(mode))

    trainer = create_supervised_trainer(model, optimizer, criterion, cuda=cuda)
    evaluator = create_supervised_evaluator(model,
                                            metrics={'accuracy': CategoricalAccuracy(),
                                                     'nll': Loss(nn.CrossEntropyLoss())},
                                            cuda=cuda)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
                  "".format(engine.state.epoch, iter, len(train_loader), engine.state.output))
            writer.add_scalar("training/loss", engine.state.output, engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print("Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
              .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar("valdation/loss", avg_nll, engine.state.epoch)
        writer.add_scalar("valdation/accuracy", avg_accuracy, engine.state.epoch)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    writer.close()
Пример #7
0
def test_compute_batch_images():
    acc = CategoricalAccuracy()

    y_pred = torch.softmax(torch.rand(2, 3, 2, 2), dim=1)
    y = torch.LongTensor([[[0, 1], [0, 1]], [[0, 2], [0, 2]]])
    indices = torch.max(y_pred, dim=1)[1]
    acc.update((y_pred, y))
    assert isinstance(acc.compute(), float)
    assert accuracy_score(
        y.view(-1).data.numpy(),
        indices.view(-1).data.numpy()) == pytest.approx(acc.compute())
Пример #8
0
def test_wrong_input_args():
    with pytest.raises(TypeError):
        _ = RunningAverage(src=[12, 34])

    with pytest.raises(ValueError):
        _ = RunningAverage(alpha=-1.0)

    with pytest.raises(ValueError):
        _ = RunningAverage(CategoricalAccuracy(),
                           output_transform=lambda x: x[0])

    with pytest.raises(ValueError):
        _ = RunningAverage()
Пример #9
0
def test_compute():
    acc = CategoricalAccuracy()

    y_pred = torch.eye(4)
    y = torch.ones(4).type(torch.LongTensor)
    acc.update((y_pred, y))
    assert acc.compute() == 0.25

    acc.reset()
    y_pred = torch.eye(2)
    y = torch.ones(2).type(torch.LongTensor)
    acc.update((y_pred, y))
    assert acc.compute() == 0.5
Пример #10
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size)
    model = Net()
    writer = create_summary_writer(model, train_loader, log_dir)
    device = 'cpu'

    if torch.cuda.is_available():
        device = 'cuda'
        model = model.to(device)

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={'accuracy': CategoricalAccuracy(),
                                                     'nll': Loss(F.nll_loss)},
                                            device=device)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
                  "".format(engine.state.epoch, iter, len(train_loader), engine.state.output))
            writer.add_scalar("training/loss", engine.state.output, engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print("Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
              .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print("Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
              .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    writer.close()
Пример #11
0
def test_compute():
    acc = CategoricalAccuracy()

    y_pred = torch.softmax(torch.rand(4, 4), dim=1)
    y = torch.ones(4).type(torch.LongTensor)
    indices = torch.max(y_pred, dim=1)[1]
    acc.update((y_pred, y))
    assert isinstance(acc.compute(), float)
    assert accuracy_score(
        y.view(-1).data.numpy(),
        indices.view(-1).data.numpy()) == pytest.approx(acc.compute())

    acc.reset()
    y_pred = torch.softmax(torch.rand(2, 2), dim=1)
    y = torch.ones(2).type(torch.LongTensor)
    indices = torch.max(y_pred, dim=1)[1]
    acc.update((y_pred, y))
    assert isinstance(acc.compute(), float)
    assert accuracy_score(
        y.view(-1).data.numpy(),
        indices.view(-1).data.numpy()) == pytest.approx(acc.compute())
Пример #12
0
def test_compute_batch_images():
    acc = CategoricalAccuracy()
    y_pred = torch.zeros(2, 3, 2, 2)
    y_pred[0, 1, :] = 1
    y_pred[0, 2, :] = 1

    y = torch.LongTensor([[[0, 1], [0, 1]], [[0, 2], [0, 2]]])

    acc.update((y_pred, y))

    assert isinstance(acc.compute(), float)
    assert acc.compute() == 0.5

    acc.reset()
    y_pred = torch.zeros(2, 3, 2, 2)
    y_pred[0, 1, :] = 1
    y_pred[1, 2, :] = 1

    y = torch.LongTensor([[[2, 1], [1, 1]], [[2, 2], [0, 2]]])

    acc.update((y_pred, y))
    assert isinstance(acc.compute(), float)
    assert acc.compute() == 0.75
Пример #13
0
def main():
    parser = argparse.ArgumentParser(description='Training')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.0005,
                        help='learning_rate')
    parser.add_argument('--temp_min',
                        type=float,
                        default=0.01,
                        help='Temp Min')
    parser.add_argument('--epochs_to_anneal',
                        type=float,
                        default=15.0,
                        help='epochs_to_anneal')
    parser.add_argument('--temp_max', type=float, default=2.0, help='Temp Max')
    parser.add_argument('--reg', type=float, default=0.01, help='regularizer')
    parser.add_argument('--batch_size', type=int, default=8, help='batch_size')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=500,
                        help='Max Epochs')
    parser.add_argument('--log_every_batch',
                        type=int,
                        default=50,
                        help='Log every batch')
    parser.add_argument('--save_ckpt_every',
                        type=int,
                        default=20,
                        help='Save Checkpoint Every')
    parser.add_argument('--dataset',
                        type=str,
                        default="QuestionLabels",
                        help='dataset')
    parser.add_argument('--base_dataset',
                        type=str,
                        default="Names",
                        help='base_dataset')
    parser.add_argument('--checkpoints_directory',
                        type=str,
                        default="CKPTS",
                        help='Check Points Directory')
    parser.add_argument('--continue_training',
                        type=str,
                        default="False",
                        help='Continue Training')
    parser.add_argument('--filter_width',
                        type=int,
                        default=5,
                        help='Filter Width')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=256,
                        help='hidden_units')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=256,
                        help='embedding_size')
    parser.add_argument('--resume_run',
                        type=int,
                        default=-1,
                        help='Which run to resume')
    parser.add_argument('--random_network',
                        type=str,
                        default="False",
                        help='Random Network')
    parser.add_argument('--classifier_type',
                        type=str,
                        default="charRNN",
                        help='rnn type')
    parser.add_argument('--print_prob',
                        type=str,
                        default="False",
                        help='Probs')
    parser.add_argument('--progressive',
                        type=str,
                        default="True",
                        help='Progressively increase length for back prop')

    args = parser.parse_args()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    base_train_dataset = datasets.get_dataset(args.base_dataset,
                                              dataset_type='train')

    train_dataset = datasets.get_dataset(args.dataset, dataset_type='train')
    val_dataset = datasets.get_dataset(args.dataset, dataset_type='val')

    if args.classifier_type == "charRNN":
        lstm_model = model_classifier.uniRNN({
            'vocab_size':
            len(base_train_dataset.idx_to_char),
            'hidden_size':
            args.hidden_units,
            'target_size':
            len(base_train_dataset.classes),
            'embedding_size':
            args.embedding_size
        })
        print "char RNN"

    if args.classifier_type == "biRNN":
        lstm_model = model_classifier.biRNN({
            'vocab_size':
            len(base_train_dataset.idx_to_char),
            'hidden_size':
            args.hidden_units,
            'target_size':
            len(base_train_dataset.classes),
            'embedding_size':
            args.embedding_size
        })
        print "BI RNN"

    if args.classifier_type == "CNN":
        lstm_model = model_classifier.CnnTextClassifier({
            'vocab_size':
            len(base_train_dataset.idx_to_char),
            'hidden_size':
            args.hidden_units,
            'target_size':
            len(base_train_dataset.classes),
            'embedding_size':
            args.embedding_size
        })
        print "CnnTextClassifier"

    lstm_ckpt_dir = "{}/{}_classifer_{}".format(args.checkpoints_directory,
                                                args.base_dataset,
                                                args.classifier_type)
    lstm_ckpt_name = "{}/best_model.pth".format(lstm_ckpt_dir)
    if args.random_network != "True":
        lstm_model.load_state_dict(torch.load(lstm_ckpt_name))
    else:
        print "Random LSTM network.."
    lstm_model.eval()
    lstm_loss_criterion = nn.CrossEntropyLoss()

    seq_model = seq_rewriter_gumbel.seq_rewriter({
        'vocab_size':
        len(train_dataset.idx_to_char),
        'target_size':
        len(base_train_dataset.idx_to_char),
        'filter_width':
        args.filter_width,
        'target_sequence_length':
        base_train_dataset.seq_length
    })

    new_classifier = nn.Sequential(seq_model, lstm_model)

    lstm_model.to(device)
    seq_model.to(device)
    new_classifier.to(device)

    parameters = filter(lambda p: p.requires_grad, seq_model.parameters())
    for parameter in parameters:
        print "PARAMETERS", parameter.size()

    optimizer = optim.Adam(parameters, lr=args.learning_rate)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=0)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=True,
                            num_workers=0)
    evaluator = create_supervised_evaluator(new_classifier,
                                            metrics={
                                                'accuracy':
                                                CategoricalAccuracy(),
                                            })

    # CHECKPOINT DIRECTORY STUFF.......
    checkpoints_dir = "{}/ADVERSARIAL_GUMBEL".format(
        args.checkpoints_directory)
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)

    checkpoint_suffix = "lr_{}_tmin_{}_fw_{}_bs_{}_rand_{}_classifer_{}".format(
        args.learning_rate, args.temp_min, args.filter_width, args.batch_size,
        args.random_network, args.classifier_type)

    checkpoints_dir = "{}/{}_adversarial_base_{}_{}".format(
        checkpoints_dir, args.dataset, args.base_dataset, checkpoint_suffix)

    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)

    start_epoch = 0
    training_log = {
        'log': [],
        'best_epoch': 0,
        'best_accuracy': 0.0,
        'running_reward': []
    }
    running_reward = -args.batch_size

    lstm_loss_criterion = nn.CrossEntropyLoss()

    if args.continue_training == "True":
        if args.resume_run == -1:
            run_index = len(os.listdir(checkpoints_dir)) - 1
        else:
            run_index = args.resume_run
        checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index)
        if not os.path.exists(checkpoints_dir):
            raise Exception("Coud not find checkpoints_dir")

        with open("{}/training_log.json".format(checkpoints_dir)) as tlog_f:
            print "CHECKSSSSSS"
            training_log = json.load(tlog_f)

        seq_model.load_state_dict(
            torch.load("{}/best_model.pth".format(checkpoints_dir)))
        start_epoch = training_log['best_epoch']
        # running_reward = training_log['running_reward'][-1]
    else:
        run_index = len(os.listdir(checkpoints_dir))
        checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index)
        if not os.path.exists(checkpoints_dir):
            os.makedirs(checkpoints_dir)

    temp_min = args.temp_min
    temp_max = args.temp_max

    for epoch in range(start_epoch, args.max_epochs):
        new_classifier.train()
        epoch_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            slope = (temp_max - temp_min) / args.epochs_to_anneal
            temp = max(temp_max - (slope * epoch), temp_min)
            rewritten_x = seq_model(batch[0], temp=temp)
            pred_logits = lstm_model(seq_model.probs)
            # print seq_model.probs
            _, predictions = torch.max(pred_logits, 1)

            pred_correctness = (predictions == batch[1]).float()
            pred_correctness[pred_correctness == 0.0] = -1.0
            rewards = pred_correctness
            batch_reward = torch.sum(rewards)
            # print batch_reward

            loss = lstm_loss_criterion(pred_logits, batch[1])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # print running_reward/(args.log_every_batch * 1.0)
            # print batch_reward/(args.log_every_batch * 1.0)
            running_reward -= running_reward / (args.log_every_batch * 1.0)
            running_reward += batch_reward / (args.log_every_batch * 1.0)

            if batch_idx % args.log_every_batch == 0:
                if args.print_prob == "True":
                    print "Temp", temp, seq_model.probs
                print(
                    "Epoch[{}] Iteration[{}] RunningLoss[{}] Reward[{}] Temp[{}]"
                    .format(epoch, batch_idx, loss, running_reward, temp))

        evaluator.run(train_loader)
        training_metrics = evaluator.state.metrics
        print("Training Results - Epoch: {}  Avg accuracy: {:.2f}".format(
            epoch, training_metrics['accuracy']))

        evaluator.run(val_loader)
        evaluation_metrics = evaluator.state.metrics
        print("Validation Results - Epoch: {}  Avg accuracy: {:.2f}".format(
            epoch, evaluation_metrics['accuracy']))

        training_log['log'].append({
            'training_metrics': training_metrics,
            'evaluation_metrics': evaluation_metrics,
            'temp': temp
        })

        if evaluation_metrics['accuracy'] > training_log['best_accuracy']:
            torch.save(seq_model.state_dict(),
                       "{}/best_model.pth".format(checkpoints_dir))
            training_log['best_accuracy'] = evaluation_metrics['accuracy']
            training_log['best_epoch'] = epoch

        if epoch % args.save_ckpt_every == 0:
            torch.save(seq_model.state_dict(),
                       "{}/model_{}.pth".format(checkpoints_dir, epoch))

        print "BEST", training_log['best_epoch'], training_log['best_accuracy']
        with open("{}/training_log.json".format(checkpoints_dir), 'w') as f:
            f.write(json.dumps(training_log))

    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)
def main():
    parser = argparse.ArgumentParser(description='Training')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.0005,
                        help='Output filename')
    parser.add_argument('--temp_min',
                        type=float,
                        default=0.01,
                        help='Temp Min')
    parser.add_argument(
        '--epochs_to_anneal',
        type=float,
        default=15.0,
        help='Epoch Number upto which length will be progressed to full length'
    )
    parser.add_argument('--temp_max', type=float, default=2.0, help='Temp Max')
    parser.add_argument('--reg',
                        type=float,
                        default=0.01,
                        help='Output filename')
    parser.add_argument('--batch_size',
                        type=int,
                        default=8,
                        help='Output filename')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=500,
                        help='Max Epochs')
    parser.add_argument('--log_every_batch',
                        type=int,
                        default=50,
                        help='Log every batch')
    parser.add_argument('--save_ckpt_every',
                        type=int,
                        default=20,
                        help='Save Checkpoint Every')
    parser.add_argument('--dataset',
                        type=str,
                        default="QuestionLabels",
                        help='Output filename')
    parser.add_argument('--base_dataset',
                        type=str,
                        default="Names",
                        help='Output filename')
    parser.add_argument('--checkpoints_directory',
                        type=str,
                        default="CKPTS",
                        help='Check Points Directory')
    parser.add_argument('--adv_directory',
                        type=str,
                        default="ADVERSARIAL_GUMBEL",
                        help='Check Points Directory')
    parser.add_argument('--continue_training',
                        type=str,
                        default="False",
                        help='Continue Training')
    parser.add_argument('--filter_width',
                        type=int,
                        default=5,
                        help='Filter Width')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=256,
                        help='hidden_units')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=256,
                        help='embedding_size')
    parser.add_argument('--resume_run',
                        type=int,
                        default=-1,
                        help='Which run to resume')
    parser.add_argument('--random_network',
                        type=str,
                        default="False",
                        help='Random Network')
    parser.add_argument('--classifier_type',
                        type=str,
                        default="charRNN",
                        help='rnn type')
    parser.add_argument('--print_prob',
                        type=str,
                        default="False",
                        help='Probs')
    parser.add_argument('--progressive',
                        type=str,
                        default="True",
                        help='Progressively increase length for back prop')

    args = parser.parse_args()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    base_train_dataset = datasets.get_dataset(args.base_dataset,
                                              dataset_type='train')

    train_dataset = datasets.get_dataset(args.dataset, dataset_type='train')
    val_dataset = datasets.get_dataset(args.dataset, dataset_type='val')

    if args.classifier_type == "charRNN":
        lstm_model = model_classifier.uniRNN({
            'vocab_size':
            len(base_train_dataset.idx_to_char),
            'hidden_size':
            args.hidden_units,
            'target_size':
            len(base_train_dataset.classes),
            'embedding_size':
            args.embedding_size
        })
        print "char RNN"

    if args.classifier_type == "biRNN":
        lstm_model = model_classifier.biRNN({
            'vocab_size':
            len(base_train_dataset.idx_to_char),
            'hidden_size':
            args.hidden_units,
            'target_size':
            len(base_train_dataset.classes),
            'embedding_size':
            args.embedding_size
        })
        print "BI RNN"

    if args.classifier_type == "CNN":
        lstm_model = model_classifier.CnnTextClassifier({
            'vocab_size':
            len(base_train_dataset.idx_to_char),
            'hidden_size':
            args.hidden_units,
            'target_size':
            len(base_train_dataset.classes),
            'embedding_size':
            args.embedding_size
        })
        print "CnnTextClassifier"

    lstm_ckpt_dir = "{}/{}_classifer_{}".format(args.checkpoints_directory,
                                                args.base_dataset,
                                                args.classifier_type)
    lstm_ckpt_name = "{}/best_model.pth".format(lstm_ckpt_dir)
    if args.random_network != "True":
        lstm_model.load_state_dict(torch.load(lstm_ckpt_name))
    else:
        print "Random LSTM network.."
    lstm_model.eval()
    lstm_loss_criterion = nn.CrossEntropyLoss()

    seq_model = seq_rewriter_gumbel.seq_rewriter({
        'vocab_size':
        len(train_dataset.idx_to_char),
        'target_size':
        len(base_train_dataset.idx_to_char),
        'filter_width':
        args.filter_width,
        'target_sequence_length':
        base_train_dataset.seq_length
    })

    new_classifier = nn.Sequential(seq_model, lstm_model)

    lstm_model.to(device)
    seq_model.to(device)
    new_classifier.to(device)

    parameters = filter(lambda p: p.requires_grad, seq_model.parameters())
    for parameter in parameters:
        print "PARAMETERS", parameter.size()

    optimizer = optim.Adam(parameters, lr=args.learning_rate)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=0)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=True,
                            num_workers=0)
    evaluator = create_supervised_evaluator(new_classifier,
                                            metrics={
                                                'accuracy':
                                                CategoricalAccuracy(),
                                            })

    # CHECKPOINT DIRECTORY STUFF.......
    checkpoints_dir = "{}/{}".format(args.checkpoints_directory,
                                     args.adv_directory)
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)

    checkpoint_suffix = "lr_{}_tmin_{}_fw_{}_bs_{}_rand_{}_classifer_{}".format(
        args.learning_rate, args.temp_min, args.filter_width, args.batch_size,
        args.random_network, args.classifier_type)

    checkpoints_dir = "{}/{}_adversarial_base_{}_{}".format(
        checkpoints_dir, args.dataset, args.base_dataset, checkpoint_suffix)

    if args.resume_run == -1:
        run_index = len(os.listdir(checkpoints_dir)) - 1
        print "CHeck ", run_index
    else:
        run_index = args.resume_run
    checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index)
    if not os.path.exists(checkpoints_dir):
        print checkpoints_dir
        raise Exception("Coud not find checkpoints_dir")

    with open("{}/training_log.json".format(checkpoints_dir)) as tlog_f:
        print "CHECKSSSSSS"
        training_log = json.load(tlog_f)

    seq_model.load_state_dict(
        torch.load("{}/best_model.pth".format(checkpoints_dir)))
    # running_reward = training_log['running_reward'][-1]
    seq_model.eval()
    lstm_model.eval()
    new_classifier.eval()

    for batch_idx, batch in enumerate(val_loader):
        original_sentences = batch_to_sentenes(batch[0],
                                               val_dataset.idx_to_char)
        rewritten_x = seq_model(batch[0], temp=1.0)
        new_sentences = batch_to_sentenes(rewritten_x,
                                          base_train_dataset.idx_to_char,
                                          spaces=True)

        pred_logits = lstm_model(seq_model.probs)
        _, predictions = torch.max(pred_logits, 1)

        results = []
        for i in range(batch[0].size()[0]):
            print "ORIG", original_sentences[i]
            print "REWR", new_sentences[i]
            print "CLAS", base_train_dataset.classes[int(predictions[i])]
            print "MAPP", val_dataset.classes[int(predictions[i])]
            print "TARG", val_dataset.classes[int(batch[1][i])]
            print "***************"
def main():
    parser = argparse.ArgumentParser(description='Training')
    parser.add_argument('--learning_rate', type=float, default=0.001,
                        help='Learning Rate')
    parser.add_argument('--reg', type=float, default=0.01,
                        help='Regularizer')
    parser.add_argument('--batch_size', type=int, default=8,
                        help='batch size')
    parser.add_argument('--max_epochs', type=int, default=500,
                        help='Max Epochs')
    parser.add_argument('--log_every_batch', type=int, default=10,
                        help='Log every batch')
    parser.add_argument('--save_ckpt_every', type=int, default=20,
                        help='Save Checkpoint Every')
    parser.add_argument('--dataset', type=str, default="Names",
                        help='dataset')
    parser.add_argument('--base_dataset', type=str, default="Names",
                        help='base_dataset')
    parser.add_argument('--checkpoints_directory', type=str, default="CKPTS",
                        help='Check Points Directory')
    parser.add_argument('--continue_training', type=str, default="False",
                        help='Continue Training')
    parser.add_argument('--filter_width', type=int, default=5,
                        help='Filter Width')
    parser.add_argument('--hidden_units', type=int, default=256,
                        help='hidden_units')
    parser.add_argument('--embedding_size', type=int, default=256,
                        help='embedding_size')
    parser.add_argument('--resume_run', type=int, default=-1,
                        help='Which run to resume')
    parser.add_argument('--random_network', type=str, default="False",
                        help='Random Network')
    parser.add_argument('--classifier_type', type=str, default="charRNN",
                        help='rnn type')
    parser.add_argument('--progressive', type=str, default="True",
                        help='Progressively increase length for back prop')
    parser.add_argument('--progress_up_to', type=float, default=30.0,
                        help='Epoch Number upto which length will be progressed to full length')

    args = parser.parse_args()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    
    base_train_dataset = datasets.get_dataset(args.base_dataset, dataset_type = 'train')

    
    train_dataset = datasets.get_dataset(args.dataset, dataset_type = 'train')
    val_dataset = datasets.get_dataset(args.dataset, dataset_type = 'val')

    if args.classifier_type == "charRNN":
        lstm_model = model_classifier.uniRNN({
            'vocab_size' : len(base_train_dataset.idx_to_char),
            'hidden_size' : args.hidden_units,
            'target_size' : len(base_train_dataset.classes),
            'embedding_size' : args.embedding_size
        })
        print "char RNN"

    if args.classifier_type == "biRNN":
        lstm_model = model_classifier.biRNN({
            'vocab_size' : len(base_train_dataset.idx_to_char),
            'hidden_size' : args.hidden_units,
            'target_size' : len(base_train_dataset.classes),
            'embedding_size' : args.embedding_size
        })
        print "BI RNN"

    if args.classifier_type == "CNN":
        lstm_model = model_classifier.CnnTextClassifier({
            'vocab_size' : len(base_train_dataset.idx_to_char),
            'hidden_size' : args.hidden_units,
            'target_size' : len(base_train_dataset.classes),
            'embedding_size' : args.embedding_size
        })
        print "CnnTextClassifier"

    lstm_ckpt_dir = "{}/{}_classifer_{}".format(args.checkpoints_directory, args.base_dataset, args.classifier_type)
    lstm_ckpt_name = "{}/best_model.pth".format(lstm_ckpt_dir)
    if args.random_network != "True":
        lstm_model.load_state_dict(torch.load(lstm_ckpt_name))
    else:
        print "Random LSTM network.."
    lstm_model.eval()
    lstm_loss_criterion = nn.CrossEntropyLoss()

    seq_model = seq_rewriter.seq_rewriter({
        'vocab_size' : len(train_dataset.idx_to_char),
        'target_size' : len(base_train_dataset.idx_to_char),
        'filter_width' : args.filter_width,
        'target_sequence_length' : base_train_dataset.seq_length
    })

    new_classifier = nn.Sequential(seq_model, lstm_model)

    lstm_model.to(device)
    seq_model.to(device)
    new_classifier.to(device)

    parameters = filter(lambda p: p.requires_grad, seq_model.parameters())

    optimizer = optim.Adam(parameters, lr=args.learning_rate)
    
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size,
                        shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size,
                        shuffle=True, num_workers=0)
    evaluator = create_supervised_evaluator(new_classifier,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                        })
    
    # CHECKPOINT DIRECTORY STUFF.......
    checkpoints_dir = "{}/ADVERSARIAL".format(args.checkpoints_directory)
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)

    checkpoint_suffix = "lr_{}_rg_{}_fw_{}_bs_{}_rd_{}_classifer_{}".format(args.learning_rate, args.reg, args.filter_width, 
        args.batch_size, args.random_network,args.classifier_type)

    checkpoints_dir = "{}/{}_adversarial_base_{}_{}".format(checkpoints_dir, args.dataset, 
        args.base_dataset, checkpoint_suffix)
    
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)

    start_epoch = 0
    training_log = {
        'log' : [],
        'best_epoch' : 0,
        'best_accuracy' : 0.0,
        'running_reward' : []
    }
    running_reward = -args.batch_size
    
    
    if args.continue_training == "True":
        if args.resume_run == -1:
            run_index = len(os.listdir(checkpoints_dir)) - 1
        else:
            run_index = args.resume_run
        checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index)
        if not os.path.exists(checkpoints_dir):
            raise Exception("Coud not find checkpoints_dir")

        with open("{}/training_log.json".format(checkpoints_dir)) as tlog_f:
            print "CHECKSSSSSS"
            training_log = json.load(tlog_f)

        seq_model.load_state_dict(torch.load("{}/best_model.pth".format(checkpoints_dir)))
        start_epoch = training_log['best_epoch']
        running_reward = training_log['running_reward'][-1]
    else:
        run_index = len(os.listdir(checkpoints_dir))
        checkpoints_dir = "{}/{}".format(checkpoints_dir, run_index)
        if not os.path.exists(checkpoints_dir):
            os.makedirs(checkpoints_dir)
    
    for epoch in range(start_epoch, args.max_epochs):
        new_classifier.train()
        for batch_idx, batch in enumerate(train_loader):
            rewritten_x = seq_model(batch[0])
            pred_logits = lstm_model(rewritten_x)
            _, predictions = torch.max(pred_logits, 1)

            pred_correctness = (predictions == batch[1]).float()
            pred_correctness[pred_correctness == 0.0] = -1.0
            rewards = pred_correctness
            # lstm_loss = lstm_loss_criterion(pred_logits, batch[1])
            seq_rewriter_loss = 0
            max_length_to_update = train_dataset.seq_length + args.filter_width + 1
            if args.progressive == "True":
                max_length_to_update = min( int( (epoch/args.progress_up_to) * max_length_to_update  ) + 1, max_length_to_update )
            for idx, log_prob in enumerate(seq_model.saved_log_probs):
                if (idx % (batch[0].size()[1])) < max_length_to_update:
                    seq_rewriter_loss += (-log_prob * rewards[idx/rewritten_x.size()[1]])

            # seq_rewriter_loss /= (args.batch_size * max_length_to_update)
            # seq_rewriter_loss += (- args.reg * seq_model.entropy)

            l2_reg = None
            for W in seq_model.parameters():
                if l2_reg is None:
                    l2_reg = W.norm(2)
                else:
                    l2_reg = l2_reg + W.norm(2)
            
            # reg_loss = args.reg * l2_reg
            reg_loss = 0
            seq_rewriter_loss_combined = seq_rewriter_loss + reg_loss
            optimizer.zero_grad()
            seq_rewriter_loss_combined.backward()
            optimizer.step()
            seq_model.saved_log_probs = None

            batch_reward = torch.sum(rewards)
            running_reward -= running_reward/(args.log_every_batch * 1.0)
            running_reward += batch_reward/(args.log_every_batch * 1.0)

            if batch_idx % args.log_every_batch == 0:
                print ("Epoch[{}] Iteration[{}] Running Reward[{}] LossBasic[{}] RegLoss[{}] max_length_to_update[{}]".format(
                    epoch, batch_idx, running_reward, seq_rewriter_loss, reg_loss, max_length_to_update))
                training_log['running_reward'].append(float(running_reward.cpu().numpy()))

        evaluator.run(train_loader)
        training_metrics = evaluator.state.metrics
        print("Training Results - Epoch: {}  Avg accuracy: {:.2f}"
              .format(epoch, training_metrics['accuracy']))

        evaluator.run(val_loader)
        evaluation_metrics = evaluator.state.metrics
        print("Validation Results - Epoch: {}  Avg accuracy: {:.2f}"
              .format(epoch, evaluation_metrics['accuracy']))

        training_log['log'].append({
            'training_metrics' : training_metrics,
            'evaluation_metrics' : evaluation_metrics,
        })

        if evaluation_metrics['accuracy'] > training_log['best_accuracy']:
            torch.save(seq_model.state_dict(), "{}/best_model.pth".format(checkpoints_dir))
            training_log['best_accuracy'] = evaluation_metrics['accuracy']
            training_log['best_epoch'] = epoch

        if epoch % args.save_ckpt_every == 0:
            torch.save(seq_model.state_dict(), "{}/model_{}.pth".format(checkpoints_dir, epoch))

        print "BEST", training_log['best_epoch'], training_log['best_accuracy']
        with open("{}/training_log.json".format(checkpoints_dir), 'w') as f:
            f.write(json.dumps(training_log))

    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)
Пример #16
0
        ])),
                                             batch_size=batch_size,
                                             shuffle=True,
                                             **kwargs)

    model = Net()
    device = 'cuda' if use_cuda else 'cpu'
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy':
                                                CategoricalAccuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)

    @trainer.on(Events.STARTED)
    def load_checkpoint(engine):
        # you can load the best checkpoint to continue training
        filename = checkpoint_best
        # or load the last checkpoint
        filename = checkpoint_last
        try:
            print("Loading checkpoint '{}'".format(filename))
            model.load_state_dict(torch.load(filename))
            evaluator.run(val_loader)
            metrics = evaluator.state.metrics
Пример #17
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = 'cpu'

    if torch.cuda.is_available():
        device = 'cuda'

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy':
                                                CategoricalAccuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
def main():
    parser = argparse.ArgumentParser(description='Training')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.0001,
                        help='Output filename')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Output filename')
    parser.add_argument('--epochs', type=int, default=200, help='Epochs')
    parser.add_argument('--dataset',
                        type=str,
                        default="Names",
                        help='Output filename')
    parser.add_argument('--checkpoints_directory',
                        type=str,
                        default="CKPTS",
                        help='Check Points Directory')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=256,
                        help='hidden_units')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=256,
                        help='embedding_size')
    parser.add_argument('--patience', type=int, default=10, help='patience')
    parser.add_argument('--classifier_type',
                        type=str,
                        default="charRNN",
                        help='rnn type')
    args = parser.parse_args()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    train_dataset = datasets.get_dataset(args.dataset, dataset_type='train')
    val_dataset = datasets.get_dataset(args.dataset, dataset_type='train_val')

    if args.classifier_type == "charRNN":
        model_options = {
            'vocab_size': len(train_dataset.idx_to_char),
            'hidden_size': args.hidden_units,
            'target_size': len(train_dataset.classes),
            'embedding_size': args.embedding_size
        }
        model = model_classifier.uniRNN(model_options)
        print "char RNN"

    if args.classifier_type == "biRNN":
        model_options = {
            'vocab_size': len(train_dataset.idx_to_char),
            'hidden_size': args.hidden_units,
            'target_size': len(train_dataset.classes),
            'embedding_size': args.embedding_size
        }
        model = model_classifier.biRNN(model_options)
        print "BI RNN"

    if args.classifier_type == "CNN":
        model_options = {
            'vocab_size': len(train_dataset.idx_to_char),
            'hidden_size': args.hidden_units,
            'target_size': len(train_dataset.classes),
            'embedding_size': args.embedding_size
        }
        model = model_classifier.CnnTextClassifier(model_options)
        print "CnnTextClassifier"

    print device
    model.to(device)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(parameters, lr=args.learning_rate)
    loss_criterion = nn.CrossEntropyLoss()

    print "check", torch.cuda.is_available()
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=0)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=True,
                            num_workers=0)

    trainer = create_supervised_trainer(model, optimizer, loss_criterion)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy':
                                                CategoricalAccuracy(),
                                                'nll': Loss(loss_criterion)
                                            })

    checkpoints_dir = "{}/{}_classifer_{}".format(args.checkpoints_directory,
                                                  args.dataset,
                                                  args.classifier_type)
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)

    training_log = {
        'model_options': model_options,
        'log': [],
        'best_epoch': 0,
        'best_accuracy': 0.0
    }

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(trainer):
        total_batches = int(len(train_dataset) / args.batch_size)
        if trainer.state.iteration % 100 == 0:
            print("Epoch[{}] Iteration[{}] Total Iterations[{}] Loss: {:.2f}".
                  format(trainer.state.epoch, trainer.state.iteration,
                         total_batches, trainer.state.output))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(trainer):

        evaluator.run(train_loader)
        training_metrics = evaluator.state.metrics
        print(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(trainer.state.epoch, training_metrics['accuracy'],
                    training_metrics['nll']))

        evaluator.run(val_loader)
        evaluation_metrics = evaluator.state.metrics
        print(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(trainer.state.epoch, evaluation_metrics['accuracy'],
                    evaluation_metrics['nll']))

        out_path = "{}/model_epoch_{}.pth".format(checkpoints_dir,
                                                  trainer.state.epoch)
        torch.save(model.state_dict(), out_path)

        training_log['log'].append({
            'training_metrics': training_metrics,
            'evaluation_metrics': evaluation_metrics,
        })

        # if (trainer.state.epoch - training_log['best_epoch']) > args.patience and (evaluation_metrics['accuracy'] < training_log['best_accuracy']):
        #     trainer.terminate()

        if evaluation_metrics['accuracy'] > training_log['best_accuracy']:
            torch.save(model.state_dict(),
                       "{}/best_model.pth".format(checkpoints_dir))
            training_log['best_accuracy'] = evaluation_metrics['accuracy']
            training_log['best_epoch'] = trainer.state.epoch

        print "BEST", training_log['best_epoch'], training_log['best_accuracy']
        with open("{}/training_log.json".format(checkpoints_dir), 'w') as f:
            f.write(json.dumps(training_log))

    trainer.run(train_loader, max_epochs=args.epochs)
Пример #19
0
def train(base_path: str,
            epochs: int,
            n_folds: int,
            #val_batch_size: int,
            lr: t.Optional[float] = 1e-2,
            momentum: t.Optional[float] = 0.5,
            log_interval: t.Optional[int] = 50,
            random_seed: t.Optional[int] = 42,
            handlers: t.Optional[t.Tuple] = ()
          ) -> nn.Module:
    """
    Instantiates and trains a CNN on MNIST.
    """
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    model = ResFeatureExtractor(pretrained_model=models.resnet50)


    image_transform = Compose([Resize((320,180)),
                                ToTensor()])
    kfoldWorkflowSet = kFoldWorkflowSplit(base_path, 
                                            image_transform=image_transform,
                                            video_extn='.avi', shuffle=True,
                                            n_folds=n_folds, num_phases=14,
                                            batch_size=32, num_workers=16)

    train_loader, val_loader = next(kfoldWorkflowSet)#get_data_loaders(train_batch_size, val_batch_size)
    device = 'cpu'

    if torch.cuda.is_available():
        device = 'cuda:0'
        model = model.to(device=device)

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion_CE = nn.CrossEntropyLoss()

    trainer = create_supervised_trainer(
        model,
        optimizer,
        criterion_CE,
        device=device)
    evaluator = create_supervised_evaluator(
        model,
        metrics={'accuracy': CategoricalAccuracy(), 'CE': Loss(criterion_CE)},
        device=device)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        i = (engine.state.iteration - 1) % len(train_loader) + 1
        if i % log_interval == 0:
            print(f"[{engine.state.epoch}] {i}/{len(train_loader)} loss: {'%.2f' % engine.state.output}")

    # Attach scheduler(s)
    for handler_args in handlers:
        (scheduler_cls, param_name, start_value, end_value, cycle_mult) = handler_args
        handler = scheduler_cls(
            optimizer, param_name, start_value, end_value, len(train_loader),
            cycle_mult=cycle_mult, save_history=True)
        trainer.add_event_handler(Events.ITERATION_COMPLETED, handler)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_CE = metrics['CE']
        print("Validation Accuracy: {:.2f} Loss: {:.2f}\n".format(avg_accuracy, avg_CE))

    trainer.run(train_loader, max_epochs=epochs)
    
    return (model, trainer.state)
Пример #20
0
# Optional config param: if set evaluation on val_dataloader is run
val_dataloader = get_basic_dataloader("test",
                                      batch_size,
                                      num_workers,
                                      device=device,
                                      data_augs=val_data_augs)

# Required config param
model = resnet50(pretrained=False, num_classes=10)
model.avgpool = nn.AdaptiveAvgPool2d(1)

# Required config param
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Required config param
criterion = nn.CrossEntropyLoss()

# Required config param
num_epochs = 50

# Optional config param
metrics = {
    "precision": Precision(average=True),
    "recall": Recall(average=True),
    "accuracy": CategoricalAccuracy()
}

# Optional config param
lr_scheduler = CosineAnnealingLR(optimizer, T_max=1200, eta_min=1e-5)
Пример #21
0
def test_integration():

    n_iters = 100
    batch_size = 10
    n_classes = 10
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    loss_values = iter(range(n_iters))

    def update_fn(engine, batch):
        loss_value = next(loss_values)
        y_true_batch = next(y_true_batch_values)
        y_pred_batch = next(y_pred_batch_values)
        return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy(
            y_true_batch)

    trainer = Engine(update_fn)
    alpha = 0.98

    acc_metric = RunningAverage(
        CategoricalAccuracy(output_transform=lambda x: [x[1], x[2]]),
        alpha=alpha)
    acc_metric.attach(trainer, 'running_avg_accuracy')

    avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha)
    avg_output.attach(trainer, 'running_avg_output')

    running_avg_acc = [None]

    @trainer.on(Events.ITERATION_COMPLETED, running_avg_acc)
    def manual_running_avg_acc(engine, running_avg_acc):
        _, y_pred, y = engine.state.output
        indices = torch.max(y_pred, 1)[1]
        correct = torch.eq(indices, y).view(-1)
        num_correct = torch.sum(correct).item()
        num_examples = correct.shape[0]
        batch_acc = num_correct * 1.0 / num_examples
        if running_avg_acc[0] is None:
            running_avg_acc[0] = batch_acc
        else:
            running_avg_acc[0] = running_avg_acc[0] * alpha + (
                1.0 - alpha) * batch_acc
        engine.state.running_avg_acc = running_avg_acc[0]

    @trainer.on(Events.EPOCH_STARTED)
    def running_avg_output_init(engine):
        engine.state.running_avg_output = None

    @trainer.on(Events.ITERATION_COMPLETED)
    def running_avg_output_update(engine):
        if engine.state.running_avg_output is None:
            engine.state.running_avg_output = engine.state.output[0]
        else:
            engine.state.running_avg_output = engine.state.running_avg_output * alpha + \
                (1.0 - alpha) * engine.state.output[0]

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_acc_values(engine):
        assert engine.state.running_avg_acc == engine.state.metrics['running_avg_accuracy'], \
            "{} vs {}".format(engine.state.running_avg_acc, engine.state.metrics['running_avg_accuracy'])

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_output_values(engine):
        assert engine.state.running_avg_output == engine.state.metrics['running_avg_output'], \
            "{} vs {}".format(engine.state.running_avg_output, engine.state.metrics['running_avg_output'])

    np.random.seed(10)
    running_avg_acc[0] = None
    n_iters = 10
    batch_size = 10
    n_classes = 10
    data = list(range(n_iters))
    loss_values = iter(range(n_iters))
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    trainer.run(data, max_epochs=1)

    running_avg_acc[0] = None
    n_iters = 10
    batch_size = 10
    n_classes = 10
    data = list(range(n_iters))
    loss_values = iter(range(n_iters))
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    trainer.run(data, max_epochs=1)
Пример #22
0
def train(params, log, time_keeper):
    # specify dataset
    dataset = DatasetFactory.create(params)

    # specify model
    model = ModelFactory.create(params)
    model = model.to(params['device'])

    # optiimizer
    optimizer = SGD(model.parameters(),
                    lr=params['TRAIN']['lr'],
                    momentum=params['TRAIN']['momentum'])

    # scheduler
    scheduler = None

    # best accuracy(precision)
    best_prec = 0

    # optionally resume from a checkpoint
    checkpoint_file = params['TRAIN']['resume']
    start_epoch, best_prec = load_checkpoint(log, model, checkpoint_file,
                                             optimizer, scheduler)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.cross_entropy,
                                        device=params['device'])

    # evaluator
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy':
                                                CategoricalAccuracy(),
                                                'cross_entropy':
                                                Loss(F.cross_entropy)
                                            },
                                            device=params['device'])
    # log details
    log_string = "\n" + "==== NET MODEL:\n" + str(model)
    log_string += "\n" + "==== OPTIMIZER:\n" + str(optimizer) + "\n"
    log.log_global(log_string)

    # end-of-iteration events
    @trainer.on(Events.ITERATION_COMPLETED)
    def on_iter(engine):
        iter_current = engine.state.iteration % len(dataset.loader['train'])
        epoch_current = engine.state.epoch
        num_iter = len(dataset.loader['train'])
        loss = engine.state.output

        # logging
        time_string = time_keeper.get_current_str()  # get current time
        log.log_iter(iter_current, epoch_current - 1, num_iter, loss,
                     time_string)

    # end-of-epoch events
    @trainer.on(Events.EPOCH_COMPLETED)
    def on_epoch(engine):
        nonlocal best_prec

        # current epoch
        epoch_current = engine.state.epoch

        # evaluation on train set
        evaluator.run(dataset.loader['train'])
        acc_train = evaluator.state.metrics['accuracy'] * 100
        loss_train = evaluator.state.metrics['cross_entropy']

        # evaluation on val set
        evaluator.run(dataset.loader['val'])
        acc_val = evaluator.state.metrics['accuracy'] * 100
        loss_val = evaluator.state.metrics['cross_entropy']

        is_best = acc_val > best_prec
        best_prec = max(acc_val, best_prec)
        save_checkpoint(
            {
                'epoch': epoch_current + 1,
                'state_dict': model.state_dict(),
                'best_prec': best_prec,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler
            }, model, params, is_best)

        # logging results
        time_string = time_keeper.get_current_str()  # get current time
        log.log_epoch(epoch_current, acc_train, loss_train, acc_val, loss_val,
                      is_best, time_string)

    time_keeper.start()
    trainer.run(dataset.loader['train'], max_epochs=params['TRAIN']['epochs'])
Пример #23
0
def test_zero_div():
    acc = CategoricalAccuracy()
    with pytest.raises(NotComputableError):
        acc.compute()
Пример #24
0
def test_warning():
    with pytest.warns(DeprecationWarning):
        CategoricalAccuracy()
def run(train_batch_size,
        val_batch_size,
        epochs,
        lr,
        momentum,
        log_interval,
        restore_from,
        crash_iteration=1000):

    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = 'cpu'
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy':
                                                CategoricalAccuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)
    # Setup debug level of engine logger:
    trainer._logger.setLevel(logging.INFO)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        "%(asctime)s|%(name)s|%(levelname)s| %(message)s")
    ch.setFormatter(formatter)
    trainer._logger.addHandler(ch)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
                  "".format(engine.state.epoch, iter, len(train_loader),
                            engine.state.output))

        if engine.state.iteration == crash_iteration:
            raise Exception("STOP at {}".format(engine.state.iteration))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    objects_to_checkpoint = {"model": model, "optimizer": optimizer}
    engine_checkpoint = EngineCheckpoint(dirname="engine_checkpoint",
                                         to_save=objects_to_checkpoint,
                                         save_interval=100)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, engine_checkpoint)

    if restore_from == "":
        trainer.run(train_loader, max_epochs=epochs)
    else:
        trainer.resume(train_loader,
                       restore_from,
                       to_load=objects_to_checkpoint)
Пример #26
0
def main(cfg):
    model_name = get_model_name(cfg)
    model_name = randomize_name(model_name)
    print(f'Model name: {model_name}')

    dataset_train, dataset_dev = get_dataset(cfg)

    W_emb = create_word_embeddings(cfg, dataset_train.vocab)
    model_params = get_model_params(cfg, W_emb)
    model = create_model(cfg, model_params, W_emb=W_emb)

    data_loader_train = create_data_loader(dataset_train, cfg.batch_size, shuffle=True)
    data_loader_dev = create_data_loader(dataset_dev, cfg.batch_size, shuffle=False)

    model_parameters = get_trainable_parameters(model.parameters())
    optimizer = torch.optim.Adam(model_parameters, cfg.learning_rate, weight_decay=cfg.weight_decay, amsgrad=True)
    criterion = torch.nn.CrossEntropyLoss()

    def update_function(engine, batch):
        model.train()
        optimizer.zero_grad()

        (premise, hypothesis), label = to_device(batch)

        logits = model(premise, hypothesis)
        loss = criterion(logits, label)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_parameters, cfg.max_grad_norm)
        optimizer.step()

        return loss.item()

    def inference_function(engine, batch):
        model.eval()
        with torch.no_grad():
            (premise, hypothesis), label = to_device(batch)

            logits = model(premise, hypothesis)

            return logits, label

    trainer = Engine(update_function)
    evaluator = Engine(inference_function)

    metrics = [
        ('loss', Loss(criterion)),
        ('accuracy', CategoricalAccuracy())
    ]
    for name, metric in metrics:
        metric.attach(evaluator, name)

    best_dev_acc = -np.inf

    @trainer.on(Events.EPOCH_COMPLETED)
    def eval_model(engine):
        nonlocal best_dev_acc

        def format_metric_str(metrics_values):
            metrics_str = ', '.join([
                f'{metric_name} {metrics_values[metric_name]:.3f}' for metric_name, _ in metrics
            ])
            return metrics_str

        evaluator.run(data_loader_train)
        metrics_train = evaluator.state.metrics.copy()

        evaluator.run(data_loader_dev)
        metrics_dev = evaluator.state.metrics.copy()

        print(f'Epoch {engine.state.epoch}', end=' | ')
        print('Train:', format_metric_str(metrics_train), end=' | ')
        print('Dev:', format_metric_str(metrics_dev), end=' ')
        print()

        if metrics_dev['accuracy'] > best_dev_acc:
            best_dev_acc = metrics_dev['accuracy']
            save_weights(model, cfg.models_dir.joinpath(f'{model_name}.pt'))

    # save models specifications
    create_dirs(cfg)
    model_spec = dict(model_name=model_name, model_params=model_params, vocab=dataset_train.vocab, cfg=cfg)
    save_pickle(model_spec, cfg.models_dir.joinpath(f'{model_name}.pkl'))

    trainer.run(data_loader_train, max_epochs=cfg.nb_epochs)

    print(f'Best dev accuracy: {best_dev_acc:.3f}')
Пример #27
0
def run(path, model_name, imgaugs,
        train_batch_size, val_batch_size, num_workers,
        epochs, optim,
        lr, lr_update_every, gamma, restart_every, restart_factor, init_lr_factor,
        lr_reduce_patience, early_stop_patience,
        log_interval, output, debug):

    # Polyaxon
    exp = Experiment()
    exp.log_params(seed=SEED)

    print("--- Cifar10 Playground : Training --- ")

    from datetime import datetime
    now = datetime.now()
    log_dir = os.path.join(output, "training_{}_{}".format(model_name, now.strftime("%Y%m%d_%H%M")))
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_level = logging.INFO
    if debug:
        log_level = logging.DEBUG
        print("Activated debug mode")

    logger = logging.getLogger("Cifar10 Playground: Train")
    setup_logger(logger, log_dir, log_level)

    logger.debug("Setup tensorboard writer")
    writer = SummaryWriter(log_dir=os.path.join(log_dir, "tensorboard"))

    save_conf(logger, writer, model_name, imgaugs,
              train_batch_size, val_batch_size, num_workers,
              epochs, optim,
              lr, lr_update_every, gamma, restart_every, restart_factor, init_lr_factor,
              lr_reduce_patience, early_stop_patience,
              log_dir)

    # Polyaxon
    # log config
    exp.log_params(
        model_name=model_name,
        imgaugs=imgaugs,
        train_batch_size=train_batch_size,
        val_batch_size=val_batch_size, num_workers=num_workers,
        num_epochs=epochs, optimizer=optim,
        lr=lr, lr_update_every=lr_update_every,
        gamma=gamma, restart_every=restart_every,
        restart_factor=restart_factor, init_lr_factor=init_lr_factor,
        lr_reduce_patience=lr_reduce_patience, early_stop_patience=early_stop_patience
    )

    device = 'cpu'
    if torch.cuda.is_available():
        logger.debug("CUDA is enabled")
        from torch.backends import cudnn
        cudnn.benchmark = True
        device = 'cuda'

    # Polyaxon
    exp.log_params(device=device)

    logger.debug("Setup model: {}".format(model_name))

    if not os.path.isfile(model_name):
        assert model_name in MODEL_MAP, "Model name not in {}".format(MODEL_MAP.keys())
        model = MODEL_MAP[model_name](num_classes=10)
    else:
        model = torch.load(model_name)

    model_name = model.__class__.__name__
    if 'cuda' in device:
        model = model.to(device)

    logger.debug("Setup train/val dataloaders")
    train_loader, val_loader = get_data_loaders(path, imgaugs, train_batch_size, val_batch_size,
                                                num_workers, device=device)

    write_model_graph(writer, model, train_loader, device=device)

    logger.debug("Setup optimizer")
    assert optim in OPTIMIZER_MAP, "Optimizer name not in {}".format(OPTIMIZER_MAP.keys())
    optimizer = OPTIMIZER_MAP[optim](model.parameters(), lr=lr)

    logger.debug("Setup criterion")
    criterion = nn.CrossEntropyLoss()
    if 'cuda' in device:
        criterion = criterion.cuda()

    lr_scheduler = ExponentialLR(optimizer, gamma=gamma)
    lr_scheduler_restarts = LRSchedulerWithRestart(lr_scheduler,
                                                   restart_every=restart_every,
                                                   restart_factor=restart_factor,
                                                   init_lr_factor=init_lr_factor)
    reduce_on_plateau = ReduceLROnPlateau(optimizer, mode='min', factor=0.1,
                                          patience=lr_reduce_patience,
                                          threshold=0.01, verbose=True)

    logger.debug("Setup ignite trainer and evaluator")
    trainer = create_supervised_trainer(model, optimizer, criterion, device=device)

    metrics = {
        'accuracy': CategoricalAccuracy(),
        'precision': Precision(),
        'recall': Recall(),
        'nll': Loss(criterion)
    }
    train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
    val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)

    logger.debug("Setup handlers")
    # Setup timer to measure training time
    timer = Timer(average=True)
    timer.attach(trainer,
                 start=Events.EPOCH_STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(engine.state.epoch, iter,
                                                                         len(train_loader),
                                                                         engine.state.output))

            writer.add_scalar("training/loss_vs_iterations", engine.state.output, engine.state.iteration)

    @trainer.on(Events.EPOCH_STARTED)
    def update_lr_schedulers(engine):
        if (engine.state.epoch - 1) % lr_update_every == 0:
            lr_scheduler_restarts.step()

    @trainer.on(Events.EPOCH_STARTED)
    def log_lrs(engine):
        if len(optimizer.param_groups) == 1:
            lr = float(optimizer.param_groups[0]['lr'])
            writer.add_scalar("learning_rate", lr, engine.state.epoch)
            logger.debug("Learning rate: {}".format(lr))
        else:
            for i, param_group in enumerate(optimizer.param_groups):
                lr = float(param_group['lr'])
                logger.debug("Learning rate (group {}): {}".format(i, lr))
                writer.add_scalar("learning_rate_group_{}".format(i), lr, engine.state.epoch)

    log_images_dir = os.path.join(log_dir, "figures")
    os.makedirs(log_images_dir)

    def log_precision_recall_results(metrics, epoch, mode):
        for metric_name in ['precision', 'recall']:
            value = metrics[metric_name]
            avg_value = torch.mean(value).item()
            writer.add_scalar("{}/avg_{}".format(mode, metric_name), avg_value, epoch)

            kwargs = {"{}_avg_{}".format(mode, metric_name): avg_value}
            # Polyaxon
            exp.log_metrics(step=epoch, **kwargs)

            # Save metric per class figure
            sorted_values = value.to('cpu').numpy()
            indices = np.argsort(sorted_values)
            sorted_values = sorted_values[indices]
            n_classes = len(sorted_values)
            classes = np.array(["class_{}".format(i) for i in range(n_classes)])
            sorted_classes = classes[indices]
            fig = create_fig_param_per_class(sorted_values, metric_name, classes=sorted_classes, n_classes_per_fig=20)
            fname = os.path.join(log_images_dir, "{}_{}_{}_per_class.png".format(mode, epoch, metric_name))
            fig.savefig(fname)
            tag = "{}_{}".format(mode, metric_name)
            writer.add_figure(tag, fig, epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_metrics(engine):
        epoch = engine.state.epoch
        logger.info("One epoch training time (seconds): {}".format(timer.value()))
        metrics = train_evaluator.run(train_loader).metrics
        logger.info("Training Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}"
                    .format(engine.state.epoch, metrics['accuracy'], metrics['nll']))
        writer.add_scalar("training/avg_accuracy", metrics['accuracy'], epoch)
        writer.add_scalar("training/avg_error", 1.0 - metrics['accuracy'], epoch)
        writer.add_scalar("training/avg_loss", metrics['nll'], epoch)

        kwargs = {
            "training_avg_accuracy": metrics['accuracy'],
            "training_avg_error": 1.0 - metrics['accuracy'],
            "training_avg_loss": metrics['nll'],
        }
        # Polyaxon
        exp.log_metrics(step=epoch, **kwargs)

        log_precision_recall_results(metrics, epoch, "training")

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        epoch = engine.state.epoch
        metrics = val_evaluator.run(val_loader).metrics
        writer.add_scalar("validation/avg_loss", metrics['nll'], epoch)
        writer.add_scalar("validation/avg_accuracy", metrics['accuracy'], epoch)
        writer.add_scalar("validation/avg_error", 1.0 - metrics['accuracy'], epoch)

        kwargs = {
            "validation_avg_accuracy": metrics['accuracy'],
            "validation_avg_error": 1.0 - metrics['accuracy'],
            "validation_avg_loss": metrics['nll'],
        }
        # Polyaxon
        exp.log_metrics(step=epoch, **kwargs)

        logger.info("Validation Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}"
                    .format(engine.state.epoch, metrics['accuracy'], metrics['nll']))
        log_precision_recall_results(metrics, epoch, "validation")

    @val_evaluator.on(Events.COMPLETED)
    def update_reduce_on_plateau(engine):
        val_loss = engine.state.metrics['nll']
        reduce_on_plateau.step(val_loss)

    def score_function(engine):
        val_loss = engine.state.metrics['nll']
        # Objects with highest scores will be retained.
        return -val_loss

    # Setup early stopping:
    handler = EarlyStopping(patience=early_stop_patience, score_function=score_function, trainer=trainer)
    setup_logger(handler._logger, log_dir, log_level)
    val_evaluator.add_event_handler(Events.COMPLETED, handler)

    # Setup model checkpoint:
    best_model_saver = ModelCheckpoint(log_dir,
                                       filename_prefix="model",
                                       score_name="val_loss",
                                       score_function=score_function,
                                       n_saved=5,
                                       atomic=True,
                                       create_dir=True)
    val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver, {model_name: model})

    last_model_saver = ModelCheckpoint(log_dir,
                                       filename_prefix="checkpoint",
                                       save_interval=1,
                                       n_saved=1,
                                       atomic=True,
                                       create_dir=True)
    trainer.add_event_handler(Events.COMPLETED, last_model_saver, {model_name: model})

    logger.info("Start training: {} epochs".format(epochs))
    try:
        trainer.run(train_loader, max_epochs=epochs)
    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        if args.debug:
            try:
                # open an ipython shell if possible
                import IPython
                IPython.embed()  # noqa
            except ImportError:
                print("Failed to start IPython console")

    logger.debug("Training is ended")
    writer.close()
Пример #28
0
import torch.nn.functional as F
from ignite.metrics import CategoricalAccuracy, Loss, MeanAbsoluteError

from attributer.attributes import FaceAttributes
from training.metric_utils import ScaledError

_metrics = {
    FaceAttributes.AGE: ScaledError(MeanAbsoluteError(), 50),
    FaceAttributes.GENDER: CategoricalAccuracy(),
    FaceAttributes.EYEGLASSES: CategoricalAccuracy(),
    FaceAttributes.RECEDING_HAIRLINES: CategoricalAccuracy(),
    FaceAttributes.SMILING: CategoricalAccuracy(),
    FaceAttributes.HEAD_YAW_BIN: CategoricalAccuracy(),
    FaceAttributes.HEAD_PITCH_BIN: CategoricalAccuracy(),
    FaceAttributes.HEAD_ROLL_BIN: CategoricalAccuracy(),
    FaceAttributes.HEAD_YAW: MeanAbsoluteError(),
    FaceAttributes.HEAD_PITCH: MeanAbsoluteError(),
    FaceAttributes.HEAD_ROLL: MeanAbsoluteError(),
}

_losses = {
    FaceAttributes.AGE: F.l1_loss,
    FaceAttributes.GENDER: F.cross_entropy,
    FaceAttributes.EYEGLASSES: F.cross_entropy,
    FaceAttributes.RECEDING_HAIRLINES: F.cross_entropy,
    FaceAttributes.SMILING: F.cross_entropy,
    FaceAttributes.HEAD_YAW_BIN: F.cross_entropy,
    FaceAttributes.HEAD_PITCH_BIN: F.cross_entropy,
    FaceAttributes.HEAD_ROLL_BIN: F.cross_entropy,
    FaceAttributes.HEAD_YAW: F.l1_loss,
    FaceAttributes.HEAD_PITCH: F.l1_loss,
Пример #29
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    vis = visdom.Visdom()

    # if not vis.check_connection():
    #     raise RuntimeError("Visdom server not running. Please run python -m visdom.server")

    train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size)
    model = Net()
    device = 'cpu'

    if torch.cuda.is_available():
        device = 'cuda'

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={'accuracy': CategoricalAccuracy(),
                                                     'nll': Loss(F.nll_loss)},
                                            device=device)

    train_loss_window = create_plot_window(vis, '#Iterations', 'Loss', 'Training Loss')
    train_avg_loss_window = create_plot_window(vis, '#Iterations', 'Loss', 'Training Average Loss')
    train_avg_accuracy_window = create_plot_window(vis, '#Iterations', 'Accuracy', 'Training Average Accuracy')
    val_avg_loss_window = create_plot_window(vis, '#Epochs', 'Loss', 'Validation Average Loss')
    val_avg_accuracy_window = create_plot_window(vis, '#Epochs', 'Accuracy', 'Validation Average Accuracy')

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
                  "".format(engine.state.epoch, iter, len(train_loader), engine.state.output))
            vis.line(X=np.array([engine.state.iteration]),
                     Y=np.array([engine.state.output]),
                     update='append', win=train_loss_window)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print("Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
              .format(engine.state.epoch, avg_accuracy, avg_nll))
        vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_accuracy]),
                 win=train_avg_accuracy_window, update='append')
        vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_nll]),
                 win=train_avg_loss_window, update='append')

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print("Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
              .format(engine.state.epoch, avg_accuracy, avg_nll))
        vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_accuracy]),
                 win=val_avg_accuracy_window, update='append')
        vis.line(X=np.array([engine.state.epoch]), Y=np.array([avg_nll]),
                 win=val_avg_loss_window, update='append')

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)
Пример #30
0
def run(config_file):
    print("--- iMaterialist 2018 : Training --- ")

    print("Load config file ... ")
    config = load_config(config_file)

    seed = config.get("SEED", 2018)
    random.seed(seed)
    torch.manual_seed(seed)

    output = Path(config["OUTPUT_PATH"])
    debug = config.get("DEBUG", False)

    from datetime import datetime
    now = datetime.now()
    log_dir = output / ("{}".format(Path(config_file).stem)) / "{}".format(
        now.strftime("%Y%m%d_%H%M"))
    assert not log_dir.exists(), \
        "Output logging directory '{}' already existing".format(log_dir)
    log_dir.mkdir(parents=True)

    shutil.copyfile(config_file, (log_dir / Path(config_file).name).as_posix())

    log_level = logging.INFO
    if debug:
        log_level = logging.DEBUG
        print("Activated debug mode")

    logger = logging.getLogger("iMaterialist 2018: Train")
    setup_logger(logger, (log_dir / "train.log").as_posix(), log_level)

    logger.debug("Setup tensorboard writer")
    writer = SummaryWriter(log_dir=(log_dir / "tensorboard").as_posix())

    save_conf(config_file, log_dir.as_posix(), logger, writer)

    model = config["MODEL"]
    model_name = model.__class__.__name__

    device = config.get("DEVICE", 'cuda')
    if 'cuda' in device:
        assert torch.cuda.is_available(), \
            "Device {} is not compatible with torch.cuda.is_available()".format(device)
        from torch.backends import cudnn
        cudnn.benchmark = True
        logger.debug("CUDA is enabled")
        model = model.to(device)

    logger.debug("Setup train/val dataloaders")
    train_loader, val_loader = config["TRAIN_LOADER"], config["VAL_LOADER"]

    # Setup training subset to run evaluation on:
    indices = np.arange(len(train_loader.sampler))
    np.random.shuffle(indices)
    indices = indices[:len(val_loader.sampler)] if len(
        val_loader.sampler) < len(train_loader.sampler) else indices
    train_eval_loader = get_train_eval_data_loader(train_loader, indices)

    logger.debug(
        "- train data loader: {} number of batches | {} number of samples".
        format(len(train_loader), len(train_loader.sampler)))
    logger.debug(
        "- train eval data loader: {} number of batches | {} number of samples"
        .format(len(train_eval_loader), len(train_eval_loader.sampler)))
    logger.debug(
        "- validation data loader: {} number of batches | {} number of samples"
        .format(len(val_loader), len(val_loader.sampler)))

    # write_model_graph(writer, model=model, data_loader=train_loader, device=device)

    optimizer = config["OPTIM"]

    logger.debug("Setup criterion")
    criterion = config["CRITERION"]
    if "cuda" in device and isinstance(criterion, nn.Module):
        criterion = criterion.to(device)

    lr_schedulers = config.get("LR_SCHEDULERS")

    logger.debug("Setup ignite trainer and evaluator")
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)

    metrics = {
        'accuracy': CategoricalAccuracy(),
        'precision': Precision(),
        'recall': Recall(),
        'nll': Loss(criterion)
    }
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)

    val_metrics = {
        'accuracy': CategoricalAccuracy(),
        'precision': Precision(),
        'recall': Recall(),
        'nll': Loss(nn.CrossEntropyLoss())
    }
    val_evaluator = create_supervised_evaluator(model,
                                                metrics=val_metrics,
                                                device=device)

    logger.debug("Setup handlers")
    log_interval = config.get("LOG_INTERVAL", 100)
    reduce_on_plateau = config.get("REDUCE_LR_ON_PLATEAU")

    # Setup timer to measure training time
    timer = Timer(average=True)
    timer.attach(trainer,
                 start=Events.EPOCH_STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(
                engine.state.epoch, iter, len(train_loader),
                engine.state.output))

            writer.add_scalar("training/loss_vs_iterations",
                              engine.state.output, engine.state.iteration)

    @trainer.on(Events.EPOCH_STARTED)
    def update_lr_schedulers(engine):
        if lr_schedulers is not None:
            for lr_scheduler in lr_schedulers:
                lr_scheduler.step()

    @trainer.on(Events.EPOCH_STARTED)
    def log_lrs(engine):
        if len(optimizer.param_groups) == 1:
            lr = float(optimizer.param_groups[0]['lr'])
            writer.add_scalar("learning_rate", lr, engine.state.epoch)
            logger.debug("Learning rate: {}".format(lr))
        else:
            for i, param_group in enumerate(optimizer.param_groups):
                lr = float(param_group['lr'])
                logger.debug("Learning rate (group {}): {}".format(i, lr))
                writer.add_scalar("learning_rate_group_{}".format(i), lr,
                                  engine.state.epoch)

    log_images_dir = log_dir / "figures"
    log_images_dir.mkdir(parents=True)

    def log_precision_recall_results(metrics, epoch, mode):
        for metric_name in ['precision', 'recall']:
            value = metrics[metric_name]
            avg_value = torch.mean(value).item()
            writer.add_scalar("{}/avg_{}".format(mode, metric_name), avg_value,
                              epoch)
            # Save metric per class figure
            sorted_values = value.to('cpu').numpy()
            indices = np.argsort(sorted_values)
            sorted_values = sorted_values[indices]
            n_classes = len(sorted_values)
            classes = np.array(
                ["class_{}".format(i) for i in range(n_classes)])
            sorted_classes = classes[indices]
            fig = create_fig_param_per_class(sorted_values,
                                             metric_name,
                                             classes=sorted_classes,
                                             n_classes_per_fig=20)
            fname = log_images_dir / ("{}_{}_{}_per_class.png".format(
                mode, epoch, metric_name))
            fig.savefig(fname.as_posix())
            # Add figure in TB
            img = Image.open(fname.as_posix())
            tag = "{}_{}".format(mode, metric_name)
            writer.add_image(tag, np.asarray(img), epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_metrics(engine):
        epoch = engine.state.epoch
        logger.info("One epoch training time (seconds): {}".format(
            timer.value()))
        metrics = train_evaluator.run(train_eval_loader).metrics
        logger.info(
            "Training Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}"
            .format(engine.state.epoch, metrics['accuracy'], metrics['nll']))
        writer.add_scalar("training/avg_accuracy", metrics['accuracy'], epoch)
        writer.add_scalar("training/avg_error", 1.0 - metrics['accuracy'],
                          epoch)
        writer.add_scalar("training/avg_loss", metrics['nll'], epoch)
        log_precision_recall_results(metrics, epoch, "training")

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        epoch = engine.state.epoch
        metrics = val_evaluator.run(val_loader).metrics
        writer.add_scalar("validation/avg_loss", metrics['nll'], epoch)
        writer.add_scalar("validation/avg_accuracy", metrics['accuracy'],
                          epoch)
        writer.add_scalar("validation/avg_error", 1.0 - metrics['accuracy'],
                          epoch)
        logger.info(
            "Validation Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}"
            .format(engine.state.epoch, metrics['accuracy'], metrics['nll']))
        log_precision_recall_results(metrics, epoch, "validation")

    if reduce_on_plateau is not None:

        @val_evaluator.on(Events.COMPLETED)
        def update_reduce_on_plateau(engine):
            val_loss = engine.state.metrics['nll']
            reduce_on_plateau.step(val_loss)

    def score_function(engine):
        val_loss = engine.state.metrics['nll']
        # Objects with highest scores will be retained.
        return -val_loss

    # Setup early stopping:
    if "EARLY_STOPPING_KWARGS" in config:
        kwargs = config["EARLY_STOPPING_KWARGS"]
        if 'score_function' not in kwargs:
            kwargs['score_function'] = score_function
        handler = EarlyStopping(trainer=trainer, **kwargs)
        setup_logger(handler._logger, (log_dir / "train.log").as_posix(),
                     log_level)
        val_evaluator.add_event_handler(Events.COMPLETED, handler)

    # Setup model checkpoint:
    best_model_saver = ModelCheckpoint(log_dir.as_posix(),
                                       filename_prefix="model",
                                       score_name="val_loss",
                                       score_function=score_function,
                                       n_saved=5,
                                       atomic=True,
                                       create_dir=True)
    val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver,
                                    {model_name: model})

    last_model_saver = ModelCheckpoint(log_dir.as_posix(),
                                       filename_prefix="checkpoint",
                                       save_interval=1,
                                       n_saved=1,
                                       atomic=True,
                                       create_dir=True)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, last_model_saver,
                              {model_name: model})

    # Setup custom event handlers:
    for (event, handler) in config["TRAINER_CUSTOM_EVENT_HANDLERS"]:
        trainer.add_event_handler(event, handler, val_evaluator, logger)

    for (event, handler) in config["EVALUATOR_CUSTOM_EVENT_HANDLERS"]:
        val_evaluator.add_event_handler(event, handler, trainer, logger)

    n_epochs = config["N_EPOCHS"]
    logger.info("Start training: {} epochs".format(n_epochs))
    try:
        trainer.run(train_loader, max_epochs=n_epochs)
    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        if debug:
            try:
                # open an ipython shell if possible
                import IPython
                IPython.embed()  # noqa
            except ImportError:
                print("Failed to start IPython console")

    logger.debug("Training is ended")
    writer.close()