示例#1
0
def eval_acc(inference, val_loader, ctx, return_meta=False):
    mtc_acc = Accuracy()
    mtc_acc.reset()

    feature_nest, y_nest, y_hat_nest = [], [], []
    for X, y in val_loader:
        X = X.as_in_context(ctx[0])
        y = y.as_in_context(ctx[0])
        with autograd.record(train_mode=False):
            y_hat, features = inference(X)

        # update metric
        mtc_acc.update([y], [y_hat])

        if return_meta:
            y_nest.extend(y.asnumpy())
            feature_nest.extend(features.asnumpy())
            y_hat_nest.extend(y_hat.asnumpy())

    feature_nest = np.array(feature_nest)
    y_nest = np.array(y_nest)
    y_hat_nest = np.array(y_hat_nest)

    if return_meta:
        return mtc_acc.get()[1], y_nest, y_hat_nest, feature_nest

    return mtc_acc.get()[1]
示例#2
0
    def eval_epoch(self):
        self.is_train = False
        meter = Accuracy()
        meter.reset()

        for X, y in self.test_loader:
            X = X.as_in_context(self.ctx[0])
            y = y.as_in_context(self.ctx[0])

            y_hat, features = self.net(X)
            meter.update([y], [y_hat])

        acc = meter.get()[1]
        logging.info('Test  - Epoch {}, Iter {}, Acc {:.2f} %'.format(
            self.cur_epoch, self.cur_iter, acc * 100))

        if acc > self.eval_tracker['Acc']:
            self.eval_tracker.update({
                'Epoch': self.cur_epoch,
                'Iter': self.cur_iter,
                'Acc': acc
            })

        self.net.save_parameters('{}_{}_{}_{:.2f}.params'.format(
            self.cfg.META.CKPT_PATH, self.cur_epoch, self.cur_iter, acc))
示例#3
0
    def eval(self, inference, val_loader, log=True, target=True, epoch=True):
        """
        Evaluate the model
        :param inference: network
        :param val_loader: data loader
        :param log: log flag
        :param target: target flag for updating the record and log
        :param epoch: epoch flag for updating the record and log
        :return:
        """
        mtc_acc = Accuracy()
        mtc_acc.reset()
        # val_loader.reset()

        feature_nest, y_nest, y_hat_nest = [], [], []
        for X, Y in val_loader:
            X_lst = split_and_load(X, self.args.ctx, even_split=False)
            Y_lst = split_and_load(Y, self.args.ctx, even_split=False)

            for x, y in zip(X_lst, Y_lst):
                y_hat, features = inference(x)
                # update metric
                mtc_acc.update([y], [y_hat])

                y_nest.extend(y.asnumpy())
                feature_nest.extend(features.asnumpy())
                y_hat_nest.extend(y_hat.asnumpy())

        feature_nest = np.array(feature_nest)
        y_nest = np.array(y_nest).astype(int)
        y_hat_nest = np.array(y_hat_nest)

        if log:
            target_key = 'Tgt' if target else 'Src'
            epoch_key = 'Epoch' if epoch else 'Iter'
            record = self.cur_epoch if epoch else self.cur_iter

            if mtc_acc.get()[1] > self.records[epoch_key]['%s-Acc' %
                                                          target_key]:
                if target:
                    self.records[epoch_key][epoch_key] = record
                self.records[epoch_key]['%s-Acc' %
                                        target_key] = mtc_acc.get()[1]
                self.records[epoch_key]['%s-label' % target_key] = y_nest
                self.records[epoch_key]['%s-preds' % target_key] = y_hat_nest
                self.records[epoch_key]['%s-features' %
                                        target_key] = feature_nest

                self.save_params(inference, 0, epoch_key)

            self.logger.update_scalar(
                '%s [%d]: Eval-Acc-%s' % (epoch_key, record, target_key),
                mtc_acc.get()[1])
            if self.sw:
                self.sw.add_scalar('Acc/Eval-%s-Acc-%s' % (epoch, target_key),
                                   mtc_acc.get()[1],
                                   global_step=record)

        return mtc_acc.get()[1], y_nest, y_hat_nest, feature_nest
示例#4
0
def validate(net, val_loader, gpu_id, train_index2words, val_index2words):
    metric = BleuMetric(pred_index2words=train_index2words,
                        label_index2words=val_index2words)
    metruc_acc = Accuracy()
    metruc_acc.reset()
    metric.reset()
    for batch in tqdm.tqdm(val_loader):
        batch = [x.as_in_context(mx.gpu(gpu_id)) for x in batch]
        image, label, label_len = batch
        predictions, alphas = net(image, None, None)
        for n, l in enumerate(label_len):
            l = int(l.asscalar())
            la = label[n, 1:l]
            pred = predictions[n, :]
            metric.update(la, pred)
            metruc_acc.update(la, predictions[n, :(l - 1)])
    return metric.get()[1], metruc_acc.get()[1]
示例#5
0
def validate(net, val_loader, gpu_id, train_index2words, val_index2words):
    metric = BleuMetric(pred_index2words=train_index2words,
                        label_index2words=val_index2words)
    metruc_acc = Accuracy()
    metruc_acc.reset()
    metric.reset()
    for batch in tqdm.tqdm(val_loader):
        batch = [Variable(torch.from_numpy(x.asnumpy()).cuda()) for x in batch]
        image, label, label_len = batch
        label = label.long()
        label_len = label_len.long()
        predictions, alphas = net(image, None, None)
        for n, l in enumerate(label_len):
            l = int(l.data.cpu().numpy().squeeze().tolist())
            la = label[n, 1:l].data.cpu().numpy()
            pred = predictions[n, :].data.cpu().numpy()
            metric.update(la, pred)
            metruc_acc.update(
                mx.nd.array(la),
                mx.nd.array(predictions[n, :(l - 1)].data.cpu().numpy()))
    return metric.get()[1], metruc_acc.get()[1]
示例#6
0
                # computes softmax cross entropy loss
                l = loss_fn(z, y)
                output.append(z)
                losses.append(l)
        # backpropagate the error for one iteration.
        for l in losses:
            l.backward()
        # Update network weights
        trainer.step(BATCH_SIZE)
        # Update metric
        metric.update(label, output)
    str1 = 'Epoch [{}], Accuracy {:.4f}'.format(epoch, metric.get()[1])
    str2 = '~Samples/Sec {:.4f}'.format(BATCH_SIZE * (i + 1) /
                                        (time.time() - tick_0))
    print('%s  %s' % (str1, str2))
    metric.reset()

elapsed = time.perf_counter() - start
print('elapsed: {:0.3f}'.format(elapsed))

# use Accuracy as the evaluation metric
metric = Accuracy()
for data, label in test_data:
    data = split_and_load(data, ctx_list=ctx, batch_axis=0)
    label = split_and_load(label, ctx_list=ctx, batch_axis=0)
    outputs = []
    for x in data:
        outputs.append(model(x))
    metric.update(label, outputs)
print('validation %s=%f' % metric.get())
示例#7
0
def main():
    epoches = 32
    gpu_id = 7
    ctx_list = [mx.gpu(x) for x in [7, 8]]
    log_interval = 100
    batch_size = 32
    start_epoch = 0
    # trainer_resume = resume + ".states" if resume is not None else None
    trainer_resume = None

    resume = None
    from mxnet.gluon.data.vision import transforms
    transform_fn = transforms.Compose([
        LeftTopPad(dest_shape=(256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406),
                             std=(0.229, 0.224, 0.225))
    ])
    dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/train2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_train2017.json",
        transforms=transform_fn,
        feature_hdf5="output/train2017.h5")
    val_dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/val2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_val2017.json",
        words2index=dataset.words2index,
        index2words=dataset.index2words,
        transforms=transform_fn,
        feature_hdf5="output/val2017.h5")
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True,
                            last_batch="discard")
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True)

    num_words = dataset.words_count

    # set up logger
    save_prefix = "output/res50_"
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)

    net = EncoderDecoder(num_words=num_words, test_max_len=val_dataset.max_len)
    if resume is not None:
        net.collect_params().load(resume,
                                  allow_missing=True,
                                  ignore_extra=True)
        logger.info("Resumed form checkpoint {}.".format(resume))
    params = net.collect_params()
    for key in params.keys():
        if params[key]._data is not None:
            continue
        else:
            if "bias" in key or "mean" in key or "beta" in key:
                params[key].initialize(init=mx.init.Zero())
                logging.info("initialized {} using Zero.".format(key))
            elif "weight" in key:
                params[key].initialize(init=mx.init.Normal())
                logging.info("initialized {} using Normal.".format(key))
            elif "var" in key or "gamma" in key:
                params[key].initialize(init=mx.init.One())
                logging.info("initialized {} using One.".format(key))
            else:
                params[key].initialize(init=mx.init.Normal())
                logging.info("initialized {} using Normal.".format(key))

    net.collect_params().reset_ctx(ctx=ctx_list)
    trainer = mx.gluon.Trainer(
        net.collect_params(),
        'adam',
        {
            'learning_rate': 4e-4,
            'clip_gradient': 5,
            'multi_precision': True
        },
    )
    if trainer_resume is not None:
        trainer.load_states(trainer_resume)
        logger.info(
            "Loaded trainer states form checkpoint {}.".format(trainer_resume))
    criterion = Criterion()
    accu_top3_metric = TopKAccuracy(top_k=3)
    accu_top1_metric = Accuracy(name="batch_accu")
    ctc_loss_metric = Loss(name="ctc_loss")
    alpha_metric = Loss(name="alpha_loss")
    batch_bleu = BleuMetric(name="batch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    epoch_bleu = BleuMetric(name="epoch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    btic = time.time()
    logger.info(batch_size)
    logger.info(num_words)
    logger.info(len(dataset.words2index))
    logger.info(len(dataset.index2words))
    logger.info(dataset.words2index["<PAD>"])
    logger.info(val_dataset.words2index["<PAD>"])
    logger.info(len(val_dataset.words2index))
    # net.hybridize(static_alloc=True, static_shape=True)
    net_parallel = DataParallelModel(net, ctx_list=ctx_list, sync=True)
    for nepoch in range(start_epoch, epoches):
        if nepoch > 15:
            trainer.set_learning_rate(4e-5)
        logger.info("Current lr: {}".format(trainer.learning_rate))
        accu_top1_metric.reset()
        accu_top3_metric.reset()
        ctc_loss_metric.reset()
        alpha_metric.reset()
        epoch_bleu.reset()
        batch_bleu.reset()
        for nbatch, batch in enumerate(tqdm.tqdm(dataloader)):
            batch = [mx.gluon.utils.split_and_load(x, ctx_list) for x in batch]
            inputs = [[x[n] for x in batch] for n, _ in enumerate(ctx_list)]
            losses = []
            with ag.record():
                net_parallel.sync = nbatch > 1
                outputs = net_parallel(*inputs)
                for s_batch, s_outputs in zip(inputs, outputs):
                    image, label, label_len = s_batch
                    predictions, alphas = s_outputs
                    ctc_loss = criterion(predictions, label, label_len)
                    loss2 = 1.0 * ((1. - alphas.sum(axis=1))**2).mean()
                    losses.extend([ctc_loss, loss2])
            ag.backward(losses)
            trainer.step(batch_size=batch_size, ignore_stale_grad=True)
            for n, l in enumerate(label_len):
                l = int(l.asscalar())
                la = label[n, 1:l]
                pred = predictions[n, :(l - 1)]
                accu_top3_metric.update(la, pred)
                accu_top1_metric.update(la, pred)
                epoch_bleu.update(la, predictions[n, :])
                batch_bleu.update(la, predictions[n, :])
            ctc_loss_metric.update(None,
                                   preds=nd.sum(ctc_loss) / image.shape[0])
            alpha_metric.update(None, preds=loss2)
            if nbatch % log_interval == 0 and nbatch > 0:
                msg = ','.join([
                    '{}={:.3f}'.format(*metric.get()) for metric in [
                        epoch_bleu, batch_bleu, accu_top1_metric,
                        accu_top3_metric, ctc_loss_metric, alpha_metric
                    ]
                ])
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.
                    format(nepoch, nbatch,
                           log_interval * batch_size / (time.time() - btic),
                           msg))
                btic = time.time()
                batch_bleu.reset()
                accu_top1_metric.reset()
                accu_top3_metric.reset()
                ctc_loss_metric.reset()
                alpha_metric.reset()

        bleu, acc_top1 = validate(net,
                                  gpu_id=gpu_id,
                                  val_loader=val_loader,
                                  train_index2words=dataset.index2words,
                                  val_index2words=val_dataset.index2words)
        save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % (
            nepoch, bleu, acc_top1)
        net.collect_params().save(save_path)
        trainer.save_states(fname=save_path + ".states")
        logger.info("Saved checkpoint to {}.".format(save_path))
示例#8
0
def main(train_list,
         val_list,
         model,
         exp,
         saved_model,
         batch_size,
         optimizer,
         nb_epochs,
         augment,
         max_lr,
         min_lr,
         loss_function,
         train_all,
         nb_frames,
         eager,
         params=None,
         **kwargs):

    print("Unused arguments:", kwargs)

    setname = train_list.split(os.sep)[0]
    # Timestamp to name experiment folder
    xptime = strftime("%Y-%m-%d_%Hh%Mm%Ss", gmtime())
    xp_folder = "experiments/%s-%s-%s_%s" % (setname, model, exp, xptime)
    # Make folder
    mkdir_p(xp_folder)
    mkdir_p(os.path.join(xp_folder, 'checkpoints'))
    mkdir_p(os.path.join(xp_folder, 'tb'))
    print("\nSaving experiment data to:", xp_folder)

    # Save command (as well as possible)
    with open(os.path.join(xp_folder, 'command.sh'), "w") as f:
        command = " ".join(sys.argv[:]) + "\n"
        f.write(command)

    # Save employed parameters for future reference
    if params is not None:
        write_params(os.path.join(xp_folder, 'params.json'), params)

    #############
    # Callbacks #
    #############

    # Helper: Save the model.
    ckpt_fmt = os.path.join(
        xp_folder, 'checkpoints', model + '-' + exp +
        '.{epoch:03d}-loss{val_loss:.3f}-acc{val_acc:.3f}.hdf5')
    checkpointer = ModelCheckpoint(filepath=ckpt_fmt,
                                   verbose=1,
                                   save_best_only=True,
                                   monitor='val_acc')

    # Helper: TensorBoard
    tb = HistoryKeeper(logdir=os.path.join(xp_folder),
                       keys=['val_acc', 'val_loss', 'train_time', 'val_time'])

    # Helper: Stop when we stop learning.
    # early_stopper = EarlyStopper(patience=15)

    # Helper: Terminate when finding a NaN loss
    nan_term = TerminateOnNaN()

    callbacks = [tb, checkpointer, nan_term]
    #############

    #############
    #  Loading  #
    #############
    if augment:
        augmenter = default_augmenter_vid(strip_size=4)
    else:
        augment = False
        augmenter = None

    # Dataset classes
    train_data = ArrayData(train_list,
                           nb_frames=nb_frames,
                           augmenter=augmenter,
                           eager=eager)
    val_data = ArrayData(val_list,
                         nb_frames=nb_frames,
                         augmenter=None,
                         eager=eager,
                         encoder=train_data.get_encoder())

    # Saving encoder
    with open(os.path.join(xp_folder, 'encoder.pkl'), 'wb') as f:
        pickle.dump(train_data.get_encoder(), f)

    # Train loader
    train_loader = DataLoader(train_data,
                              batch_size=batch_size,
                              shuffle=True,
                              last_batch='keep',
                              num_workers=10)
    nb_samples = len(train_data)  # loader should provide the number of sampĺes

    # Validation loader
    val_loader = DataLoader(val_data,
                            batch_size=batch_size,
                            shuffle=False,
                            last_batch='keep',
                            num_workers=10)
    nb_validation = len(
        val_data)  # loader should provide the number of sampĺes

    # Compute number of steps
    steps_per_epoch = math.ceil(nb_samples / batch_size)
    validation_steps = math.ceil(nb_validation / batch_size)

    # The model
    net = ResearchModels(train_data.nb_classes,
                         model,
                         saved_model,
                         input_shape=train_data.shape,
                         train_all=train_all).model

    # A little more verbosity
    print("************************************")
    if train_all:
        print("Train all layers.")
    print("Max lr:", max_lr, " Min lr:", min_lr)
    print("Batch size:", batch_size)
    print(nb_samples, "training samples,", steps_per_epoch, "steps per epoch")
    print(nb_validation, "validation samples,", validation_steps,
          "validation steps")
    print("Optimizer:", optimizer)
    if augment:
        print("Using data augmentation")
    else:
        print("WARNING: Not using data augmentation")
    print("************************************")

    ############################
    #   Loss and Optimization  #
    ############################

    trainer = gluon.Trainer(net.collect_params(), optimizer,
                            {'learning_rate': max_lr})

    if loss_function == 'categorical_crossentropy':
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        loss_fn.hybridize()

    ############
    # Training #
    ############
    progress_desc = "Super epoch %03d - acc %.3f - loss %.3f  "
    acc = Accuracy()
    start_time = time()

    super_epoch_size = 250
    # Learning rate decay
    iteration = 1
    decay_alpha = 0.01**0.25
    lr = max_lr

    for epoch in range(1, nb_epochs + 1):
        train_loss, val_loss = 0., 0.
        nb_batches = 0
        tic = time()
        acc.reset()

        start_training = time()
        t = tqdm(range(super_epoch_size), unit='epochs')
        for _ in t:
            for data, label in train_loader:
                # Learning rate decay
                if iteration % 10000 == 0:
                    lr *= decay_alpha
                    trainer.set_learning_rate(lr)
                    print("Learning rate updated to", lr)
                iteration += 1

                current_batch_size = data.shape[0]
                data = data.copyto(mx.gpu(0))
                label = label.copyto(mx.gpu(0))

                with autograd.record():
                    output = net(data)
                    loss = loss_fn(output, label)
                loss.backward()
                # print(mx.nd.log_softmax(output[0], axis=-1), label[0])

                # update parameters
                trainer.step(current_batch_size)

                # calculate training metrics
                train_loss += loss.mean().asscalar()
                # accuracy(output, label)
                acc.update(preds=output, labels=label)

                nb_batches += 1

            t.set_description(progress_desc %
                              (epoch, acc.get()[1], train_loss / nb_batches))

        train_time = time() - start_training

        train_loss /= steps_per_epoch * super_epoch_size
        train_acc = acc.get()[1]

        acc.reset()
        start_val = time()
        # calculate validation accuracy
        tval = tqdm(val_loader,
                    leave=False,
                    desc='Running validation',
                    unit='batch')
        for data, label in tval:
            data = data.copyto(mx.gpu(0))
            label = label.copyto(mx.gpu(0))

            # Compute outputs
            output = net(data)
            loss = loss_fn(output, label)

            # Compute metrics
            val_loss += loss.mean().asscalar()
            # val_acc += accuracy(output, label)
            acc.update(preds=output, labels=label)

        val_time = time() - start_val

        val_loss /= validation_steps
        val_acc = acc.get()[1]

        print(
            "Epoch %d: loss %.3f, acc %.3f, val_loss %.3f, val_acc %.3f, in %.1f sec"
            % (epoch, train_loss, train_acc, val_loss, val_acc, time() - tic))
        print(
            "--------------------------------------------------------------------------------"
        )

        stop = False
        train_info = {
            'epoch': epoch,
            'loss': train_loss,
            'acc': train_acc,
            'val_loss': val_loss,
            'val_acc': val_acc,
            'train_time': train_time,
            'val_time': val_time
        }
        for cb in callbacks:
            if cb(net, train_info):
                stop = True

        if stop:
            break
        print()

    hours, rem = divmod(time() - start_time, 3600)
    days, hours = divmod(hours, 24)
    minutes, seconds = divmod(rem, 60)

    print("%d training epochs in %dd, %dh%dm%.2fs." %
          (nb_epochs, int(days), int(hours), int(minutes), seconds))
示例#9
0
def main():
    epoches = 32
    gpu_id = 7
    ctx_list = [mx.gpu(x) for x in [7, 8]]
    log_interval = 100
    batch_size = 32
    start_epoch = 0
    # trainer_resume = resume + ".states" if resume is not None else None
    trainer_resume = None

    resume = None
    from mxnet.gluon.data.vision import transforms
    transform_fn = transforms.Compose([
        LeftTopPad(dest_shape=(256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406),
                             std=(0.229, 0.224, 0.225))
    ])
    dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/train2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_train2017.json",
        transforms=transform_fn,
        feature_hdf5="output/train2017.h5")
    val_dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/val2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_val2017.json",
        words2index=dataset.words2index,
        index2words=dataset.index2words,
        transforms=transform_fn,
        feature_hdf5="output/val2017.h5")
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True,
                            last_batch="discard")
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True)

    num_words = dataset.words_count

    # set up logger
    save_prefix = "output/res50_"
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)

    net = EncoderDecoder(num_words=num_words,
                         test_max_len=val_dataset.max_len).cuda()
    for name, p in net.named_parameters():
        if "bias" in name:
            p.data.zero_()
        else:
            p.data.normal_(0, 0.01)
        print(name)
    net = torch.nn.DataParallel(net)
    if resume is not None:
        net.collect_params().load(resume,
                                  allow_missing=True,
                                  ignore_extra=True)
        logger.info("Resumed form checkpoint {}.".format(resume))

    trainer = torch.optim.Adam(params=filter(lambda p: p.requires_grad,
                                             net.parameters()),
                               lr=4e-4)
    criterion = Criterion()
    accu_top3_metric = TopKAccuracy(top_k=3)
    accu_top1_metric = Accuracy(name="batch_accu")
    ctc_loss_metric = Loss(name="ctc_loss")
    alpha_metric = Loss(name="alpha_loss")
    batch_bleu = BleuMetric(name="batch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    epoch_bleu = BleuMetric(name="epoch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    btic = time.time()
    logger.info(batch_size)
    logger.info(num_words)
    logger.info(len(dataset.words2index))
    logger.info(len(dataset.index2words))
    logger.info(dataset.words2index["<PAD>"])
    logger.info(val_dataset.words2index["<PAD>"])
    logger.info(len(val_dataset.words2index))
    for nepoch in range(start_epoch, epoches):
        if nepoch > 15:
            trainer.set_learning_rate(4e-5)
        logger.info("Current lr: {}".format(trainer.param_groups[0]["lr"]))
        accu_top1_metric.reset()
        accu_top3_metric.reset()
        ctc_loss_metric.reset()
        alpha_metric.reset()
        epoch_bleu.reset()
        batch_bleu.reset()
        for nbatch, batch in enumerate(tqdm.tqdm(dataloader)):
            batch = [
                Variable(torch.from_numpy(x.asnumpy()).cuda()) for x in batch
            ]
            data, label, label_len = batch
            label = label.long()
            label_len = label_len.long()
            max_len = label_len.max().data.cpu().numpy()
            net.train()
            outputs = net(data, label, max_len)
            predictions, alphas = outputs
            ctc_loss = criterion(predictions, label, label_len)
            loss2 = 1.0 * ((1. - alphas.sum(dim=1))**2).mean()
            ((ctc_loss + loss2) / batch_size).backward()
            for group in trainer.param_groups:
                for param in group['params']:
                    if param.grad is not None:
                        param.grad.data.clamp_(-5, 5)

            trainer.step()
            if nbatch % 10 == 0:
                for n, l in enumerate(label_len):
                    l = int(l.data.cpu().numpy())
                    la = label[n, 1:l].data.cpu().numpy()
                    pred = predictions[n, :(l - 1)].data.cpu().numpy()
                    accu_top3_metric.update(mx.nd.array(la), mx.nd.array(pred))
                    accu_top1_metric.update(mx.nd.array(la), mx.nd.array(pred))
                    epoch_bleu.update(la, predictions[n, :].data.cpu().numpy())
                    batch_bleu.update(la, predictions[n, :].data.cpu().numpy())
                ctc_loss_metric.update(
                    None,
                    preds=mx.nd.array([ctc_loss.data.cpu().numpy()]) /
                    batch_size)
                alpha_metric.update(None,
                                    preds=mx.nd.array(
                                        [loss2.data.cpu().numpy()]))
                if nbatch % log_interval == 0 and nbatch > 0:
                    msg = ','.join([
                        '{}={:.3f}'.format(*metric.get()) for metric in [
                            epoch_bleu, batch_bleu, accu_top1_metric,
                            accu_top3_metric, ctc_loss_metric, alpha_metric
                        ]
                    ])
                    logger.info(
                        '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.
                        format(
                            nepoch, nbatch,
                            log_interval * batch_size / (time.time() - btic),
                            msg))
                    btic = time.time()
                    batch_bleu.reset()
                    accu_top1_metric.reset()
                    accu_top3_metric.reset()
                    ctc_loss_metric.reset()
                    alpha_metric.reset()
        net.eval()
        bleu, acc_top1 = validate(net,
                                  gpu_id=gpu_id,
                                  val_loader=val_loader,
                                  train_index2words=dataset.index2words,
                                  val_index2words=val_dataset.index2words)
        save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % (
            nepoch, bleu, acc_top1)
        torch.save(net.module.state_dict(), save_path)
        torch.save(trainer.state_dict(), save_path + ".states")
        logger.info("Saved checkpoint to {}.".format(save_path))
示例#10
0
 def fit(self, itr, ctx, epochs, batch_size, callbacks=None):
     # ADAM optimizer
     #opt_params={'learning_rate':0.001, 'beta1':0.9, 'beta2':0.999, 'epsilon':1e-08}
     opt = mx.optimizer.create('adam')
     # SGD optimizer
     #opt = mx.optimizer.create('sgd')
     # AdaDelta optimizer
     #opt = mx.optimizer.create('adadelta')
     # initialize parameters
     # MXNet initializes the weight matrices uniformly by drawing from [−0.07,0.07], bias parameters are all set to 0
     # 'Xavier': initializer is designed to keep the scale of gradients roughly the same in all layers
     self._net.initialize(mx.init.Xavier(magnitude=2.3),
                          ctx=ctx,
                          force_reinit=True)
     # fetch and broadcast parameters
     params = self._net.collect_params()
     # trainer
     trainer = Trainer(params=params, optimizer=opt, kvstore='device')
     # loss function
     loss_fn = SoftmaxCrossEntropyLoss()
     # use accuracy as the evaluation metric
     metric = Accuracy()
     # train
     for e in range(epochs):
         if callbacks is not None:
             for cb in callbacks:
                 cb.before_epoch(e)
         # reset evaluation result to initial state
         metric.reset()
         # reset the train data iterator.
         itr.reset()
         # loop over the train data iterator
         for i, batch in enumerate(itr):
             # splits train data into multiple slices along batch_axis
             # copy each slice into a context
             data = split_and_load(batch.data[0],
                                   ctx_list=ctx,
                                   batch_axis=0,
                                   even_split=False)
             # splits train label into multiple slices along batch_axis
             # copy each slice into a context
             label = split_and_load(batch.label[0],
                                    ctx_list=ctx,
                                    batch_axis=0,
                                    even_split=False)
             outputs = []
             losses = []
             # inside training scope
             with ag.record():
                 for x, y in zip(data, label):
                     z = self._net(x)
                     # computes softmax cross entropy loss
                     l = loss_fn(z, y)
                     outputs.append(z)
                     losses.append(l)
             # backpropagate the error for one iteration
             for l in losses:
                 l.backward()
             # make one step of parameter update.
             # trainer needs to know the batch size of data
             # to normalize the gradient by 1/batch_size
             trainer.step(batch_size)
             # updates internal evaluation
             metric.update(label, outputs)
             # invoke callbacks after batch
             if callbacks is not None:
                 for cb in callbacks:
                     cb.after_batch(e, i, batch_size, metric)
         # invoke callbacks after epoch
         if callbacks is not None:
             for cb in callbacks:
                 cb.after_epoch(e, i, batch_size, metric)
     return metric