Exemplo n.º 1
0
    def layerwise_relevance_zclip(self, out, use_bias=False, **kwargs):
        if self._in is None:
            raise RuntimeError('Block has not yet executed forward_logged!')
        R = out
        a = self._in[0]
        z = self._out
        weight = self.weight.data(ctx=a.context)
        wplus = nd.maximum(0., weight)
        wminus = nd.minimum(0., weight)

        bplus = None
        bminus = None
        if use_bias is not None:
            bias = self.bias.data(ctx=a.context)
            bplus = nd.maximum(0., bias)
            bminus = nd.minimum(0., bias)

        alpha = z > 0.
        beta = z < 0.

        a.attach_grad()
        with autograd.record():
            zplus = self._forward(data=a, weight=wplus, bias=bplus)
        cplus, = autograd.grad(zplus, a, head_grads=alpha*R/(zplus + (zplus == 0.)))

        with autograd.record():
            zminus = self._forward(data=a, weight=wminus, bias=bminus)
        cminus, = autograd.grad(zminus, a, head_grads=beta*R/(zminus + (zminus == 0.)))

        return a*(cplus - cminus)
Exemplo n.º 2
0
def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs, print_batches=None):
    """Train and evaluate a model."""
    print("training on", ctx)
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    for epoch in range(1, num_epochs + 1):
        train_l_sum, train_acc_sum, n, m = 0.0, 0.0, 0.0, 0.0
        if isinstance(train_iter, mx.io.MXDataIter):
            train_iter.reset()
        start = time()
        for i, batch in enumerate(train_iter):
            Xs, ys, batch_size = _get_batch(batch, ctx)
            ls = []
            with autograd.record():
                y_hats = [net(X) for X in Xs]
                ls = [loss(y_hat, y) for y_hat, y in zip(y_hats, ys)]
            for l in ls:
                l.backward()
            train_acc_sum += sum([(y_hat.argmax(axis=1) == y).sum().asscalar()
                                 for y_hat, y in zip(y_hats, ys)])
            train_l_sum += sum([l.sum().asscalar() for l in ls])
            trainer.step(batch_size)
            n += batch_size
            m += sum([y.size for y in ys])
            if print_batches and (i+1) % print_batches == 0:
                print("batch %d, loss %f, train acc %f" % (
                    n, train_l_sum / n, train_acc_sum / m
                ))
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print("epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec" % (
            epoch, train_l_sum / n, train_acc_sum / m, test_acc, time() - start
        ))
Exemplo n.º 3
0
def train(epoch, ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    net.initialize(mx.init.Orthogonal(), ctx=ctx)
    # re-initialize conv4's weight to be Orthogonal
    net.conv4.collect_params().initialize(mx.init.Orthogonal(scale=1), ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opt.lr})
    loss = gluon.loss.L2Loss()

    for i in range(epoch):
        train_data.reset()
        for batch in train_data:
            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
            outputs = []
            with ag.record():
                for x, y in zip(data, label):
                    z = net(x)
                    L = loss(z, y)
                    L.backward()
                    outputs.append(z)
            trainer.step(batch.data[0].shape[0])
            metric.update(label, outputs)

        name, acc = metric.get()
        metric.reset()
        print('training mse at epoch %d: %s=%f'%(i, name, acc))
        test(ctx)

    net.save_params('superres.params')
Exemplo n.º 4
0
def train(weight_decay):
    learning_rate = 0.005
    epochs = 10

    net = gluon.nn.Sequential()
    with net.name_scope():
        net.add(gluon.nn.Dense(1))
    net.initialize()

    # 注意到这里 'wd'
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {
        'learning_rate': learning_rate, 'wd': weight_decay})  #注意在这里设置正则项
    # 标准的梯度下降中,w = w-lr*grad, 参数是这样更新的
    # 加入正则项后 w = w - lr*grad - wd*w
    # ?? w = w - lr(grad + wd * w)
    train_loss = []
    test_loss = []
    for e in range(epochs):
        for data, label in data_iter_train:
            with autograd.record():
                output = net(data)
                loss = square_loss(output, label)
            loss.backward()
            trainer.step(batch_size)
        train_loss.append(test(net, X_train, y_train))
        test_loss.append(test(net, X_test, y_test))
    plt.plot(train_loss)
    plt.plot(test_loss)
    plt.legend(['train', 'test'])
    plt.show()

    return ('learned w[:10]:', net[0].weight.data()[:, :10],
            'learned b:', net[0].bias.data())
Exemplo n.º 5
0
def train_gluon_ch7(trainer_name, trainer_hyperparams, features, labels,
                    batch_size=10, num_epochs=2):
    """Train a linear regression model with a given Gluon trainer."""
    net = nn.Sequential()
    net.add(nn.Dense(1))
    net.initialize(init.Normal(sigma=0.01))
    loss = gloss.L2Loss()

    def eval_loss():
        return loss(net(features), labels).mean().asscalar()

    ls = [eval_loss()]
    data_iter = gdata.DataLoader(
        gdata.ArrayDataset(features, labels), batch_size, shuffle=True)
    trainer = gluon.Trainer(net.collect_params(),
                            trainer_name, trainer_hyperparams)
    for _ in range(num_epochs):
        start = time.time()
        for batch_i, (X, y) in enumerate(data_iter):
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
            if (batch_i + 1) * batch_size % 100 == 0:
                ls.append(eval_loss())
    print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start))
    set_figsize()
    plt.plot(np.linspace(0, num_epochs, len(ls)), ls)
    plt.xlabel('epoch')
    plt.ylabel('loss')
Exemplo n.º 6
0
def train_ch7(model, data_iter, lr, num_epochs, ctx):
    """Train an encoder-decoder model"""
    model.initialize(init.Xavier(), force_reinit=True, ctx=ctx)
    trainer = gluon.Trainer(model.collect_params(),
                            'adam', {'learning_rate': lr})
    loss = MaskedSoftmaxCELoss()
    tic = time.time()
    for epoch in range(1, num_epochs+1):
        l_sum, num_tokens_sum = 0.0, 0.0
        for batch in data_iter:
            X, X_vlen, Y, Y_vlen = [x.as_in_context(ctx) for x in batch]
            Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
            with autograd.record():
                Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)
                l = loss(Y_hat, Y_label, Y_vlen)
            l.backward()
            grad_clipping_gluon(model, 5, ctx)
            num_tokens = Y_vlen.sum().asscalar()
            trainer.step(num_tokens)
            l_sum += l.sum().asscalar()
            num_tokens_sum += num_tokens
        if epoch % (num_epochs // 4) == 0:
            print("epoch %d, loss %.3f, time %.1f sec" % (
                epoch, l_sum/num_tokens_sum, time.time()-tic))
            tic = time.time()
Exemplo n.º 7
0
def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs):
    """Train and evaluate a model."""
    print('training on', ctx)
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, m, start = 0.0, 0.0, 0, 0, time.time()
        for i, batch in enumerate(train_iter):
            Xs, ys, batch_size = _get_batch(batch, ctx)
            ls = []
            with autograd.record():
                y_hats = [net(X) for X in Xs]
                ls = [loss(y_hat, y) for y_hat, y in zip(y_hats, ys)]
            for l in ls:
                l.backward()
            trainer.step(batch_size)
            train_l_sum += sum([l.sum().asscalar() for l in ls])
            n += sum([l.size for l in ls])
            train_acc_sum += sum([(y_hat.argmax(axis=1) == y).sum().asscalar()
                                 for y_hat, y in zip(y_hats, ys)])
            m += sum([y.size for y in ys])
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
              'time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / m, test_acc,
                 time.time() - start))
Exemplo n.º 8
0
    def layerwise_relevance_zb(self, out, lo=-1, hi=1, use_bias=False, **kwargs):
        if self._in is None:
            raise RuntimeError('Block has not yet executed forward_logged!')
        R = out
        a = self._in[0]
        weight = self.weight.data(ctx=a.context)
        wplus = nd.maximum(0., weight)
        wminus = nd.minimum(0., weight)

        bias = None
        bplus = None
        bminus = None
        if use_bias is not None:
            bias = self.bias.data(ctx=a.context)
            bplus = nd.maximum(0., bias)
            bminus = nd.minimum(0., bias)

        upper = nd.ones_like(a)*hi
        lower = nd.ones_like(a)*lo
        a.attach_grad()
        upper.attach_grad()
        lower.attach_grad()
        with autograd.record():
            zlh = ( self._forward(a, weight, bias)
                  - self._forward(lower, wplus, bplus)
                  - self._forward(upper, wminus, bminus)
                  )
        zlh.backward(out_grad=R/(zlh + (zlh == 0.)))
        return a*a.grad + upper*upper.grad + lower*lower.grad
Exemplo n.º 9
0
def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
                                corpus_indices, idx_to_char, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size, pred_period, pred_len, prefixes):
    """Train an Gluon RNN model and predict the next item in the sequence."""
    loss = gloss.SoftmaxCrossEntropyLoss()
    model.initialize(ctx=ctx, force_reinit=True, init=init.Normal(0.01))
    trainer = gluon.Trainer(model.collect_params(), 'sgd',
                            {'learning_rate': lr, 'momentum': 0, 'wd': 0})

    for epoch in range(num_epochs):
        loss_sum, start = 0.0, time.time()
        data_iter = data_iter_consecutive(
            corpus_indices, batch_size, num_steps, ctx)
        state = model.begin_state(batch_size=batch_size, ctx=ctx)
        for t, (X, Y) in enumerate(data_iter):
            for s in state:
                s.detach()
            with autograd.record():
                (output, state) = model(X, state)
                y = Y.T.reshape((-1,))
                l = loss(output, y).mean()
            l.backward()
            params = [p.data() for p in model.collect_params().values()]
            grad_clipping(params, clipping_theta, ctx)
            trainer.step(1)
            loss_sum += l.asscalar()

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(loss_sum / (t + 1)), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn_gluon(
                    prefix, pred_len, model, vocab_size,
                    ctx, idx_to_char, char_to_idx))
Exemplo n.º 10
0
def train_ch7(trainer_fn, states, hyperparams, features, labels, batch_size=10,
              num_epochs=2):
    """Train a linear regression model."""
    net, loss = linreg, squared_loss
    w, b = nd.random.normal(scale=0.01, shape=(features.shape[1], 1)), nd.zeros(1)
    w.attach_grad()
    b.attach_grad()

    def eval_loss():
        return loss(net(features, w, b), labels).mean().asscalar()

    ls = [eval_loss()]
    data_iter = gdata.DataLoader(
        gdata.ArrayDataset(features, labels), batch_size, shuffle=True)
    for _ in range(num_epochs):
        start = time.time()
        for batch_i, (X, y) in enumerate(data_iter):
            with autograd.record():
                l = loss(net(X, w, b), y).mean()
            l.backward()
            trainer_fn([w, b], states, hyperparams)
            if (batch_i + 1) * batch_size % 100 == 0:
                ls.append(eval_loss())
    print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start))
    set_figsize()
    plt.plot(np.linspace(0, num_epochs, len(ls)), ls)
    plt.xlabel('epoch')
    plt.ylabel('loss')
Exemplo n.º 11
0
def train(train_data, test_data, net, loss, trainer, ctx, num_epochs, print_batches=None):
    """Train a network"""
    print("Start training on ", ctx)
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    for epoch in range(num_epochs):
        train_loss, train_acc, n, m = 0.0, 0.0, 0.0, 0.0
        if isinstance(train_data, mx.io.MXDataIter):
            train_data.reset()
        start = time()
        for i, batch in enumerate(train_data):
            data, label, batch_size = _get_batch(batch, ctx)
            losses = []
            with autograd.record():
                outputs = [net(X) for X in data]
                losses = [loss(yhat, y) for yhat, y in zip(outputs, label)]
            for l in losses:
                l.backward()
            train_acc += sum([(yhat.argmax(axis=1) == y).sum().asscalar()
                              for yhat, y in zip(outputs, label)])
            train_loss += sum([l.sum().asscalar() for l in losses])
            trainer.step(batch_size)
            n += batch_size
            m += sum([y.size for y in label])
            if print_batches and (i + 1) % print_batches == 0:
                print("Batch %d. Loss: %f, Train acc %f" % (
                    n, train_loss / n, train_acc / m
                ))

        test_acc = evaluate_accuracy(test_data, net, ctx)
        print("Epoch %d. Loss: %.3f, Train acc %.2f, Test acc %.2f, Time %.1f sec" % (
            epoch, train_loss / n, train_acc / m, test_acc, time() - start
        ))
Exemplo n.º 12
0
def test_infer_multiout_op():
    data = mx.nd.arange(16, dtype=np.float64).reshape((4, 4))
    data.attach_grad()

    with autograd.record():
        y = mx.nd.split(data, axis=0, num_outputs=2)
    y[0].backward()
    assert data.grad.dtype == np.float64
Exemplo n.º 13
0
def test_infer_multiout_op2():
    def test_func(a):
        q, l = mx.nd.linalg.gelqf(a)
        return mx.nd.sum(l)

    data32 = mx.nd.random.normal(shape=(2, 3), ctx=mx.cpu(), dtype=np.float32)
    data32.attach_grad()
    with autograd.record():
        test32 = test_func(data32)
        test32.backward()

    data64 = mx.nd.Cast(data32, dtype=np.float64)
    data64.attach_grad()
    with autograd.record():
        test64 = test_func(data64)
        test64.backward()
    assert_almost_equal(data64.grad.asnumpy(), data32.grad.asnumpy(), atol=1e-5, rtol=1e-5)
Exemplo n.º 14
0
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          corpus_indices, vocab, ctx, is_random_iter,
                          num_epochs, num_steps, lr, clipping_theta,
                          batch_size, prefixes):
    """Train an RNN model and predict the next item in the sequence."""
    if is_random_iter:
        data_iter_fn = data_iter_random
    else:
        data_iter_fn = data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()
    start = time.time()
    for epoch in range(1, num_epochs+1):
        if not is_random_iter:
            # If adjacent sampling is used, the hidden state is initialized
            # at the beginning of the epoch
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n = 0.0, 0
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_random_iter:
                # If random sampling is used, the hidden state is initialized
                # before each mini-batch update
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:
                # Otherwise, the detach function needs to be used to separate
                # the hidden state from the computational graph to avoid
                # backpropagation beyond the current sample
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, len(vocab))
                # outputs is num_steps terms of shape (batch_size, len(vocab))
                (outputs, state) = rnn(inputs, state, params)
                # After stitching it is (num_steps * batch_size, len(vocab))
                outputs = nd.concat(*outputs, dim=0)
                # The shape of Y is (batch_size, num_steps), and then becomes
                # a vector with a length of batch * num_steps after
                # transposition. This gives it a one-to-one correspondence
                # with output rows
                y = Y.T.reshape((-1,))
                # Average classification error via cross entropy loss
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)  # Clip the gradient
            sgd(params, lr, 1)
            # Since the error is the mean, no need to average gradients here
            l_sum += l.asscalar() * y.size
            n += y.size
        if epoch % (num_epochs // 4) == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch, math.exp(l_sum / n), time.time() - start))
            start = time.time()
        if epoch % (num_epochs // 2) == 0:
            for prefix in prefixes:
                print(' -',  predict_rnn(prefix, 50, rnn, params,
                                         init_rnn_state, num_hiddens,
                                         vocab, ctx))
Exemplo n.º 15
0
def train(ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    if opt.use_pretrained_base:
        net.deconv_layers.initialize(ctx=ctx)
        net.final_layer.initialize(ctx=ctx)
    else:
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

    L = gluon.loss.L2Loss()
    metric = HeatmapAccuracy()

    best_val_score = 1

    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)

    for epoch in range(opt.num_epochs):
        loss_val = 0
        tic = time.time()
        btic = time.time()
        metric.reset()

        for i, batch in enumerate(train_data):
            data, label, weight, imgid = train_batch_fn(batch, ctx)

            with ag.record():
                outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype)
                        for yhat, y, w in zip(outputs, label, weight)]
            for l in loss:
                l.backward()
            trainer.step(batch_size)

            metric.update(label, outputs)

            loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus
            if opt.log_interval and not (i+1)%opt.log_interval:
                metric_name, metric_score = metric.get()
                logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%(
                             epoch, i, batch_size*opt.log_interval/(time.time()-btic),
                             loss_val / (i+1), trainer.learning_rate, metric_name, metric_score))
                btic = time.time()

        time_elapsed = time.time() - tic
        logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%(
                     epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1)))
        if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
            net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch))
            trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch))

    if save_frequency and save_dir:
        net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1))
        trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1))

    return net
Exemplo n.º 16
0
def train(input_variable, target_variable, encoder, decoder, teacher_forcing_ratio,
          encoder_optimizer, decoder_optimizer, criterion, max_length, ctx):
    with autograd.record():
        loss = F.zeros((1,), ctx=ctx)

        encoder_hidden = encoder.initHidden(ctx)

        input_length = input_variable.shape[0]
        target_length = target_variable.shape[0]

        encoder_outputs, encoder_hidden = encoder(
                input_variable.expand_dims(0), encoder_hidden)

        if input_length < max_length:
            encoder_outputs = F.concat(encoder_outputs.flatten(),
                F.zeros((max_length - input_length, encoder.hidden_size), ctx=ctx), dim=0)
        else:
            encoder_outputs = encoder_outputs.flatten()



        decoder_input = F.array([SOS_token], ctx=ctx)

        decoder_hidden = encoder_hidden

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        if use_teacher_forcing:
            # Teacher forcing: Feed the target as the next input
            for di in range(target_length):
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)

                loss = F.add(loss, criterion(decoder_output, target_variable[di]))
                print criterion(decoder_output, target_variable[di])
                decoder_input = target_variable[di]  # Teacher forcing

        else:
            # Without teacher forcing: use its own predictions as the next input
            for di in range(target_length):
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                topi = decoder_output.argmax(axis=1)

                decoder_input = F.array([topi.asscalar()], ctx=ctx)

                loss = F.add(loss, criterion(decoder_output, target_variable[di]))

                if topi.asscalar() == EOS_token:
                    break

        loss.backward()

    encoder_optimizer.step(1)
    decoder_optimizer.step(1)

    return loss.asscalar()/target_length
Exemplo n.º 17
0
 def relevance_sensitivity(self, data, out=None, **kwargs):
     data = Mlist(data)
     data.attach_grad()
     with autograd.record():
         y = self.forward(data)
     y.backward(out_grad=out)
     # WARNING: is hacky and sucks
     self._out = y
     return data.grad
Exemplo n.º 18
0
def forward_backward(network, data, label):

    # Ask autograd to remember the forward pass
    with autograd.record():
        # Compute the loss on all GPUs
        losses = [loss(network(X), Y) for X, Y in zip(data, label)]

    # Run the backward pass (calculate gradients) on all GPUs
    for l in losses:
        l.backward()
Exemplo n.º 19
0
def main(net, batch_size, epochs, opt, ctx):
    train_data, val_data = get_data_iters(batch_size)
    if opt.hybridize:
        net.hybridize()

    trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opt.lr, 'wd': opt.wd})
    criterion = gluon.loss.SoftmaxCrossEntropyLoss()

    lr = opt.lr
    if opt.warmup:
        minlr = lr*0.01
        dlr = (lr-minlr)/(epochs[0]-1)

    prev_time = datetime.datetime.now()
    for epoch in range(epochs[-1]):
        _loss = 0.
        if opt.warmup:
            if epoch<epochs[0]:
                lr = minlr + dlr*epoch
        if epoch in epochs[1:]:
            lr = lr * opt.lr_decay
        trainer.set_learning_rate(lr)

        for data, label in train_data:
            data_list = gluon.utils.split_and_load(data, ctx)
            label_list = gluon.utils.split_and_load(label, ctx)
            with autograd.record():
                outpus = [net(X) for X in data_list]
                losses = [criterion(X, y) for X, y in zip(outpus, label_list)]

            for l in losses:
                l.backward()
            trainer.step(batch_size)
            _loss_list = [l.mean().asscalar() for l in losses]
            _loss += sum(_loss_list) / len(_loss_list)

        cur_time = datetime.datetime.now()
        h, remainder = divmod((cur_time - prev_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        time_str = "Time %02d:%02d:%02d" % (h, m, s)
        __loss = _loss/len(train_data)

        if val_data is not None:
            val_loss, val_accuray = validate(val_data, net, criterion, ctx)
            epoch_str = ("Epoch %d. Train loss: %f, Val loss %f, Val accuray %f, " % (epoch, __loss , val_loss, val_accuray))
        else:
            epoch_str = ("Epoch %d. Train loss: %f, " % (epoch, __loss))

        prev_time = cur_time
        print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate))

    if not os.path.exists("params"):
        os.mkdir("params")
    net.save_parameters("params/resnet50.params")
Exemplo n.º 20
0
 def relevance_layerwise(self, out, *args, **kwargs):
     R = out
     a = self._in[0]
     pkwargs = self._kwargs.copy()
     pkwargs['pool_type'] = 'sum'
     # suppress mxnet warnings about sum-pooling nob being supported with cudnn
     pkwargs['cudnn_off'] = True
     a.attach_grad()
     with autograd.record():
         z = nd.Pooling(a, **pkwargs)
     z.backward(out_grad=R/(z + (z == 0.)))
     return a * a.grad
def test_inference():
    all_models = ['resnet50_v1', 'vgg19_bn', 'alexnet', #'inceptionv3',
                  'densenet201', 'squeezenet1.0', 'mobilenet0.25']

    batch_size = 10
    download_data()
    for model_name in all_models:
        eprint('testing inference on %s'%model_name)

        data_shape = (3, 224, 224) if 'inception' not in model_name else (3, 299, 299)
        dataIter = mx.io.ImageRecordIter(
            path_imgrec        = VAL_DATA,
            label_width        = 1,
            preprocess_threads = 1,
            batch_size         = batch_size,
            data_shape         = data_shape,
            label_name         = 'softmax_label',
            rand_crop          = False,
            rand_mirror        = False)
        data_batch = dataIter.next()
        data = data_batch.data[0]
        label = data_batch.label[0]
        gpu_data = data.as_in_context(mx.gpu())
        gpu_label = label.as_in_context(mx.gpu())

        # This is to create a model and run the model once to initialize
        # all parameters.
        cpu_model = get_model(model_name)
        cpu_model.collect_params().initialize(ctx=mx.cpu())
        cpu_model(mx.nd.array(data, ctx=mx.cpu()))
        gpu_model = get_model(model_name)
        gpu_model.collect_params().initialize(ctx=mx.gpu())
        gpu_model(mx.nd.array(data, ctx=mx.gpu()))

        # Force the two models have the same parameters.
        cpu_params = cpu_model.collect_params()
        gpu_params = gpu_model.collect_params()
        for k in cpu_params.keys():
            k = k.replace(cpu_params.prefix, '')
            cpu_param = cpu_params.get(k)
            gpu_param = gpu_params.get(k)
            gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu()))

        for i in range(5):
            # Run inference.
            with autograd.record(train_mode=False):
                cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
                gpu_out = gpu_model(gpu_data)
            out = cpu_out.asnumpy()
            max_val = np.max(np.abs(out))
            gpu_max_val = np.max(np.abs(gpu_out.asnumpy()))
            eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val))
            assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-3, atol=1e-3)
Exemplo n.º 22
0
def optimize(args):
    """    Gatys et al. CVPR 2017
    ref: Image Style Transfer Using Convolutional Neural Networks
    """
    if args.cuda:
        ctx = mx.gpu(0)
    else:
        ctx = mx.cpu(0)
    # load the content and style target
    content_image = utils.tensor_load_rgbimage(args.content_image,ctx, size=args.content_size, keep_asp=True)
    content_image = utils.subtract_imagenet_mean_preprocess_batch(content_image)
    style_image = utils.tensor_load_rgbimage(args.style_image, ctx, size=args.style_size)
    style_image = utils.subtract_imagenet_mean_preprocess_batch(style_image)
    # load the pre-trained vgg-16 and extract features
    vgg = net.Vgg16()
    utils.init_vgg_params(vgg, 'models', ctx=ctx)
    # content feature
    f_xc_c = vgg(content_image)[1]
    # style feature
    features_style = vgg(style_image)
    gram_style = [net.gram_matrix(y) for y in features_style]
    # output
    output = Parameter('output', shape=content_image.shape)
    output.initialize(ctx=ctx)
    output.set_data(content_image)
    # optimizer
    trainer = gluon.Trainer([output], 'adam',
                            {'learning_rate': args.lr})
    mse_loss = gluon.loss.L2Loss()

    # optimizing the images
    for e in range(args.iters):
        utils.imagenet_clamp_batch(output.data(), 0, 255)
        # fix BN for pre-trained vgg
        with autograd.record():
            features_y = vgg(output.data())
            content_loss = 2 * args.content_weight * mse_loss(features_y[1], f_xc_c)
            style_loss = 0.
            for m in range(len(features_y)):
                gram_y = net.gram_matrix(features_y[m])
                gram_s = gram_style[m]
                style_loss = style_loss + 2 * args.style_weight * mse_loss(gram_y, gram_s)
            total_loss = content_loss + style_loss
            total_loss.backward()

        trainer.step(1)
        if (e + 1) % args.log_interval == 0:
            print('loss:{:.2f}'.format(total_loss.asnumpy()[0]))
        
    # save the image
    output = utils.add_imagenet_mean_batch(output.data())
    utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
Exemplo n.º 23
0
def train_and_predict_rnn(rnn, is_random_iter, num_epochs, num_steps,
                          num_hiddens, lr, clipping_theta, batch_size,
                          vocab_size, pred_period, pred_len, prefixes,
                          get_params, get_inputs, ctx, corpus_indices,
                          idx_to_char, char_to_idx, is_lstm=False):
    """Train an RNN model and predict the next item in the sequence."""
    if is_random_iter:
        data_iter = data_iter_random
    else:
        data_iter = data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(1, num_epochs + 1):
        if not is_random_iter:
            state_h = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx)
            if is_lstm:
                state_c = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx)
        train_l_sum = nd.array([0], ctx=ctx)
        train_l_cnt = 0
        for X, Y in data_iter(corpus_indices, batch_size, num_steps, ctx):
            if is_random_iter:
                state_h = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx)
                if is_lstm:
                    state_c = nd.zeros(shape=(batch_size, num_hiddens),
                                       ctx=ctx)
            else:
                state_h = state_h.detach()
                if is_lstm:
                    state_c = state_c.detach()       
            with autograd.record():
                if is_lstm:
                    outputs, state_h, state_c = rnn(
                        get_inputs(X, vocab_size), state_h, state_c, *params) 
                else:
                    outputs, state_h = rnn(
                        get_inputs(X, vocab_size), state_h, *params)
                y = Y.T.reshape((-1,))
                outputs = nd.concat(*outputs, dim=0)
                l = loss(outputs, y)
            l.backward()
            grad_clipping(params, clipping_theta, ctx)
            sgd(params, lr, 1)
            train_l_sum = train_l_sum + l.sum()
            train_l_cnt += l.size
        if epoch % pred_period == 0:
            print("\nepoch %d, perplexity %f"
                  % (epoch, (train_l_sum / train_l_cnt).exp().asscalar()))
            for prefix in prefixes:
                print(' - ', predict_rnn(
                    rnn, prefix, pred_len, params, num_hiddens, vocab_size,
                    ctx, idx_to_char, char_to_idx, get_inputs, is_lstm))
Exemplo n.º 24
0
def test_lstmp():
    hidden_size, projection_size = 3, 2
    rtol, atol = 1e-2, 1e-2
    batch_size, seq_len = 7, 11
    input_size = 5
    ctx = mx.gpu(0)
    lstm_input = mx.nd.uniform(
        shape=(seq_len, batch_size, input_size), ctx=ctx)
    shapes = {'i2h_weight': (hidden_size * 4, input_size),
              'h2h_weight': (hidden_size * 4, projection_size),
              'i2h_bias': (hidden_size * 4,),
              'h2h_bias': (hidden_size * 4,),
              'h2r_weight': (projection_size, hidden_size)}
    weights = {k: rand_ndarray(v) for k, v in shapes.items()}
    lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
                                input_size=input_size, prefix='lstm0_')
    lstm_cell = gluon.contrib.rnn.LSTMPCell(hidden_size=hidden_size,
                                            projection_size=projection_size,
                                            input_size=input_size,
                                            prefix='lstm0_l0_')
    lstm_layer.initialize(ctx=ctx)
    lstm_cell.initialize(ctx=ctx)
    layer_params = lstm_layer.collect_params()
    cell_params = lstm_cell.collect_params()
    for k, v in weights.items():
        layer_params['lstm0_l0_' + k].set_data(v.copy())
        cell_params['lstm0_l0_' + k].set_data(v.copy())
    with autograd.record():
        layer_output = lstm_layer(lstm_input.copy())
        cell_output = lstm_cell.unroll(seq_len, lstm_input.copy(), layout='TNC',
                                       merge_outputs=True)[0]
    assert_almost_equal(layer_output.asnumpy(),
                        cell_output.asnumpy(), rtol=rtol, atol=atol)
    layer_output.backward()
    cell_output.backward()
    for k, v in weights.items():
        layer_grad = layer_params['lstm0_l0_' + k].grad()
        cell_grad = cell_params['lstm0_l0_' + k].grad()
        print('checking gradient for {}'.format('lstm0_l0_' + k))
        assert_almost_equal(layer_grad.asnumpy(), cell_grad.asnumpy(),
                            rtol=rtol, atol=atol)
    check_rnn_layer_forward(gluon.rnn.LSTM(
        10, 2, projection_size=5), mx.nd.ones((8, 3, 20)), ctx=ctx)
    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.nd.ones(
        (8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], ctx=ctx)

    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.nd.ones((8, 3, 20)),
                            run_only=True, ctx=ctx)
    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, projection_size=5),
                            mx.nd.ones((8, 3, 20)),
                            [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], run_only=True, ctx=ctx)
Exemplo n.º 25
0
 def train(self, data, label, batch_size):
     """
     Description : training for LipNet
     """
     # pylint: disable=no-member
     sum_losses = 0
     len_losses = 0
     with autograd.record():
         losses = [self.loss_fn(self.net(X), Y) for X, Y in zip(data, label)]
     for loss in losses:
         sum_losses += mx.nd.array(loss).sum().asscalar()
         len_losses += len(loss)
         loss.backward()
     self.trainer.step(batch_size)
     return sum_losses, len_losses
def train(epochs, ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    net.initialize(mx.init.Xavier(magnitude=2), ctx=ctx)
    kv = mx.kv.create(opt.kvstore)
    train_data, val_data = get_data_iters(dataset, batch_size, kv.num_workers, kv.rank)
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum},
                            kvstore = kv)
    metric = mx.metric.Accuracy()
    loss = gluon.loss.SoftmaxCrossEntropyLoss()

    for epoch in range(epochs):
        tic = time.time()
        train_data.reset()
        metric.reset()
        btic = time.time()
        for i, batch in enumerate(train_data):
            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
            outputs = []
            Ls = []
            with ag.record():
                for x, y in zip(data, label):
                    z = net(x)
                    L = loss(z, y)
                    # store the loss and do backward after we have done forward
                    # on all GPUs for better speed on multiple GPUs.
                    Ls.append(L)
                    outputs.append(z)
                for L in Ls:
                    L.backward()
            trainer.step(batch.data[0].shape[0])
            metric.update(label, outputs)
            if opt.log_interval and not (i+1)%opt.log_interval:
                name, acc = metric.get()
                logging.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f'%(
                               epoch, i, batch_size/(time.time()-btic), name, acc))
            btic = time.time()

        name, acc = metric.get()
        logging.info('[Epoch %d] training: %s=%f'%(epoch, name, acc))
        logging.info('[Epoch %d] time cost: %f'%(epoch, time.time()-tic))
        name, val_acc = test(ctx, val_data)
        logging.info('[Epoch %d] validation: %s=%f'%(epoch, name, val_acc))

    net.save_params('image-classifier-%s-%d.params'%(opt.model, epochs))
Exemplo n.º 27
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
        for i, (data, target) in enumerate(train_data):
            data = data.as_in_context(context).T
            target = target.as_in_context(context).T.reshape((-1, 1))
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [p.grad(context) for p in model.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and bptt size to balance it.
            gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size)

            trainer.step(args.batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if i % args.log_interval == 0 and i > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, i, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.collect_params().save(args.save)
            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
        else:
            args.lr = args.lr*0.25
            trainer._init_optimizer('sgd',
                                    {'learning_rate': args.lr,
                                     'momentum': 0,
                                     'wd': 0})
            model.collect_params().load(args.save, context)
Exemplo n.º 28
0
    def explain_pattern(self, data, out=None, attribution=False):
        X = Mlist(data)
        X.attach_grad()

        with autograd.record():
            y = self.forward_pattern(X)

        if attribution:
            self.overload_weight_attribution_pattern()
        else:
            self.overload_weight_pattern()

        if out is None:
            out = y
        y.backward(out_grad=out)
        self.overload_weight_reset()
        return X.grad
Exemplo n.º 29
0
def _get_grad(net, image, class_id=None, conv_layer_name=None, image_grad=False):
    """This is an internal helper function that can be used for either of these
    but not both at the same time:
    1. Record the output and gradient of output of an intermediate convolutional layer.
    2. Record the gradients of the image.

    Parameters
    ----------
    image : NDArray
        Image to visuaize. This is an NDArray with the preprocessed image.
    class_id : int
        Category ID this image belongs to. If not provided,
        network's prediction will be used.
    conv_layer_name: str
        Name of the convolutional layer whose output and output's gradients need to be acptured.
    image_grad: bool
        Whether to capture gradients of the image."""

    if image_grad:
        image.attach_grad()
        Conv2D.capture_layer_name = None
        Activation.set_guided_backprop(True)
    else:
        # Tell convviz.Conv2D which layer's output and gradient needs to be recorded
        Conv2D.capture_layer_name = conv_layer_name
        Activation.set_guided_backprop(False)
    
    # Run the network
    with autograd.record(train_mode=False):
        out = net(image)
    
    # If user didn't provide a class id, we'll use the class that the network predicted
    if class_id == None:
        model_output = out.asnumpy()
        class_id = np.argmax(model_output)

    # Create a one-hot target with class_id and backprop with the created target
    one_hot_target = mx.nd.one_hot(mx.nd.array([class_id]), 1000)
    out.backward(one_hot_target, train_mode=False)

    if image_grad:
        return image.grad[0].asnumpy()
    else:
        # Return the recorded convolution output and gradient
        conv_out = Conv2D.conv_output
        return conv_out[0].asnumpy(), conv_out.grad[0].asnumpy()
Exemplo n.º 30
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
        for i, (data, target) in enumerate(train_data):
            data = data.as_in_context(context).T
            target = target.as_in_context(context).T.reshape((-1, 1))
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                # Here L is a vector of size batch_size * bptt size
                L = loss(output, target)
                L = L / (args.bptt * args.batch_size)
                L.backward()

            grads = [p.grad(context) for p in model.collect_params().values()]
            gluon.utils.clip_global_norm(grads, args.clip)

            trainer.step(1)
            total_L += mx.nd.sum(L).asscalar()

            if i % args.log_interval == 0 and i > 0:
                cur_L = total_L / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, i, cur_L, math.exp(cur_L)))
                total_L = 0.0

            if args.export_model:
                model.export('model')
                return

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.save_parameters(args.save)
            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
        else:
            args.lr = args.lr*0.25
            trainer.set_learning_rate(args.lr)
    def train(self, batch_size=64,
              num_epoch=10,
              optimizer='adam',
              optimizer_params=(('learning_rate', 0.001),),
              load_checkpoint=False,
              context='cpu',
              reconstruction_loss='mse',
              preprocessing=False,
              checkpoint_period=5,
              load_pretrained=False,
              normalize=False,
              log_period = 50,
              kl_loss_weight=1,
              print_images=False):
        preprocessing = False #TODO to be added - create an additional load_vae_data for preprocessing case
        num_pus = 1
        if context == 'gpu':
            num_pus = mx.context.num_gpus()
            if num_pus >= 1:
                if num_pus == 1:
                    mx_context = [mx.gpu(0)]
                else:
                    mx_context = [mx.gpu(i) for i in range(num_pus)]
            else:
                logging.error("Context argument is '" + context + "'. But no gpu is present in the system.")
                sys.exit(1)
        elif context == 'cpu':
            mx_context = [mx.cpu()]
        else:
            logging.error("Context argument is '" + context + "'. Only 'cpu' and 'gpu are valid arguments'.")
            sys.exit(1)

        single_pu_batch_size = int(batch_size / num_pus)

        if print_images:
            try:
                logging.info("Creating 'images' directory...")
                if not os.path.isdir('images'):
                    os.mkdir('images')
                else:
                    logging.info("'images' directory already exists.")
            except:
                logging.error("Creation of the 'images' directory failed.")

        input_names = [ "data"]
        train_iter, test_iter, data_mean, data_std, _, _ = self._data_loader.load_vae_data(batch_size=batch_size, input_names=input_names)

        if 'weight_decay' in optimizer_params:
            optimizer_params['wd'] = optimizer_params['weight_decay']
            del optimizer_params['weight_decay']
        if 'learning_rate_decay' in optimizer_params:
            min_learning_rate = 1e-08
            if 'learning_rate_minimum' in optimizer_params:
                min_learning_rate = optimizer_params['learning_rate_minimum']
                del optimizer_params['learning_rate_minimum']
            optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
                                                  optimizer_params['step_size'],
                                                  factor=optimizer_params['learning_rate_decay'],
                                                  stop_factor_lr=min_learning_rate)
            del optimizer_params['step_size']
            del optimizer_params['learning_rate_decay']

        begin_epoch = 0

        if load_checkpoint:
            begin_epoch = self._enc_creator.load(mx_context)
            _ = self._dec_creator.load(mx_context)
        elif load_pretrained:
            self._enc_creator.load_pretrained_weights(mx_context)
            self._dec_creator.load_pretrained_weights(mx_context)
        else:
            if os.path.isdir(self._enc_creator._model_dir_):
                shutil.rmtree(self._enc_creator._model_dir_)
            if os.path.isdir(self._dec_creator._model_dir_):
                shutil.rmtree(self._dec_creator._model_dir_)

        if normalize:
            self._enc_creator.construct(context=mx_context, batch_size=batch_size, data_mean=data_mean, data_std=data_std)
            self._dec_creator.construct(context=mx_context, batch_size=batch_size, data_mean=data_mean, data_std=data_std)
        else:
            self._enc_creator.construct(context=mx_context, batch_size=batch_size)
            self._dec_creator.construct(context=mx_context, batch_size=batch_size)

        encoder_nets = self._enc_creator.networks
        decoder_nets = self._dec_creator.networks

        if len(encoder_nets) > 1:
            logging.error("VAE-components don't support multiple networkmodels yet. Encoder-Networks found: " + str(len(encoder_nets)))
            sys.exit(1)
        elif len(decoder_nets) > 1:
            logging.error("VAE-components don't support multiple networkmodels yet. Decoder-Networks found: " + str(len(decoder_nets)))
            sys.exit(1)

        loss_ctx_list = []

        loss_ctx_list.append(encoder_nets[0].loss_ctx_dict)
        


        enc_trainers = [mx.gluon.Trainer(network.collect_params(), optimizer, optimizer_params) for network in
                    encoder_nets.values() if len(network.collect_params().values()) != 0]
        dec_trainers = [mx.gluon.Trainer(network.collect_params(), optimizer, optimizer_params) for network in
                    decoder_nets.values() if len(network.collect_params().values()) != 0]


        loss_function = VAELoss(recon_loss=reconstruction_loss, kl_loss_weight=kl_loss_weight, loss_ctx_list=loss_ctx_list)

        loss_function.hybridize()

        tic = None

        avg_speed = 0
        n = 0

        train_lost_list = []
        test_lost_list = []

        for epoch in range(begin_epoch, begin_epoch + num_epoch):
            global_loss_train = 0.0
            global_reconloss = 0.0
            train_batches = 0

            loss_total = 0
            recon_total = 0
            train_iter.reset()

            for batch_i, batch in enumerate(train_iter):

                with autograd.record():
                    indexed_labels = 0
                    indexed_data = 0
                    if "data" == "label":
                        data_ = gluon.utils.split_and_load(batch.label[indexed_labels], ctx_list=mx_context, even_split=False)
                        indexed_labels += 1
                    else:
                        data_ = gluon.utils.split_and_load(batch.data[indexed_data], ctx_list=mx_context, even_split=False)
                        indexed_data += 1

                    lossList = []
                    loss_param_list = []
                    reconstruction_losses = []
                    encoding_ = []
                    pred_= []

                    for i in range(num_pus):
                        lossList.append([])
                        loss_param_list.append([])
                        reconstruction_losses.append([])
                        encoding_.append([])
                        pred_.append([])

                    nd.waitall()
                    for i in range(num_pus):
                        feature_vec, loss_params_enc = encoder_nets[0]( data_[i])
                        loss_param_list[i].append(loss_params_enc)
                        encoding_[i] = feature_vec[0]

                    nd.waitall()
                    for i in range(num_pus):
                        res_ = decoder_nets[0]( encoding_[i])
                        
                        pred_[i] = res_[0][0]

                    nd.waitall()
                    for i in range(num_pus):
                        elbo, reconstruction_loss = loss_function(pred_[i], data_[i], loss_param_list[i])
                        lossList[i].append(elbo)
                        reconstruction_losses[i].append(reconstruction_loss)


                    losses = [0] * num_pus
                    reconLosses = [0] * num_pus
                    for i in range(num_pus):
                        for element in lossList[i]:
                            losses[i] = losses[i] + element
                        for r in reconstruction_losses[i]:
                            reconLosses[i] = reconLosses[i] + r

                for loss in losses:
                    loss.backward()
                    loss_total += loss.sum().asscalar()
                    global_loss_train += loss.sum().asscalar()

                for loss in reconLosses:
                    recon_total += loss.sum().asscalar()
                    global_reconloss += loss.sum().asscalar()

                train_batches += 1

                for trainer in dec_trainers:
                    trainer.step(batch_size)
                for trainer in enc_trainers:
                    trainer.step(batch_size)

                if tic is None:
                    tic = time.time()
                else:
                    if batch_i % log_period == 0:
                        try:
                            speed = log_period * batch_size / (time.time() - tic)
                        except ZeroDivisionError:
                            speed = float("inf")

                        loss_avg = loss_total / (batch_size * log_period)
                        recon_avg = recon_total / (batch_size * log_period)
                        loss_total = 0
                        recon_total = 0

                        logging.info("Epoch[%d] Batch[%d] Speed: %.2f samples/sec Average Negative-ELBO Loss: %.5f, Reconstruction Loss: %.5f" % (
                        epoch, batch_i, speed, loss_avg, recon_avg))

                        avg_speed += speed
                        n += 1

                        tic = time.time()


            global_loss_train /= (train_batches * batch_size)
            global_reconloss /= (train_batches * batch_size)

            tic = None

            global_loss_test = 0.0
            test_batches = 0

            test_iter.batch_size = single_pu_batch_size
            test_iter.reset()


            for batch_i, batch in enumerate(test_iter):

                indexed_labels = 0
                indexed_data = 0
                if "data" == "label":
                    data_ = gluon.utils.split_and_load(batch.label[indexed_labels], ctx_list=mx_context, even_split=False)
                    indexed_labels += 1
                else:
                    data_ = gluon.utils.split_and_load(batch.data[indexed_data], ctx_list=mx_context, even_split=False)
                    indexed_data += 1

                lossList = []
                loss_param_list = []
                encoding_ = []
                pred_= []

                for i in range(num_pus):
                    lossList.append([])
                    loss_param_list.append([])
                    encoding_.append([])
                    pred_.append([])

                nd.waitall()
                for i in range(num_pus):
                    feature_vec, loss_params_enc = encoder_nets[0]( data_[i])
                    loss_param_list[i].append(loss_params_enc)
                    encoding_[i] = feature_vec[0]

                nd.waitall()
                for i in range(num_pus):
                    res_ = decoder_nets[0]( encoding_[i])
                    
                    pred_[i] = res_[0][0]

                nd.waitall()
                for i in range(num_pus):
                    elbo, reconstruction_loss = loss_function(pred_[i], data_[i], loss_param_list[i])
                    lossList[i].append(elbo)

                losses = [0] * num_pus
                for i in range(num_pus):
                    for element in lossList[i]:
                        losses[i] = losses[i] + element

                for loss in losses:
                    global_loss_test += loss.sum().asscalar()

                test_batches += 1

            global_loss_test /= (test_batches * single_pu_batch_size)

            logging.info("Epoch[%d], Epoch Train Loss: %f, Epoch Reconstruction Loss: %f, Validation Loss: %f" % (
                epoch, global_loss_train, global_reconloss, global_loss_test))

            if (epoch+1) % checkpoint_period == 0:
                for i, network in encoder_nets.items():
                    if network.save_specific_params_list:
                        for name, param_dic in network.save_specific_params_list:
                            param_dic.save(self.encoder_parameter_path(i) + '-' + name + '.params')
                    network.save_parameters(self.encoder_parameter_path(i) + '-' + str(epoch).zfill(4) + '.params')
                for i, network in decoder_nets.items():
                    if network.save_specific_params_list:
                        for name, param_dic in network.save_specific_params_list:
                            param_dic.save(self.decoder_parameter_path(i) + '-' + name + '.params')
                    network.save_parameters(self.decoder_parameter_path(i) + '-' + str(epoch).zfill(4) + '.params')

            if print_images:
                train_lost_list.append(global_loss_train)
                test_lost_list.append(global_loss_test)

                try:
                    #Reconstructions
                    filename = 'test_reconstruction_%06d%06d.png' % (epoch, batch_i)
                    fig = plt.figure()
                    ax = fig.add_subplot(1, 2, 1)
                    plt.imshow(data_[0][0].squeeze(0).asnumpy())
                    ax.set_title('Original')
                    ax = fig.add_subplot(1, 2, 2)
                    plt.imshow(pred_[0][0].squeeze(0).asnumpy())
                    ax.set_title('Reconstruction')
                    plt.tight_layout()
                    plt.savefig('images/' + filename)
                    plt.close()
                except:
                    logging.info("Could not print reconstruction images.")

        if print_images:
            if num_epoch != 1:
                try:
                    #Loss plot
                    batch_x = np.linspace(1, num_epoch, len(train_lost_list))
                    filename = 'loss_graph.png'
                    plt.plot(batch_x, np.array(train_lost_list))
                    plt.plot(batch_x, np.array(test_lost_list))
                    plt.legend(['Train loss', 'Validation Loss'])
                    plt.savefig('images/' + filename)
                except:
                    logging.info("Could not print loss plot image.")

        for i, network in encoder_nets.items():
            if network.save_specific_params_list:
                for name, param_dic in network.save_specific_params_list:
                    param_dic.save(self.encoder_parameter_path(i) + '-' + name + '.params')
            network.save_parameters(self.encoder_parameter_path(i) + '-' + str((num_epoch-1) + begin_epoch).zfill(4) + '.params')
            network.export(self.encoder_parameter_path(i) + '_newest', epoch=0)

            loss_function.export(self.encoder_parameter_path(i) + '_newest_loss', epoch=0)

        for i, network in decoder_nets.items():
            if network.save_specific_params_list:
                for name, param_dic in network.save_specific_params_list:
                    param_dic.save(self.decoder_parameter_path(i) + '-' + name + '.params')
            network.save_parameters(self.decoder_parameter_path(i) + '-' + str((num_epoch-1) + begin_epoch).zfill(4) + '.params')
            network.export(self.decoder_parameter_path(i) + '_newest', epoch=0)

            loss_function.export(self.decoder_parameter_path(i) + '_newest_loss', epoch=0)
Exemplo n.º 32
0
def train_mnist(epochs,
                input_shape,
                n_class,
                num_routing,
                recon_loss_weight,
                ctx=mx.gpu(0),
                log_interval=20,
                **kwargs):
    batch_size, C, H, W = input_shape
    capsnet = CapsNet(n_class, num_routing, input_shape)
    capsnet.initialize(init=mx.init.Xavier(), ctx=ctx)

    capsnet.hybridize()

    #mnist = mx.test_utils.get_mnist()
    #train_iter = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True)
    #val_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
    train_iter = mx.io.MNISTIter(image="data/train-images.idx3-ubyte",
                                 label="data/train-labels.idx1-ubyte",
                                 batch_size=batch_size,
                                 shuffle=True)
    val_iter = mx.io.MNISTIter(image="data/t10k-images.idx3-ubyte",
                               label="data/t10k-labels.idx1-ubyte",
                               batch_size=batch_size,
                               shuffle=False)

    draw_num = 32
    draw_batch = val_iter.next()
    draw_data = draw_batch.data[0].as_in_context(ctx)
    draw_label = draw_batch.label[0].as_in_context(ctx)
    draw_label = mx.nd.one_hot(draw_label, n_class)
    learning_rate = 0.001
    lr_scheduler = SimpleLRScheduler(learning_rate)
    decay = 0.9
    trainer = gluon.Trainer(capsnet.collect_params(),
                            optimizer='adam',
                            optimizer_params={'lr_scheduler': lr_scheduler})

    train_plt = viz.line(Y=np.zeros((1, 3)),
                         X=np.zeros((1, 3)),
                         opts=dict(
                             xlabel='Batch',
                             ylabel='Loss and Acc',
                             title='CapsNet traning plot',
                             legend=['Accuracy', 'Digit Loss', 'Mask Loss']))
    val_plt = viz.line(Y=np.zeros((1, 3)),
                       X=np.zeros((1, 3)),
                       opts=dict(
                           xlabel='Epoch',
                           ylabel='Loss and Acc',
                           title='CapsNet validation plot',
                           legend=['Accuracy', 'Digit Loss', 'Mask Loss']))
    mask_plt = viz.images(np.random.randn(draw_num * 2, 1, 28, 28),
                          opts=dict(title='Mask images', caption='Mask'))
    hist_acc = 0
    loss_metric = LossMetric(batch_size, 1)
    val_metric = LossMetric(batch_size, 1)
    batches_one_epoch = 60000 / batch_size
    for epoch in range(epochs):
        train_iter.reset()
        val_iter.reset()
        loss_metric.reset()
        for i, batch in enumerate(train_iter):
            tic = time.time()
            x = batch.data[0].as_in_context(ctx)
            y = batch.label[0].as_in_context(ctx)
            y_ori = y
            y = mx.nd.one_hot(y, n_class)
            with autograd.record():
                out_caps, out_mask = capsnet(x, y)
                margin_loss_ = margin_loss(mx.nd, y, out_caps)
                mask_loss_ = mask_mse_loss(mx.nd, x, out_mask)
                loss = (1 - recon_loss_weight
                        ) * margin_loss_ + recon_loss_weight * mask_loss_
            loss.backward()
            trainer.step(batch_size)
            loss_metric.update([y_ori], [out_caps, loss, mask_loss_])

            if i % log_interval == 0:
                acc, digit_loss, mask_loss = loss_metric.get_name_value()
                viz.line(Y=np.array([acc, digit_loss, mask_loss]).reshape(
                    (1, 3)),
                         X=np.ones((1, 3)) * batches_one_epoch * epoch + i,
                         win=train_plt,
                         update='append')
                take_num = min(draw_num, batch_size)
                pred_label, pred_mask = capsnet(draw_data, draw_label)
                draw = np.concatenate([
                    draw_data[:take_num].asnumpy(),
                    pred_mask[:take_num].asnumpy()
                ])
                viz.images(draw, win=mask_plt)
                elasp = time.time() - tic
                print 'Epoch %2d, train %s %.5f, time %.1f sec, %d samples/s' % (
                    epoch, "acc", acc, elasp, int(batch_size / elasp))

        lr_scheduler.learning_rate = learning_rate * (decay**(epoch + 1))

        val_metric.reset()
        for i, batch in enumerate(val_iter):
            x = batch.data[0].as_in_context(ctx)
            y = batch.label[0].as_in_context(ctx)
            y_ori = y
            y = mx.nd.one_hot(y, n_class)
            out_caps, out_mask = capsnet(x, y)
            margin_loss_ = margin_loss(mx.nd, y, out_caps)
            mask_loss_ = mask_mse_loss(mx.nd, x, out_mask)
            loss = (1 - recon_loss_weight
                    ) * margin_loss_ + recon_loss_weight * mask_loss_
            val_metric.update([y_ori], [out_caps, loss, mask_loss_])
        acc, digit_loss, mask_loss = val_metric.get_name_value()
        viz.line(Y=np.array([acc, digit_loss, mask_loss]).reshape((1, 3)),
                 X=np.ones((1, 3)) * epoch,
                 win=val_plt,
                 update='append')
        if acc > hist_acc:
            hist_acc = acc
            capsnet.save_params("model/capsnet_%f.params" % acc)
        print 'Epoch %2d, validation %s %.5f' % (epoch, "acc", acc)
Exemplo n.º 33
0
    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        if opt.partial_bn:
            train_patterns = None
            if 'inceptionv3' in opt.model:
                train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var'
            else:
                logger.info('Current model does not support partial batch normalization.')

            if opt.kvstore is not None:
                trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, kvstore=kv, update_on_kvstore=False)
            else:
                trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, update_on_kvstore=False)
        else:
            if opt.kvstore is not None:
                trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, kvstore=kv, update_on_kvstore=False)
            else:
                trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, update_on_kvstore=False)

        if opt.accumulate > 1:
            params = [p for p in net.collect_params().values() if p.grad_req != 'null']
            for p in params:
                p.grad_req = 'add'

        if opt.resume_states is not '':
            trainer.load_states(opt.resume_states)

        if opt.use_amp:
            amp.init_trainer(trainer)

        L = gluon.loss.SoftmaxCrossEntropyLoss()

        best_val_score = 0
        lr_decay_count = 0

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            train_metric.reset()
            btic = time.time()
            num_train_iter = len(train_data)
            train_loss_epoch = 0
            train_loss_iter = 0

            for i, batch in enumerate(train_data):
                data, label = batch_fn(batch, ctx)

                with ag.record():
                    outputs = []
                    for _, X in enumerate(data):
#                        X = X.reshape((-1,) + X.shape[2:])
                        X = X.reshape((-3,-3,-2))
                        pred = net(X.astype(opt.dtype, copy=False))
                        outputs.append(pred)
                    loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]
#                    print(loss)
                    if opt.use_amp:
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                    else:
                        ag.backward(loss)

                if opt.accumulate > 1 and (i + 1) % opt.accumulate == 0:
                    if opt.kvstore is not None:
                        trainer.step(batch_size * kv.num_workers * opt.accumulate)
                    else:
                        trainer.step(batch_size * opt.accumulate)
                        net.collect_params().zero_grad()
                else:
                    if opt.kvstore is not None:
                        trainer.step(batch_size * kv.num_workers)
                    else:
                        trainer.step(batch_size)

#                print(outputs)
#                print(label)
                train_metric.update(label, outputs)
                train_loss_iter = sum([l.mean().asscalar() for l in loss]) / len(loss)
                train_loss_epoch += train_loss_iter

                train_metric_name, train_metric_score = train_metric.get()
                sw.add_scalar(tag='train_acc_top1_iter', value=train_metric_score*100, global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='train_loss_iter', value=train_loss_iter, global_step=epoch * num_train_iter + i)
                sw.add_scalar(tag='learning_rate_iter', value=trainer.learning_rate, global_step=epoch * num_train_iter + i)

                if opt.log_interval and not (i+1) % opt.log_interval:
                    logger.info('Epoch[%03d] Batch [%04d]/[%04d]\tSpeed: %f samples/sec\t %s=%f\t loss=%f\t lr=%f' % (
                                epoch, i, num_train_iter, batch_size*opt.log_interval/(time.time()-btic),
                                train_metric_name, train_metric_score*100, train_loss_epoch/(i+1), trainer.learning_rate))
                    btic = time.time()

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i /(time.time() - tic))
            mx.ndarray.waitall()

            if opt.kvstore is not None and epoch == opt.resume_epoch:
                kv.init(111111, nd.zeros(1))
                kv.init(555555, nd.zeros(1))
                kv.init(999999, nd.zeros(1))

            if opt.kvstore is not None:
                acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data, kv)
            else:
                acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data)

            logger.info('[Epoch %03d] training: %s=%f\t loss=%f' % (epoch, train_metric_name, train_metric_score*100, train_loss_epoch/num_train_iter))
            logger.info('[Epoch %03d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time()-tic))
            logger.info('[Epoch %03d] validation: acc-top1=%f acc-top5=%f loss=%f' % (epoch, acc_top1_val*100, acc_top5_val*100, loss_val))

            sw.add_scalar(tag='train_loss_epoch', value=train_loss_epoch/num_train_iter, global_step=epoch)
            sw.add_scalar(tag='val_loss_epoch', value=loss_val, global_step=epoch)
            sw.add_scalar(tag='val_acc_top1_epoch', value=acc_top1_val*100, global_step=epoch)

            if acc_top1_val > best_val_score:
                best_val_score = acc_top1_val
                net.save_parameters('%s/%.4f-%s-%s-%03d-best.params'%(opt.save_dir, best_val_score, opt.dataset, model_name, epoch))
                trainer.save_states('%s/%.4f-%s-%s-%03d-best.states'%(opt.save_dir, best_val_score, opt.dataset, model_name, epoch))
            else:
                if opt.save_frequency and opt.save_dir and (epoch + 1) % opt.save_frequency == 0:
                    net.save_parameters('%s/%s-%s-%03d.params'%(opt.save_dir, opt.dataset, model_name, epoch))
                    trainer.save_states('%s/%s-%s-%03d.states'%(opt.save_dir, opt.dataset, model_name, epoch))

        # save the last model
        net.save_parameters('%s/%s-%s-%03d.params'%(opt.save_dir, opt.dataset, model_name, opt.num_epochs-1))
        trainer.save_states('%s/%s-%s-%03d.states'%(opt.save_dir, opt.dataset, model_name, opt.num_epochs-1))
Exemplo n.º 34
0

def relu(X):
    return nd.maximum(X, 0)


def net(X):
    X = X.reshape((-1, num_input))
    h1 = relu(nd.dot(X, w1) + b1)
    output = nd.dot(h1, w2) + b2
    return output


loss = gloss.SoftmaxCrossEntropyLoss()
epochs = 5
learning_rate = 0.01
train_loss = 0.
train_acc = 0.
for i in range(epochs):
    for X, Y in train_iter:
        with ag.record():
            Y_hat = net(X)
            l = loss(Y_hat, Y)
        l.backward()
        utils.SGD(params, learning_rate / batch_size)
    train_loss += nd.mean(l).asscalar()
    train_acc += utils.accuracy(Y_hat, Y)
test_acc = utils.evaluate_accuracy(test_iter, net)
print "Epoch %d train_loss %f train_acc %f test_acc %f" % (
    i, train_loss / len(train_iter), train_acc / len(train_iter), test_acc)
def train():
    logging.info('Start Training for Task: %s\n' % (task))

    finetune_net = build_model()

    # Define DataLoader
    train_data = gluon.data.DataLoader(
        gluon.data.vision.ImageFolderDataset(
            os.path.join('data/train_valid', task, 'train'),
            transform=transform_train),
        batch_size=batch_size, shuffle=True, num_workers=num_workers, last_batch='discard')

    val_data = gluon.data.DataLoader(
        gluon.data.vision.ImageFolderDataset(
            os.path.join('data/train_valid', task, 'val'),
            transform=transform_val),
        batch_size=batch_size, shuffle=False, num_workers = num_workers)

    # Define Trainer
    trainer = gluon.Trainer(finetune_net.collect_params(), 'sgd', {
        'learning_rate': lr, 'momentum': momentum, 'wd': wd})
    metric = mx.metric.Accuracy()
    L = gluon.loss.SoftmaxCrossEntropyLoss()
    lr_counter = 0
    num_batch = len(train_data)

    # Start Training
    for epoch in range(epochs):
        if epoch == lr_steps[lr_counter]:
            trainer.set_learning_rate(trainer.learning_rate*lr_factor)
            lr_counter += 1

        tic = time.time()
        train_loss = 0
        metric.reset()
        AP = 0.
        AP_cnt = 0

        for i, batch in enumerate(train_data):
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
            label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)
            with ag.record():
                outputs = [finetune_net(X) for X in data]
                loss = [L(yhat, y) for yhat, y in zip(outputs, label)]
            for l in loss:
                l.backward()

            trainer.step(batch_size)
            train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)

            metric.update(label, outputs)
            ap, cnt = calculate_ap(label, outputs)
            AP += ap
            AP_cnt += cnt

            progressbar(i, num_batch-1)

        train_map = AP / AP_cnt
        _, train_acc = metric.get()
        train_loss /= num_batch

        val_acc, val_map, val_loss = validate(finetune_net, val_data, ctx)

        logging.info('[Epoch %d] Train-acc: %.3f, mAP: %.3f, loss: %.3f | Val-acc: %.3f, mAP: %.3f, loss: %.3f | time: %.1fs' %
                 (epoch, train_acc, train_map, train_loss, val_acc, val_map, val_loss, time.time() - tic))

        saved_path = os.path.join(CKPT_PATH, '%s-%s-epoch-%d.params' % (task, time.strftime("%Y-%m-%d-%H-%M", time.localtime(time.time())), epoch))
        finetune_net.save_params(saved_path)
        logging.info('\nsave results at %s' % saved_path)
    return (finetune_net, saved_path)
def train(net, train_data, val_data, eval_metric, ctx, args):
    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)

    fix_pattern = get_fix_params_pattern(args.network)
    param_dict = net.collect_params(fix_pattern)
    for _, param in param_dict.items():
        param.grad_req = 'null'
    logger.info('Fixed such params for net:\n%s' % param_dict)

    trainer = gluon.Trainer(net.collect_params(), 'sgd', {
        'learning_rate': args.lr,
        'wd': args.wd,
        'momentum': args.momentum
    })
    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted(
        [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])

    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_metric = [0]
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
    loss_metric = mx.metric.Loss('CELoss')
    num_batch = len(train_data)
    # Start Training
    for epoch in range(args.epochs):
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(
                epoch, new_lr))

        tic = time.time()
        btic = time.time()
        loss_metric.reset()
        eval_metric.reset()

        for i, batch in enumerate(train_data):
            data_list = gluon.utils.split_and_load(batch[0],
                                                   ctx_list=ctx,
                                                   batch_axis=0,
                                                   even_split=False)
            label_list = gluon.utils.split_and_load(batch[1],
                                                    ctx_list=ctx,
                                                    batch_axis=0,
                                                    even_split=False)
            output_list = []
            loss_list = []
            with autograd.record():
                for data, label in zip(data_list, label_list):
                    output = net(data)
                    output_list.append(output)
                    loss_list.append(loss(output, label))
            autograd.backward(loss_list)
            trainer.step(args.batch_size)

            loss_metric.update(None, loss_list)
            eval_metric.update(label_list, output_list)

            if args.log_interval and not (i + 1) % args.log_interval:
                _, train_loss = loss_metric.get()
                metric_name, metric_value = eval_metric.get()
                speed = args.log_interval * args.batch_size / (time.time() -
                                                               btic)
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, CELoss=%.3f'
                    .format(epoch, i, speed, train_loss, metric_name,
                            metric_value))
                btic = time.time()

        _, train_loss = loss_metric.get()
        metric_name, metric_value = eval_metric.get()
        if not isinstance(metric_value, (list, tuple)):
            metric_name = [metric_name]
            metric_value = [metric_value]
        metric_msg = '\n'.join(
            ['{}={}'.format(k, v) for k, v in zip(metric_name, metric_value)])
        logger.info(
            '[Epoch {}] Training cost: {:.3f}, CELoss=%.3f, \n{}'.format(
                epoch, (time.time() - tic), train_loss, metric_msg))

        if not (epoch + 1) % args.val_interval:
            metric_name, metric_value = validate(net, val_data, ctx,
                                                 eval_metric)
            if not isinstance(metric_value, (list, tuple)):
                metric_name = [metric_name]
                metric_value = [metric_value]
            metric_msg = '\n'.join([
                '{}={}'.format(k, v) for k, v in zip(metric_name, metric_value)
            ])
            logger.info('[Epoch {}] Validation: \n{}'.format(
                epoch, metric_msg))
            current_metric = metric_value[-1]
        else:
            current_metric = 0.

        save_params(net, logger, best_metric, current_metric, epoch,
                    args.save_interval, args.save_prefix)
Exemplo n.º 37
0
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)

    if args.horovod:
        hvd.broadcast_parameters(net.collect_params(), root_rank=0)
        trainer = hvd.DistributedTrainer(net.collect_params(), 'sgd', {
            'learning_rate': args.lr,
            'wd': args.wd,
            'momentum': args.momentum
        })
    else:
        trainer = gluon.Trainer(
            net.collect_params(),
            'sgd', {
                'learning_rate': args.lr,
                'wd': args.wd,
                'momentum': args.momentum
            },
            update_on_kvstore=(False if args.amp else None))

    if args.amp:
        amp.init_trainer(trainer)

    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted(
        [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])

    mbox_loss = gcv.loss.SSDMultiBoxLoss()
    ce_metric = mx.metric.Loss('CrossEntropy')
    smoothl1_metric = mx.metric.Loss('SmoothL1')

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]

    for epoch in range(args.start_epoch, args.epochs):
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(
                epoch, new_lr))
        ce_metric.reset()
        smoothl1_metric.reset()
        tic = time.time()
        btic = time.time()
        net.hybridize(static_alloc=True, static_shape=True)

        for i, batch in enumerate(train_data):
            if args.dali:
                # dali iterator returns a mxnet.io.DataBatch
                data = [d.data[0] for d in batch]
                box_targets = [d.label[0] for d in batch]
                cls_targets = [
                    nd.cast(d.label[1], dtype='float32') for d in batch
                ]

            else:
                data = gluon.utils.split_and_load(batch[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
                cls_targets = gluon.utils.split_and_load(batch[1],
                                                         ctx_list=ctx,
                                                         batch_axis=0)
                box_targets = gluon.utils.split_and_load(batch[2],
                                                         ctx_list=ctx,
                                                         batch_axis=0)

            with autograd.record():
                cls_preds = []
                box_preds = []
                for x in data:
                    cls_pred, box_pred, _ = net(x)
                    cls_preds.append(cls_pred)
                    box_preds.append(box_pred)
                sum_loss, cls_loss, box_loss = mbox_loss(
                    cls_preds, box_preds, cls_targets, box_targets)
                if args.amp:
                    with amp.scale_loss(sum_loss, trainer) as scaled_loss:
                        autograd.backward(scaled_loss)
                else:
                    autograd.backward(sum_loss)
            # since we have already normalized the loss, we don't want to normalize
            # by batch-size anymore
            trainer.step(1)

            if (not args.horovod or hvd.rank() == 0):
                local_batch_size = int(args.batch_size //
                                       (hvd.size() if args.horovod else 1))
                ce_metric.update(0, [l * local_batch_size for l in cls_loss])
                smoothl1_metric.update(
                    0, [l * local_batch_size for l in box_loss])
                if args.log_interval and not (i + 1) % args.log_interval:
                    name1, loss1 = ce_metric.get()
                    name2, loss2 = smoothl1_metric.get()
                    logger.info(
                        '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'
                        .format(epoch, i,
                                args.batch_size / (time.time() - btic), name1,
                                loss1, name2, loss2))
                btic = time.time()

        if (not args.horovod or hvd.rank() == 0):
            name1, loss1 = ce_metric.get()
            name2, loss2 = smoothl1_metric.get()
            logger.info(
                '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.
                format(epoch, (time.time() - tic), name1, loss1, name2, loss2))
            if (epoch % args.val_interval
                    == 0) or (args.save_interval
                              and epoch % args.save_interval == 0):
                # consider reduce the frequency of validation to save time
                map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
                val_msg = '\n'.join(
                    ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
                logger.info('[Epoch {}] Validation: \n{}'.format(
                    epoch, val_msg))
                current_map = float(mean_ap[-1])
            else:
                current_map = 0.
            save_params(net, best_map, current_map, epoch, args.save_interval,
                        args.save_prefix)
Exemplo n.º 38
0
def test_lstmp():
    hidden_size, projection_size = 3, 2
    rtol, atol = 1e-2, 1e-2
    batch_size, seq_len = 7, 11
    input_size = 5
    ctx = mx.gpu(0)
    lstm_input = mx.nd.uniform(shape=(seq_len, batch_size, input_size),
                               ctx=ctx)
    shapes = {
        'i2h_weight': (hidden_size * 4, input_size),
        'h2h_weight': (hidden_size * 4, projection_size),
        'i2h_bias': (hidden_size * 4, ),
        'h2h_bias': (hidden_size * 4, ),
        'h2r_weight': (projection_size, hidden_size)
    }
    weights = {k: rand_ndarray(v) for k, v in shapes.items()}
    lstm_layer = gluon.rnn.LSTM(hidden_size,
                                projection_size=projection_size,
                                input_size=input_size)
    lstm_cell = gluon.rnn.LSTMPCell(hidden_size=hidden_size,
                                    projection_size=projection_size,
                                    input_size=input_size)
    lstm_layer.initialize(ctx=ctx)
    lstm_cell.initialize(ctx=ctx)
    layer_params = lstm_layer.collect_params()
    cell_params = lstm_cell.collect_params()
    for k, v in weights.items():
        layer_params['l0_' + k].set_data(v.copy())
        cell_params[k].set_data(v.copy())
    with autograd.record():
        layer_output = lstm_layer(lstm_input.copy())
        cell_output = lstm_cell.unroll(seq_len,
                                       lstm_input.copy(),
                                       layout='TNC',
                                       merge_outputs=True)[0]

    assert_almost_equal(layer_output, cell_output, rtol=rtol, atol=atol)
    layer_output.backward()
    cell_output.backward()
    for k, v in weights.items():
        layer_grad = layer_params['l0_' + k].grad()
        cell_grad = cell_params[k].grad()
        print('checking gradient for {}'.format('lstm0_l0_' + k))
        assert_almost_equal(layer_grad, cell_grad, rtol=rtol, atol=atol)
    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5),
                            mx.nd.ones((8, 3, 20)),
                            ctx=ctx)
    check_rnn_layer_forward(
        gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True),
        mx.nd.ones((8, 3, 20)),
        [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))],
        ctx=ctx)
    check_rnn_layer_forward(gluon.rnn.LSTM(10,
                                           2,
                                           dropout=0.5,
                                           projection_size=5),
                            mx.nd.ones((8, 3, 20)),
                            run_only=True,
                            ctx=ctx)
    check_rnn_layer_forward(
        gluon.rnn.LSTM(10,
                       2,
                       bidirectional=True,
                       dropout=0.5,
                       projection_size=5),
        mx.nd.ones((8, 3, 20)),
        [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))],
        run_only=True,
        ctx=ctx)
    lstm_layer.save_parameters('gpu_tmp.params')
    lstm_layer.load_parameters('gpu_tmp.params')
Exemplo n.º 39
0
    def _train_loop(self,
                    train_data,
                    val_data,
                    train_eval_data,
                    time_limit=math.inf):
        start_tic = time.time()
        # fix seed for mxnet, numpy and python builtin random generator.
        gutils.random.seed(self._cfg.train.seed)
        # loss and metric
        mbox_loss = SSDMultiBoxLoss()
        ce_metric = mx.metric.Loss('CrossEntropy')
        smoothl1_metric = mx.metric.Loss('SmoothL1')

        # lr decay policy
        lr_decay = float(self._cfg.train.lr_decay)
        lr_steps = sorted([float(ls) for ls in self._cfg.train.lr_decay_epoch])

        self._logger.info('Start training from [Epoch %d]',
                          max(self._cfg.train.start_epoch, self.epoch))

        self.net.collect_params().reset_ctx(self.ctx)
        early_stopper = EarlyStopperOnPlateau(
            patience=self._cfg.train.early_stop_patience,
            min_delta=self._cfg.train.early_stop_min_delta,
            baseline_value=self._cfg.train.early_stop_baseline,
            max_value=self._cfg.train.early_stop_max_value)
        mean_ap = [-1]
        cp_name = ''
        self._time_elapsed += time.time() - start_tic
        for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch),
                                self._cfg.train.epochs):
            epoch = self.epoch
            tic = time.time()
            last_tic = time.time()
            if self._best_map >= 1.0:
                self._logger.info(
                    '[Epoch {}] Early stopping as mAP is reaching 1.0'.format(
                        epoch))
                break
            should_stop, stop_message = early_stopper.get_early_stop_advice()
            if should_stop:
                self._logger.info('[Epoch {}] '.format(epoch) + stop_message)
                break
            while lr_steps and epoch >= lr_steps[0]:
                new_lr = self.trainer.learning_rate * lr_decay
                lr_steps.pop(0)
                self.trainer.set_learning_rate(new_lr)
                self._logger.info("[Epoch {}] Set learning rate to {}".format(
                    epoch, new_lr))
            ce_metric.reset()
            smoothl1_metric.reset()
            self.net.hybridize(static_alloc=True, static_shape=True)

            for i, batch in enumerate(train_data):
                btic = time.time()
                if self._time_elapsed > time_limit:
                    self._logger.warning(
                        f'`time_limit={time_limit}` reached, exit early...')
                    return {
                        'train_map': float(mean_ap[-1]),
                        'valid_map': self._best_map,
                        'time': self._time_elapsed,
                        'checkpoint': cp_name
                    }
                if self._cfg.train.dali:
                    # dali iterator returns a mxnet.io.DataBatch
                    data = [d.data[0] for d in batch]
                    box_targets = [d.label[0] for d in batch]
                    cls_targets = [
                        nd.cast(d.label[1], dtype='float32') for d in batch
                    ]
                else:
                    data = gluon.utils.split_and_load(batch[0],
                                                      ctx_list=self.ctx,
                                                      batch_axis=0,
                                                      even_split=False)
                    cls_targets = gluon.utils.split_and_load(batch[1],
                                                             ctx_list=self.ctx,
                                                             batch_axis=0,
                                                             even_split=False)
                    box_targets = gluon.utils.split_and_load(batch[2],
                                                             ctx_list=self.ctx,
                                                             batch_axis=0,
                                                             even_split=False)

                with autograd.record():
                    cls_preds = []
                    box_preds = []
                    for x in data:
                        cls_pred, box_pred, _ = self.net(x)
                        cls_preds.append(cls_pred)
                        box_preds.append(box_pred)
                    sum_loss, cls_loss, box_loss = mbox_loss(
                        cls_preds, box_preds, cls_targets, box_targets)
                    if self._cfg.ssd.amp:
                        with amp.scale_loss(sum_loss,
                                            self.trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(sum_loss)
                # since we have already normalized the loss, we don't want to normalize
                # by batch-size anymore
                self.trainer.step(1)

                if not self._cfg.horovod or hvd.rank() == 0:
                    local_batch_size = int(
                        self._cfg.train.batch_size //
                        (hvd.size() if self._cfg.horovod else 1))
                    ce_metric.update(0,
                                     [l * local_batch_size for l in cls_loss])
                    smoothl1_metric.update(
                        0, [l * local_batch_size for l in box_loss])
                    if self._cfg.train.log_interval and not (
                            i + 1) % self._cfg.train.log_interval:
                        name1, loss1 = ce_metric.get()
                        name2, loss2 = smoothl1_metric.get()
                        self._logger.info(
                            '[Epoch %d][Batch %d], Speed: %f samples/sec, %s=%f, %s=%f',
                            epoch, i, self._cfg.train.batch_size /
                            (time.time() - last_tic), name1, loss1, name2,
                            loss2)
                        last_tic = time.time()
                self._time_elapsed += time.time() - btic

            post_tic = time.time()
            if not self._cfg.horovod or hvd.rank() == 0:
                name1, loss1 = ce_metric.get()
                name2, loss2 = smoothl1_metric.get()
                self._logger.info('[Epoch %d] Training cost: %f, %s=%f, %s=%f',
                                  epoch, (time.time() - tic), name1, loss1,
                                  name2, loss2)
                if (epoch % self._cfg.valid.val_interval == 0) or \
                    (self._cfg.save_interval and epoch % self._cfg.save_interval == 0):
                    # consider reduce the frequency of validation to save time
                    map_name, mean_ap = self._evaluate(val_data)
                    val_msg = '\n'.join([
                        '{}={}'.format(k, v)
                        for k, v in zip(map_name, mean_ap)
                    ])
                    self._logger.info('[Epoch %d] Validation: \n%s', epoch,
                                      str(val_msg))
                    current_map = float(mean_ap[-1])
                    if current_map > self._best_map:
                        cp_name = os.path.join(self._logdir,
                                               _BEST_CHECKPOINT_FILE)
                        self._logger.info(
                            '[Epoch %d] Current best map: %f vs previous %f, saved to %s',
                            self.epoch, current_map, self._best_map, cp_name)
                        self.save(cp_name)
                        self._best_map = current_map
                    if self._reporter:
                        self._reporter(epoch=epoch, map_reward=current_map)
                    early_stopper.update(current_map, epoch=epoch)
            self._time_elapsed += time.time() - post_tic
        # map on train data
        tic = time.time()
        map_name, mean_ap = self._evaluate(train_eval_data)
        self._time_elapsed += time.time() - tic
        return {
            'train_map': float(mean_ap[-1]),
            'valid_map': self._best_map,
            'time': self._time_elapsed,
            'checkpoint': cp_name
        }
Exemplo n.º 40
0
def run(outdir):
    """ Runs a set of training and validation experiments and stores result in a directory. """
    ''' Set up paths and start log '''
    logfile = outdir + 'log.txt'
    f = open(logfile, 'w')
    f.close()
    ''' Hyperparameters '''
    epochs = int(FLAGS.iterations)
    learning_rate = float(FLAGS.learning_rate)
    wd = float(FLAGS.weight_decay)
    train_experiments = int(FLAGS.experiments)
    learning_rate_factor = float(FLAGS.learning_rate_factor)
    learning_rate_steps = int(
        FLAGS.learning_rate_steps
    )  # changes the learning rate for every n updates.
    ''' Logging '''
    logfile = outdir + 'log.txt'
    f = open(logfile, 'w')
    f.close()
    data_train = FLAGS.data_dir + FLAGS.data_train
    data_train_valid = FLAGS.data_dir + FLAGS.data_test
    ''' Set GPUs/CPUs '''
    num_gpus = mx.context.num_gpus()
    num_workers = int(
        FLAGS.num_workers)  # replace num_workers with the number of cores
    ctx = mx.gpu() if num_gpus > 0 else mx.cpu()
    units = num_gpus if num_gpus > 0 else 1
    batch_size_per_unit = int(FLAGS.batch_size_per_unit)  # mini-batch size
    batch_size = batch_size_per_unit * max(units, 1)
    ''' Set random seeds '''
    random.seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    mx.random.seed(FLAGS.seed)
    ''' Save parameters '''
    save_config(outdir + 'config.txt', FLAGS)

    log(
        logfile, 'Training with hyperparameters: alpha=%.2g, lambda=%.2g' %
        (FLAGS.p_alpha, FLAGS.weight_decay))
    ''' Load dataset '''
    train_dataset = load_data(data_train, normalize=FLAGS.normalize_input)

    log(logfile, 'Training data: ' + data_train)
    log(logfile, 'Valid data:     ' + data_train_valid)
    log(
        logfile, 'Loaded data with shape [%d,%d]' %
        (train_dataset['n'], train_dataset['dim']))
    ''' CFR Neural Network Architecture for ITE estimation '''
    net = CFRNet(FLAGS.dim_rep, FLAGS.dim_hyp, FLAGS.weight_init_scale,
                 train_dataset['dim'], FLAGS.batch_norm)
    ''' Instantiate net '''
    net.initialize(ctx=ctx)
    net.hybridize()  # hybridize for better performance
    ''' Metric, Loss and Optimizer '''
    rmse_metric = mx.metric.RMSE()
    l2_loss = gluon.loss.L2Loss()
    wass_loss = WassersteinLoss(
        lam=FLAGS.wass_lambda,
        its=FLAGS.wass_iterations,
        square=True,
        backpropT=FLAGS.wass_bpg)  # Change too at hybrid_test_net_with_cfr
    scheduler = mx.lr_scheduler.FactorScheduler(step=learning_rate_steps,
                                                factor=learning_rate_factor,
                                                base_lr=learning_rate)
    optimizer = mx.optimizer.Adam(learning_rate=learning_rate,
                                  lr_scheduler=scheduler)
    # optimizer = mx.optimizer.Adam(learning_rate=learning_rate, lr_scheduler=scheduler, wd=wd)
    trainer = gluon.Trainer(net.collect_params(), optimizer=optimizer)
    ''' Initialize train score results '''
    train_scores = np.zeros((train_experiments, 3))
    ''' Initialize train experiment durations '''
    train_durations = np.zeros((train_experiments, 1))
    ''' Initialize valid score results '''
    test_scores = np.zeros((train_experiments, 3))
    ''' Train experiments means and stds '''
    means = np.array([])
    stds = np.array([])
    ''' Train '''
    for train_experiment in range(train_experiments):
        ''' Create training dataset '''
        x = train_dataset['x'][:, :, train_experiment]
        t = np.reshape(train_dataset['t'][:, train_experiment], (-1, 1))
        yf = train_dataset['yf'][:, train_experiment]
        ycf = train_dataset['ycf'][:, train_experiment]
        mu0 = train_dataset['mu0'][:, train_experiment]
        mu1 = train_dataset['mu1'][:, train_experiment]

        train, valid, test, _ = split_data_in_train_valid_test(
            x, t, yf, ycf, mu0, mu1)
        ''' With-in sample '''
        train_evaluator = Evaluator(
            np.concatenate([train['t'], valid['t']]),
            np.concatenate([train['yf'], valid['yf']]),
            y_cf=np.concatenate([train['ycf'], valid['ycf']], axis=0),
            mu0=np.concatenate([train['mu0'], valid['mu0']], axis=0),
            mu1=np.concatenate([train['mu1'], valid['mu1']], axis=0))
        test_evaluator = Evaluator(test['t'], test['yf'], test['ycf'],
                                   test['mu0'], test['mu1'])
        ''' Plot first experiment original TSNE visualization '''
        if train_experiment == 0:
            ''' Learned representations of first experiment for TSNE visualization '''
            first_exp_reps = []
        ''' Normalize yf '''
        if FLAGS.normalize_input:
            yf_m, yf_std = np.mean(train['yf'], axis=0), np.std(train['yf'],
                                                                axis=0)
            train['yf'] = (train['yf'] - yf_m) / yf_std
            valid['yf'] = (valid['yf'] - yf_m) / yf_std
            test['yf'] = (test['yf'] - yf_m) / yf_std
            ''' Save mean and std '''
            means = np.append(means, yf_m)
            stds = np.append(stds, yf_std)
        ''' Train dataset '''
        factual_features = np.hstack((train['x'], train['t']))
        train_factual_dataset = gluon.data.ArrayDataset(
            mx.nd.array(factual_features), mx.nd.array(train['yf']))
        ''' With-in sample '''
        train_rmse_ite_dataset = gluon.data.ArrayDataset(
            mx.nd.array(np.concatenate([train['x'], valid['x']])))
        ''' Valid dataset '''
        valid_factual_features = np.hstack((valid['x'], valid['t']))
        valid_factual_dataset = gluon.data.ArrayDataset(
            mx.nd.array(valid_factual_features), mx.nd.array(valid['yf']))
        ''' Test dataset '''
        test_rmse_ite_dataset = gluon.data.ArrayDataset(
            mx.nd.array(test['x']))  # todo rename, rmse_ite has nothing to do
        ''' Train DataLoader '''
        train_factual_loader = gluon.data.DataLoader(train_factual_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=True,
                                                     num_workers=num_workers)
        train_rmse_ite_loader = gluon.data.DataLoader(train_rmse_ite_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=False,
                                                      num_workers=num_workers)
        ''' Valid DataLoader '''
        valid_factual_loader = gluon.data.DataLoader(valid_factual_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=num_workers)
        ''' Test DataLoader '''
        test_rmse_ite_loader = gluon.data.DataLoader(test_rmse_ite_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=num_workers)

        number_of_batches = len(train_factual_loader)
        ''' Compute treatment probability '''
        treatment_probability = np.mean(train['t'])

        train_start = time.time()
        ''' Train model '''
        for epoch in range(
                1, epochs +
                1):  # start with epoch 1 for easier learning rate calculation

            train_loss = 0
            rmse_metric.reset()
            obj_loss = 0
            imb_err = 0

            for i, (batch_f_features,
                    batch_yf) in enumerate(train_factual_loader):
                ''' Get data and labels into slices and copy each slice into a context. '''
                batch_f_features = batch_f_features.as_in_context(ctx)
                batch_yf = batch_yf.as_in_context(ctx)

                x = batch_f_features[:, :-1]
                t = batch_f_features[:, -1]
                ''' Get treatment and control indices. Batch_size must be enough to have at least one t=1 sample '''
                t1_idx = np.where(t == 1)[0]
                t0_idx = np.where(t == 0)[0]

                if t1_idx.shape[0] == 0:
                    log(
                        logfile, 'Encountered no treatment samples at batch ' +
                        str(i) + '.')
                ''' Compute sample reweighing '''
                if FLAGS.reweight_sample:
                    w_t = t / (2 * treatment_probability)
                    w_c = (1 - t) / (2 * 1 - treatment_probability)
                    sample_weight = w_t + w_c
                else:
                    sample_weight = 1.0
                ''' Initialize outputs '''
                outputs = np.zeros(batch_yf.shape)
                loss = np.zeros(batch_yf.shape)
                ''' Forward (Factual) '''
                with autograd.record():
                    t1_o, t0_o, rep_o = net(x, mx.nd.array(t1_idx),
                                            mx.nd.array(t0_idx))

                    risk = 0

                    t1_o_loss = l2_loss(t1_o, batch_yf[t1_idx],
                                        sample_weight[t1_idx])
                    np.put(loss, t1_idx, t1_o_loss.asnumpy())
                    np.put(outputs, t1_idx, t1_o.asnumpy())
                    risk = risk + t1_o_loss.sum()

                    t0_o_loss = l2_loss(t0_o, batch_yf[t0_idx],
                                        sample_weight[t0_idx])
                    np.put(loss, t0_idx, t0_o_loss.asnumpy())
                    np.put(outputs, t0_idx, t0_o.asnumpy())
                    risk = risk + t0_o_loss.sum()

                    if FLAGS.normalization == 'divide':
                        h_rep_norm = rep_o / mx_safe_sqrt(
                            mx.nd.sum(
                                mx.nd.square(rep_o), axis=1, keepdims=True))
                    else:
                        h_rep_norm = 1.0 * rep_o

                    imb_dist = wass_loss(h_rep_norm[t1_idx],
                                         h_rep_norm[t0_idx])

                    imb_error = FLAGS.p_alpha * imb_dist

                    tot_error = risk

                    if FLAGS.p_alpha > 0:
                        tot_error = tot_error + imb_error
                    ''' Save last epoch of first experiment reps for TSNE vis. '''
                    if train_experiment == 0 and epoch == range(epochs +
                                                                1)[-1]:
                        first_exp_reps.extend(rep_o)
                ''' Backward '''
                tot_error.backward()
                ''' Optimize '''
                trainer.step(batch_size)

                train_loss += loss.mean()
                rmse_metric.update(batch_yf, mx.nd.array(outputs))

                obj_loss += tot_error.asscalar()
                imb_err += imb_error.asscalar()

            if epoch % FLAGS.epoch_output_iter == 0 or epoch == 1:
                _, train_rmse_factual = rmse_metric.get()
                train_loss /= number_of_batches
                (_, valid_rmse_factual), _, _ = hybrid_test_net_with_cfr(
                    net, valid_factual_loader, ctx, FLAGS, np.mean(valid['t']))

                log(
                    logfile,
                    '[Epoch %d/%d] Train-rmse-factual: %.3f | Loss: %.3f | learning-rate: '
                    '%.3E | ObjLoss: %.3f | ImbErr: %.3f | Valid-rmse-factual: %.3f'
                    % (epoch, epochs, train_rmse_factual, train_loss,
                       trainer.learning_rate, obj_loss, imb_err,
                       valid_rmse_factual))
        ''' Plot first experiment learned TSNE visualization '''
        if train_experiment == 0:
            tsne_plot_pca(data=train['x'],
                          label=train['t'],
                          learned_representation=np.asarray(
                              [ind.asnumpy() for ind in first_exp_reps]),
                          outdir=outdir + FLAGS.architecture.lower())

        train_durations[train_experiment, :] = time.time() - train_start
        ''' Test model with valid data '''
        y_t0, y_t1, = hybrid_predict_treated_and_controlled_with_cfr(
            net, train_rmse_ite_loader, ctx)
        if FLAGS.normalize_input:
            y_t0, y_t1 = y_t0 * yf_std + yf_m, y_t1 * yf_std + yf_m
        train_score = train_evaluator.get_metrics(y_t1, y_t0)
        train_scores[train_experiment, :] = train_score

        y_t0, y_t1, = hybrid_predict_treated_and_controlled_with_cfr(
            net, test_rmse_ite_loader, ctx)
        if FLAGS.normalize_input:
            y_t0, y_t1 = y_t0 * yf_std + yf_m, y_t1 * yf_std + yf_m
        test_score = test_evaluator.get_metrics(y_t1, y_t0)
        test_scores[train_experiment, :] = test_score

        log(logfile, '[Train Replication {}/{}]: train RMSE ITE: {:0.3f}, train ATE: {:0.3f}, train PEHE: {:0.3f},' \
                     ' test RMSE ITE: {:0.3f}, test ATE: {:0.3f}, test PEHE: {:0.3f}'.format(train_experiment + 1,
                                                                                             train_experiments,
                                                                                             train_score[0],
                                                                                             train_score[1],
                                                                                             train_score[2],
                                                                                             test_score[0],
                                                                                             test_score[1],
                                                                                             test_score[2]))
    ''' Save means and stds NDArray values for inference '''
    if FLAGS.normalize_input:
        mx.nd.save(
            outdir + FLAGS.architecture.lower() + '_means_stds_ihdp_' +
            str(train_experiments) + '_.nd', {
                "means": mx.nd.array(means),
                "stds": mx.nd.array(stds)
            })
    ''' Export trained models '''
    # See mxnet.apache.org/api/python/docs/tutorials/packages/gluon/blocks/save_load_params.html
    net.export(outdir + FLAGS.architecture.lower() + "-ihdp-predictions-" +
               str(train_experiments))  # hybrid

    log(logfile,
        '\n{} architecture total scores:'.format(FLAGS.architecture.upper()))
    ''' Train and test scores '''
    means, stds = np.mean(train_scores, axis=0), sem(train_scores,
                                                     axis=0,
                                                     ddof=0)
    r_pehe_mean, r_pehe_std = np.mean(np.sqrt(train_scores[:, 2]),
                                      axis=0), sem(np.sqrt(train_scores[:, 2]),
                                                   axis=0,
                                                   ddof=0)
    train_total_scores_str = 'train RMSE ITE: {:.2f} ± {:.2f}, train ATE: {:.2f} ± {:.2f}, train PEHE: {:.2f} ± {:.2f}, ' \
                             'train root PEHE: {:.2f} ± {:.2f}' \
                             ''.format(means[0], stds[0], means[1], stds[1], means[2], stds[2], r_pehe_mean, r_pehe_std)

    means, stds = np.mean(test_scores, axis=0), sem(test_scores,
                                                    axis=0,
                                                    ddof=0)
    r_pehe_mean, r_pehe_std = np.mean(np.sqrt(test_scores[:, 2]),
                                      axis=0), sem(np.sqrt(test_scores[:, 2]),
                                                   axis=0,
                                                   ddof=0)
    test_total_scores_str = 'test RMSE ITE: {:.2f} ± {:.2f}, test ATE: {:.2f} ± {:.2f}, test PEHE: {:.2f} ± {:.2f}, ' \
                            'test root PEHE: {:.2f} ± {:.2f}' \
                            ''.format(means[0], stds[0], means[1], stds[1], means[2], stds[2], r_pehe_mean, r_pehe_std)

    log(logfile, train_total_scores_str)
    log(logfile, test_total_scores_str)

    mean_duration = float("{0:.2f}".format(
        np.mean(train_durations, axis=0)[0]))

    with open(outdir + FLAGS.architecture.lower() + "-total-scores-" +
              str(train_experiments),
              "w",
              encoding="utf8") as text_file:
        print(train_total_scores_str,
              "\n",
              test_total_scores_str,
              file=text_file)

    return {
        "ite": "{:.2f} ± {:.2f}".format(means[0], stds[0]),
        "ate": "{:.2f} ± {:.2f}".format(means[1], stds[1]),
        "pehe": "{:.2f} ± {:.2f}".format(means[2], stds[2]),
        "mean_duration": mean_duration
    }
Exemplo n.º 41
0
    nn.Dense(10))
net.initialize(init=init.Xavier())

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})


def acc(output, label):
    return (output.argmax(axis=1) == label.astype('float32')).mean().asscalar()

for epoch in range(10):
    train_loss, train_acc, valid_acc = 0., 0., 0.
    tic = time.time()
    for data, label in train_data:
        with autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()

        # update params
        trainer.step(batch_size)
        # calc training metrics
        train_loss += loss.mean().asscalar()
        train_acc += acc(output, label)

    for data, label in valid_data:
        valid_acc += acc(net(data), label)
    
    print("Epoch %d: loss %.3f, train acc %.3f, test acc %.3f, in %.1f sec" %
            (epoch, train_loss/len(train_data), train_acc/len(train_data),
Exemplo n.º 42
0
def train():
    """training"""
    image_pool = ImagePool(pool_size)
    metric = mx.metric.CustomMetric(facc)

    stamp = datetime.now().strftime('%Y_%m_%d-%H_%M')
    logging.basicConfig(level=logging.DEBUG)

    # define a summary writer that logs data and flushes to the file every 5 seconds
    sw = SummaryWriter(logdir='%s' % dir_out_sw, flush_secs=5, verbose=False)
    global_step = 0

    for epoch in range(epochs):
        if epoch == 0:
            netG.hybridize()
            netD.hybridize()
        #     sw.add_graph(netG)
        #     sw.add_graph(netD)

        tic = time.time()
        btic = time.time()
        train_data.reset()
        val_data.reset()
        iter = 0
        for local_step, batch in enumerate(train_data):
            ############################
            # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z)))
            ###########################
            tmp = mx.nd.concat(batch.data[0],
                               batch.data[1],
                               batch.data[2],
                               dim=1)
            tmp = augmenter(tmp,
                            patch_size=128,
                            offset=offset,
                            aug_type=1,
                            aug_methods=aug_methods,
                            random_crop=False)
            real_in = tmp[:, :1].as_in_context(ctx)
            real_out = tmp[:, 1:2].as_in_context(ctx)
            m = tmp[:, 2:3].as_in_context(ctx)  # mask

            fake_out = netG(real_in) * m

            # loss weight based on mask, applied on L1 loss
            if no_loss_weights:
                loss_weight = m
            else:
                loss_weight = m.asnumpy()
                loss_weight[loss_weight == 0] = .1
                loss_weight = mx.nd.array(loss_weight, ctx=m.context)

            fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1))
            with autograd.record():
                # Train with fake image
                # Use image pooling to utilize history images
                output = netD(fake_concat)
                fake_label = nd.zeros(output.shape, ctx=ctx)
                errD_fake = GAN_loss(output, fake_label)
                metric.update([
                    fake_label,
                ], [
                    output,
                ])

                # Train with real image
                real_concat = nd.concat(real_in, real_out, dim=1)
                output = netD(real_concat)
                real_label = nd.ones(output.shape, ctx=ctx)
                errD_real = GAN_loss(output, real_label)
                errD = (errD_real + errD_fake) * 0.5
                errD.backward()
                metric.update([
                    real_label,
                ], [
                    output,
                ])

            trainerD.step(batch.data[0].shape[0])

            ############################
            # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z))
            ###########################
            with autograd.record():
                fake_out = netG(real_in)
                fake_concat = nd.concat(real_in, fake_out, dim=1)
                output = netD(fake_concat)
                real_label = nd.ones(output.shape, ctx=ctx)
                errG = GAN_loss(output, real_label) + loss_2nd(
                    real_out, fake_out, loss_weight) * lambda1
                errG.backward()

            trainerG.step(batch.data[0].shape[0])

            sw.add_scalar(tag='loss',
                          value=('d_loss', errD.mean().asscalar()),
                          global_step=global_step)
            sw.add_scalar(tag='loss',
                          value=('g_loss', errG.mean().asscalar()),
                          global_step=global_step)
            global_step += 1

            if epoch + local_step == 0:
                sw.add_graph((netG))
                img_in_list, img_out_list, m_val = val_data.next().data
                m_val = m_val.as_in_context(ctx)
                sw.add_image('first_minibatch_train_real', norm3(real_out))
                sw.add_image('first_minibatch_val_real',
                             norm3(img_out_list.as_in_context(ctx)))
                netG.export('%snetG' % dir_out_checkpoints)
            if local_step == 0:
                # Log the first batch of images of each epoch (training)
                sw.add_image('first_minibatch_train_fake',
                             norm3(fake_out * m) * m, epoch)
                sw.add_image(
                    'first_minibatch_val_fake',
                    norm3(netG(img_in_list.as_in_context(ctx)) * m_val) *
                    m_val, epoch)
                # norm3(netG(img_in_list.as_in_context(ctx)) * m_val.as_in_context(ctx)), epoch)

            if (iter + 1) % 10 == 0:
                name, acc = metric.get()

                logging.info('speed: {} samples/s'.format(
                    batch_size / (time.time() - btic)))
                logging.info(
                    'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d'
                    % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc,
                       iter, epoch))

            iter += 1
            btic = time.time()

        sw.add_scalar(tag='binary_training_acc',
                      value=('acc', acc),
                      global_step=epoch)

        name, acc = metric.get()
        metric.reset()

        fake_val = netG(val_data.data[0][1].as_in_context(ctx))
        loss_val = loss_2nd(val_data.data[1][1].as_in_context(ctx), fake_val,
                            val_data.data[2][1].as_in_context(ctx)) * lambda1
        sw.add_scalar(tag='loss_val',
                      value=('g_loss', loss_val.mean().asscalar()),
                      global_step=epoch)

        if (epoch % check_point_interval == 0) | (epoch == epochs - 1):
            netD.save_params('%snetD-%04d' % (dir_out_checkpoints, epoch))
            netG.save_params('%snetG-%04d' % (dir_out_checkpoints, epoch))

        logging.info('\nbinary training acc at epoch %d: %s=%f' %
                     (epoch, name, acc))
        logging.info('time: %f' % (time.time() - tic))

    sw.export_scalars('scalar_dict.json')
    sw.close()
Exemplo n.º 43
0
def run(args, outdir):
    """ Run training for NN4 architecture with Variational Bayes. """
    ''' Hyperparameters '''
    epochs = int(args.iterations)
    learning_rate = float(args.learning_rate)
    wd = float(args.weight_decay)
    hidden_size = int(args.hidden_size)
    train_experiments = int(args.experiments)
    learning_rate_factor = float(args.learning_rate_factor)
    learning_rate_steps = int(
        args.learning_rate_steps
    )  # changes the learning rate for every n updates.
    epoch_output_iter = int(args.epoch_output_iter)
    ''' Logging '''
    logfile = outdir + 'log.txt'
    f = open(logfile, 'w')
    f.close()

    config = {  # TODO may need adjustments
        # "sigma_p1": 1.5,
        "sigma_p1": 1.75,  # og
        # "sigma_p2": 0.25,
        # "sigma_p2": 0.5, # og
        "sigma_p2": 0.5,
        "pi": 0.5,
        "lambda_p": 24.5
    }
    ''' Set GPUs/CPUs '''
    num_gpus = mx.context.num_gpus()
    num_workers = int(
        args.num_workers)  # replace num_workers with the number of cores
    ctx = [mx.gpu(i) for i in range(num_gpus)
           ] if num_gpus > 0 else [mx.cpu()]  # todo change as cfr_net_train
    batch_size_per_unit = int(args.batch_size_per_unit)  # mini-batch size
    batch_size = batch_size_per_unit * max(num_gpus, 1)
    ''' Set seeds '''
    for c in ctx:
        mx.random.seed(int(args.seed), c)
    np.random.seed(int(args.seed))
    ''' Feed Forward Neural Network Model (4 hidden layers) '''
    net = ff4_relu_architecture(hidden_size)
    ''' Load datasets '''
    # train_dataset = load_data('../' + args.data_dir + args.data_train) # PyCharm run
    train_dataset = load_data(args.data_dir + args.data_train)  # Terminal run

    log(logfile, 'Training data: ' + args.data_dir + args.data_train)
    log(logfile, 'Valid data:     ' + args.data_dir + args.data_test)
    log(
        logfile, 'Loaded data with shape [%d,%d]' %
        (train_dataset['n'], train_dataset['dim']))

    # ''' Feature correlation '''
    # import pandas as pd
    # df = pd.DataFrame.from_records(train_dataset['x'][:, :, 20])
    # df.insert(25, "t", train_dataset['t'][:, 20])
    # corr = df.corr()
    # import seaborn as sns
    # sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, fmt='.1f')
    ''' Instantiate net '''
    ''' Param. init. '''
    net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
    net.hybridize()
    ''' Forward-propagate a single data set entry once to set up all network 
    parameters (weights and biases) with the desired initializer specified above. '''
    x = train_dataset['x'][:, :, 0]
    t = np.reshape(train_dataset['t'][:, 0], (-1, 1))
    yf = train_dataset['yf'][:, 0]
    yf_m, yf_std = np.mean(yf, axis=0), np.std(yf, axis=0)
    yf = (yf - yf_m) / yf_std
    factual_features = np.hstack((x, t))
    zero_train_factual_dataset = gluon.data.ArrayDataset(
        mx.nd.array(factual_features), mx.nd.array(yf))
    zero_train_factual_loader = gluon.data.DataLoader(
        zero_train_factual_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers)
    for i, (batch_f_features,
            batch_yf) in enumerate(zero_train_factual_loader):
        batch_f_features = gluon.utils.split_and_load(batch_f_features,
                                                      ctx_list=ctx,
                                                      even_split=False)
        [net(x) for x in batch_f_features]
        break

    weight_scale = .1
    rho_offset = -3
    lambda_init = 25
    ''' Initialize variational parameters; mean and variance for each weight '''
    mus = []
    rhos = []
    lambdas = []

    shapes = list(map(lambda x: x.shape, net.collect_params().values()))

    for shape in shapes:
        # mu = gluon.Parameter('mu', shape=shape, init=mx.init.Normal(weight_scale))
        # rho = gluon.Parameter('rho', shape=shape, init=mx.init.Constant(rho_offset))
        lmb = gluon.Parameter('lmb',
                              shape=shape,
                              init=mx.init.Constant(lambda_init))
        # mu.initialize(ctx=ctx)
        # rho.initialize(ctx=ctx)
        lmb.initialize(ctx=ctx)
        # mus.append(mu)
        # rhos.append(rho)
        lambdas.append(lmb)
    # variational_params = mus + rhos
    variational_params = lambdas

    # raw_mus = list(map(lambda x: x.data(ctx[0]), mus))
    # raw_rhos = list(map(lambda x: x.data(ctx[0]), rhos))
    raw_lambdas = list(map(lambda x: x.data(ctx[0]), lambdas))
    ''' Metric, Loss and Optimizer '''
    rmse_metric = mx.metric.RMSE()
    l2_loss = gluon.loss.L2Loss()
    bbb_loss = BBBLoss(ctx[0],
                       log_prior="exponential",
                       sigma_p1=config['sigma_p1'],
                       sigma_p2=config['sigma_p2'],
                       pi=config['pi'],
                       lambda_p=config['lambda_p'])
    # bbb_loss = BBBLoss(ctx[0], log_prior="scale_mixture", sigma_p1=config['sigma_p1'], sigma_p2=config['sigma_p2'],
    #                    pi=config['pi'])
    scheduler = mx.lr_scheduler.FactorScheduler(step=learning_rate_steps,
                                                factor=learning_rate_factor,
                                                base_lr=learning_rate)
    # optimizer = mx.optimizer.Adam(learning_rate=learning_rate, lr_scheduler=scheduler)
    optimizer = mx.optimizer.RMSProp(learning_rate=learning_rate,
                                     lr_scheduler=scheduler,
                                     wd=wd)
    # optimizer = mx.optimizer.Adam(learning_rate=learning_rate)
    trainer = gluon.Trainer(variational_params, optimizer=optimizer)
    ''' Initialize train score results '''
    train_scores = np.zeros((train_experiments, 3))
    ''' Initialize train experiment durations '''
    train_durations = np.zeros((train_experiments, 1))
    ''' Initialize test score results '''
    test_scores = np.zeros((train_experiments, 3))
    ''' Train experiments means and stds '''
    means = np.array([])
    stds = np.array([])
    ''' Train '''
    for train_experiment in range(train_experiments):
        ''' Create training dataset '''
        x = train_dataset['x'][:, :, train_experiment]
        t = np.reshape(train_dataset['t'][:, train_experiment], (-1, 1))
        yf = train_dataset['yf'][:, train_experiment]
        ycf = train_dataset['ycf'][:, train_experiment]
        mu0 = train_dataset['mu0'][:, train_experiment]
        mu1 = train_dataset['mu1'][:, train_experiment]

        train, valid, test, _ = split_data_in_train_valid_test(
            x, t, yf, ycf, mu0, mu1)
        ''' With-in sample '''
        train_evaluator = Evaluator(
            np.concatenate([train['t'], valid['t']]),
            np.concatenate([train['yf'], valid['yf']]),
            y_cf=np.concatenate([train['ycf'], valid['ycf']], axis=0),
            mu0=np.concatenate([train['mu0'], valid['mu0']], axis=0),
            mu1=np.concatenate([train['mu1'], valid['mu1']], axis=0))
        test_evaluator = Evaluator(test['t'], test['yf'], test['ycf'],
                                   test['mu0'], test['mu1'])
        ''' Normalize yf '''  # TODO check for normalize input?
        yf_m, yf_std = np.mean(train['yf'], axis=0), np.std(train['yf'],
                                                            axis=0)
        train['yf'] = (train['yf'] - yf_m) / yf_std
        valid['yf'] = (valid['yf'] - yf_m) / yf_std
        test['yf'] = (test['yf'] - yf_m) / yf_std
        ''' Save mean and std '''
        means = np.append(means, yf_m)
        stds = np.append(stds, yf_std)
        ''' Train dataset '''
        factual_features = np.hstack((train['x'], train['t']))
        train_factual_dataset = gluon.data.ArrayDataset(
            mx.nd.array(factual_features), mx.nd.array(train['yf']))
        ''' With-in sample '''
        train_rmse_ite_dataset = gluon.data.ArrayDataset(
            mx.nd.array(np.concatenate([train['x'], valid['x']])))
        ''' Valid dataset '''
        valid_factual_features = np.hstack((valid['x'], valid['t']))
        valid_factual_dataset = gluon.data.ArrayDataset(
            mx.nd.array(valid_factual_features), mx.nd.array(valid['yf']))
        ''' Test dataset '''
        test_rmse_ite_dataset = gluon.data.ArrayDataset(mx.nd.array(test['x']))
        ''' Train DataLoader '''
        train_factual_loader = gluon.data.DataLoader(train_factual_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=True,
                                                     num_workers=num_workers)
        train_rmse_ite_loader = gluon.data.DataLoader(train_rmse_ite_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=False,
                                                      num_workers=num_workers)
        ''' Valid DataLoader '''
        valid_factual_loader = gluon.data.DataLoader(valid_factual_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=num_workers)
        ''' Test DataLoader '''
        test_rmse_ite_loader = gluon.data.DataLoader(test_rmse_ite_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=num_workers)

        num_batch = len(train_factual_loader)

        train_start = time.time()

        train_acc = []
        test_acc = []
        ''' Train model '''
        for epoch in range(
                1, epochs +
                1):  # start with epoch 1 for easier learning rate calculation

            train_loss = 0
            rmse_metric.reset()

            for i, (batch_f_features,
                    batch_yf) in enumerate(train_factual_loader):
                ''' Get data and labels into slices and copy each slice into a context.'''
                batch_f_features = batch_f_features.as_in_context(
                    ctx[0]).reshape((-1, 26))
                batch_yf = batch_yf.as_in_context(ctx[0]).reshape(
                    (len(batch_yf), -1))
                ''' Forward '''
                with autograd.record():
                    ''' Generate sample '''
                    # layer_params, sigmas = generate_weight_sample(shapes, raw_mus, raw_rhos, ctx[0])
                    layer_params = generate_weight_sample_exp(
                        shapes, raw_lambdas, ctx[0])
                    ''' Overwrite network parameters with sampled parameters '''
                    for sample, param in zip(layer_params,
                                             net.collect_params().values()):
                        param._data[0] = sample
                    ''' Forward-propagate the batch '''
                    outputs = net(batch_f_features)

                    # if epoch == epochs:
                    #     ''' Factual outcomes and batch_yf histograms '''
                    #     import pandas as pd
                    #     df = pd.DataFrame({'layer_params': layer_params[6][0].asnumpy().flatten()}, columns=['layer_params'])
                    #     df = pd.DataFrame(
                    #         {'outputs': outputs.asnumpy().flatten(), 'batch_yf': batch_yf.asnumpy().flatten()},
                    #         columns=['outputs', 'batch_yf'])
                    #     df.plot(kind='hist', alpha=0.5)
                    #     df.plot.kde()
                    ''' Calculate the loss '''
                    l2_loss_value = l2_loss(outputs, batch_yf)
                    # bbb_loss_value = bbb_loss(outputs, batch_yf, layer_params, raw_mus, sigmas, num_batch)
                    bbb_loss_value = bbb_loss(outputs, batch_yf, layer_params,
                                              raw_lambdas, [], num_batch)
                    loss = bbb_loss_value + l2_loss_value
                    # loss = bbb_loss_value
                    # loss = l2_loss_value
                    ''' Backpropagate for gradient calculation '''
                    loss.backward()
                ''' Optimize '''
                trainer.step(batch_size)

                train_loss += sum([l.mean().asscalar()
                                   for l in loss]) / len(loss)

                rmse_metric.update(batch_yf, outputs)

            if epoch % epoch_output_iter == 0 or epoch == 1:
                _, train_rmse_factual = rmse_metric.get()
                train_loss /= num_batch
                _, valid_rmse_factual = test_net_vb(net, valid_factual_loader,
                                                    layer_params, ctx)

                # _, train_RMSE = evaluate_RMSE(train_factual_loader, net, raw_mus, ctx)
                # _, test_RMSE = evaluate_RMSE(valid_factual_loader, net, raw_mus, ctx)
                # train_acc.append(np.asscalar(train_RMSE))
                # test_acc.append(np.asscalar(test_RMSE))
                # print("Epoch %s. Train-RMSE %s, Test-RMSE %s" %
                #       (epoch, train_RMSE, test_RMSE))

                log(
                    logfile, 'l2-loss: %.3f, bbb-loss: %.3f' %
                    (l2_loss_value[0].asscalar(),
                     bbb_loss_value[0].asscalar()))

                log(
                    logfile,
                    '[Epoch %d/%d] Train-rmse-factual: %.3f, loss: %.3f | Valid-rmse-factual: %.3f | learning-rate: '
                    '%.3E' % (epoch, epochs, train_rmse_factual, train_loss,
                              valid_rmse_factual, trainer.learning_rate))

        train_durations[train_experiment, :] = time.time() - train_start
        ''' Test model '''
        # y_t0, y_t1 = predict_treated_and_controlled_vb(net, train_rmse_ite_loader, raw_mus, ctx)
        y_t0, y_t1 = predict_treated_and_controlled_vb(net,
                                                       train_rmse_ite_loader,
                                                       layer_params, ctx)
        y_t0, y_t1 = y_t0 * yf_std + yf_m, y_t1 * yf_std + yf_m
        train_score = train_evaluator.get_metrics(y_t1, y_t0)
        train_scores[train_experiment, :] = train_score

        # y_t0, y_t1 = predict_treated_and_controlled_vb(net, test_rmse_ite_loader, raw_mus, ctx)
        y_t0, y_t1 = predict_treated_and_controlled_vb(net,
                                                       test_rmse_ite_loader,
                                                       layer_params, ctx)
        y_t0, y_t1 = y_t0 * yf_std + yf_m, y_t1 * yf_std + yf_m
        test_score = test_evaluator.get_metrics(y_t1, y_t0)
        test_scores[train_experiment, :] = test_score

        log(logfile, '[Train Replication {}/{}]: train RMSE ITE: {:0.3f}, train ATE: {:0.3f}, train PEHE: {:0.3f},' \
                     ' test RMSE ITE: {:0.3f}, test ATE: {:0.3f}, test PEHE: {:0.3f}'.format(train_experiment + 1,
                                                                                             train_experiments,
                                                                                             train_score[0],
                                                                                             train_score[1],
                                                                                             train_score[2],
                                                                                             test_score[0],
                                                                                             test_score[1],
                                                                                             test_score[2]))
        # plt.plot(train_acc)
        # plt.plot(test_acc)
    ''' Save means and stds NDArray values for inference '''
    mx.nd.save(
        outdir + args.architecture.lower() + '_means_stds_ihdp_' +
        str(train_experiments) + '_.nd', {
            "means": mx.nd.array(means),
            "stds": mx.nd.array(stds)
        })
    ''' Export trained model '''
    net.export(outdir + args.architecture.lower() + "-ihdp-predictions-" +
               str(train_experiments),
               epoch=epochs)

    log(logfile,
        '\n{} architecture total scores:'.format(args.architecture.upper()))
    ''' Train and test scores '''
    means, stds = np.mean(train_scores, axis=0), sem(train_scores,
                                                     axis=0,
                                                     ddof=0)
    r_pehe_mean, r_pehe_std = np.mean(np.sqrt(train_scores[:, 2]),
                                      axis=0), sem(np.sqrt(train_scores[:, 2]),
                                                   axis=0,
                                                   ddof=0)
    train_total_scores_str = 'train RMSE ITE: {:.2f} ± {:.2f}, train ATE: {:.2f} ± {:.2f}, train PEHE: {:.2f} ± {:.2f}, ' \
                             'train root PEHE: {:.2f} ± {:.2f}' \
                             ''.format(means[0], stds[0], means[1], stds[1], means[2], stds[2], r_pehe_mean, r_pehe_std)

    means, stds = np.mean(test_scores, axis=0), sem(test_scores,
                                                    axis=0,
                                                    ddof=0)
    r_pehe_mean, r_pehe_std = np.mean(np.sqrt(test_scores[:, 2]),
                                      axis=0), sem(np.sqrt(test_scores[:, 2]),
                                                   axis=0,
                                                   ddof=0)
    test_total_scores_str = 'test RMSE ITE: {:.2f} ± {:.2f}, test ATE: {:.2f} ± {:.2f}, test PEHE: {:.2f} ± {:.2f}, ' \
                            'test root PEHE: {:.2f} ± {:.2f}' \
                            ''.format(means[0], stds[0], means[1], stds[1], means[2], stds[2], r_pehe_mean, r_pehe_std)

    log(logfile, train_total_scores_str)
    log(logfile, test_total_scores_str)

    mean_duration = float("{0:.2f}".format(
        np.mean(train_durations, axis=0)[0]))

    with open(outdir + args.architecture.lower() + "-total-scores-" +
              str(train_experiments),
              "w",
              encoding="utf8") as text_file:
        print(train_total_scores_str,
              "\n",
              test_total_scores_str,
              file=text_file)

    return {
        "ite": "{:.2f} ± {:.2f}".format(means[0], stds[0]),
        "ate": "{:.2f} ± {:.2f}".format(means[1], stds[1]),
        "pehe": "{:.2f} ± {:.2f}".format(means[2], stds[2]),
        "mean_duration": mean_duration
    }
Exemplo n.º 44
0
    def train(self,
              train_data,
              epochs=1,
              batch_size=32,
              validation_data=None,
              train_resize_batch_num=None):
        """Train the model and update the model parameters."""
        stats = dict()
        if self.is_worker:
            config = self.config.copy()
            if "batch_size" not in config:
                config["batch_size"] = batch_size

            if train_resize_batch_num is not None:
                config["train_resize_batch_num"] = train_resize_batch_num
            train_data_iter = train_data(config, self.kv)
            val_data_iter = validation_data(
                config, self.kv) if validation_data else None

            start_time = time.time()
            if self.trainer:  # Imperative API

                def cpu_context(target_data):
                    if isinstance(target_data, list):
                        return [cpu_context(d) for d in target_data]
                    else:
                        return target_data.as_in_context(mx.cpu())

                for epoch in range(epochs):
                    # DataLoader doesn't need to be reset.
                    if isinstance(train_data_iter, mx.io.DataIter):
                        train_data_iter.reset()
                    if self.eval_metrics:
                        self.eval_metrics.reset(
                        )  # metrics will accumulate for one batch.
                    batch_start_time = time.time()
                    epoch_start_time = time.time()
                    for i, batch in enumerate(train_data_iter):
                        data = cpu_context(batch.data)
                        label = cpu_context(batch.label)
                        if not isinstance(data, list):
                            data = [data]
                        if not isinstance(label, list):
                            label = [label]
                        from mxnet import autograd as ag
                        with ag.record():
                            output = self.model(*data)  # forward
                            if not isinstance(output, list):
                                output = [output]
                            Ls = self.loss(*output, *label)
                            ag.backward(Ls)
                        self.trainer.step(batch_size)
                        if self.eval_metrics:
                            self.eval_metrics.update(label, output)
                        if not (i + 1) % self.config["log_interval"]:
                            # This would be logged on driver for each worker process.
                            iteration_log = \
                                "Epoch[%d] Batch[%d]  Speed: %f samples/sec  %s=%f" \
                                % (epoch, i,
                                   batch_size / (time.time() - batch_start_time),
                                   "loss", Ls.asnumpy().mean())
                            if self.eval_metrics:
                                names, accs = self.eval_metrics.get()
                                names, accs = to_list(names), to_list(accs)
                                for name, acc in zip(names, accs):
                                    iteration_log += "  %s=%f" % (name, acc)
                            self.logger.info(iteration_log)
                        batch_start_time = time.time()
                    # Epoch time log.
                    self.logger.info("[Epoch %d] time cost: %f" %
                                     (epoch, time.time() - epoch_start_time))
                    # Epoch metrics log on train data.
                    if self.eval_metrics:
                        epoch_train_log = "[Epoch %d] training: " % epoch
                        names, accs = self.eval_metrics.get()
                        names, accs = to_list(names), to_list(accs)
                        for name, acc in zip(names, accs):
                            epoch_train_log += "%s=%f  " % (name, acc)
                        self.logger.info(epoch_train_log)
                    # Epoch metrics log on validation data if any.
                    if val_data_iter:
                        if isinstance(val_data_iter, mx.io.DataIter):
                            val_data_iter.reset()
                        self.val_metrics.reset()
                        for batch in val_data_iter:
                            data = cpu_context(batch.data)
                            label = cpu_context(batch.label)
                            if not isinstance(data, list):
                                data = [data]
                            if not isinstance(label, list):
                                label = [label]
                            output = self.model(*data)
                            if not isinstance(output, list):
                                output = [output]
                            self.val_metrics.update(label, output)
                        epoch_val_log = "[Epoch %d] validation: " % epoch
                        names, accs = self.val_metrics.get()
                        names, accs = to_list(names), to_list(accs)
                        for name, acc in zip(names, accs):
                            epoch_val_log += "%s=%f  " % (name, acc)
                        self.logger.info(epoch_val_log)
                    # TODO: save checkpoints
                if self.eval_metrics:
                    names, accs = self.eval_metrics.get()
                    names, accs = to_list(names), to_list(accs)
                    for name, acc in zip(names, accs):
                        stats[name] = acc
            else:  # Symbolic API
                # TODO: seems no history (i.e. validation accuracy) returned by fit?
                if "init" not in self.config:
                    from mxnet.initializer import Uniform
                    self.config["init"] = Uniform(
                        0.01)  # This is the default value for MXNet.
                if self.eval_metrics is None:
                    self.eval_metrics = 'acc'  # This is the default value for MXNet.
                self.model.fit(
                    train_data=train_data_iter,
                    num_epoch=epochs,
                    initializer=self.config["init"],
                    kvstore=self.kv,
                    optimizer=self.config["optimizer"],
                    optimizer_params=self.config["optimizer_params"],
                    eval_data=val_data_iter,
                    eval_metric=self.eval_metrics,
                    validation_metric=self.val_metrics,
                    batch_end_callback=mx.callback.Speedometer(
                        batch_size, self.config["log_interval"]),
                    epoch_end_callback=None if "model" not in self.config else
                    mx.callback.do_checkpoint(self.config["model"]))
            epoch_time = time.time() - start_time
            stats["epoch_time"] = epoch_time
        return [stats]
Exemplo n.º 45
0
def create(input_dataset,
           target,
           feature=None,
           validation_set='auto',
           warm_start='auto',
           batch_size=256,
           max_iterations=100,
           verbose=True):
    """
    Create a :class:`DrawingClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The columns named by the ``feature`` and ``target``
        parameters will be extracted for training the drawing classifier.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    feature : string optional
        Name of the column containing the input drawings. 'None' (the default)
        indicates the column in `dataset` named "drawing" should be used as the
        feature.
        The feature column can contain both bitmap-based drawings as well as
        stroke-based drawings. Bitmap-based drawing input can be a grayscale
        tc.Image of any size.
        Stroke-based drawing input must be in the following format:
        Every drawing must be represented by a list of strokes, where each
        stroke must be a list of points in the order in which they were drawn
        on the canvas.
        Each point must be a dictionary with two keys, "x" and "y", and their
        respective values must be numerical, i.e. either integer or float.

    validation_set : SFrame optional
        A dataset for monitoring the model's generalization performance.
        The format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    warm_start : string optional
        A string to denote which pretrained model to use. Set to "auto"
        by default which uses a model trained on 245 of the 345 classes in the
        Quick, Draw! dataset. Here is a list of all the pretrained models that
        can be passed in as this argument:
        "auto": Uses quickdraw_245_v0
        "quickdraw_245_v0": Uses a model trained on 245 of the 345 classes in the
                         Quick, Draw! dataset.

    batch_size: int optional
        The number of drawings per training step. If not set, a default
        value of 256 will be used. If you are getting memory errors,
        try decreasing this value. If you have a powerful computer, increasing
        this value may improve performance.

    max_iterations : int optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. 

    verbose : bool optional
        If True, print progress updates and model details.

    Returns
    -------
    out : DrawingClassifier
        A trained :class:`DrawingClassifier` model.

    See Also
    --------
    DrawingClassifier

    Examples
    --------
    .. sourcecode:: python

        # Train a drawing classifier model
        >>> model = turicreate.drawing_classifier.create(data)

        # Make predictions on the training set and as column to the SFrame
        >>> data['predictions'] = model.predict(data)

    """

    import mxnet as _mx
    from mxnet import autograd as _autograd
    from ._model_architecture import Model as _Model
    from ._sframe_loader import SFrameClassifierIter as _SFrameClassifierIter

    start_time = _time.time()

    # @TODO: Should be able to automatically choose number of iterations
    # based on data size: Tracked in Github Issue #1576

    # automatically infer feature column
    if feature is None:
        feature = _tkutl._find_only_drawing_column(input_dataset)

    _raise_error_if_not_drawing_classifier_input_sframe(
        input_dataset, feature, target)

    is_stroke_input = (input_dataset[feature].dtype != _tc.Image)
    dataset = _extensions._drawing_classifier_prepare_data(
        input_dataset, feature) if is_stroke_input else input_dataset

    iteration = 0

    classes = dataset[target].unique()
    classes = sorted(classes)
    class_to_index = {name: index for index, name in enumerate(classes)}

    validation_set_corrective_string = (
        "'validation_set' parameter must be " +
        "an SFrame, or None, or must be set to 'auto' for the toolkit to " +
        "automatically create a validation set.")
    if isinstance(validation_set, _tc.SFrame):
        _raise_error_if_not_drawing_classifier_input_sframe(
            validation_set, feature, target)
        is_validation_stroke_input = (validation_set[feature].dtype !=
                                      _tc.Image)
        validation_dataset = _extensions._drawing_classifier_prepare_data(
            validation_set,
            feature) if is_validation_stroke_input else validation_set
    elif isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print(
                        "PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.\n"
                        "          You can set ``validation_set=None`` to disable validation tracking.\n"
                    )
                dataset, validation_dataset = dataset.random_split(
                    TRAIN_VALIDATION_SPLIT)
            else:
                validation_set = None
                validation_dataset = _tc.SFrame()
        else:
            raise _ToolkitError("Unrecognized value for 'validation_set'. " +
                                validation_set_corrective_string)
    elif validation_set is None:
        validation_dataset = _tc.SFrame()
    else:
        raise TypeError("Unrecognized type for 'validation_set'." +
                        validation_set_corrective_string)

    train_loader = _SFrameClassifierIter(dataset,
                                         batch_size,
                                         feature_column=feature,
                                         target_column=target,
                                         class_to_index=class_to_index,
                                         load_labels=True,
                                         shuffle=True,
                                         iterations=max_iterations)
    train_loader_to_compute_accuracy = _SFrameClassifierIter(
        dataset,
        batch_size,
        feature_column=feature,
        target_column=target,
        class_to_index=class_to_index,
        load_labels=True,
        shuffle=True,
        iterations=1)
    validation_loader = _SFrameClassifierIter(validation_dataset,
                                              batch_size,
                                              feature_column=feature,
                                              target_column=target,
                                              class_to_index=class_to_index,
                                              load_labels=True,
                                              shuffle=True,
                                              iterations=1)
    if verbose and iteration == 0:
        column_names = ['iteration', 'train_loss', 'train_accuracy', 'time']
        column_titles = [
            'Iteration', 'Training Loss', 'Training Accuracy',
            'Elapsed Time (seconds)'
        ]
        if validation_set is not None:
            column_names.insert(3, 'validation_accuracy')
            column_titles.insert(3, 'Validation Accuracy')
        table_printer = _tc.util._ProgressTablePrinter(column_names,
                                                       column_titles)

    ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size)
    model = _Model(num_classes=len(classes), prefix="drawing_")
    model_params = model.collect_params()
    model_params.initialize(_mx.init.Xavier(), ctx=ctx)

    if warm_start is not None:
        pretrained_model = _pre_trained_models.DrawingClassifierPreTrainedModel(
            warm_start)
        pretrained_model_params_path = pretrained_model.get_model_path()
        model.load_params(pretrained_model_params_path,
                          ctx=ctx,
                          allow_missing=True)
    softmax_cross_entropy = _mx.gluon.loss.SoftmaxCrossEntropyLoss()
    model.hybridize()
    trainer = _mx.gluon.Trainer(model.collect_params(), 'adam')

    train_accuracy = _mx.metric.Accuracy()
    validation_accuracy = _mx.metric.Accuracy()

    def get_data_and_label_from_batch(batch):
        if batch.pad is not None:
            size = batch_size - batch.pad
            batch_data = (
                [_mx.nd.slice_axis(batch.data[0], axis=0, begin=0, end=size)] +
                [None] * (len(ctx) - 1))
            batch_label = (
                [_mx.nd.slice_axis(batch.label[0], axis=0, begin=0, end=size)
                 ] + [None] * (len(ctx) - 1))
        else:
            batch_data = _mx.gluon.utils.split_and_load(batch.data[0],
                                                        ctx_list=ctx,
                                                        batch_axis=0)
            batch_label = _mx.gluon.utils.split_and_load(batch.label[0],
                                                         ctx_list=ctx,
                                                         batch_axis=0)
        return batch_data, batch_label

    def compute_accuracy(accuracy_metric, batch_loader):
        batch_loader.reset()
        accuracy_metric.reset()
        for batch in batch_loader:
            batch_data, batch_label = get_data_and_label_from_batch(batch)
            outputs = []
            for x, y in zip(batch_data, batch_label):
                if x is None or y is None: continue
                z = model(x)
                outputs.append(z)
            accuracy_metric.update(batch_label, outputs)

    for train_batch in train_loader:
        train_batch_data, train_batch_label = get_data_and_label_from_batch(
            train_batch)
        with _autograd.record():
            # Inside training scope
            for x, y in zip(train_batch_data, train_batch_label):
                z = model(x)
                # Computes softmax cross entropy loss.
                loss = softmax_cross_entropy(z, y)
                # Backpropagate the error for one iteration.
                loss.backward()

        # Make one step of parameter update. Trainer needs to know the
        # batch size of data to normalize the gradient by 1/batch_size.
        trainer.step(train_batch.data[0].shape[0])
        # calculate training metrics
        train_loss = loss.mean().asscalar()
        train_time = _time.time() - start_time

        if train_batch.iteration > iteration:
            # Compute training accuracy
            compute_accuracy(train_accuracy, train_loader_to_compute_accuracy)
            # Compute validation accuracy
            if validation_set is not None:
                compute_accuracy(validation_accuracy, validation_loader)
            iteration = train_batch.iteration
            if verbose:
                kwargs = {
                    "iteration": iteration,
                    "train_loss": float(train_loss),
                    "train_accuracy": train_accuracy.get()[1],
                    "time": train_time
                }
                if validation_set is not None:
                    kwargs["validation_accuracy"] = validation_accuracy.get(
                    )[1]
                table_printer.print_row(**kwargs)

    state = {
        '_model': model,
        '_class_to_index': class_to_index,
        'num_classes': len(classes),
        'classes': classes,
        'input_image_shape': (1, BITMAP_WIDTH, BITMAP_HEIGHT),
        'batch_size': batch_size,
        'training_loss': train_loss,
        'training_accuracy': train_accuracy.get()[1],
        'training_time': train_time,
        'validation_accuracy': validation_accuracy.get()[1],
        # nan if validation_set=None
        'max_iterations': max_iterations,
        'target': target,
        'feature': feature,
        'num_examples': len(input_dataset)
    }
    return DrawingClassifier(state)
Exemplo n.º 46
0
def check_layer_bidirectional_varseqlen(size, in_size):
    weights = {}
    for d in ['l', 'r']:
        weights['{}0_i2h_weight'.format(d)] = mx.random.uniform(
            shape=(size * 4, in_size))
        weights['{}0_h2h_weight'.format(d)] = mx.random.uniform(shape=(size *
                                                                       4,
                                                                       size))
        weights['{}0_i2h_bias'.format(d)] = mx.random.uniform(shape=(size *
                                                                     4, ))
        weights['{}0_h2h_bias'.format(d)] = mx.random.uniform(shape=(size *
                                                                     4, ))

    net = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=True)
    ref_net = gluon.rnn.LSTM(size,
                             bidirectional=True,
                             use_sequence_length=False)
    net.initialize()
    ref_net.initialize()
    net_params = net.collect_params()
    ref_net_params = ref_net.collect_params()
    for k in weights:
        net_params[k].set_data(weights[k])
        ref_net_params[k].set_data(weights[k])

    batch_size = 10
    num_timesteps = 11
    data = mx.random.uniform(shape=(num_timesteps, batch_size, in_size))
    data_np = data.asnumpy()

    sequence_length = nd.random.randint(1,
                                        num_timesteps + 1,
                                        shape=(batch_size)).astype("int32")
    sequence_length_np = sequence_length.asnumpy().astype("int32")

    # Reference net is processing batch elements one at a time, so that it is "perfectly sized"
    # Because of that, we need to accumulate gradients in reference net.
    for p in ref_net.collect_params().values():
        p.grad_req = 'add'

    ref_net_output = []
    with autograd.record():
        net_output = net(data.copy(), sequence_length=sequence_length.copy())

        for b in range(batch_size):
            data_slice = mx.nd.array(data_np[:sequence_length_np[b],
                                             b, :]).reshape(
                                                 sequence_length_np[b], 1,
                                                 in_size)
            ref_output_slice = ref_net(data_slice)
            ref_net_output.append(ref_output_slice)

    net_output_np = net_output.asnumpy()

    # TODO: test state return value as well output
    # Only compare the valid sections for each batch entry
    for b in range(batch_size):
        assert_allclose(net_output_np[:sequence_length_np[b], b],
                        ref_net_output[b].asnumpy().squeeze(1),
                        rtol=1e-2,
                        atol=1e-6)

    # Now test backward
    net_output.backward()

    for ref_output_slice in ref_net_output:
        ref_output_slice.backward()

    ref_net_params = ref_net.collect_params()

    for k in weights:
        net_grad = net_params[k].grad()
        ref_net_grad = ref_net_params[k].grad()
        assert_almost_equal(net_grad.asnumpy(),
                            ref_net_grad.asnumpy(),
                            rtol=1e-2,
                            atol=1e-6)
Exemplo n.º 47
0
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().setattr('grad_req', 'null')
    net.collect_train_params().setattr('grad_req', 'write')
    trainer = gluon.Trainer(
        net.collect_train_params(),  # fix batchnorm, fix first stage, etc...
        'sgd',
        {
            'learning_rate': args.lr,
            'wd': args.wd,
            'momentum': args.momentum,
            'clip_gradient': 5
        })

    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted(
        [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])
    lr_warmup = float(args.lr_warmup)  # avoid int division

    # TODO(zhreshold) losses?
    rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
        from_sigmoid=False)
    rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.)  # == smoothl1
    rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
    rcnn_box_loss = mx.gluon.loss.HuberLoss()  # == smoothl1
    metrics = [
        mx.metric.Loss('RPN_Conf'),
        mx.metric.Loss('RPN_SmoothL1'),
        mx.metric.Loss('RCNN_CrossEntropy'),
        mx.metric.Loss('RCNN_SmoothL1'),
    ]

    rpn_acc_metric = RPNAccMetric()
    rpn_bbox_metric = RPNL1LossMetric()
    rcnn_acc_metric = RCNNAccMetric()
    rcnn_bbox_metric = RCNNL1LossMetric()
    metrics2 = [
        rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric
    ]

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    if args.verbose:
        logger.info('Trainable parameters:')
        logger.info(net.collect_train_params().keys())
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        mix_ratio = 1.0
        if args.mixup:
            # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise
            train_data._dataset.set_mixup(np.random.uniform, 0.5, 0.5)
            mix_ratio = 0.5
            if epoch >= args.epochs - args.no_mixup_epochs:
                train_data._dataset.set_mixup(None)
                mix_ratio = 1.0
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(
                epoch, new_lr))
        for metric in metrics:
            metric.reset()
        tic = time.time()
        btic = time.time()
        net.hybridize(static_alloc=True)
        base_lr = trainer.learning_rate
        for i, batch in enumerate(train_data):
            if epoch == 0 and i <= lr_warmup:
                # adjust based on real percentage
                new_lr = base_lr * get_lr_at_iter(i / lr_warmup)
                if new_lr != trainer.learning_rate:
                    if i % args.log_interval == 0:
                        logger.info(
                            '[Epoch 0 Iteration {}] Set learning rate to {}'.
                            format(i, new_lr))
                    trainer.set_learning_rate(new_lr)
            batch = split_and_load(batch, ctx_list=ctx)
            batch_size = len(batch[0])
            losses = []
            metric_losses = [[] for _ in metrics]
            add_losses = [[] for _ in metrics2]
            with autograd.record():
                for data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(
                        *batch):
                    gt_label = label[:, :, 4:5]
                    gt_box = label[:, :, :4]
                    cls_pred, box_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net(
                        data, gt_box)
                    # losses of rpn
                    rpn_score = rpn_score.squeeze(axis=-1)
                    num_rpn_pos = (rpn_cls_targets >= 0).sum()
                    rpn_loss1 = rpn_cls_loss(
                        rpn_score, rpn_cls_targets, rpn_cls_targets >=
                        0) * rpn_cls_targets.size / num_rpn_pos
                    rpn_loss2 = rpn_box_loss(
                        rpn_box, rpn_box_targets,
                        rpn_box_masks) * rpn_box.size / num_rpn_pos
                    # rpn overall loss, use sum rather than average
                    rpn_loss = rpn_loss1 + rpn_loss2
                    # generate targets for rcnn
                    cls_targets, box_targets, box_masks = net.target_generator(
                        roi, samples, matches, gt_label, gt_box)
                    # losses of rcnn
                    num_rcnn_pos = (cls_targets >= 0).sum()
                    rcnn_loss1 = rcnn_cls_loss(
                        cls_pred, cls_targets, cls_targets >= 0
                    ) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos
                    rcnn_loss2 = rcnn_box_loss(
                        box_pred, box_targets, box_masks
                    ) * box_pred.size / box_pred.shape[0] / num_rcnn_pos
                    rcnn_loss = rcnn_loss1 + rcnn_loss2
                    # overall losses
                    losses.append(rpn_loss.sum() * mix_ratio +
                                  rcnn_loss.sum() * mix_ratio)
                    metric_losses[0].append(rpn_loss1.sum() * mix_ratio)
                    metric_losses[1].append(rpn_loss2.sum() * mix_ratio)
                    metric_losses[2].append(rcnn_loss1.sum() * mix_ratio)
                    metric_losses[3].append(rcnn_loss2.sum() * mix_ratio)
                    add_losses[0].append(
                        [[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]])
                    add_losses[1].append([[rpn_box_targets, rpn_box_masks],
                                          [rpn_box]])
                    add_losses[2].append([[cls_targets], [cls_pred]])
                    add_losses[3].append([[box_targets, box_masks],
                                          [box_pred]])
                autograd.backward(losses)
                for metric, record in zip(metrics, metric_losses):
                    metric.update(0, record)
                for metric, records in zip(metrics2, add_losses):
                    for pred in records:
                        metric.update(pred[0], pred[1])
            trainer.step(batch_size)
            # update metrics
            if args.log_interval and not (i + 1) % args.log_interval:
                # msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics])
                msg = ','.join([
                    '{}={:.3f}'.format(*metric.get())
                    for metric in metrics + metrics2
                ])
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.
                    format(
                        epoch, i,
                        args.log_interval * batch_size / (time.time() - btic),
                        msg))
                btic = time.time()

        msg = ','.join(
            ['{}={:.3f}'.format(*metric.get()) for metric in metrics])
        logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format(
            epoch, (time.time() - tic), msg))
        if not (epoch + 1) % args.val_interval:
            # consider reduce the frequency of validation to save time
            map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
            val_msg = '\n'.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            current_map = float(mean_ap[-1])
        else:
            current_map = 0.
        save_params(net, logger, best_map, current_map, epoch,
                    args.save_interval, args.save_prefix)
Exemplo n.º 48
0
def nnTrain(model_mark,
            nnModel,
            train_data,
            valid_data_X,
            valid_data_Y,
            test_data_X,
            test_data_Y,
            batch_size,
            loss_func,
            epochs,
            optimizer,
            optimizer_params,
            lr_decay_rate=1):
    """
    Providing 3 approaches to train the model: momentum, adadelta and adam
    """
    assert optimizer in set(['sgd', 'adadelta', 'adam'])
    random.seed(1)
    train_iter = gluon.data.DataLoader(train_data, batch_size, shuffle=True)
    nTrain = len(train_data)
    nValid = len(test_data_Y)
    # The model
    mx.random.seed(123456)
    nnModel.collect_params().initialize(mx.initializer.MSRAPrelu(),
                                        ctx=context)
    trainer = gluon.Trainer(nnModel.collect_params(),
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

    best_smape = 1
    for e in range(epochs):
        #       if(e>=2): trainer.set_learning_rate(trainer.learning_rate * lr_decay_rate)
        train_loss = 0
        for data, label in train_iter:
            label = label.as_in_context(context)
            with autograd.record():
                output = nnModel(data)
                loss = loss_func(output, label)
            loss.backward()
            trainer.step(batch_size)
            train_loss += nd.sum(loss).asscalar()
        # The valid loss
        valid_pred = DLPred(nnModel, valid_data_X)[:, 0].asnumpy()
        valid_true = valid_data_Y.asnumpy()
        # The valid loss
        test_pred = DLPred(nnModel, test_data_X)[:, 0].asnumpy()
        test_true = test_data_Y.asnumpy()

        valid_loss = nd.sum(
            abs_loss(nd.array(valid_true), nd.array(valid_pred))).asscalar()
        test_loss = nd.sum(abs_loss(nd.array(test_true),
                                    nd.array(test_pred))).asscalar()

        valid_smape = smape(valid_true, valid_pred)
        test_smape = smape(test_true, test_pred)

        print("Epoch %d, train loss: %f, valid_loss: %f" %
              (e, train_loss / nTrain, valid_loss / nValid))
        print("Valid smape  %f; Test smape %f" % (valid_smape, test_smape))
        # Save the model
        if (e == 0 or valid_smape < best_smape):
            best_smape = valid_smape
        if e > 0 and valid_smape < best_smape + 0.3:
            save_checkpoint(nnModel,
                            model_mark + str(e),
                            round(valid_smape, 2),
                            save_path="checkpoints")
Exemplo n.º 49
0
def train(channel_input_dirs, hyperparameters, **kwargs):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.
    """
    Args:
        channel_input_dirs:
        hyperparameters:
        **kwargs:
    """
    ctx = mx.cpu()

    # retrieve the hyperparameters we set in notebook (with some defaults)
    batch_size = hyperparameters.get("batch_size", 100)
    epochs = hyperparameters.get("epochs", 10)
    learning_rate = hyperparameters.get("learning_rate", 0.1)
    momentum = hyperparameters.get("momentum", 0.9)
    log_interval = hyperparameters.get("log_interval", 100)

    training_data = channel_input_dirs["training"]

    # load training and validation data
    # we use the gluon.data.vision.MNIST class because of its built in mnist pre-processing logic,
    # but point it at the location where SageMaker placed the data files, so it doesn't download them again.
    train_data = get_train_data(training_data, batch_size)
    val_data = get_val_data(training_data, batch_size)

    # define the network
    net = define_network()

    # Collect all parameters from net and its children, then initialize them.
    net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
    # Trainer is for updating parameters with gradient.
    trainer = gluon.Trainer(
        net.collect_params(), "sgd", {"learning_rate": learning_rate, "momentum": momentum}
    )
    metric = mx.metric.Accuracy()
    loss = gluon.loss.SoftmaxCrossEntropyLoss()

    for epoch in range(epochs):
        # reset data iterator and metric at begining of epoch.
        metric.reset()
        btic = time.time()
        for i, (data, label) in enumerate(train_data):
            # Copy data to ctx if necessary
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            # Start recording computation graph with record() section.
            # Recorded graphs can then be differentiated with backward.
            with autograd.record():
                output = net(data)
                L = loss(output, label)
                L.backward()
            # take a gradient step with batch_size equal to data.shape[0]
            trainer.step(data.shape[0])
            # update metric at last.
            metric.update([label], [output])

            if i % log_interval == 0 and i > 0:
                name, acc = metric.get()
                logger.info(
                    "[Epoch %d Batch %d] Training: %s=%f, %f samples/s"
                    % (epoch, i, name, acc, batch_size / (time.time() - btic))
                )

            btic = time.time()

        name, acc = metric.get()
        logger.info("[Epoch %d] Training: %s=%f" % (epoch, name, acc))

        name, val_acc = test(ctx, net, val_data)
        logger.info("[Epoch %d] Validation: %s=%f" % (epoch, name, val_acc))

    return net
Exemplo n.º 50
0
import mxnet as mx
    def train(self,
              batch_size=64,
              num_epoch=10,
              eval_metric='acc',
              optimizer='adam',
              optimizer_params=(('learning_rate', 0.001), ),
              load_checkpoint=True,
              context='gpu',
              checkpoint_period=5,
              normalize=True,
              noise_distribution='gaussian',
              noise_distribution_params=(
                  ('mean_value', 0),
                  ('spread_value', 1),
              ),
              discriminator_optimizer='adam',
              discriminator_optimizer_params=(('learning_rate', 0.001), ),
              constraint_distributions={},
              constraint_losses={},
              preprocessing=False,
              k_value=1,
              generator_loss=None,
              generator_target_name="",
              noise_input="",
              gen_loss_weight=1,
              dis_loss_weight=1,
              log_period=50,
              print_images=False):

        if context == 'gpu':
            mx_context = mx.gpu()
        elif context == 'cpu':
            mx_context = mx.cpu()
        else:
            logging.error("Context argument is '" + context +
                          "'. Only 'cpu' and 'gpu are valid arguments'.")

        gen_input_names = list(self._net_creator_gen.getInputs().keys())
        gen_input_names = [name[:-1] for name in gen_input_names]
        dis_input_names = list(self._net_creator_dis.getInputs().keys())
        dis_input_names = [name[:-1] for name in dis_input_names]
        if self.use_qnet:
            qnet_input_names = list(self._net_creator_qnet.getOutputs().keys())
            qnet_input_names = [name[:-1] for name in qnet_input_names]
        dis_real_input = list(
            self._net_creator_gen.getOutputs().keys())[0][:-1]

        gen_output_name = list(
            self._net_creator_gen.getOutputs().keys())[0][:-1]
        if self.use_qnet:
            cGAN_input_names = set(gen_input_names).difference(
                qnet_input_names)
            cGAN_input_names.discard(noise_input)
            cGAN_input_names = list(cGAN_input_names)
        else:
            cGAN_input_names = set(gen_input_names)
            cGAN_input_names.discard(noise_input)
            cGAN_input_names = list(cGAN_input_names)

        if preprocessing:
            preproc_lib = "CNNPreprocessor_defaultGAN_defaultGANConnector_predictor_executor"

        self._data_loader._output_names_ = []

        if not generator_target_name == "":
            self._data_loader._input_names_ = cGAN_input_names + [
                gen_output_name
            ] + [generator_target_name]
        else:
            self._data_loader._input_names_ = cGAN_input_names + [
                gen_output_name
            ]

        if preprocessing:
            train_iter, test_iter, data_mean, data_std, _, _ = self._data_loader.load_preprocessed_data(
                batch_size, preproc_lib)
        else:
            train_iter, test_iter, data_mean, data_std, _, _ = self._data_loader.load_data(
                batch_size)

        traindata_to_index = {}
        curIndex = 0
        for data_tuple in train_iter.data:
            traindata_to_index[data_tuple[0] + "_"] = curIndex
            curIndex += 1

        if 'weight_decay' in optimizer_params:
            optimizer_params['wd'] = optimizer_params['weight_decay']
            del optimizer_params['weight_decay']
        if 'learning_rate_decay' in optimizer_params:
            min_learning_rate = 1e-08
            if 'learning_rate_minimum' in optimizer_params:
                min_learning_rate = optimizer_params['learning_rate_minimum']
                del optimizer_params['learning_rate_minimum']
            optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
                optimizer_params['step_size'],
                factor=optimizer_params['learning_rate_decay'],
                stop_factor_lr=min_learning_rate)
            del optimizer_params['step_size']
            del optimizer_params['learning_rate_decay']

        if 'weight_decay' in discriminator_optimizer_params:
            discriminator_optimizer_params[
                'wd'] = discriminator_optimizer_params['weight_decay']
            del discriminator_optimizer_params['weight_decay']
        if 'learning_rate_decay' in optimizer_params:
            min_learning_rate = 1e-08
            if 'learning_rate_minimum' in discriminator_optimizer_params:
                min_learning_rate = discriminator_optimizer_params[
                    'learning_rate_minimum']
                del discriminator_optimizer_params['learning_rate_minimum']
            discriminator_optimizer_params[
                'lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
                    discriminator_optimizer_params['step_size'],
                    factor=discriminator_optimizer_params[
                        'learning_rate_decay'],
                    stop_factor_lr=min_learning_rate)
            del discriminator_optimizer_params['step_size']
            del discriminator_optimizer_params['learning_rate_decay']

        if normalize:
            self._net_creator_dis.construct([mx_context],
                                            batch_size=batch_size,
                                            data_mean=data_mean,
                                            data_std=data_std)
        else:
            self._net_creator_dis.construct([mx_context],
                                            batch_size=batch_size)

        self._net_creator_gen.construct([mx_context])

        if self.use_qnet:
            self._net_creator_qnet.construct([mx_context])
            if load_checkpoint:
                self._net_creator_qnet.load([mx_context])
            else:
                if os.path.isdir(self._net_creator_qnet._model_dir_):
                    shutil.rmtree(self._net_creator_qnet._model_dir_)
            try:
                os.makedirs(self._net_creator_qnet._model_dir_)
            except OSError:
                if not (os.path.isdir(self._net_creator_qnet._model_dir_)):
                    raise
            q_net = self._net_creator_qnet.networks[0]

        begin_epoch = 0
        if load_checkpoint:
            begin_epoch = self._net_creator_dis.load([mx_context])
            self._net_creator_gen.load([mx_context])
        else:
            if os.path.isdir(self._net_creator_dis._model_dir_):
                shutil.rmtree(self._net_creator_dis._model_dir_)
            if os.path.isdir(self._net_creator_gen._model_dir_):
                shutil.rmtree(self._net_creator_gen._model_dir_)

        dis_net = self._net_creator_dis.networks[0]
        gen_net = self._net_creator_gen.networks[0]

        try:
            os.makedirs(self._net_creator_gen._model_dir_)
            os.makedirs(self._net_creator_dis._model_dir_)
        except OSError:
            if not (os.path.isdir(self._net_creator_gen._model_dir_)
                    and os.path.isdir(self._net_creator_dis._model_dir_)):
                raise

        gen_trainer = mx.gluon.Trainer(gen_net.collect_params(), optimizer,
                                       optimizer_params)
        dis_trainer = mx.gluon.Trainer(dis_net.collect_params(),
                                       discriminator_optimizer,
                                       discriminator_optimizer_params)
        if self.use_qnet:
            qnet_trainer = mx.gluon.Trainer(q_net.collect_params(),
                                            discriminator_optimizer,
                                            discriminator_optimizer_params)

        dis_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
            from_sigmoid=True)
        dis_loss.hybridize()

        if not generator_loss == None:
            if generator_loss == "l2":
                generator_loss_func = mx.gluon.loss.L2Loss()
                generator_loss_func.hybridize()
            elif generator_loss == "l1":
                generator_loss_func = mx.gluon.loss.L1Loss()
                generator_loss_func.hybridize()
            else:
                logging.error("Invalid generator loss parameter")

        metric_dis = mx.metric.create(eval_metric)
        metric_gen = mx.metric.create(eval_metric)
        gen_inputs = self._net_creator_gen.getInputs()
        dis_inputs = self._net_creator_dis.getInputs()

        qnet_outputs = []
        if self.use_qnet:
            qnet_outputs = self._net_creator_qnet.getOutputs()
            qnet_losses = []
        generators = {}
        if self.use_qnet:
            for name in qnet_outputs:
                domain = gen_inputs[name]
                min = domain[1]
                max = domain[2]
                if name[:-1] in constraint_distributions:
                    dist_dict = constraint_distributions[name[:-1]]
                    dist_name = dist_dict['name']
                    if dist_name is "gaussian":
                        generators[
                            name] = lambda domain=domain, min=min, max=max: mx.nd.cast(
                                mx.ndarray.random.normal(
                                    dist_dict["mean_value"],
                                    dist_dict["spread_value"],
                                    shape=(batch_size, ) + domain[3],
                                    dtype=domain[0],
                                    ctx=mx_context),
                                dtype="float32")
                else:
                    if domain[0] == float:
                        generators[
                            name] = lambda domain=domain, min=min, max=max: mx.nd.cast(
                                mx.ndarray.random.uniform(
                                    min,
                                    max,
                                    shape=(batch_size, ) + domain[3],
                                    dtype=domain[0],
                                    ctx=mx_context,
                                ),
                                dtype="float32")
                    elif domain[0] == int:
                        generators[
                            name] = lambda domain=domain, min=min, max=max: mx.ndarray.one_hot(
                                mx.ndarray.random.randint(low=0,
                                                          high=int(max - min) +
                                                          1,
                                                          shape=(batch_size, ),
                                                          dtype=int,
                                                          ctx=mx_context),
                                depth=int(max - min) + 1,
                                on_value=1).reshape((batch_size, ) + domain[3])

                if name[-1] in constraint_losses:
                    loss_dict = constraint_losses[name[:-1]]
                    loss = loss_dict['name']
                    margin = loss_dict[
                        'margin'] if 'margin' in loss_dict else 1.0
                    sparseLabel = loss_dict[
                        'sparse_label'] if 'sparse_label' in loss_dict else True
                    ignore_indices = [
                        loss_dict['ignore_indices']
                    ] if 'ignore_indices' in loss_dict else []
                    fromLogits = loss_dict[
                        'from_logits'] if 'from_logits' in loss_dict else False

                    if loss == 'softmax_cross_entropy':
                        qnet_losses += [
                            mx.gluon.loss.SoftmaxCrossEntropyLoss(
                                from_logits=fromLogits,
                                sparse_label=sparseLabel)
                        ]
                    elif loss == 'softmax_cross_entropy_ignore_indices':
                        qnet_losses += [
                            SoftmaxCrossEntropyLossIgnoreIndices(
                                ignore_indices=ignore_indices,
                                from_logits=fromLogits,
                                sparse_label=sparseLabel)
                        ]
                    elif loss == 'sigmoid_binary_cross_entropy':
                        qnet_losses += [
                            mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
                                from_sigmoid=True)
                        ]
                    elif loss == 'cross_entropy':
                        qnet_losses += [
                            CrossEntropyLoss(sparse_label=sparseLabel)
                        ]
                    elif loss == 'l2':
                        qnet_losses += [mx.gluon.loss.L2Loss()]
                    elif loss == 'l1':
                        qnet_losses += [mx.gluon.loss.L2Loss()]
                    elif loss == 'log_cosh':
                        qnet_losses += [LogCoshLoss()]
                    else:
                        logging.error(
                            "Invalid loss parameter for constraint:" +
                            name[:-1] + ".")
                else:
                    if domain[0] == float:
                        qnet_losses += [mx.gluon.loss.L2Loss()]
                    elif domain[0] == int:
                        qnet_losses += [
                            lambda pred, labels: mx.gluon.loss.
                            SoftmaxCrossEntropyLoss(sparse_label=False)
                            (pred, labels)
                        ]

        for name in gen_inputs:
            if name == noise_input + "_":
                domain = gen_inputs[name]
                min = domain[1]
                max = domain[2]
                if noise_distribution == "gaussian":
                    generators[
                        name] = lambda domain=domain, min=min, max=max: mx.nd.cast(
                            mx.ndarray.random.normal(
                                noise_distribution_params["mean_value"],
                                noise_distribution_params["spread_value"],
                                shape=(batch_size, ) + domain[3],
                                dtype=domain[0],
                                ctx=mx_context),
                            dtype="float32")
                elif noise_distribution == "uniform":
                    generators[
                        name] = lambda domain=domain, min=min, max=max: mx.nd.cast(
                            mx.ndarray.random.uniform(low=min,
                                                      high=max,
                                                      shape=(batch_size, ) +
                                                      domain[3],
                                                      dtype=domain[0],
                                                      ctx=mx_context),
                            dtype="float32")

        def create_generator_input(cur_batch):
            expected_qnet_output = []
            gen_input = []

            for name in gen_inputs:
                if name in traindata_to_index.keys():
                    gen_input += [
                        cur_batch.data[traindata_to_index[name]].as_in_context(
                            mx_context)
                    ]
                elif name in qnet_outputs:
                    value = generators[name]()
                    expected_qnet_output += [value]
                    gen_input += [value]
                else:
                    gen_input += [generators[name]()]
            return gen_input, expected_qnet_output

        def create_discriminator_input(cur_batch):
            conditional_input = []
            for name in gen_inputs:
                if name in traindata_to_index.keys():
                    conditional_input += [
                        cur_batch.data[traindata_to_index[name]].as_in_context(
                            mx_context)
                    ]
            return conditional_input

        tic = None

        for epoch in range(begin_epoch, begin_epoch + num_epoch):
            train_iter.reset()
            for batch_i, batch in enumerate(train_iter):
                real_data = batch.data[traindata_to_index[
                    dis_real_input + "_"]].as_in_context(mx_context)

                dis_conditional_input = create_discriminator_input(batch)
                gen_input, exp_qnet_output = create_generator_input(batch)

                with autograd.record():
                    fake_data = gen_net(*gen_input)[0][0]
                    fake_data.detach()
                    discriminated_fake_dis = dis_net(
                        fake_data, *dis_conditional_input)[0][0]
                    if self.use_qnet:
                        discriminated_fake_dis, _ = discriminated_fake_dis

                    fake_labels = mx.nd.zeros(discriminated_fake_dis.shape,
                                              ctx=mx_context)
                    real_labels = mx.nd.ones(discriminated_fake_dis.shape,
                                             ctx=mx_context)

                    loss_resultF = dis_loss(discriminated_fake_dis,
                                            fake_labels)
                    discriminated_real_dis = dis_net(
                        real_data, *dis_conditional_input)[0][0]
                    if self.use_qnet:
                        discriminated_real_dis, _ = discriminated_real_dis
                    loss_resultR = dis_loss(discriminated_real_dis,
                                            real_labels)

                    loss_resultD = dis_loss_weight * (loss_resultR +
                                                      loss_resultF)
                    loss_resultD.backward()
                dis_trainer.step(batch_size)

                if batch_i % k_value == 0:
                    with autograd.record():
                        fake_data = gen_net(*gen_input)[0][0]
                        discriminated_fake_gen = dis_net(
                            fake_data, *dis_conditional_input)[0][0]
                        if self.use_qnet:
                            discriminated_fake_gen, features = discriminated_fake_gen
                        loss_resultG = dis_loss(discriminated_fake_gen,
                                                real_labels)
                        if not generator_loss == None:
                            condition = batch.data[traindata_to_index[
                                generator_target_name + "_"]]
                            loss_resultG = loss_resultG + gen_loss_weight * generator_loss_func(
                                fake_data, condition)
                        if self.use_qnet:
                            qnet_discriminated = [q_net(features)[0][0]]
                            for i, qnet_out in enumerate(qnet_discriminated):
                                loss_resultG = loss_resultG + qnet_losses[i](
                                    qnet_out, exp_qnet_output[i])
                        loss_resultG.backward()
                    gen_trainer.step(batch_size)
                    if self.use_qnet:
                        qnet_trainer.step(batch_size)

                if tic is None:
                    tic = time.time()
                else:
                    if batch_i % log_period == 0:
                        try:
                            speed = log_period * batch_size / (time.time() -
                                                               tic)
                        except ZeroDivisionError:
                            speed = float("inf")

                        logging.info(" Discriminator loss on real data: " +
                                     str(loss_resultR[0].asnumpy().item()))
                        logging.info(" Discriminator loss on fake data: " +
                                     str(loss_resultF[0].asnumpy().item()))
                        logging.info(" Generator loss: " +
                                     str(loss_resultG[0].asnumpy().item()))
                        logging.info(
                            "Epoch[%d] Batch[%d] Speed: %.2f samples/sec \n" %
                            (epoch, batch_i, speed))

                        tic = time.time()

                        if print_images:
                            pyplot.subplot(1, 2, 1)
                            fake_img = fake_data[0]
                            visualize(fake_img)
                            filename = 'plot_%06d%06d.png' % (epoch, batch_i)
                            pyplot.savefig(filename)
                            pyplot.close()

            if (epoch - begin_epoch) % checkpoint_period == 0:
                gen_net.save_parameters(self.parameter_path_gen() + '-' +
                                        str(epoch).zfill(4) + '.params')
                dis_net.save_parameters(self.parameter_path_dis() + '-' +
                                        str(epoch).zfill(4) + '.params')

        gen_net.save_parameters(self.parameter_path_gen() + '-' +
                                str(num_epoch + begin_epoch).zfill(4) +
                                '.params')
        gen_net.export(self.parameter_path_gen() + '_newest', epoch=0)
        dis_net.save_parameters(self.parameter_path_dis() + '-' +
                                str(num_epoch + begin_epoch).zfill(4) +
                                '.params')
        dis_net.export(self.parameter_path_dis() + '_newest', epoch=0)
        if not generator_loss == None:
            generator_loss_func.export(self.parameter_path_gen() +
                                       '_newest_loss',
                                       epoch=0)
        dis_loss.export(self.parameter_path_dis() + '_newest_loss', epoch=0)
Exemplo n.º 52
0
def CNN(epoch=100,
        batch_size=128,
        save_period=10,
        load_period=100,
        optimizer="sgd",
        learning_rate=0.01,
        dataset="MNIST",
        ctx=mx.gpu(0)):

    #data selection
    if dataset == "MNIST":
        train_data, test_data = MNIST(batch_size)
        path = "weights/MNIST-{}.params".format(load_period)
    elif dataset == "CIFAR10":
        train_data, test_data = CIFAR10(batch_size)
        path = "weights/CIFAR10-{}.params".format(load_period)
    elif dataset == "FashionMNIST":
        train_data, test_data = FashionMNIST(batch_size)
        path = "weights/FashionMNIST-{}.params".format(load_period)
    else:
        return "The dataset does not exist."
    '''Follow these steps:

    •Define network
    •Initialize parameters
    •Loop over inputs
    •Forward input through network to get output
    •Compute loss with output and label
    •Backprop gradient
    •Update parameters with gradient descent.
    '''

    #Convolution Neural Network
    # formula : output_size=((input−weights+2*Padding)/Stride)+1
    # data size
    # MNIST,FashionMNIST = (batch size , 1 , 28 ,  28)
    # CIFAR = (batch size , 3 , 32 ,  32)
    #net = gluon.nn.Sequential() # stacks 'Block's sequentially
    net = gluon.nn.HybridSequential()  # for faster learning
    with net.name_scope():
        net.add(
            gluon.nn.Conv2D(channels=60,
                            kernel_size=(3, 3),
                            strides=(1, 1),
                            use_bias=True)
        )  # MNIST : result = ( batch size , 60 , 26 , 26) , CIFAR10 : : result = ( batch size , 60 , 30 , 30)
        net.add(
            gluon.nn.BatchNorm(axis=1,
                               momentum=0.9,
                               epsilon=1e-05,
                               center=True,
                               scale=True,
                               beta_initializer="zeros",
                               gamma_initializer="ones",
                               running_mean_initializer="zeros",
                               running_variance_initializer="ones"))
        net.add(gluon.nn.Activation("relu"))
        net.add(
            gluon.nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2))
        )  # MNIST : result = (batch size , 60 , 13 , 13) , CIFAR10 : result = (batch size , 60 , 15 , 15)
        net.add(
            gluon.nn.Conv2D(channels=30,
                            kernel_size=(6, 6),
                            strides=(1, 1),
                            use_bias=True)
        )  # MNIST :  result = ( batch size , 30 , 8 , 8), CIFAR10 :  result = ( batch size , 30 , 10 , 10)
        net.add(
            gluon.nn.BatchNorm(axis=1,
                               momentum=0.9,
                               epsilon=1e-05,
                               center=True,
                               scale=True,
                               beta_initializer="zeros",
                               gamma_initializer="ones",
                               running_mean_initializer="zeros",
                               running_variance_initializer="ones"))
        net.add(gluon.nn.Activation("relu"))
        net.add(
            gluon.nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2))
        )  # MNIST : result = (batch size , 30 , 4 , 4) , CIFAR10 : result = (batch size , 30 , 5 , 5)
        net.add(gluon.nn.Dense(units=120, use_bias=True, flatten=True))
        net.add(
            gluon.nn.BatchNorm(axis=1,
                               momentum=0.9,
                               epsilon=1e-05,
                               center=True,
                               scale=True,
                               beta_initializer="zeros",
                               gamma_initializer="ones",
                               running_mean_initializer="zeros",
                               running_variance_initializer="ones"))
        net.add(gluon.nn.Activation("relu"))
        net.add(gluon.nn.Dropout(0.0))
        net.add(gluon.nn.Dense(units=64, use_bias=True))
        net.add(
            gluon.nn.BatchNorm(axis=1,
                               momentum=0.9,
                               epsilon=1e-05,
                               center=True,
                               scale=True,
                               beta_initializer="zeros",
                               gamma_initializer="ones",
                               running_mean_initializer="zeros",
                               running_variance_initializer="ones"))
        net.add(gluon.nn.Activation("relu"))
        net.add(gluon.nn.Dropout(0.0))
        net.add(gluon.nn.Dense(10, use_bias=True))
    net.hybridize()  # for faster learning
    #weights initialization
    if os.path.exists(path):
        print("loading weights")
        net.load_params(filename=path, ctx=ctx)  # weights load
    else:
        print("initializing weights")
        net.collect_params().initialize(mx.init.Normal(sigma=0.1),
                                        ctx=ctx)  # weights initialization
        #net.initialize(mx.init.Normal(sigma=0.1),ctx=ctx) # weights initialization

    #optimizer
    trainer = gluon.Trainer(net.collect_params(), optimizer,
                            {"learning_rate": learning_rate})

    #learning
    for i in tqdm(range(1, epoch + 1, 1)):
        for data, label in train_data:

            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)

            with autograd.record(train_mode=True):
                output = net(data)

                #loss definition
                '''Why do you write this?
                answer :  Blocks, sequential, softmaxCrossEntropyLoss, and other gluon package keywords should be accessed as classes by default.'''
                loss = gluon.loss.SoftmaxCrossEntropyLoss(axis=-1,
                                                          sparse_label=True)(
                                                              output, label)
                cost = nd.mean(loss).asscalar()
            loss.backward()
            trainer.step(batch_size, ignore_stale_grad=True)

        print(" epoch : {} , last batch cost : {}".format(i, cost))

        #weight_save
        if i % save_period == 0:

            if not os.path.exists("weights"):
                os.makedirs("weights")

            print("saving weights")
            if dataset == "MNIST":
                net.save_params("weights/MNIST-{}.params".format(i))

            if dataset == "FashionMNIST":
                net.save_params("weights/FashionMNIST-{}.params".format(i))

            elif dataset == "CIFAR10":
                net.save_params("weights/CIFAR10-{}.params".format(i))

    test_accuracy = evaluate_accuracy(test_data, net, ctx)
    print("Test_acc : {}".format(test_accuracy))

    return "optimization completed"
Exemplo n.º 53
0
def train(net, train_data, val_data, eval_metric, polygon_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.no_wd:
        for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
            v.wd_mult = 0.0

    if args.label_smooth:
        net._target_generator._label_smooth = True

    if args.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
    lr_scheduler = LRScheduler(mode=args.lr_mode,
                               baselr=args.lr,
                               niters=args.num_samples // args.batch_size,
                               nepochs=args.epochs,
                               step=lr_decay_epoch,
                               step_factor=args.lr_decay,
                               power=2,
                               warmup_epochs=args.warmup_epochs)

    trainer = gluon.Trainer(net.collect_params(),
                            'sgd', {
                                'wd': args.wd,
                                'momentum': args.momentum,
                                'lr_scheduler': lr_scheduler
                            },
                            kvstore='local')
    # targets
    sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    l1_loss = gluon.loss.L1Loss()

    # metrics
    obj_metrics = mx.metric.Loss('ObjLoss')
    center_metrics = mx.metric.Loss('BoxCenterLoss')
    scale_metrics = mx.metric.Loss('BoxScaleLoss')
    # coef_center_metrics = mx.metric.Loss('CoefCenterLoss')
    coef_metrics = mx.metric.Loss('CoefLoss')
    # w_metrics = mx.metric.Loss('wLoss')
    cls_metrics = mx.metric.Loss('ClassLoss')
    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        if args.mixup:
            # TODO(threshold): more elegant way to control mixup during runtime
            try:
                train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
            except AttributeError:
                train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
            if epoch >= args.epochs - args.no_mixup_epochs:
                try:
                    train_data._dataset.set_mixup(None)
                except AttributeError:
                    train_data._dataset._data.set_mixup(None)
        tic = time.time()
        btic = time.time()
        mx.nd.waitall()
        net.hybridize()
        for i, batch in enumerate(train_data):
            batch_size = batch[0].shape[0]
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0)
            fixed_targets = [
                gluon.utils.split_and_load(batch[it],
                                           ctx_list=ctx,
                                           batch_axis=0) for it in range(1, 7)
            ]
            gt_boxes = gluon.utils.split_and_load(batch[7],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
            sum_losses = []
            obj_losses = []
            center_losses = []
            scale_losses = []
            # coef_center_losses = []
            coef_losses = []
            cls_losses = []
            with autograd.record():
                for ix, x in enumerate(data):
                    obj_loss, center_loss, scale_loss, coef_loss, cls_loss = net(
                        x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets])
                    if (args.only_bbox):
                        sum_losses.append(obj_loss + center_loss + scale_loss +
                                          cls_loss)
                    else:
                        sum_losses.append(obj_loss + center_loss + scale_loss +
                                          coef_loss + cls_loss)
                        # coef_center_losses.append(coef_center_loss)
                        coef_losses.append(coef_loss)
                    obj_losses.append(obj_loss)
                    center_losses.append(center_loss)
                    scale_losses.append(scale_loss)
                    cls_losses.append(cls_loss)
                autograd.backward(sum_losses)
            lr_scheduler.update(i, epoch)
            trainer.step(batch_size)
            if (args.only_bbox == False):
                # coef_center_metrics.update(0, coef_center_losses)
                coef_metrics.update(0, coef_losses)
            obj_metrics.update(0, obj_losses)
            center_metrics.update(0, center_losses)
            scale_metrics.update(0, scale_losses)
            cls_metrics.update(0, cls_losses)
            if args.log_interval and not (i + 1) % args.log_interval:
                name1, loss1 = obj_metrics.get()
                name2, loss2 = center_metrics.get()
                name3, loss3 = scale_metrics.get()
                if (args.only_bbox == False):
                    # name4, loss4 = coef_center_metrics.get()
                    name5, loss5 = coef_metrics.get()
                name6, loss6 = cls_metrics.get()
                if (args.only_bbox):
                    logger.info(
                        '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
                        .format(epoch, i, trainer.learning_rate,
                                batch_size / (time.time() - btic), name1,
                                loss1, name2, loss2, name3, loss3, name6,
                                loss6))
                else:
                    logger.info(
                        '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
                        .format(epoch, i, trainer.learning_rate,
                                batch_size / (time.time() - btic), name1,
                                loss1, name2, loss2, name3, loss3, name5,
                                loss5, name6, loss6))
            btic = time.time()

        name1, loss1 = obj_metrics.get()
        name2, loss2 = center_metrics.get()
        name3, loss3 = scale_metrics.get()
        if (args.only_bbox == False):
            # name4, loss4 = coef_center_metrics.get()
            name5, loss5 = coef_metrics.get()
        name6, loss6 = cls_metrics.get()
        if (args.only_bbox):
            logger.info(
                '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
                .format(epoch, (time.time() - tic), name1, loss1, name2, loss2,
                        name3, loss3, name6, loss6))
        else:
            logger.info(
                '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
                .format(epoch, (time.time() - tic), name1, loss1, name2, loss2,
                        name3, loss3, name5, loss5, name6, loss6))
        if False and not (epoch) % args.val_interval:
            # consider reduce the frequency of validation to save time
            map_bbox, map_polygon = validate(net, val_data, ctx, eval_metric,
                                             polygon_metric, args)
            map_name, mean_ap = map_bbox
            polygonmap_name, polygonmean_ap = map_polygon
            val_msg = '\n'.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            polygonval_msg = '\n'.join([
                '{}={}'.format(k, v)
                for k, v in zip(polygonmap_name, polygonmean_ap)
            ])
            logger.info('[Epoch {}] PolygonValidation: \n{}'.format(
                epoch, polygonval_msg))
            current_map = float(polygonmean_ap[-1])
        else:
            current_map = 0.
        save_params(net, best_map, current_map, epoch, args.save_interval,
                    args.save_prefix)
Exemplo n.º 54
0
    def _train_loop(self, train_data, val_data):
        if self._cfg.train.no_wd:
            for k, v in self.net.collect_params(
                    '.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0
        if self._cfg.train.label_smoothing or self._cfg.train.mixup:
            sparse_label_loss = False
        else:
            sparse_label_loss = True
        if self.distillation:
            L = loss.DistillationSoftmaxCrossEntropyLoss(
                temperature=self._cfg.train.temperature,
                hard_weight=self._cfg.train.hard_weight,
                sparse_label=sparse_label_loss)
        else:
            L = gluon.loss.SoftmaxCrossEntropyLoss(
                sparse_label=sparse_label_loss)

        if self._cfg.train.mixup:
            train_metric = mx.metric.RMSE()
        else:
            train_metric = mx.metric.Accuracy()
        if self._cfg.train.mode == 'hybrid':
            self.net.hybridize(static_alloc=True, static_shape=True)
            if self.distillation:
                self.teacher.hybridize(static_alloc=True, static_shape=True)

        self._logger.info('Start training from [Epoch %d]',
                          max(self._cfg.train.start_epoch, self.epoch))
        for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch),
                                self._cfg.train.epochs):
            epoch = self.epoch
            tic = time.time()
            btic = time.time()
            if self._cfg.train.use_rec:
                train_data.reset()
            train_metric.reset()

            # pylint: disable=undefined-loop-variable
            for i, batch in enumerate(train_data):
                data, label = self.batch_fn(batch, self.ctx)

                if self._cfg.train.mixup:
                    lam = np.random.beta(self._cfg.train.mixup_alpha,
                                         self._cfg.train.mixup_alpha)
                    if epoch >= self._cfg.train.epochs - self._cfg.train.mixup_off_epoch:
                        lam = 1
                    data = [lam * X + (1 - lam) * X[::-1] for X in data]

                    if self._cfg.train.label_smoothing:
                        eta = 0.1
                    else:
                        eta = 0.0
                    label = mixup_transform(label, classes, lam, eta)

                elif self._cfg.train.label_smoothing:
                    hard_label = label
                    label = smooth(label, classes)

                if self.distillation:
                    teacher_prob = [nd.softmax(self.teacher(X.astype(self._cfg.train.dtype, copy=False)) \
                                    / self._cfg.train.temperature) for X in data]

                with ag.record():
                    outputs = [
                        self.net(X.astype(self._cfg.train.dtype, copy=False))
                        for X in data
                    ]
                    if self.distillation:
                        losses = [L(yhat.astype('float32', copy=False),
                                    y.astype('float32', copy=False),
                                    p.astype('float32', copy=False)) \
                                        for yhat, y, p in zip(outputs, label, teacher_prob)]
                    else:
                        losses = [
                            L(yhat, y.astype(self._cfg.train.dtype,
                                             copy=False))
                            for yhat, y in zip(outputs, label)
                        ]
                for l in losses:
                    l.backward()
                self.trainer.step(self.batch_size)

                if self._cfg.train.mixup:
                    output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                                    for out in outputs]
                    train_metric.update(label, output_softmax)
                else:
                    if self._cfg.train.label_smoothing:
                        train_metric.update(hard_label, outputs)
                    else:
                        train_metric.update(label, outputs)

                if self._cfg.train.log_interval and not (
                        i + 1) % self._cfg.train.log_interval:
                    train_metric_name, train_metric_score = train_metric.get()
                    self._logger.info(
                        'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f',
                        epoch, i, self._cfg.train.batch_size *
                        self._cfg.train.log_interval / (time.time() - btic),
                        train_metric_name, train_metric_score,
                        self.trainer.learning_rate)
                    btic = time.time()

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(self.batch_size * i / (time.time() - tic))

            top1_val, top5_val = self._evaluate(val_data)

            self._logger.info('[Epoch %d] training: %s=%f', epoch,
                              train_metric_name, train_metric_score)
            self._logger.info(
                '[Epoch %d] speed: %d samples/sec\ttime cost: %f', epoch,
                throughput,
                time.time() - tic)
            self._logger.info('[Epoch %d] validation: top1=%f top5=%f', epoch,
                              top1_val, top5_val)

            if top1_val > self._best_acc:
                cp_name = os.path.join(self._logdir, 'best_checkpoint.pkl')
                self._logger.info(
                    '[Epoch %d] Current best top-1: %f vs previous %f, saved to %s',
                    self.epoch, top1_val, self._best_acc, cp_name)
                self.save(cp_name)
                self._best_acc = top1_val
            if self._reporter:
                self._reporter(epoch=epoch, acc_reward=top1_val)
            self._time_elapsed += time.time() - btic
        return {
            'train_acc': train_metric_score,
            'valid_acc': self._best_acc,
            'time': self._time_elapsed
        }
Exemplo n.º 55
0
def check_nth_order_unary(x, op, grad_ops, orders, rtol=None, atol=None):
    """Assert n-th order autograd gradient against expected gradient.

    Multiple order of gradients can be checked by passing list of
    function computing the particular order gradient and passing the
    corresponding list of order.

    Note
    ----
    1. Orders should always be monotonically increasing.
    2. Elements of grads_ops should correspond to elements of orders
    i.e. grads_op = [grad_op, grad_grad_grad_op] should be passed with
         orders = [1, 3]

    Parameters
    ----------
    x : mxnet.NDArray
        Input Array.
    op : Callable
        Operation to perform on Input Array.
    grad_ops : Callable or List of Callable
        Function to compute and assert gradient of given order.
    orders : int or List of int
        Order/s to assert expected and computed gradients.

    Returns
    -------
    None

    """
    if isinstance(orders, int):
        orders = [orders]
        grad_ops = [grad_ops]

    assert all(i < j for i, j in zip(orders[0:-1], orders[1:])), \
        "orders should be monotonically increasing"
    assert len(set(orders)) == len(orders), \
        "orders should have unique elements"
    highest_order = max(orders)

    x = nd.array(x)
    x.attach_grad()

    expected_grads = [grad_op(x) for grad_op in grad_ops]
    computed_grads = []
    head_grads = []

    # Perform compute.
    with autograd.record():
        y = op(x)
        for current_order in range(1, highest_order + 1):
            head_grad = nd.random.normal(shape=x.shape)
            y = autograd.grad(heads=y,
                              variables=x,
                              head_grads=head_grad,
                              create_graph=True,
                              retain_graph=True)[0]
            if current_order in orders:
                computed_grads.append(y)
            head_grads.append(head_grad)

    # Validate all the gradients.
    for order, grad, computed_grad in \
            zip(orders, expected_grads, computed_grads):
        # Compute expected values.
        expected_grad = grad.asnumpy()
        for head_grad in head_grads[:order]:
            expected_grad *= head_grad.asnumpy()

        assert_almost_equal(expected_grad,
                            computed_grad.asnumpy(),
                            rtol=rtol,
                            atol=atol)
Exemplo n.º 56
0
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.
    ctx = mx.cpu()

    # retrieve the hyperparameters we set in notebook (with some defaults)
    batch_size = args.batch_size
    epochs = args.epochs
    learning_rate = args.learning_rate
    momentum = args.momentum
    log_interval = args.log_interval

    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    CHECKPOINTS_DIR = '/opt/ml/checkpoints'
    checkpoints_enabled = os.path.exists(CHECKPOINTS_DIR)

    # load training and validation data
    # we use the gluon.data.vision.MNIST class because of its built in mnist pre-processing logic,
    # but point it at the location where SageMaker placed the data files, so it doesn't download them again.
    training_dir = args.train
    train_data = get_train_data(training_dir + '/train', batch_size)
    val_data = get_val_data(training_dir + '/test', batch_size)

    # define the network
    net = define_network()

    # Collect all parameters from net and its children, then initialize them.
    net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
    # Trainer is for updating parameters with gradient.

    if len(hosts) == 1:
        kvstore = 'device' if num_gpus > 0 else 'local'
    else:
        kvstore = 'dist_device_sync' if num_gpus > 0 else 'dist_sync'

    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': learning_rate, 'momentum': momentum},
                            kvstore=kvstore)
    metric = mx.metric.Accuracy()
    loss = gluon.loss.SoftmaxCrossEntropyLoss()

    # shard the training data in case we are doing distributed training. Alternatively to splitting in memory,
    # the data could be pre-split in S3 and use ShardedByS3Key to do distributed training.
    if len(hosts) > 1:
        train_data = [x for x in train_data]
        shard_size = len(train_data) // len(hosts)
        for i, host in enumerate(hosts):
            if host == current_host:
                start = shard_size * i
                end = start + shard_size
                break

        train_data = train_data[start:end]

    net.hybridize()

    best_val_score = 0.0
    for epoch in range(epochs):
        # reset data iterator and metric at begining of epoch.
        metric.reset()
        btic = time.time()
        for i, (data, label) in enumerate(train_data):
            # Copy data to ctx if necessary
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            # Start recording computation graph with record() section.
            # Recorded graphs can then be differentiated with backward.
            with autograd.record():
                output = net(data)
                L = loss(output, label)
                L.backward()
            # take a gradient step with batch_size equal to data.shape[0]
            trainer.step(data.shape[0])
            # update metric at last.
            metric.update([label], [output])

            if i % log_interval == 0 and i > 0:
                name, acc = metric.get()
                print('[Epoch %d Batch %d] Training: %s=%f, %f samples/s' %
                      (epoch, i, name, acc, batch_size / (time.time() - btic)))

            btic = time.time()

        name, acc = metric.get()
        print('[Epoch %d] Training: %s=%f' % (epoch, name, acc))

        name, val_acc = test(ctx, net, val_data)
        print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc))
        # checkpoint the model, params and optimizer states in the folder /opt/ml/checkpoints
        if checkpoints_enabled and val_acc > best_val_score:
            best_val_score = val_acc
            logging.info('Saving the model, params and optimizer state.')
            net.export(CHECKPOINTS_DIR + "/%.4f-gluon_mnist"%(best_val_score), epoch)
            trainer.save_states(CHECKPOINTS_DIR + '/%.4f-gluon_mnist-%d.states'%(best_val_score, epoch))

    if current_host == hosts[0]:
        save(net, model_dir)
Exemplo n.º 57
0
def train(net, train_data, val_data, eval_metric, ctx, args, reporter,
          final_fit):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.no_wd:
        for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
            v.wd_mult = 0.0

    if args.label_smooth:
        net._target_generator._label_smooth = True

    if args.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
    lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch]
    num_batches = args.num_samples // args.batch_size
    lr_scheduler = LRSequential([
        LRScheduler('linear',
                    base_lr=0,
                    target_lr=args.lr,
                    nepochs=args.warmup_epochs,
                    iters_per_epoch=num_batches),
        LRScheduler(args.lr_mode,
                    base_lr=args.lr,
                    nepochs=args.epochs - args.warmup_epochs,
                    iters_per_epoch=num_batches,
                    step_epoch=lr_decay_epoch,
                    step_factor=args.lr_decay,
                    power=2),
    ])

    trainer = gluon.Trainer(net.collect_params(),
                            'sgd', {
                                'wd': args.wd,
                                'momentum': args.momentum,
                                'lr_scheduler': lr_scheduler
                            },
                            kvstore='local')

    # targets
    sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    l1_loss = gluon.loss.L1Loss()

    # metrics
    obj_metrics = mx.metric.Loss('ObjLoss')
    center_metrics = mx.metric.Loss('BoxCenterLoss')
    scale_metrics = mx.metric.Loss('BoxScaleLoss')
    cls_metrics = mx.metric.Loss('ClassLoss')

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    #logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]

    pre_current_map = 0
    for epoch in range(args.start_epoch, args.epochs):
        #tbar2.next()
        if args.mixup:
            # TODO(zhreshold): more elegant way to control mixup during runtime
            try:
                train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
            except AttributeError:
                train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
            if epoch >= args.epochs - args.no_mixup_epochs:
                try:
                    train_data._dataset.set_mixup(None)
                except AttributeError:
                    train_data._dataset._data.set_mixup(None)

        tic = time.time()
        btic = time.time()
        mx.nd.waitall()
        net.hybridize()
        for i, batch in enumerate(train_data):
            batch_size = batch[0].shape[0]
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0)
            # objectness, center_targets, scale_targets, weights, class_targets
            fixed_targets = [
                gluon.utils.split_and_load(batch[it],
                                           ctx_list=ctx,
                                           batch_axis=0) for it in range(1, 6)
            ]
            gt_boxes = gluon.utils.split_and_load(batch[6],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
            sum_losses = []
            obj_losses = []
            center_losses = []
            scale_losses = []
            cls_losses = []
            with autograd.record():
                for ix, x in enumerate(data):
                    obj_loss, center_loss, scale_loss, cls_loss = net(
                        x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets])
                    sum_losses.append(obj_loss + center_loss + scale_loss +
                                      cls_loss)
                    obj_losses.append(obj_loss)
                    center_losses.append(center_loss)
                    scale_losses.append(scale_loss)
                    cls_losses.append(cls_loss)
                autograd.backward(sum_losses)
            trainer.step(batch_size)
            obj_metrics.update(0, obj_losses)
            center_metrics.update(0, center_losses)
            scale_metrics.update(0, scale_losses)
            cls_metrics.update(0, cls_losses)
            if args.log_interval and not (i + 1) % args.log_interval:
                name1, loss1 = obj_metrics.get()
                name2, loss2 = center_metrics.get()
                name3, loss3 = scale_metrics.get()
                name4, loss4 = cls_metrics.get()
                logger.info(
                    '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
                    .format(epoch, i, trainer.learning_rate,
                            batch_size / (time.time() - btic), name1, loss1,
                            name2, loss2, name3, loss3, name4, loss4))
            btic = time.time()

        name1, loss1 = obj_metrics.get()
        name2, loss2 = center_metrics.get()
        name3, loss3 = scale_metrics.get()
        name4, loss4 = cls_metrics.get()
        logger.info(
            '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'
            .format(epoch, (time.time() - tic), name1, loss1, name2, loss2,
                    name3, loss3, name4, loss4))
        if (not (epoch + 1) % args.val_interval) and not final_fit:
            # consider reduce the frequency of validation to save time
            map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
            val_msg = ' '.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            #$tbar.set_description('[Epoch {}] Validation: {}'.format(epoch, val_msg))
            logger.info('[Epoch {}] Validation: {}'.format(epoch, val_msg))
            current_map = float(mean_ap[-1])
            pre_current_map = current_map
        else:
            current_map = pre_current_map
        reporter(epoch=epoch, map_reward=current_map)
def main(args):
    # Function to get mnist iterator given a rank
    def get_mnist_iterator(rank):
        data_dir = "data-%d" % rank
        if not os.path.isdir(data_dir):
            os.makedirs(data_dir)
        zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
                                 dirname=data_dir)
        with zipfile.ZipFile(zip_file_path) as zf:
            zf.extractall(data_dir)

        input_shape = (1, 28, 28)
        batch_size = args.batch_size

        train_iter = mx.io.MNISTIter(
            image="%s/train-images-idx3-ubyte" % data_dir,
            label="%s/train-labels-idx1-ubyte" % data_dir,
            input_shape=input_shape,
            batch_size=batch_size,
            shuffle=True,
            flat=False,
            num_parts=hvd.size(),
            part_index=hvd.rank())

        val_iter = mx.io.MNISTIter(
            image="%s/t10k-images-idx3-ubyte" % data_dir,
            label="%s/t10k-labels-idx1-ubyte" % data_dir,
            input_shape=input_shape,
            batch_size=batch_size,
            flat=False,
        )

        return train_iter, val_iter

    kernel_size = 5
    strides = 2
    pool_size = 2
    hidden_dim = 512
    output_dim = 10
    activation = 'relu'

    # Function to define neural network
    def conv_nets():
        net = gluon.nn.HybridSequential()
        with net.name_scope():
            net.add(
                gluon.nn.Conv2D(channels=20,
                                kernel_size=kernel_size,
                                activation=activation))
            net.add(gluon.nn.MaxPool2D(pool_size=pool_size, strides=strides))
            net.add(
                gluon.nn.Conv2D(channels=50,
                                kernel_size=kernel_size,
                                activation=activation))
            net.add(gluon.nn.MaxPool2D(pool_size=pool_size, strides=strides))
            net.add(gluon.nn.Flatten())
            net.add(gluon.nn.Dense(hidden_dim, activation=activation))
            net.add(gluon.nn.Dense(output_dim))
        return net

    # Function to evaluate accuracy for a model
    def evaluate(model, data_iter, context):
        data_iter.reset()
        metric = mx.metric.Accuracy()
        for _, batch in enumerate(data_iter):
            data = batch.data[0].as_in_context(context)
            label = batch.label[0].as_in_context(context)
            output = model(data.astype(args.dtype, copy=False))
            metric.update([label], [output])

        return metric.get()

    # Initialize Horovod
    hvd.init()

    # Horovod: pin context to local rank
    context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(
        hvd.local_rank())
    num_workers = hvd.size()

    # Load training and validation data
    train_data, val_data = get_mnist_iterator(hvd.rank())

    # Build model
    model = conv_nets()
    model.cast(args.dtype)
    model.hybridize()

    # Create optimizer
    optimizer_params = {
        'momentum': args.momentum,
        'learning_rate': args.lr * hvd.size()
    }
    opt = mx.optimizer.create('sgd', **optimizer_params)

    # Initialize parameters
    initializer = mx.init.Xavier(rnd_type='gaussian',
                                 factor_type="in",
                                 magnitude=2)
    model.initialize(initializer, ctx=context)

    # Horovod: fetch and broadcast parameters
    params = model.collect_params()
    if params is not None:
        hvd.broadcast_parameters(params, root_rank=0)

    # Horovod: create DistributedTrainer, a subclass of gluon.Trainer
    trainer = hvd.DistributedTrainer(params, opt)

    # Create loss function and train metric
    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
    metric = mx.metric.Accuracy()

    # Global training timing
    if hvd.rank() == 0:
        global_tic = time.time()

    # Train model
    for epoch in range(args.epochs):
        tic = time.time()
        train_data.reset()
        metric.reset()
        for nbatch, batch in enumerate(train_data, start=1):
            data = batch.data[0].as_in_context(context)
            label = batch.label[0].as_in_context(context)
            with autograd.record():
                output = model(data.astype(args.dtype, copy=False))
                loss = loss_fn(output, label)
            loss.backward()
            trainer.step(args.batch_size)
            metric.update([label], [output])

            if nbatch % 100 == 0:
                name, acc = metric.get()
                logging.info('[Epoch %d Batch %d] Training: %s=%f' %
                             (epoch, nbatch, name, acc))

        if hvd.rank() == 0:
            elapsed = time.time() - tic
            speed = nbatch * args.batch_size * hvd.size() / elapsed
            logging.info('Epoch[%d]\tSpeed=%.2f samples/s\tTime cost=%f',
                         epoch, speed, elapsed)

        # Evaluate model accuracy
        _, train_acc = metric.get()
        name, val_acc = evaluate(model, val_data, context)
        if hvd.rank() == 0:
            logging.info('Epoch[%d]\tTrain: %s=%f\tValidation: %s=%f', epoch,
                         name, train_acc, name, val_acc)

        if hvd.rank() == 0 and epoch == args.epochs - 1:
            assert val_acc > 0.96, "Achieved accuracy (%f) is lower than expected\
                                    (0.96)" % val_acc

    if hvd.rank() == 0:
        global_training_time = time.time() - global_tic
        print(
            "Global elpased time on training:{}".format(global_training_time))
        device = context.device_type + str(num_workers)
        logging.info('Device info: %s', device)
Exemplo n.º 59
0
def train(epochs, ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    net.initialize(mx.init.Xavier(), ctx=ctx)

    train_data = gluon.data.DataLoader(
        gluon.data.vision.CIFAR10(train=True).transform_first(transform_train),
        batch_size=batch_size,
        shuffle=True,
        last_batch='discard',
        num_workers=num_workers)

    val_data = gluon.data.DataLoader(
        gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers)

    trainer = gluon.Trainer(net.collect_params(), optimizer, {
        'learning_rate': opt.lr,
        'wd': opt.wd,
        'momentum': opt.momentum
    })
    metric = mx.metric.Accuracy()
    train_metric = mx.metric.Accuracy()
    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
    train_history = TrainingHistory(['training-error', 'validation-error'])

    iteration = 0
    lr_decay_count = 0

    best_val_score = 0

    for epoch in range(epochs):
        tic = time.time()
        train_metric.reset()
        metric.reset()
        train_loss = 0
        num_batch = len(train_data)
        alpha = 1

        if epoch == lr_decay_epoch[lr_decay_count]:
            trainer.set_learning_rate(trainer.learning_rate * lr_decay)
            lr_decay_count += 1

        for i, batch in enumerate(train_data):
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0)
            label = gluon.utils.split_and_load(batch[1],
                                               ctx_list=ctx,
                                               batch_axis=0)

            with ag.record():
                output = [net(X) for X in data]
                loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
            for l in loss:
                l.backward()
            trainer.step(batch_size)
            train_loss += sum([l.sum().asscalar() for l in loss])

            train_metric.update(label, output)
            name, acc = train_metric.get()
            iteration += 1

        train_loss /= batch_size * num_batch
        name, acc = train_metric.get()
        name, val_acc = test(ctx, val_data)
        train_history.update([1 - acc, 1 - val_acc])
        train_history.plot(save_path='%s/%s_history.png' %
                           (plot_path, model_name))

        if val_acc > best_val_score:
            best_val_score = val_acc
            net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                (save_dir, best_val_score, model_name, epoch))

        name, val_acc = test(ctx, val_data)
        logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' %
                     (epoch, acc, val_acc, train_loss, time.time() - tic))

        if save_period and save_dir and (epoch + 1) % save_period == 0:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epoch))

    if save_period and save_dir:
        net.save_parameters('%s/cifar10-%s-%d.params' %
                            (save_dir, model_name, epochs - 1))
def train(net, async_net, ctx, args):
    """Training pipeline"""
    net.collect_params().reset_ctx(ctx)
    if args.no_wd:
        for k, v in net.collect_params(".*beta|.*gamma|.*bias").items():
            v.wd_mult = 0.0

    if args.label_smooth:
        net._target_generator._label_smooth = True

    if args.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(args.lr_decay_period, args.epochs, args.lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]

    lr_scheduler = LRSequential([
        LRScheduler("linear",
                    base_lr=0,
                    target_lr=args.lr,
                    nepochs=args.warmup_epochs,
                    iters_per_epoch=args.batch_size),
        LRScheduler(args.lr_mode,
                    base_lr=args.lr,
                    nepochs=args.epochs - args.warmup_epochs,
                    iters_per_epoch=args.batch_size,
                    step_epoch=lr_decay_epoch,
                    step_factor=args.lr_decay,
                    power=2),
    ])
    if (args.optimizer == "sgd"):
        trainer = gluon.Trainer(net.collect_params(),
                                args.optimizer, {
                                    "wd": args.wd,
                                    "momentum": args.momentum,
                                    "lr_scheduler": lr_scheduler
                                },
                                kvstore="local")
    elif (args.optimizer == "adam"):
        trainer = gluon.Trainer(net.collect_params(),
                                args.optimizer, {"lr_scheduler": lr_scheduler},
                                kvstore="local")
    else:
        trainer = gluon.Trainer(net.collect_params(),
                                args.optimizer,
                                kvstore="local")

    # targets
    #sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    #l1_loss = gluon.loss.L1Loss()

    # Intermediate Metrics:
    train_metrics = (
        mx.metric.Loss("ObjLoss"),
        mx.metric.Loss("BoxCenterLoss"),
        mx.metric.Loss("BoxScaleLoss"),
        mx.metric.Loss("ClassLoss"),
        mx.metric.Loss("TotalLoss"),
    )
    train_metric_ixs = range(len(train_metrics))
    target_metric_ix = -1  # Train towards TotalLoss (the last one)

    # Evaluation Metrics:
    val_metric = VOC07MApMetric(iou_thresh=0.5)

    # Data transformations:
    train_batchify_fn = Tuple(*([Stack() for _ in range(6)] +
                                [Pad(axis=0, pad_val=-1) for _ in range(1)]))
    train_transforms = (YOLO3DefaultTrainTransform(
        args.data_shape, args.data_shape, net=async_net,
        mixup=args.mixup) if args.no_random_shape else [
            YOLO3DefaultTrainTransform(
                x * 32, x * 32, net=async_net, mixup=args.mixup)
            for x in range(10, 20)
        ])
    validation_batchify_fn = None
    validation_transforms = None
    if args.validation:
        validation_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
        validation_transforms = YOLO3DefaultValTransform(
            args.data_shape, args.data_shape)

    logger.info(args)
    logger.info(f"Start training from [Epoch {args.start_epoch}]")
    prev_best_score = float("-inf")
    best_epoch = args.start_epoch
    logger.info("Sleeping for 3s in case training data file not yet ready")
    time.sleep(3)
    for epoch in range(args.start_epoch, args.epochs):
        #         if args.mixup:
        #             # TODO(zhreshold): more elegant way to control mixup during runtime
        #             try:
        #                 train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5)
        #             except AttributeError:
        #                 train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5)
        #             if epoch >= args.epochs - args.no_mixup_epochs:
        #                 try:
        #                     train_data._dataset.set_mixup(None)
        #                 except AttributeError:
        #                     train_data._dataset._data.set_mixup(None)

        tic = time.time()
        btic = time.time()
        mx.nd.waitall()
        net.hybridize()

        logger.debug(
            f'Input data dir contents: {os.listdir("/opt/ml/input/data/")}')
        train_data_gen = pipe_detection_minibatch(
            epoch, channel=args.train, batch_size=args.stream_batch_size)
        for ix_streambatch, train_dataset in enumerate(train_data_gen):
            # TODO: Mixup is kinda rubbish if it's only within a (potentially small) batch
            if args.mixup:
                train_dataset = MixupDetection(train_dataset)

            # Create dataloader for the stream-batch:
            if args.no_random_shape:
                logger.debug(
                    "Creating train DataLoader without random transform")
                train_dataloader = gluon.data.DataLoader(
                    train_dataset.transform(train_transforms),
                    batch_size=args.batch_size,
                    batchify_fn=train_batchify_fn,
                    last_batch="discard",
                    num_workers=args.num_workers,
                    shuffle=True,
                )
            else:
                logger.debug("Creating train DataLoader with random transform")
                train_dataloader = RandomTransformDataLoader(
                    train_transforms,
                    train_dataset,
                    interval=10,
                    batch_size=args.batch_size,
                    batchify_fn=train_batchify_fn,
                    last_batch="discard",
                    num_workers=args.num_workers,
                    shuffle=True,
                )

            if args.mixup:
                logger.debug("Shuffling stream-batch")
                # TODO(zhreshold): more elegant way to control mixup during runtime
                try:
                    train_dataloader._dataset.set_mixup(
                        np.random.beta, 1.5, 1.5)
                except AttributeError:
                    train_dataloader._dataset._data.set_mixup(
                        np.random.beta, 1.5, 1.5)
                if epoch >= args.epochs - args.no_mixup_epochs:
                    try:
                        train_dataloader._dataset.set_mixup(None)
                    except AttributeError:
                        train_dataloader._dataset._data.set_mixup(None)

            logger.debug(
                f"Training on stream-batch {ix_streambatch} ({len(train_dataset)} records)"
            )
            # TODO: Improve stream-batching robustness to drop loop guard clauses
            # While it would be nice to simply `for i, batch in enumerate(train_dataloader):`,
            # corrupted image buffers are somehow sneaking through the stream-batch at the moment.
            #
            # For now, we catch and tolerate these errors - trying to resume stream-batch process
            # where possible and otherwise discarding the remainder of the stream-batch :-(
            done = False
            i = -1
            dataiter = iter(train_dataloader)
            while not done:
                i += 1
                batch = None
                while not batch:
                    try:
                        batch = next(dataiter)
                    except StopIteration:
                        done = True
                        break
                    except ValueError:
                        # Some problem with the minibatch prevented loading - try the next
                        logger.warn(
                            f"[Epoch {epoch}][Streambatch {ix_streambatch}] "
                            f"Failed to load minibatch {i}, trying next...")
                        i += 1
                    except:
                        logger.error(
                            f"[Epoch {epoch}][Streambatch {ix_streambatch}] "
                            f"Failed to iterate minibatch {i}: Discarding remainder"
                        )
                        break

                if not batch:
                    logger.debug(
                        f"[Epoch {epoch}][Streambatch {ix_streambatch}] "
                        f"Done after {i} minibatches")
                    break
                logger.debug(
                    f"Epoch {epoch}, stream batch {ix_streambatch}, minibatch {i}"
                )

                batch_size = batch[0].shape[0]
                data = gluon.utils.split_and_load(batch[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0,
                                                  even_split=False)
                # objectness, center_targets, scale_targets, weights, class_targets
                fixed_targets = [
                    gluon.utils.split_and_load(batch[it],
                                               ctx_list=ctx,
                                               batch_axis=0,
                                               even_split=False)
                    for it in range(1, 6)
                ]
                gt_boxes = gluon.utils.split_and_load(batch[6],
                                                      ctx_list=ctx,
                                                      batch_axis=0,
                                                      even_split=False)
                loss_trackers = tuple([] for metric in train_metrics)
                with autograd.record():
                    for ix, x in enumerate(data):
                        losses_raw = net(x, gt_boxes[ix],
                                         *[ft[ix] for ft in fixed_targets])
                        # net outputs: [obj_loss, center_loss, scale_loss, cls_loss]
                        # Each a mx.ndarray 1xbatch_size. This is the same order as our
                        # train_metrics, so we just need to add a total vector:
                        total_loss = sum(losses_raw)
                        losses = losses_raw + [total_loss]

                        # If any sample's total loss is non-finite, sum will be:
                        if not isfinite(sum(total_loss)):
                            logger.error(
                                f"[Epoch {epoch}][Streambatch {ix_streambatch}][Minibatch {i}] "
                                f"got non-finite losses: {losses_raw}")
                            # TODO: Terminate training if losses or gradient go infinite?

                        for ix in train_metric_ixs:
                            loss_trackers[ix].append(losses[ix])

                    autograd.backward(loss_trackers[target_metric_ix])
                trainer.step(batch_size)
                for ix in train_metric_ixs:
                    train_metrics[ix].update(0, loss_trackers[ix])

                if args.log_interval and not (i + 1) % args.log_interval:
                    train_metrics_current = map(lambda metric: metric.get(),
                                                train_metrics)
                    metrics_msg = "; ".join([
                        f"{name}={val:.3f}"
                        for name, val in train_metrics_current
                    ])
                    logger.info(
                        f"[Epoch {epoch}][Streambatch {ix_streambatch}][Minibatch {i}] "
                        f"LR={trainer.learning_rate:.2E}; "
                        f"Speed={batch_size/(time.time()-btic):.3f} samples/sec; {metrics_msg};"
                    )
                btic = time.time()

        train_metrics_current = map(lambda metric: metric.get(), train_metrics)
        metrics_msg = "; ".join(
            [f"{name}={val:.3f}" for name, val in train_metrics_current])
        logger.info(
            f"[Epoch {epoch}] TrainingCost={time.time()-tic:.3f}; {metrics_msg};"
        )

        if not (epoch + 1) % args.val_interval:
            logger.info(f"Validating [Epoch {epoch}]")

            metric_names, metric_values = validate(
                net, args.validation, epoch, ctx,
                VOC07MApMetric(iou_thresh=0.5), validation_transforms,
                validation_batchify_fn, args)
            if isinstance(metric_names, list):
                val_msg = "; ".join(
                    [f"{k}={v}" for k, v in zip(metric_names, metric_values)])
                current_score = float(metric_values[-1])
            else:
                val_msg = f"{metric_names}={metric_values}"
                current_score = metric_values
            logger.info(f"[Epoch {epoch}] Validation: {val_msg};")
        else:
            current_score = float("-inf")

        save_progress(net, current_score, prev_best_score, args.model_dir,
                      epoch, args.checkpoint_interval, args.checkpoint_dir)
        if current_score > prev_best_score:
            prev_best_score = current_score
            best_epoch = epoch

        if (args.early_stopping and epoch >= args.early_stopping_min_epochs
                and (epoch - best_epoch) >= args.early_stopping_patience):
            logger.info(
                f"[Epoch {epoch}] No improvement since epoch {best_epoch}: Stopping early"
            )
            break