예제 #1
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros,
                                   batch_size=args.batch_size,
                                   ctx=context)
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1,
                                         args.bptt)):
            data, target = get_batch(train_data, i)
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [i.grad(context) for i in model.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and bptt size to balance it.
            gluon.utils.clip_global_norm(
                grads, args.clip * args.bptt * args.batch_size)

            trainer.step(args.batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if ibatch % args.log_interval == 0 and ibatch > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f' %
                      (epoch, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' %
              (epoch, time.time() - start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.collect_params().save(args.save)
            print('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
        else:
            args.lr = args.lr * 0.25
            trainer._init_optimizer('sgd', {
                'learning_rate': args.lr,
                'momentum': 0,
                'wd': 0
            })
            model.collect_params().load(args.save, context)
예제 #2
0
def train():
    best_val = None
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)):
            data, target = get_batch(train_data, i)
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [i.grad(context) for i in model.collect_params().values()]
            # Here gradient is not divided by batch_size yet.
            # So we multiply max_norm by batch_size to balance it.
            gluon.utils.clip_global_norm(grads, args.clip * args.batch_size)

            trainer.step(args.batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if ibatch % args.log_interval == 0 and ibatch > 0:
                cur_L = total_L / args.batch_size / args.bptt / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_time, val_L, math.exp(val_L)))
예제 #3
0
def train():
    total_loss = 0.0
    start_time = time.time()
    hidden = model.begin_state(func=mx.nd.zeros,
                               batch_size=args.batch_size,
                               ctx=context)
    for batch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)):
        data, target = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = detach(hidden)
        with autograd.record():
            output, hidden = model(data, hidden)
            loss = criterion(output, target)
        loss.backward()

        grads = [p.grad(context) for p in model.collect_params().values()]
        # grad clipping helps prevent the exploding gradient problem in RNNs / LSTMs.
        gluon.utils.clip_global_norm(grads,
                                     args.clip * args.bptt * args.batch_size)
        trainer.step(args.bptt * args.batch_size)
        total_loss += mx.nd.sum(loss).asscalar() / loss.shape[0]

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} '
                  '| ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                      epoch + 1, batch,
                      len(train_data) // args.bptt, lr,
                      elapsed * 1000 / args.log_interval, cur_loss,
                      math.exp(cur_loss)))
            total_loss = 0.0
            start_time = time.time()
예제 #4
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
        for i, (data, target) in enumerate(train_data):
            data = data.as_in_context(context).T
            target = target.as_in_context(context).T.reshape((-1, 1))
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [p.grad(context) for p in model.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and bptt size to balance it.
            gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size)

            trainer.step(args.batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if i % args.log_interval == 0 and i > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, i, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.collect_params().save(args.save)
            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
        else:
            args.lr = args.lr*0.25
            trainer._init_optimizer('sgd',
                                    {'learning_rate': args.lr,
                                     'momentum': 0,
                                     'wd': 0})
            model.collect_params().load(args.save, context)
예제 #5
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros,
                                   batch_size=args.batch_size,
                                   ctx=context)

        # add time checkpoint for logging
        before_time = start_time

        for i, (data, target) in enumerate(train_data):
            data = data.as_in_context(context).T
            target = target.as_in_context(context).T.reshape((-1, 1))
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                # Here L is a vector of size batch_size * bptt size
                L = loss(output, target)
                L = L / (args.bptt * args.batch_size)
                L.backward()

            grads = [p.grad(context) for p in model.collect_params().values()]
            gluon.utils.clip_global_norm(grads, args.clip)

            trainer.step(1)
            total_L += mx.nd.sum(L).asscalar()

            if i % args.log_interval == 0 and i > 0:

                # log interval latency print
                batch_latency = time.time() - before_time
                before_time = time.time()

                cur_L = total_L / args.log_interval
                print(
                    '[Epoch %d Batch %d] loss %.2f, ppl %.2f, batch_latency %.4f'
                    % (epoch, i, cur_L, math.exp(cur_L), batch_latency))
                total_L = 0.0

            if args.export_model:
                model.export('model')
                return

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' %
              (epoch, time.time() - start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.save_parameters(args.save)
            print('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
        else:
            args.lr = args.lr * 0.25
            trainer.set_learning_rate(args.lr)
예제 #6
0
def train():
    best_val = float("Inf")
    for epoch in range(args.epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context)
        for i, (data, target) in enumerate(train_data):
            data = data.as_in_context(context).T
            target = target.as_in_context(context).T.reshape((-1, 1))
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                # Here L is a vector of size batch_size * bptt size
                L = loss(output, target)
                L = L / (args.bptt * args.batch_size)
                L.backward()

            grads = [p.grad(context) for p in model.collect_params().values()]
            gluon.utils.clip_global_norm(grads, args.clip)

            trainer.step(1)
            total_L += mx.nd.sum(L).asscalar()

            if i % args.log_interval == 0 and i > 0:
                cur_L = total_L / args.log_interval
                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%(
                    epoch, i, cur_L, math.exp(cur_L)))
                total_L = 0.0

            if args.export_model:
                model.export('model')
                return

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.save_parameters(args.save)
            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
        else:
            args.lr = args.lr*0.25
            trainer.set_learning_rate(args.lr)
예제 #7
0
###############################################################################
# Build the model
###############################################################################

ntokens = len(vocab)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.tied)
if args.hybridize:
    model.hybridize(**hybridize_optional)
model.initialize(mx.init.Xavier(), ctx=context)

compression_params = None if args.gctype == 'none' else {
    'type': args.gctype,
    'threshold': args.gcthreshold
}
trainer = gluon.Trainer(model.collect_params(),
                        'sgd', {
                            'learning_rate': args.lr,
                            'momentum': 0,
                            'wd': 0
                        },
                        compression_params=compression_params)
loss = gluon.loss.SoftmaxCrossEntropyLoss()
if args.hybridize:
    loss.hybridize(**hybridize_optional)

###############################################################################
# Training code
###############################################################################

예제 #8
0
def train(epochs, ctx):
    best_val = float("Inf")

    for epoch in range(epochs):
        total_L = 0.0
        cur_L = 0.0
        tic = time.time()
        hidden_states = [
            model.begin_state(func=mx.nd.zeros,
                              batch_size=args.batch_size // len(ctx),
                              ctx=ctx[i]) for i in range(len(ctx))
        ]
        btic = time.time()
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1,
                                         args.bptt)):
            # get data batch from the training data
            data_batch, target_batch = get_batch(train_data, i)
            # For RNN we can do within batch multi-device parallelization
            data = gluon.utils.split_and_load(data_batch,
                                              ctx_list=ctx,
                                              batch_axis=1)
            target = gluon.utils.split_and_load(target_batch,
                                                ctx_list=ctx,
                                                batch_axis=1)
            Ls = []
            for (d, t) in zip(data, target):
                # get corresponding hidden state then update hidden
                hidden = detach(hidden_states[d.context.device_id])
                with autograd.record():
                    output, hidden = model(d, hidden)
                    L = loss(output, t.reshape((-1, )))
                    L.backward()
                    Ls.append(L)
                # write back to the record
                hidden_states[d.context.device_id] = hidden

            for c in ctx:
                grads = [i.grad(c) for i in model.collect_params().values()]
                # Here gradient is for the whole batch.
                # So we multiply max_norm by batch_size and bptt size to balance it.
                # Also this utility function needs to be applied within the same context
                gluon.utils.clip_global_norm(
                    grads, args.clip * args.bptt * args.batch_size / len(ctx))

            trainer.step(args.batch_size)
            for L in Ls:
                total_L += mx.nd.sum(L).asscalar()

            if ibatch % args.log_interval == 0 and ibatch > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                logging.info(
                    '[Epoch %d Batch %d] Speed: %f samples/sec loss %.2f, ppl %.2f'
                    % (epoch, ibatch, args.batch_size /
                       (time.time() - btic), cur_L, math.exp(cur_L)))
                total_L = 0.0
            btic = time.time()

        logging.info('[Epoch %d] train loss %.2f, train ppl %.2f' %
                     (epoch, cur_L, math.exp(cur_L)))
        logging.info('[Epoch %d] time cost %.2f' % (epoch, time.time() - tic))
        val_L = eval(val_data, ctx)
        logging.info('[Epoch %d] valid loss %.2f, valid ppl %.2f' %
                     (epoch, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            # test_L = eval(test_data, ctx)
            model.collect_params().save('model.params')
            # logging.info('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
        else:
            args.lr = args.lr * 0.25
            trainer._init_optimizer('sgd', {
                'learning_rate': args.lr,
                'momentum': 0,
                'wd': 0
            })
            model.collect_params().load('model.params', ctx)
예제 #9
0
    context = [mx.gpu(i) for i in range(args.gpus)]
else:
    context = [mx.cpu(0)]

corpus = data.Corpus(args.data)

args.batch_size *= max(1, args.gpus)
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, args.batch_size)
test_data = batchify(corpus.test, args.batch_size)
n_tokens = len(corpus.dictionary)

model = model.RNNModel(args.model, n_tokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.tied)

model.collect_params().initialize(mx.init.Xavier(), ctx=context)

trainer = gluon.Trainer(model.collect_params(), 'sgd', {
    'learning_rate': args.lr,
    'momentum': 0,
    'wd': 0
})
loss = gluon.loss.SoftmaxCrossEntropyLoss()

###############################################################################
# Train the model
###############################################################################


def train(epochs, ctx):
    best_val = float("Inf")
예제 #10
0
    return data

train_data = batchify(corpus.train, args.batch_size).as_in_context(context)
val_data = batchify(corpus.valid, args.batch_size).as_in_context(context)
test_data = batchify(corpus.test, args.batch_size).as_in_context(context)


###############################################################################
# Build the model
###############################################################################


ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.tied)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)

compression_params = None if args.gctype == 'none' else {'type': args.gctype, 'threshold': args.gcthreshold}
trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': args.lr,
                         'momentum': 0,
                         'wd': 0},
                        compression_params=compression_params)
loss = gluon.loss.SoftmaxCrossEntropyLoss()

###############################################################################
# Training code
###############################################################################

def get_batch(source, i):
    seq_len = min(args.bptt, source.shape[0] - 1 - i)
def train(epochs, ctx):
    best_val = float("Inf")

    for epoch in range(epochs):
        total_L = 0.0
        start_time = time.time()
        hidden_states = [
            model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size // len(ctx), ctx=ctx[i])
            for i in range(len(ctx))
        ]
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)):
            # get data batch from the training data
            data_batch, target_batch = get_batch(train_data, i)
            # For RNN we can do within batch multi-device parallelization
            data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1)
            target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1)
            Ls = []
            for (d, t) in zip(data, target):
                # get corresponding hidden state then update hidden
                hidden = detach(hidden_states[d.context.device_id])
                with autograd.record():
                    output, hidden = model(d, hidden)
                    L = loss(output, t.reshape((-1,)))
                    L.backward()
                    Ls.append(L)
                # write back to the record
                hidden_states[d.context.device_id] = hidden

            for c in ctx:
                grads = [i.grad(c) for i in model.collect_params().values()]
                # Here gradient is for the whole batch.
                # So we multiply max_norm by batch_size and bptt size to balance it.
                # Also this utility function needs to be applied within the same context
                gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size / len(ctx))

            trainer.step(args.batch_size)
            for L in Ls:
                total_L += mx.nd.sum(L).asscalar()

            if ibatch % args.log_interval == 0 and ibatch > 0:
                cur_L = total_L / args.bptt / args.batch_size / args.log_interval
                logging.info('[Epoch %d Batch %d] loss %.2f, ppl %.2f' % (
                    epoch, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data, ctx)

        logging.info('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' % (
            epoch, time.time() - start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data, ctx)
            model.collect_params().save('model.params')
            logging.info('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
        else:
            args.lr = args.lr * 0.25
            trainer._init_optimizer('sgd',
                {
                    'learning_rate': args.lr,
                    'momentum': 0,
                    'wd': 0
                }
            )
            model.collect_params().load('model.params', ctx)
예제 #12
0

###############################################################################
# Build the model
###############################################################################


ntokens = len(vocab)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.tied)
if args.hybridize:
    model.hybridize(**hybridize_optional)
model.initialize(mx.init.Xavier(), ctx=context)

compression_params = None if args.gctype == 'none' else {'type': args.gctype, 'threshold': args.gcthreshold}
trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': args.lr,
                         'momentum': 0,
                         'wd': 0},
                        compression_params=compression_params)
loss = gluon.loss.SoftmaxCrossEntropyLoss()
if args.hybridize:
    loss.hybridize(**hybridize_optional)

###############################################################################
# Training code
###############################################################################

def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
예제 #13
0
    model.hybridize()
    if args.optimizer == 'SGD':
        trainer_params = {
            'learning_rate': args.lr,
            'momentum': 0,
            'wd': args.wdecay
        }
    elif args.optimizer == 'Adam':
        trainer_params = {
            'learning_rate': args.lr,
            'wd': args.wdecay,
            'beta1': 0,
            'beta2': 0.999,
            'epsilon': 1e-9
        }
    trainer = gluon.Trainer(model.collect_params(), args.optimizer,
                            trainer_params)

    load_best_loss = float("Inf")
    if args.continue_exprm:
        load_model()
        load_best_loss, val_time = evaluate(val_data, eval_batch_size)
        load_best_ppl = math.exp(load_best_loss)
        logging.info("Loaded model: val_time {:5.2f}, valid loss {}, ppl {}\
                     ".format(val_time, load_best_loss, load_best_ppl))

    # At any point you can hit Ctrl + C to break out of training early.
    # logging.info(model.summary(nd.zeros((args.bptt, m))))

    try:
        if not args.predict_only:
예제 #14
0
    data = data.reshape((batch_size, nbatch)).T
    return data


train_data = batchify(corpus.train, args.batch_size).as_in_context(context)
val_data = batchify(corpus.valid, args.batch_size).as_in_context(context)
test_data = batchify(corpus.test, args.batch_size).as_in_context(context)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.tied)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)

compression_params = None if args.gctype == 'none' else {
    'type': args.gctype,
    'threshold': args.gcthreshold
}
trainer = gluon.Trainer(model.collect_params(),
                        'sgd', {
                            'learning_rate': args.lr,
                            'momentum': 0,
                            'wd': 0
                        },
                        compression_params=compression_params)
loss = gluon.loss.SoftmaxCrossEntropyLoss()

###############################################################################
예제 #15
0
else:
    context = [mx.cpu(0)]

corpus = data.Corpus(args.data)

args.batch_size *= max(1, args.gpus)
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, args.batch_size)
test_data = batchify(corpus.test, args.batch_size)
n_tokens = len(corpus.dictionary)


model = model.RNNModel(args.model, n_tokens, args.emsize, args.nhid,
                       args.nlayers, args.dropout, args.tied)

model.collect_params().initialize(mx.init.Xavier(), ctx=context)

trainer = gluon.Trainer(
    model.collect_params(), 'sgd',
    {
        'learning_rate': args.lr,
        'momentum': 0,
        'wd': 0
    }
)
loss = gluon.loss.SoftmaxCrossEntropyLoss()


###############################################################################
# Train the model
###############################################################################