Пример #1
0
def setUpModule():
    mv.init()
Пример #2
0
def model(x, w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o):
    c1 = T.maximum(0, conv.conv2d(x, w_c1) + b_c1.dimshuffle('x', 0, 'x', 'x'))
    p1 = downsample.max_pool_2d(c1, (3, 3))

    c2 = T.maximum(0,
                   conv.conv2d(p1, w_c2) + b_c2.dimshuffle('x', 0, 'x', 'x'))
    p2 = downsample.max_pool_2d(c2, (2, 2))

    p2_flat = p2.flatten(2)
    h3 = T.maximum(0, T.dot(p2_flat, w_h3) + b_h3)
    p_y_given_x = T.nnet.softmax(T.dot(h3, w_o) + b_o)
    return p_y_given_x


# MULTIVERSO: you should call mv.init before call multiverso apis
mv.init()
worker_id = mv.worker_id()
# MULTIVERSO: every process has distinct worker id
workers_num = mv.workers_num()

w_c1 = init_weights((4, 3, 3, 3), name="w_c1")
b_c1 = init_weights((4, ), name="b_c1")
w_c2 = init_weights((8, 4, 3, 3), name="w_c2")
b_c2 = init_weights((8, ), name="b_c2")
w_h3 = init_weights((8 * 4 * 4, 100), name="w_h3")
b_h3 = init_weights((100, ), name="b_h3")
w_o = init_weights((100, 10), name="w_o")
b_o = init_weights((10, ), name="b_o")

params = [w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o]
Пример #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Train the deep NMT model.',
        fromfile_prefix_chars='@',
    )

    parser.add_argument('-R', action="store_false", default=True, dest='reload',
                        help='Reload old model, default to True, set to False')
    parser.add_argument('-d', action='store_true', default=False, dest='dump_before_train',
                        help='Dump before train default to False, set to True')
    parser.add_argument('--lr', action="store", metavar="learning_rate", dest="learning_rate", type=float, default=1.0,
                        help='Start learning rate, default is %(default)s')
    parser.add_argument('--optimizer', action='store', default='adadelta')
    parser.add_argument('--plot', action='store', default=None,
                        help='Plot filename, default is None (not plot) (deprecated).')
    parser.add_argument('--save_freq', action='store', default=10000, type=int, dest='save_freq',
                        help='Model save frequency, default is %(default)s')
    parser.add_argument('--dev_bleu_freq', action='store', default=20000, type=int, dest='dev_bleu_freq',
                        help='Get dev set BLEU frequency, default is %(default)s')
    parser.add_argument('--dim', action='store', default=512, type=int, dest='dim',
                        help='Dim of hidden units, default is %(default)s')
    parser.add_argument('--bs', action='store', default=128, type=int, dest='batch_size',
                        help='Train batch size, default is %(default)s')
    parser.add_argument('--valid_bs', action='store', default=128, type=int, dest='valid_batch_size',
                        help='Valid batch size, default is %(default)s')
    parser.add_argument('--dim_word', action='store', default=512, type=int, dest='dim_word',
                        help='Dim of word embedding, default is %(default)s')
    parser.add_argument('--maxlen', action='store', default=80, type=int, dest='maxlen',
                        help='Max sentence length, default is %(default)s')
    parser.add_argument('-S', action='store_false', default=True, dest='shuffle',
                        help='Shuffle data per epoch, default is True, set to False')
    parser.add_argument('--train1', action='store', metavar='filename', dest='train1', type=str,
                        default='filtered_en-fr.en',
                        help='Source train file, default is %(default)s')
    parser.add_argument('--train2', action='store', metavar='filename', dest='train2', type=str,
                        default='filtered_en-fr.fr',
                        help='Target train file, default is %(default)s')
    parser.add_argument('--small1', action='store', metavar='filename', dest='small1', type=str,
                        default='small_en-fr.en',
                        help='Source small train file, default is %(default)s')
    parser.add_argument('--small2', action='store', metavar='filename', dest='small2', type=str,
                        default='small_en-fr.fr',
                        help='Target small train file, default is %(default)s')
    parser.add_argument('--valid1', action='store', metavar='filename', dest='valid1', type=str,
                        default='dev_en.tok',
                        help='Source valid file, default is %(default)s')
    parser.add_argument('--valid2', action='store', metavar='filename', dest='valid2', type=str,
                        default='dev_fr.tok',
                        help='Target valid file, default is %(default)s')
    parser.add_argument('--dic1', action='store', metavar='filename', dest='dic1', type=str,
                        default='filtered_dic_en-fr.en.pkl',
                        help='Source dict file, default is %(default)s')
    parser.add_argument('--dic2', action='store', metavar='filename', dest='dic2', type=str,
                        default='filtered_dic_en-fr.fr.pkl',
                        help='Target dict file, default is %(default)s')
    parser.add_argument('--n_words_src', action='store', default=30000, type=int, dest='n_words_src',
                        help='Vocabularies in source side, default is %(default)s')
    parser.add_argument('--n_words_tgt', action='store', default=30000, type=int, dest='n_words_tgt',
                        help='Vocabularies in target side, default is %(default)s')

    parser.add_argument('model_file', nargs='?', default='model/baseline/baseline.npz',
                        help='Generated model file, default is "%(default)s"')
    parser.add_argument('pre_load_file', nargs='?', default='model/en2fr.iter160000.npz',
                        help='Pre-load model file, default is "%(default)s"')
    parser.add_argument('--src_vocab_map', action='store', metavar='filename', dest='src_vocab_map_file', type=str,
                        default=None, help='The file containing source vocab mapping information' 
                                           'used to initialize a model on large dataset from small one')
    parser.add_argument('--tgt_vocab_map', action='store', metavar='filename', dest='tgt_vocab_map_file', type=str,
                        default=None, help='The file containing target vocab mapping information'
                                           'used to initialize a model on large dataset from small one')

    parser.add_argument('--enc', action='store', default=1, type=int, dest='n_encoder_layers',
                        help='Number of encoder layers, default is 1')
    parser.add_argument('--dec', action='store', default=1, type=int, dest='n_decoder_layers',
                        help='Number of decoder layers, default is 1')
    parser.add_argument('--conn', action='store', default=2, type=int, dest='connection_type',
                        help='Connection type, '
                             'default is 2 (bidirectional only in first layer, other layers are forward);'
                             '1 is divided bidirectional GRU')
    parser.add_argument('--max_epochs', action='store', default=100, type=int, dest='max_epochs',
                        help='Maximum epoches, default is 100')
    parser.add_argument('--unit', action='store', metavar='unit', dest='unit', type=str, default='lstm',
                        help='The unit type, default is "lstm", can be set to "gru".')
    parser.add_argument('--attention', action='store', metavar='index', dest='attention_layer_id', type=int, default=0,
                        help='Attention layer index, default is 0')
    parser.add_argument('--residual_enc', action='store', metavar='type', dest='residual_enc', type=str, default=None,
                        help='Residual connection of encoder, default is None, candidates are "layer_wise", "last"')
    parser.add_argument('--residual_dec', action='store', metavar='type', dest='residual_dec', type=str,
                        default='layer_wise',
                        help='Residual connection of decoder, default is "layer_wise", candidates are None, "last"')
    parser.add_argument('-z', '--zigzag', action='store_false', default=True, dest='use_zigzag',
                        help='Use zigzag in encoder, default is True, set to False')
    parser.add_argument('--dropout', action="store", metavar="dropout", dest="dropout", type=float, default=False,
                        help='Dropout rate, default is False (not use dropout)')
    parser.add_argument('--unit_size', action='store', default=2, type=int, dest='unit_size',
                        help='Number of unit size, default is %(default)s')
    # TODO: rename this option to decoder_unit_size in future
    parser.add_argument('--cond_unit_size', action='store', default=2, type=int, dest='cond_unit_size',
                        help='Number of decoder unit size (will rename in future), default is %(default)s')
    parser.add_argument('--clip', action='store', metavar='clip', dest='clip', type=float, default=1.0,
                        help='Gradient clip rate, default is 1.0.')
    parser.add_argument('--manual', action='store_false', dest='auto', default=True,
                        help='Set dropout rate and grad clip rate manually.')
    parser.add_argument('--emb', action='store', metavar='filename', dest='given_embedding', type=str, default=None,
                        help='Given embedding model file, default is None')
    parser.add_argument('--lr_discount', action='store', metavar='freq', dest='lr_discount_freq', type=int,
                        default=-1, help='The learning rate discount frequency, default is -1')

    parser.add_argument('--distribute', action = 'store', metavar ='type', dest = 'dist_type', type = str, default= None,
                        help = 'The distribution version, default is None (singe GPU mode), candiates are "mv", "mpi_reduce"')
    parser.add_argument('--nccl', action="store_true", default=False, dest='nccl',
                        help='Use NCCL in distributed mode, default to False, set to True')
    parser.add_argument('--clip_grads_local', action="store_true", default=False, dest='clip_grads_local',
                        help='Whether to clip grads in distributed mode, default to False, set to True')
    parser.add_argument('--recover_lr_iter', action='store', dest='dist_recover_lr', type = int, default=10000,
                        help='The mini-batch index to recover lrate in distributed mode, default is 10000.')

    parser.add_argument('--all_att', action='store_true', dest='all_att', default=False,
                        help='Generate attention from all decoder layers, default is False, set to True')
    parser.add_argument('--avg_ctx', action='store_true', dest='avg_ctx', default=False,
                        help='Average all context vectors to get softmax, default is False, set to True')
    parser.add_argument('--dataset', action='store', dest='dataset', default='en-fr',
                        help='Dataset, default is "%(default)s"')
    parser.add_argument('--gpu_map_file', action='store', metavar='filename', dest='gpu_map_file', type=str,
                        default=None, help='The file containing gpu id mapping information, '
                                           'each line is in the form physical_gpu_id\\theano_id')
    parser.add_argument('--ft_patience', action='store', metavar='N', dest='fine_tune_patience', type=int, default=-1,
                        help='Fine tune patience, default is %(default)s, set 8 to enable it')
    parser.add_argument('--valid_freq', action='store', metavar='N', dest='valid_freq', type=int, default=5000,
                        help='Validation frequency, default is 5000')
    parser.add_argument('--trg_att', action='store', metavar='N', dest='trg_attention_layer_id', type=int, default=None,
                        help='Target attention layer id, default is None (not use target attention)')
    parser.add_argument('--fix_dp_bug', action="store_true", default=False, dest='fix_dp_bug',
                        help='Fix previous dropout bug, default to False, set to True')
    parser.add_argument('--abandon_imm', action="store_true", default=False, dest='abandon_imm',
                        help='Whether to load previous immediate params, default to True, set to False')
    parser.add_argument('--tp', action="store", metavar="temperature", dest="temperature", type=float, default=1.0,
                        help='temperature, default is %(default)s')
    parser.add_argument('--scale', action="store", metavar="scale", dest="scale", type=float, default=1.0,
                        help='scale, default is %(default)s')
    parser.add_argument('--gate_dp', action="store", metavar="gate_dropout", dest="gate_dropout", type=float, default=1.0,
                        help='gate_dropout, default is %(default)s')

    args = parser.parse_args()
    print args

    if args.residual_enc == 'None':
        args.residual_enc = None
    if args.residual_dec == 'None':
        args.residual_dec = None
    if args.dist_type != 'mv' and args.dist_type != 'mpi_reduce':
        args.dist_type = None

    # FIXME: Auto mode
    if args.auto:
        if args.n_encoder_layers <= 2:
            args.dropout = False
            args.clip = 1.0
        else:
            args.dropout = 0.1
            args.clip = 5.0

        if args.n_encoder_layers <= 1:
            args.residual_enc = None
        if args.n_decoder_layers <= 1:
            args.residual_dec = None
            args.attention_layer_id = 0

        args.cond_unit_size = args.unit_size

    # If dataset is not 'en-fr', old value of dataset options like 'args.train1' will be omitted
    if args.dataset != 'en-fr':
        args.train1, args.train2, args.small1, args.small2, args.valid1, args.valid2, args.valid3, args.test1, args.test2, args.dic1, args.dic2 = \
            Datasets[args.dataset]

    print 'Command line arguments:'
    print args
    sys.stdout.flush()

    # Init multiverso or mpi and set theano flags.
    if args.dist_type == 'mv':
        try:
            import multiverso as mv
        except ImportError:
            import libs.multiverso_ as mv

        # FIXME: This must before the import of theano!
        mv.init(sync=True)
        worker_id = mv.worker_id()
        workers_cnt = mv.workers_num()
    elif args.dist_type == 'mpi_reduce':
        from mpi4py import MPI

        communicator = MPI.COMM_WORLD
        worker_id = communicator.Get_rank()
        workers_cnt = communicator.Get_size()

    if args.dist_type:
        available_gpus = get_gpu_usage(workers_cnt)
        gpu_maps_info = {idx: idx for idx in available_gpus}
        if args.gpu_map_file:
            for line in open(os.path.join('resources', args.gpu_map_file), 'r'):
                phy_id, theano_id = line.split()
                gpu_maps_info[int(phy_id)] = int(theano_id)
        theano_id = gpu_maps_info[available_gpus[worker_id]]
        print 'worker id:%d, using theano id:%d, physical id %d' % (worker_id, theano_id, available_gpus[worker_id])
        os.environ['THEANO_FLAGS'] = 'device=cuda{},floatX=float32'.format(theano_id)
        sys.stdout.flush()

    from libs.nmt import train

    train(
        max_epochs= args.max_epochs,
        saveto=args.model_file,
        preload=args.pre_load_file,
        reload_=args.reload,
        dim_word=args.dim_word,
        dim=args.dim,
        decay_c=0.,
        clip_c=args.clip,
        lrate=args.learning_rate,
        optimizer=args.optimizer,
        maxlen=args.maxlen,
        batch_size=args.batch_size,
        valid_batch_size=args.valid_batch_size,
        dispFreq=1,
        saveFreq=args.save_freq,
        validFreq=args.valid_freq,
        datasets=(r'data/train/{}'.format(args.train1),
                  r'data/train/{}'.format(args.train2)),
        valid_datasets=(r'data/dev/{}'.format(args.valid1),
                        r'data/dev/{}'.format(args.valid2)),
        small_train_datasets=(r'data/test/{}'.format(args.small1),r'data/test/{}'.format(args.small2),
                              r'data/test/{}'.format(args.test2)),
        vocab_filenames=(r'data/dic/{}'.format(args.dic1),
                         r'data/dic/{}'.format(args.dic2)),
        task=args.dataset,
        use_dropout=args.dropout,
        overwrite=False,
        n_words=args.n_words_tgt,
        n_words_src=args.n_words_src,

        # Options from v-yanfa
        dump_before_train=args.dump_before_train,
        plot_graph=args.plot,
        lr_discount_freq=args.lr_discount_freq,

        n_encoder_layers=args.n_encoder_layers,
        n_decoder_layers=args.n_decoder_layers,
        encoder_many_bidirectional=args.connection_type == 1,

        attention_layer_id=args.attention_layer_id,
        unit=args.unit,
        residual_enc=args.residual_enc,
        residual_dec=args.residual_dec,
        use_zigzag=args.use_zigzag,
        given_embedding=args.given_embedding,

        unit_size=args.unit_size,
        cond_unit_size=args.cond_unit_size,

        given_imm = not args.abandon_imm,
        dump_imm=True,
        shuffle_data=args.shuffle,

        decoder_all_attention=args.all_att,
        average_context=args.avg_ctx,

        dist_type=args.dist_type,
        dist_recover_lr_iter = args.dist_recover_lr,

        fine_tune_patience=args.fine_tune_patience,
        nccl= args.nccl,
        src_vocab_map_file= args.src_vocab_map_file,
        tgt_vocab_map_file= args.tgt_vocab_map_file,

        trg_attention_layer_id=args.trg_attention_layer_id,
        dev_bleu_freq = args.dev_bleu_freq,
        fix_dp_bug= args.fix_dp_bug,
        temperature=args.temperature,
        scale=args.scale,
        gate_dropout=args.gate_dropout,
    )
Пример #4
0
args = parser.parse_args()

if args.gpus is None or args.gpus is '':
    args.gpus = '0'
args.cuda = not args.gpus == '-1' and torch.cuda.is_available()
if args.cuda:
    devs = [int(i) for i in args.gpus.split(',')]
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

args.parallel = True if len(devs) > 1 else False
if args.parallel:
    import multiverso as mv
    from multiverso.torch_ext import torchmodel
    mv.init(sync=True, updater=b"sgd")

kwargs = {'num_workers': 1, 'pin_memory': False} if args.cuda else {}

train_loader = torch.utils.data.DataLoader(datasets.CIFAR10(
    '../data',
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(32, padding=4),
        transforms.ToTensor(),
        transforms.Normalize(mean=[x / 255 for x in [125.3, 123.0, 113.9]],
                             std=[x / 255 for x in [63.0, 62.1, 66.7]])
    ])),
                                           batch_size=args.batch_size,
Пример #5
0
 False)",
                    default=128)
parser.add_argument('-e',
                    '--epoches',
                    type=int,
                    help="Number of epoches(default:\
 82)",
                    default=82)
args = parser.parse_args()
print(args)

# MULTIVERSO: import multiverso
import multiverso as mv

# MULTIVERSO: you should call mv.init before call multiverso apis
mv.init(sync=args.sync)
# MULTIVERSO: every process has distinct worker id
worker_id = mv.worker_id()
# MULTIVERSO: mv.workers_num will return the number of workers
workers_num = mv.workers_num()
# NOTICE: To use multiple gpus, we must set the environment before import theano.
if "THEANO_FLAGS" not in os.environ:
    os.environ[
        "THEANO_FLAGS"] = 'floatX=float32,device=gpu%d,lib.cnmem=1' % worker_id

import numpy as np
import theano
import theano.tensor as T
import lasagne
from multiverso.theano_ext.lasagne_ext import param_manager
Пример #6
0
import torchvision
import torchvision.transforms as transforms

import os
import argparse

from models import *

from torch.autograd import Variable

import numpy as np
import multiverso as mv
from multiverso.torch_ext import torchmodel

mv.init(sync=False, updater=b"sgd")

parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
parser.add_argument('--arch',
                    '-a',
                    metavar='ARCH',
                    default='resnet18',
                    type=str,
                    help='model architecture: (default: resnet18)')
parser.add_argument('--lr',
                    type=float,
                    default=0.1,
                    metavar='LR',
                    help='learning rate (default: 0.1)')
parser.add_argument('--resume',
                    '-r',
 0.1)", default=0.1)
parser.add_argument('-s', '--sync', type=bool, help="run multiverso in sync \
 mode (default: False)", default=False)
parser.add_argument('-b', '--batch-size', type=int, help="batch size (default:\
 False)", default=128)
parser.add_argument('-e', '--epoches', type=int, help="Number of epoches(default:\
 82)", default=82)
args = parser.parse_args()
print(args)


# MULTIVERSO: import multiverso
import multiverso as mv

# MULTIVERSO: you should call mv.init before call multiverso apis
mv.init(sync=args.sync)
# MULTIVERSO: every process has distinct worker id
worker_id = mv.worker_id()
# MULTIVERSO: mv.workers_num will return the number of workers
workers_num = mv.workers_num()
# NOTICE: To use multiple gpus, we must set the environment before import theano.
if "THEANO_FLAGS" not in os.environ:
    os.environ["THEANO_FLAGS"] = 'floatX=float32,device=gpu%d,lib.cnmem=1' % worker_id

import numpy as np
import theano
import theano.tensor as T
import lasagne
from multiverso.theano_ext.lasagne_ext import param_manager

# for the larger networks (n>=9), we need to adjust pythons recursion limit
Пример #8
0

def model(x, w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o):
    c1 = T.maximum(0, conv.conv2d(x, w_c1) + b_c1.dimshuffle('x', 0, 'x', 'x'))
    p1 = downsample.max_pool_2d(c1, (3, 3))

    c2 = T.maximum(0, conv.conv2d(p1, w_c2) + b_c2.dimshuffle('x', 0, 'x', 'x'))
    p2 = downsample.max_pool_2d(c2, (2, 2))

    p2_flat = p2.flatten(2)
    h3 = T.maximum(0, T.dot(p2_flat, w_h3) + b_h3)
    p_y_given_x = T.nnet.softmax(T.dot(h3, w_o) + b_o)
    return p_y_given_x

# MULTIVERSO: you should call mv.init before call multiverso apis
mv.init()
worker_id = mv.worker_id()
# MULTIVERSO: every process has distinct worker id
workers_num = mv.workers_num()


w_c1 = init_weights((4, 3, 3, 3), name="w_c1")
b_c1 = init_weights((4,), name="b_c1")
w_c2 = init_weights((8, 4, 3, 3), name="w_c2")
b_c2 = init_weights((8,), name="b_c2")
w_h3 = init_weights((8 * 4 * 4, 100), name="w_h3")
b_h3 = init_weights((100,), name="b_h3")
w_o = init_weights((100, 10), name="w_o")
b_o = init_weights((10,), name="b_o")

params = [w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o]
Пример #9
0
def setUpModule():
    mv.init()
Пример #10
0
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # MULTIVERSO: you should call mv.init before call multiverso apis
    mv.init()
    # MULTIVERSO: every process has distinct worker id
    worker_id = mv.worker_id()

    # MULTIVERSO: mv.workers_num will return the number of workers
    total_worker = mv.workers_num()

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    validation_frequency = n_train_batches
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            # MULTIVERSO: we distribute the batches to different workers.
            # A worker will only train batches belonged to itself
            if minibatch_index % total_worker == worker_id:
                minibatch_avg_cost = train_model(minibatch_index)
                # MULTIVERSO: when you want to commit all the delta of
                # parameters produced by mv_shared and update the latest
                # parameters from parameter server, you can call this function to
                # synchronize the values
                sharedvar.sync_all_mv_shared_vars()

            iter = (epoch - 1) * n_train_batches + minibatch_index

            # MULTIVERSO: only master worker will output the model
            if mv.is_master_worker() and (iter +
                                          1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       validation_loss * 100.))
        # MULTIVERSO: all the workers will synchronize at the place you call barrier
        mv.barrier()

    # MULTIVERSO: You should make sure only one process will output the result.
    # Otherwise results will be outputted repeatedly
    if mv.is_master_worker():
        end_time = timeit.default_timer()

        test_losses = [test_model(i) for i in range(n_test_batches)]
        test_score = numpy.mean(test_losses)

        print(('Optimization complete with validation score of %f %%,'
               'with test performance %f %%') %
              (validation_loss * 100., test_score * 100.))
        print('The code run for %d epochs, with %f epochs/sec' %
              (epoch, 1. * epoch / (end_time - start_time)))
        print(('The code for file ' + os.path.split(__file__)[1] +
               ' ran for %.1fs' % ((end_time - start_time))),
              file=sys.stderr)

        # save the model
        with open('model.pkl', 'wb') as f:
            pickle.dump(classifier, f)
    # MULTIVERSO: You must call shutdown at the end of the file
    mv.shutdown()