def paralleltrain(epoch): model.train() scheduler.step() for batch_idx, (data, target) in enumerate(train_loader): if batch_idx % mv.workers_num() != mv.worker_id(): continue if args.cuda: data, target = data.cuda(device), target.cuda(device) data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() model.cpu() model.mv_sync() model.cuda(device) if (batch_idx / mv.workers_num()) % args.log_interval == 0: print( 'Worker: {}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'. format(mv.worker_id(), epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data[0])) if batch_idx % mv.workers_num() < mv.worker_id(): optimizer.zero_grad() model.cpu() model.mv_sync() model.cuda(device)
def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): if batch_idx % mv.workers_num() == mv.worker_id(): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() net.cpu() net.mv_sync() net.cuda() train_loss += loss.data[0] _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() if (batch_idx / mv.workers_num()) % args.log_interval == 0: print( 'Worker: {}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}' .format(mv.worker_id(), epoch, batch_idx * len(inputs), len(trainloader.dataset), 100. * batch_idx / len(trainloader), loss.data[0]))
def test(epoch): model.eval() test_loss = 0 correct = 0 for data, target in test_loader: if args.cuda: data, target = data.cuda(device), target.cuda(device) data, target = Variable(data, volatile=True), Variable(target) output = model(data) test_loss += criterion(output, target).data[0] pred = output.data.max(1)[ 1] # get the index of the max log-probability correct += pred.eq(target.data).cpu().sum() test_loss = test_loss test_loss /= len( test_loader) # loss function already averages over batch size if args.parallel: print( '\nWorker: {}\tTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n' .format(mv.worker_id(), test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) else: print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'. format(test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
def test(epoch): global best_acc net.eval() test_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(testloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = Variable(inputs, volatile=True), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) test_loss += loss.data[0] _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() print( '\nWorker: {}\tTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n' .format(mv.worker_id(), test_loss, correct, len(testloader.dataset), 100. * correct / len(testloader.dataset))) # Save checkpoint. acc = 100. * correct / total if acc > best_acc: print('Saving..') if use_cuda: net.cpu() state = { 'net': net, 'acc': acc, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, './checkpoint/ckpt.t7') best_acc = acc if use_cuda: net.cuda()
def main(): parser = argparse.ArgumentParser( description='Train the deep NMT model.', fromfile_prefix_chars='@', ) parser.add_argument('-R', action="store_false", default=True, dest='reload', help='Reload old model, default to True, set to False') parser.add_argument('-d', action='store_true', default=False, dest='dump_before_train', help='Dump before train default to False, set to True') parser.add_argument('--lr', action="store", metavar="learning_rate", dest="learning_rate", type=float, default=1.0, help='Start learning rate, default is %(default)s') parser.add_argument('--optimizer', action='store', default='adadelta') parser.add_argument('--plot', action='store', default=None, help='Plot filename, default is None (not plot) (deprecated).') parser.add_argument('--save_freq', action='store', default=10000, type=int, dest='save_freq', help='Model save frequency, default is %(default)s') parser.add_argument('--dev_bleu_freq', action='store', default=20000, type=int, dest='dev_bleu_freq', help='Get dev set BLEU frequency, default is %(default)s') parser.add_argument('--dim', action='store', default=512, type=int, dest='dim', help='Dim of hidden units, default is %(default)s') parser.add_argument('--bs', action='store', default=128, type=int, dest='batch_size', help='Train batch size, default is %(default)s') parser.add_argument('--valid_bs', action='store', default=128, type=int, dest='valid_batch_size', help='Valid batch size, default is %(default)s') parser.add_argument('--dim_word', action='store', default=512, type=int, dest='dim_word', help='Dim of word embedding, default is %(default)s') parser.add_argument('--maxlen', action='store', default=80, type=int, dest='maxlen', help='Max sentence length, default is %(default)s') parser.add_argument('-S', action='store_false', default=True, dest='shuffle', help='Shuffle data per epoch, default is True, set to False') parser.add_argument('--train1', action='store', metavar='filename', dest='train1', type=str, default='filtered_en-fr.en', help='Source train file, default is %(default)s') parser.add_argument('--train2', action='store', metavar='filename', dest='train2', type=str, default='filtered_en-fr.fr', help='Target train file, default is %(default)s') parser.add_argument('--small1', action='store', metavar='filename', dest='small1', type=str, default='small_en-fr.en', help='Source small train file, default is %(default)s') parser.add_argument('--small2', action='store', metavar='filename', dest='small2', type=str, default='small_en-fr.fr', help='Target small train file, default is %(default)s') parser.add_argument('--valid1', action='store', metavar='filename', dest='valid1', type=str, default='dev_en.tok', help='Source valid file, default is %(default)s') parser.add_argument('--valid2', action='store', metavar='filename', dest='valid2', type=str, default='dev_fr.tok', help='Target valid file, default is %(default)s') parser.add_argument('--dic1', action='store', metavar='filename', dest='dic1', type=str, default='filtered_dic_en-fr.en.pkl', help='Source dict file, default is %(default)s') parser.add_argument('--dic2', action='store', metavar='filename', dest='dic2', type=str, default='filtered_dic_en-fr.fr.pkl', help='Target dict file, default is %(default)s') parser.add_argument('--n_words_src', action='store', default=30000, type=int, dest='n_words_src', help='Vocabularies in source side, default is %(default)s') parser.add_argument('--n_words_tgt', action='store', default=30000, type=int, dest='n_words_tgt', help='Vocabularies in target side, default is %(default)s') parser.add_argument('model_file', nargs='?', default='model/baseline/baseline.npz', help='Generated model file, default is "%(default)s"') parser.add_argument('pre_load_file', nargs='?', default='model/en2fr.iter160000.npz', help='Pre-load model file, default is "%(default)s"') parser.add_argument('--src_vocab_map', action='store', metavar='filename', dest='src_vocab_map_file', type=str, default=None, help='The file containing source vocab mapping information' 'used to initialize a model on large dataset from small one') parser.add_argument('--tgt_vocab_map', action='store', metavar='filename', dest='tgt_vocab_map_file', type=str, default=None, help='The file containing target vocab mapping information' 'used to initialize a model on large dataset from small one') parser.add_argument('--enc', action='store', default=1, type=int, dest='n_encoder_layers', help='Number of encoder layers, default is 1') parser.add_argument('--dec', action='store', default=1, type=int, dest='n_decoder_layers', help='Number of decoder layers, default is 1') parser.add_argument('--conn', action='store', default=2, type=int, dest='connection_type', help='Connection type, ' 'default is 2 (bidirectional only in first layer, other layers are forward);' '1 is divided bidirectional GRU') parser.add_argument('--max_epochs', action='store', default=100, type=int, dest='max_epochs', help='Maximum epoches, default is 100') parser.add_argument('--unit', action='store', metavar='unit', dest='unit', type=str, default='lstm', help='The unit type, default is "lstm", can be set to "gru".') parser.add_argument('--attention', action='store', metavar='index', dest='attention_layer_id', type=int, default=0, help='Attention layer index, default is 0') parser.add_argument('--residual_enc', action='store', metavar='type', dest='residual_enc', type=str, default=None, help='Residual connection of encoder, default is None, candidates are "layer_wise", "last"') parser.add_argument('--residual_dec', action='store', metavar='type', dest='residual_dec', type=str, default='layer_wise', help='Residual connection of decoder, default is "layer_wise", candidates are None, "last"') parser.add_argument('-z', '--zigzag', action='store_false', default=True, dest='use_zigzag', help='Use zigzag in encoder, default is True, set to False') parser.add_argument('--dropout', action="store", metavar="dropout", dest="dropout", type=float, default=False, help='Dropout rate, default is False (not use dropout)') parser.add_argument('--unit_size', action='store', default=2, type=int, dest='unit_size', help='Number of unit size, default is %(default)s') # TODO: rename this option to decoder_unit_size in future parser.add_argument('--cond_unit_size', action='store', default=2, type=int, dest='cond_unit_size', help='Number of decoder unit size (will rename in future), default is %(default)s') parser.add_argument('--clip', action='store', metavar='clip', dest='clip', type=float, default=1.0, help='Gradient clip rate, default is 1.0.') parser.add_argument('--manual', action='store_false', dest='auto', default=True, help='Set dropout rate and grad clip rate manually.') parser.add_argument('--emb', action='store', metavar='filename', dest='given_embedding', type=str, default=None, help='Given embedding model file, default is None') parser.add_argument('--lr_discount', action='store', metavar='freq', dest='lr_discount_freq', type=int, default=-1, help='The learning rate discount frequency, default is -1') parser.add_argument('--distribute', action = 'store', metavar ='type', dest = 'dist_type', type = str, default= None, help = 'The distribution version, default is None (singe GPU mode), candiates are "mv", "mpi_reduce"') parser.add_argument('--nccl', action="store_true", default=False, dest='nccl', help='Use NCCL in distributed mode, default to False, set to True') parser.add_argument('--clip_grads_local', action="store_true", default=False, dest='clip_grads_local', help='Whether to clip grads in distributed mode, default to False, set to True') parser.add_argument('--recover_lr_iter', action='store', dest='dist_recover_lr', type = int, default=10000, help='The mini-batch index to recover lrate in distributed mode, default is 10000.') parser.add_argument('--all_att', action='store_true', dest='all_att', default=False, help='Generate attention from all decoder layers, default is False, set to True') parser.add_argument('--avg_ctx', action='store_true', dest='avg_ctx', default=False, help='Average all context vectors to get softmax, default is False, set to True') parser.add_argument('--dataset', action='store', dest='dataset', default='en-fr', help='Dataset, default is "%(default)s"') parser.add_argument('--gpu_map_file', action='store', metavar='filename', dest='gpu_map_file', type=str, default=None, help='The file containing gpu id mapping information, ' 'each line is in the form physical_gpu_id\\theano_id') parser.add_argument('--ft_patience', action='store', metavar='N', dest='fine_tune_patience', type=int, default=-1, help='Fine tune patience, default is %(default)s, set 8 to enable it') parser.add_argument('--valid_freq', action='store', metavar='N', dest='valid_freq', type=int, default=5000, help='Validation frequency, default is 5000') parser.add_argument('--trg_att', action='store', metavar='N', dest='trg_attention_layer_id', type=int, default=None, help='Target attention layer id, default is None (not use target attention)') parser.add_argument('--fix_dp_bug', action="store_true", default=False, dest='fix_dp_bug', help='Fix previous dropout bug, default to False, set to True') parser.add_argument('--abandon_imm', action="store_true", default=False, dest='abandon_imm', help='Whether to load previous immediate params, default to True, set to False') parser.add_argument('--tp', action="store", metavar="temperature", dest="temperature", type=float, default=1.0, help='temperature, default is %(default)s') parser.add_argument('--scale', action="store", metavar="scale", dest="scale", type=float, default=1.0, help='scale, default is %(default)s') parser.add_argument('--gate_dp', action="store", metavar="gate_dropout", dest="gate_dropout", type=float, default=1.0, help='gate_dropout, default is %(default)s') args = parser.parse_args() print args if args.residual_enc == 'None': args.residual_enc = None if args.residual_dec == 'None': args.residual_dec = None if args.dist_type != 'mv' and args.dist_type != 'mpi_reduce': args.dist_type = None # FIXME: Auto mode if args.auto: if args.n_encoder_layers <= 2: args.dropout = False args.clip = 1.0 else: args.dropout = 0.1 args.clip = 5.0 if args.n_encoder_layers <= 1: args.residual_enc = None if args.n_decoder_layers <= 1: args.residual_dec = None args.attention_layer_id = 0 args.cond_unit_size = args.unit_size # If dataset is not 'en-fr', old value of dataset options like 'args.train1' will be omitted if args.dataset != 'en-fr': args.train1, args.train2, args.small1, args.small2, args.valid1, args.valid2, args.valid3, args.test1, args.test2, args.dic1, args.dic2 = \ Datasets[args.dataset] print 'Command line arguments:' print args sys.stdout.flush() # Init multiverso or mpi and set theano flags. if args.dist_type == 'mv': try: import multiverso as mv except ImportError: import libs.multiverso_ as mv # FIXME: This must before the import of theano! mv.init(sync=True) worker_id = mv.worker_id() workers_cnt = mv.workers_num() elif args.dist_type == 'mpi_reduce': from mpi4py import MPI communicator = MPI.COMM_WORLD worker_id = communicator.Get_rank() workers_cnt = communicator.Get_size() if args.dist_type: available_gpus = get_gpu_usage(workers_cnt) gpu_maps_info = {idx: idx for idx in available_gpus} if args.gpu_map_file: for line in open(os.path.join('resources', args.gpu_map_file), 'r'): phy_id, theano_id = line.split() gpu_maps_info[int(phy_id)] = int(theano_id) theano_id = gpu_maps_info[available_gpus[worker_id]] print 'worker id:%d, using theano id:%d, physical id %d' % (worker_id, theano_id, available_gpus[worker_id]) os.environ['THEANO_FLAGS'] = 'device=cuda{},floatX=float32'.format(theano_id) sys.stdout.flush() from libs.nmt import train train( max_epochs= args.max_epochs, saveto=args.model_file, preload=args.pre_load_file, reload_=args.reload, dim_word=args.dim_word, dim=args.dim, decay_c=0., clip_c=args.clip, lrate=args.learning_rate, optimizer=args.optimizer, maxlen=args.maxlen, batch_size=args.batch_size, valid_batch_size=args.valid_batch_size, dispFreq=1, saveFreq=args.save_freq, validFreq=args.valid_freq, datasets=(r'data/train/{}'.format(args.train1), r'data/train/{}'.format(args.train2)), valid_datasets=(r'data/dev/{}'.format(args.valid1), r'data/dev/{}'.format(args.valid2)), small_train_datasets=(r'data/test/{}'.format(args.small1),r'data/test/{}'.format(args.small2), r'data/test/{}'.format(args.test2)), vocab_filenames=(r'data/dic/{}'.format(args.dic1), r'data/dic/{}'.format(args.dic2)), task=args.dataset, use_dropout=args.dropout, overwrite=False, n_words=args.n_words_tgt, n_words_src=args.n_words_src, # Options from v-yanfa dump_before_train=args.dump_before_train, plot_graph=args.plot, lr_discount_freq=args.lr_discount_freq, n_encoder_layers=args.n_encoder_layers, n_decoder_layers=args.n_decoder_layers, encoder_many_bidirectional=args.connection_type == 1, attention_layer_id=args.attention_layer_id, unit=args.unit, residual_enc=args.residual_enc, residual_dec=args.residual_dec, use_zigzag=args.use_zigzag, given_embedding=args.given_embedding, unit_size=args.unit_size, cond_unit_size=args.cond_unit_size, given_imm = not args.abandon_imm, dump_imm=True, shuffle_data=args.shuffle, decoder_all_attention=args.all_att, average_context=args.avg_ctx, dist_type=args.dist_type, dist_recover_lr_iter = args.dist_recover_lr, fine_tune_patience=args.fine_tune_patience, nccl= args.nccl, src_vocab_map_file= args.src_vocab_map_file, tgt_vocab_map_file= args.tgt_vocab_map_file, trg_attention_layer_id=args.trg_attention_layer_id, dev_bleu_freq = args.dev_bleu_freq, fix_dp_bug= args.fix_dp_bug, temperature=args.temperature, scale=args.scale, gate_dropout=args.gate_dropout, )
c1 = T.maximum(0, conv.conv2d(x, w_c1) + b_c1.dimshuffle('x', 0, 'x', 'x')) p1 = downsample.max_pool_2d(c1, (3, 3)) c2 = T.maximum(0, conv.conv2d(p1, w_c2) + b_c2.dimshuffle('x', 0, 'x', 'x')) p2 = downsample.max_pool_2d(c2, (2, 2)) p2_flat = p2.flatten(2) h3 = T.maximum(0, T.dot(p2_flat, w_h3) + b_h3) p_y_given_x = T.nnet.softmax(T.dot(h3, w_o) + b_o) return p_y_given_x # MULTIVERSO: you should call mv.init before call multiverso apis mv.init() worker_id = mv.worker_id() # MULTIVERSO: every process has distinct worker id workers_num = mv.workers_num() w_c1 = init_weights((4, 3, 3, 3), name="w_c1") b_c1 = init_weights((4, ), name="b_c1") w_c2 = init_weights((8, 4, 3, 3), name="w_c2") b_c2 = init_weights((8, ), name="b_c2") w_h3 = init_weights((8 * 4 * 4, 100), name="w_h3") b_h3 = init_weights((100, ), name="b_h3") w_o = init_weights((100, 10), name="w_o") b_o = init_weights((10, ), name="b_o") params = [w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o] p_y_given_x = model(x, *params)
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', n_words_src=30000, n_words=30000, max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=1., # learning rate maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=80, saveto='model.npz', saveFreq=1000, # save the parameters after every saveFreq updates validFreq=2500, dev_bleu_freq=20000, datasets=('/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'), valid_datasets=('./data/dev/dev_en.tok', './data/dev/dev_fr.tok'), small_train_datasets=('./data/train/small_en-fr.en','./data/train/small_en-fr.fr', './data/train/small_en-fr.fr'), use_dropout=False, reload_=False, overwrite=False, preload='', # Options below are from v-yanfa dump_before_train=True, plot_graph=None, vocab_filenames=('./data/dic/filtered_dic_en-fr.en.pkl', './data/dic/filtered_dic_en-fr.fr.pkl'), map_filename='./data/dic/mapFullVocab2Top1MVocab.pkl', lr_discount_freq=80000, # Options of deeper encoder and decoder n_encoder_layers=1, n_decoder_layers=1, encoder_many_bidirectional=True, attention_layer_id=0, unit='gru', residual_enc=None, residual_dec=None, use_zigzag=False, initializer='orthogonal', given_embedding=None, dist_type=None, dist_recover_lr_iter=False, unit_size=2, cond_unit_size=2, given_imm=False, dump_imm=False, shuffle_data=False, decoder_all_attention=False, average_context=False, task='en-fr', fine_tune_patience=8, nccl = False, src_vocab_map_file = None, tgt_vocab_map_file = None, trg_attention_layer_id=None, fix_dp_bug = False, temperature = 1.0, scale=1.0, gate_dropout=0.0, ): model_options = locals().copy() # Set distributed computing environment worker_id = 0 if dist_type == 'mv': try: import multiverso as mv except ImportError: from . import multiverso_ as mv worker_id = mv.worker_id() elif dist_type == 'mpi_reduce': from mpi4py import MPI mpi_communicator = MPI.COMM_WORLD worker_id = mpi_communicator.Get_rank() workers_cnt = mpi_communicator.Get_size() if nccl: nccl_comm = init_nccl_env(mpi_communicator) print 'Use {}, worker id: {}'.format('multiverso' if dist_type == 'mv' else 'mpi' if dist_recover_lr_iter else 'none', worker_id) sys.stdout.flush() # Set logging file set_logging_file('log/complete/e{}d{}_res{}_att{}_worker{}_task{}_{}.txt'.format( n_encoder_layers, n_decoder_layers, residual_enc, attention_layer_id, worker_id, task, time.strftime('%m-%d-%H-%M-%S'), )) log('''\ Start Time = {} '''.format( time.strftime('%c'), )) # Model options: load and save message('Top options:') pprint(model_options) pprint(model_options, stream=get_logging_file()) message('Done') sys.stdout.flush() #load_options(model_options, reload_, preload, src_vocab_map_file and tgt_vocab_map_file) check_options(model_options) model_options['cost_normalization'] = 1 ada_alpha = 0.95 if dist_type == 'mpi_reduce': model_options['cost_normalization'] = workers_cnt message('Model options:') pprint(model_options) pprint(model_options, stream=get_logging_file()) message() print 'Loading data' log('\n\n\nStart to prepare data\n@Current Time = {}'.format(time.time())) sys.stdout.flush() dataset_src, dataset_tgt = datasets[0], datasets[1] if shuffle_data: text_iterator_list = [None for _ in range(10)] text_iterator = None else: text_iterator_list = None text_iterator = TextIterator( dataset_src, dataset_tgt, vocab_filenames[0], vocab_filenames[1], batch_size,n_words_src, n_words,maxlen ) valid_iterator = TextIterator( valid_datasets[0], valid_datasets[1], vocab_filenames[0], vocab_filenames[1], valid_batch_size, n_words_src, n_words ) small_train_iterator = TextIterator( small_train_datasets[0], small_train_datasets[1], vocab_filenames[0], vocab_filenames[1], valid_batch_size, n_words_src, n_words ) print 'Building model' model = NMTModel(model_options) params = model.initializer.init_params() # Reload parameters if reload_ and os.path.exists(preload): print 'Reloading model parameters' load_params(preload, params, src_map_file = src_vocab_map_file, tgt_map_file = tgt_vocab_map_file) sys.stdout.flush() # Given embedding if given_embedding is not None: print 'Loading given embedding...', load_embedding(params, given_embedding) print 'Done' print_params(params) model.init_tparams(params) # Build model, stochastic_mode = 0(soft), 1(stochastic), 2(hard) trng, use_noise, stochastic_mode, hyper_param,\ x, x_mask, y, y_mask, \ opt_ret, \ cost, test_cost, x_emb, stochastic_updates,_ = model.build_model() inps = [x, x_mask, y, y_mask] all_stochastic_updates = OrderedDictUpdates() for item1 in stochastic_updates: for item2 in item1: all_stochastic_updates.update(item2) print 'Building sampler' f_init, f_next = model.build_sampler(trng=trng, use_noise=use_noise, batch_mode=True, stochastic_mode=stochastic_mode, hyper_param=hyper_param) stochastic_mode.set_value(1) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile, updates=all_stochastic_updates) print 'Done' sys.stdout.flush() test_cost = test_cost.mean() #FIXME: do not regularize test_cost here cost = cost.mean() cost = l2_regularization(cost, model.P, decay_c) cost = regularize_alpha_weights(cost, alpha_c, model_options, x_mask, y_mask, opt_ret) print 'Building f_cost...', f_cost = theano.function(inps, test_cost, profile=profile, updates=all_stochastic_updates) print 'Done' if plot_graph is not None: print 'Plotting post-compile graph...', theano.printing.pydotprint( f_cost, outfile='pictures/post_compile_{}'.format(plot_graph), var_with_name_simple=True, ) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(model.P)) clip_shared = theano.shared(np.array(clip_c, dtype=fX), name='clip_shared') if dist_type != 'mpi_reduce': #build grads clip into computational graph grads, g2 = clip_grad_remove_nan(grads, clip_shared, model.P) else: #do the grads clip after gradients aggregation g2 = None # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', given_imm_data = get_adadelta_imm_data(optimizer, given_imm, preload) if optimizer == 'adadelta': f_grad_shared, f_update, grads_shared, imm_shared = Optimizers[optimizer]( lr, model.P, grads, inps, cost, g2=g2, given_imm_data=given_imm_data, alpha = ada_alpha, all_stochastic_updates=all_stochastic_updates) if optimizer == 'adam': f_grad_shared, f_update, grads_shared, imm_shared = Optimizers[optimizer]( lr, model.P, grads, inps, cost, g2=g2, given_imm_data=given_imm_data, all_stochastic_updates=all_stochastic_updates) print 'Done' if dist_type == 'mpi_reduce': f_grads_clip = make_grads_clip_func(grads_shared = grads_shared, mt_tparams= model.P, clip_c_shared = clip_shared) print 'Optimization' log('Preparation Done\n@Current Time = {}'.format(time.time())) if dist_type == 'mv': mv.barrier() elif dist_type == 'mpi_reduce': #create receive buffers for mpi allreduce rec_grads = [np.zeros_like(p.get_value()) for p in model.P.itervalues()] estop = False history_errs = [] best_bleu = -1.0 best_valid_cost = 1e6 best_p = None bad_counter = 0 uidx = search_start_uidx(reload_, preload) epoch_n_batches = 0 start_epoch = 0 pass_batches = 0 print 'worker', worker_id, 'uidx', uidx, 'l_rate', lrate, 'ada_alpha', ada_alpha, 'n_batches', epoch_n_batches, 'start_epoch', start_epoch, 'pass_batches', pass_batches start_uidx = uidx if dump_before_train: print 'Dumping before train...', saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) np.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip(model.P)) save_options(model_options, uidx, saveto) print 'Done' sys.stdout.flush() stochastic_mode.set_value(0) valid_cost = validation(valid_iterator, f_cost, use_noise) small_train_cost = validation(small_train_iterator, f_cost, use_noise) message('Soft Valid cost {:.5f} Small train cost {:.5f}'.format(valid_cost, small_train_cost)) stochastic_mode.set_value(1) #new_bleu = translate_dev_get_bleu(model, f_init, f_next, trng, use_noise, 5, 1.0) #best_bleu = new_bleu #message('BLEU = {:.2f} at uidx {}'.format(new_bleu, uidx)) sys.stdout.flush() commu_time_sum = 0.0 cp_time_sum =0.0 reduce_time_sum = 0.0 start_time = time.time() finetune_cnt = 0 for eidx in xrange(start_epoch, max_epochs): if shuffle_data: text_iterator = load_shuffle_text_iterator( eidx, worker_id, text_iterator_list, datasets, vocab_filenames, batch_size, maxlen, n_words_src, n_words ) n_samples = 0 if dist_type == 'mpi_reduce': mpi_communicator.Barrier() for i, (x, y) in enumerate(text_iterator): if eidx == start_epoch and i < pass_batches: #ignore the first several batches when reload continue n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue effective_uidx = uidx - start_uidx ud_start = time.time() # compute cost, grads if dist_type != 'mpi_reduce': cost, g2_value = f_grad_shared(x, x_mask, y, y_mask) else: cost = f_grad_shared(x, x_mask, y, y_mask) if dist_type == 'mpi_reduce': reduce_start = time.time() commu_time = 0 gpucpu_cp_time = 0 if not nccl: commu_time, gpucpu_cp_time = all_reduce_params(grads_shared, rec_grads) else: commu_time, gpucpu_cp_time = all_reduce_params_nccl(nccl_comm, grads_shared) reduce_time = time.time() - reduce_start commu_time_sum += commu_time reduce_time_sum += reduce_time cp_time_sum += gpucpu_cp_time g2_value = f_grads_clip() print '@Worker = {}, Reduce time = {:.5f}, Commu time = {:.5f}, Copy time = {:.5f}'.format(worker_id, reduce_time, commu_time, gpucpu_cp_time) curr_lr = lrate if not dist_type or dist_recover_lr_iter < effective_uidx else lrate * 0.05 + effective_uidx * lrate / dist_recover_lr_iter * 0.95 if curr_lr < lrate: print 'Curr lr {:.3f}'.format(curr_lr) # do the update on parameters f_update(curr_lr) ud = time.time() - ud_start if np.isnan(g2_value) or np.isinf(g2_value): message('gradient NaN detected') sys.stdout.flush() if np.isnan(cost) or np.isinf(cost): message('cost NaN detected') model.save_model(saveto, history_errs, uidx) save_minibatch(x, y, saveto, uidx, vocab_filenames) sys.stdout.flush() return 1., 1., 1. # discount learning rate # FIXME: Do NOT enable this and fine-tune at the same time if lr_discount_freq > 0 and np.mod(effective_uidx, lr_discount_freq) == 0: lrate *= 0.5 message('Discount learning rate to {} at iteration {}'.format(lrate, uidx)) # sync batch if dist_type == 'mv' and np.mod(uidx, dispFreq) == 0: comm_start = time.time() model.sync_tparams() message('@Comm time = {:.5f}'.format(time.time() - comm_start)) # verbose if np.mod(effective_uidx, dispFreq) == 0: message('Worker {} Epoch {} Update {} Cost {:.5f} G2 {:.5f} UD {:.5f} Time {:.5f} s'.format( worker_id, eidx, uidx, float(cost), float(g2_value), ud, time.time() - start_time, )) sys.stdout.flush() if np.mod(effective_uidx, saveFreq) == 0 and worker_id == 0: # save with uidx if not overwrite: print 'Saving the model at iteration {}...'.format(uidx), model.save_model(saveto, history_errs, uidx) print 'Done' sys.stdout.flush() # save immediate data in adadelta saveto_imm_path = '{}_latest.npz'.format(os.path.splitext(saveto)[0]) dump_adadelta_imm_data(optimizer, imm_shared, dump_imm, saveto_imm_path) if np.mod(effective_uidx, validFreq) == 0: stochastic_mode.set_value(0) valid_cost = validation(valid_iterator, f_cost, use_noise) small_train_cost = validation(small_train_iterator, f_cost, use_noise) message('Soft Valid cost {:.5f} Small train cost {:.5f}'.format(valid_cost, small_train_cost)) #new_bleu = translate_dev_get_bleu(model, f_init, f_next, trng, use_noise, 5, 1.0) #message('BLEU = {:.2f} at uidx {}'.format(new_bleu, uidx)) sys.stdout.flush() #if new_bleu > best_bleu: # print 'Saving the model at iteration {}...'.format(uidx), # model.save_model(saveto, history_errs, uidx) # print 'Done' # best_bleu = new_bleu # sys.stdout.flush() stochastic_mode.set_value(1) # Fine-tune based on dev cost if fine_tune_patience > 0: if valid_cost < best_valid_cost: bad_counter = 0 best_valid_cost = valid_cost #dump the best model so far, including the immediate file if worker_id == 0: message('Dump the the best model so far at uidx {}'.format(uidx)) model.save_model(saveto, history_errs) #dump_adadelta_imm_data(optimizer, imm_shared, dump_imm, saveto) else: bad_counter += 1 if bad_counter >= fine_tune_patience: print 'Fine tune:', if finetune_cnt % 2 == 0: lrate = np.float32(lrate * 0.5) message('Discount learning rate to {} at iteration {}'.format(lrate, uidx)) if lrate <= 0.025: message('Learning rate decayed to {:.5f}, task completed'.format(lrate)) return 1., 1., 1. else: clip_shared.set_value(np.float32(clip_shared.get_value() * 0.25)) message('Discount clip value to {} at iteration {}'.format(clip_shared.get_value(), uidx)) finetune_cnt += 1 bad_counter = 0 # finish after this many updates if uidx >= finish_after: print 'Finishing after {} iterations!'.format(uidx) estop = True break print 'Seen {} samples'.format(n_samples) if estop: break if best_p is not None: zipp(best_p, model.P) use_noise.set_value(0.) return 0.
std=[x / 255 for x in [63.0, 62.1, 66.7]]) ])), batch_size=args.batch_size, shuffle=False, **kwargs) model = resnet.resnet20() criterion = torch.nn.CrossEntropyLoss() # if args.ngpu > 1: # model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu))) if args.parallel: model = torchmodel.MVTorchModel(model) if args.cuda: device = devs[0] if args.parallel else devs[mv.worker_id()] model.cuda(device) criterion.cuda(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.decay) scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[int(i) for i in args.lr_decay.split(',')], gamma=0.1) def train(epoch): model.train()
mode (default: False)", default=False) parser.add_argument('-b', '--batch-size', type=int, help="batch size (default:\ False)", default=128) parser.add_argument('-e', '--epoches', type=int, help="Number of epoches(default:\ 82)", default=82) args = parser.parse_args() print(args) # MULTIVERSO: import multiverso import multiverso as mv # MULTIVERSO: you should call mv.init before call multiverso apis mv.init(sync=args.sync) # MULTIVERSO: every process has distinct worker id worker_id = mv.worker_id() # MULTIVERSO: mv.workers_num will return the number of workers workers_num = mv.workers_num() # NOTICE: To use multiple gpus, we must set the environment before import theano. if "THEANO_FLAGS" not in os.environ: os.environ["THEANO_FLAGS"] = 'floatX=float32,device=gpu%d,lib.cnmem=1' % worker_id import numpy as np import theano import theano.tensor as T import lasagne from multiverso.theano_ext.lasagne_ext import param_manager # for the larger networks (n>=9), we need to adjust pythons recursion limit sys.setrecursionlimit(10000)
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # MULTIVERSO: you should call mv.init before call multiverso apis mv.init() # MULTIVERSO: every process has distinct worker id worker_id = mv.worker_id() # MULTIVERSO: mv.workers_num will return the number of workers total_worker = mv.workers_num() # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print('... training the model') validation_frequency = n_train_batches start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): # MULTIVERSO: we distribute the batches to different workers. # A worker will only train batches belonged to itself if minibatch_index % total_worker == worker_id: minibatch_avg_cost = train_model(minibatch_index) # MULTIVERSO: when you want to commit all the delta of # parameters produced by mv_shared and update the latest # parameters from parameter server, you can call this function to # synchronize the values sharedvar.sync_all_mv_shared_vars() iter = (epoch - 1) * n_train_batches + minibatch_index # MULTIVERSO: only master worker will output the model if mv.is_master_worker() and (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, validation_loss * 100.)) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() # MULTIVERSO: You should make sure only one process will output the result. # Otherwise results will be outputted repeatedly if mv.is_master_worker(): end_time = timeit.default_timer() test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) print(('Optimization complete with validation score of %f %%,' 'with test performance %f %%') % (validation_loss * 100., test_score * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) # save the model with open('model.pkl', 'wb') as f: pickle.dump(classifier, f) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()