def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='stochastic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=True, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 16, saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr8k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) if dataset == 'coco': valid, _ = valid # the second one contains all the validation data # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err
def main(model, saveto, k=5, normalize=False, zero_pad=False, n_process=5, datasets='dev,test', sampling=False, pkl_name=None): # load model model_options if pkl_name is None: pkl_name = model with open('%s.pkl' % pkl_name, 'rb') as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options['dataset']) _, valid, test, worddict = load_data( path='./data', load_train=False, load_dev=True if 'dev' in datasets else False, load_test=True if 'test' in datasets else False) # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # create processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process(target=gen_model, args=(queue, rqueue, midx, model, options, k, normalize, word_idict, sampling)) processes[midx].start() # index -> words def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) capsw.append(' '.join(ww)) return capsw # unsparsify, reshape, and queue def _send_jobs(contexts): for idx, ctx in enumerate(contexts): cc = ctx.todense().reshape([14 * 14, 512]) if zero_pad: cc0 = numpy.zeros( (cc.shape[0] + 1, cc.shape[1])).astype('float32') cc0[:-1, :] = cc else: cc0 = cc queue.put((idx, cc0)) # retrieve caption from process def _retrieve_jobs(n_samples): caps = [None] * n_samples for idx in xrange(n_samples): resp = rqueue.get() caps[resp[0]] = resp[1] if numpy.mod(idx, 10) == 0: print 'Sample ', (idx + 1), '/', n_samples, ' Done' return caps ds = datasets.strip().split(',') # send all the features for the various datasets for dd in ds: if dd == 'dev': print 'Development Set...', _send_jobs(valid[1]) caps = _seqs2words(_retrieve_jobs(valid[1].shape[0])) import pdb pdb.set_trace() with open(saveto + '.dev.txt', 'w') as f: print >> f, '\n'.join(caps) print 'Done' if dd == 'test': print 'Test Set...', _send_jobs(test[1]) caps = _seqs2words(_retrieve_jobs(test[1].shape[0])) with open(saveto + '.test.txt', 'w') as f: print >> f, '\n'.join(caps) print 'Done' # end processes for midx in xrange(n_process): queue.put(None)
def main(model, saveto, k=5, normalize=False, zero_pad=False, datasets='dev,test', sampling=False, pkl_name=None): # load model model_options if pkl_name is None: pkl_name = model with open('%s.pkl' % pkl_name, 'rb') as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options['dataset']) train, valid, test, worddict = load_data( path='./data/coco/', load_train=True if 'train' in datasets else False, load_dev=True if 'dev' in datasets else False, load_test=True if 'test' in datasets else False) # import pdb; pdb.set_trace() # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # index -> words def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) capsw.append(' '.join(ww)) return capsw # process all dev examples def _process_examples(contexts): caps = [None] * contexts.shape[0] for idx, ctx in enumerate(contexts): cc = ctx.todense().reshape([14 * 14, 512]) if zero_pad: cc0 = numpy.zeros( (cc.shape[0] + 1, cc.shape[1])).astype('float32') cc0[:-1, :] = cc else: cc0 = cc resp = gen_model(idx, cc0, model, options, k, normalize, word_idict, sampling) caps[resp[0]] = resp[1] print 'Sample ', (idx + 1), '/', contexts.shape[0], ' Done' print resp[1] return caps ds = datasets.strip().split(',') # send all the features for the various datasets for dd in ds: if dd == 'train': print 'Training Set...', caps = _seqs2words(_process_examples(train[1])) # import pdb; pdb.set_trace() with open(saveto + '.train.txt', 'w') as f: print >> f, '\n'.join(caps) print 'Done' if dd == 'dev': print 'Development Set...', caps = _seqs2words(_process_examples(valid[1])) # import pdb; pdb.set_trace() with open(saveto + '.dev.txt', 'w') as f: print >> f, '\n'.join(caps) print 'Done' if dd == 'test': print 'Test Set...', caps = _seqs2words(_process_examples(test[1])) with open(saveto + '.test.txt', 'w') as f: print >> f, '\n'.join(caps) print 'Done'
def main(pkl_names, models, split, k=4, normalize=False, debug=False, changes=None): # load model model_options f_init, f_next = [], [] for pkl_name, model in zip(pkl_names, models): options = read_pkl(pkl_name) if args.changes is not None: for change in args.changes: options[change.split('=')[0]] = change.split('=')[1] # initialize the two functions f1, f2 = gen_model(model, options) f_init.append(f1) f_next.append(f2) # fetch data, skip ones we aren't using to save time load_data, _ = get_dataset(options['dataset']) kwargs = { 'path': osp.join(options['prefix'], options['dataset']), 'load_%s' % split: True, 'options': options } eval_data, worddict = load_data(**kwargs) imgid = collapse([elem[-1] for elem in eval_data[0]]) word_idict = {vv: kk for kk, vv in worddict.iteritems()} # write results to json format caps = process_examples(f_init, f_next, imgid, eval_data[1], eval_data[2], word_idict, options, k, normalize, debug=debug) # create folder if not exist if len(pkl_names) > 1: folder = osp.join('../output', '%s_ensemble_%s' % (options['dataset'], split)) else: folder = osp.join('../output', '%s_%s' % (osp.splitext(pkl_names[0])[0], split)) # If there exists more, then create mirrows if not osp.exists(folder): os.mkdir(folder) elif osp.exists(folder) and split == 'test': for i in range(2, 5): if not osp.exists('%s.%d' % (folder, i)): folder = '%s.%d' % (folder, i) os.mkdir(folder) break # write json to the file with open(osp.join(folder, 'captions_val2014_results.json'), 'w') as f: json.dump(caps, f) if split in ('val', 'test'): # Evaluate using the official api coco_caption_folder = osp.join('../', 'coco-caption') assert osp.exists(coco_caption_folder) sys.path.append(coco_caption_folder) from cocoEvaluation import cocoEvaluation evaluator = cocoEvaluation(options['dataset']) evaluator.evaluate(folder)
def main(model, saveto, k=1, normalize=False, zero_pad=False, datasets='dev,test', data_path='./', sampling=False, pkl_name=None): # load model model_options if pkl_name is None: pkl_name = model with open('%s.pkl'% pkl_name, 'rb') as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options['dataset']) _, valid, test, worddict = load_data(load_train=False, load_dev=True if 'dev' in datasets else False, load_test=True if 'test' in datasets else False, path=data_path) # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # build sampler trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) # index -> words def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) return ' '.join(ww) # unsparsify, reshape, and queue def _send_job(context): cc = context.todense().reshape([14*14,512]) if zero_pad: cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32') cc0[:-1,:] = cc else: cc0 = cc return create_sample(tparams, f_init, f_next, cc0, options, trng, k, normalize) ds = datasets.strip().split(',') # send all the features for the various datasets for dd in ds: if dd == 'dev': bar = Bar('Development Set...', max=len(valid[1])) caps = [] for i in range(len(valid[1])): sample = _send_job(valid[1][i]) cap = _seqs2words(sample) caps.append(cap) with open(saveto+'_status.json', 'w') as f: json.dump({'current': i, 'total': len(valid[1])}, f) bar.next() bar.finish() with open(saveto, 'w') as f: print >>f, '\n'.join(caps) print 'Done' if dd == 'test': print 'Test Set...', caps = [] for i in range(len(test[1])): sample = _send_job(test[1][i]) cap = _seqs2words(sample) caps.append(cap) with open(saveto+'_status.json', 'w') as f: json.dump({'current': i, 'total': len(test[1])}, f) bar.next() bar.finish() with open(saveto, 'w') as f: print >>f, '\n'.join(caps) print 'Done'
def main(model, saveto, k=5, normalize=False, zero_pad=False, n_process=5, datasets='dev,test', sampling=False, pkl_name=None): # load model model_options if pkl_name is None: pkl_name = model with open('%s.pkl'% pkl_name, 'rb') as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options['dataset']) _, valid, test, worddict = load_data(load_train=False, load_dev=True if 'dev' in datasets else False, load_test=True if 'test' in datasets else False) # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' ipdb.set_trace() # create processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process(target=gen_model, args=(queue,rqueue,midx,model,options,k,normalize,word_idict, sampling)) processes[midx].start() # index -> words def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) capsw.append(' '.join(ww)) return capsw # unsparsify, reshape, and queue def _send_jobs(contexts): for idx, ctx in enumerate(contexts): cc = ctx.todense().reshape([14*14,512]) if zero_pad: cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32') cc0[:-1,:] = cc else: cc0 = cc queue.put((idx, cc0)) # retrieve caption from process def _retrieve_jobs(n_samples): caps = [None] * n_samples for idx in xrange(n_samples): resp = rqueue.get() caps[resp[0]] = resp[1] if numpy.mod(idx, 10) == 0: print 'Sample ', (idx+1), '/', n_samples, ' Done' return caps ds = datasets.strip().split(',') # send all the features for the various datasets for dd in ds: if dd == 'dev': print 'Development Set...', _send_jobs(valid[1]) vvv = valid[1].toarray() caps = _seqs2words(_retrieve_jobs(len(vvv))) print caps with open(saveto+'.dev.txt', 'w') as f: print >>f, '\n'.join(caps) print 'Done' if dd == 'test': print 'Test Set...', _send_jobs(test[1]) vvv = test[1].toarray() caps = _seqs2words(_retrieve_jobs(len(vvv))) print caps with open(saveto+'.test.txt', 'w') as f: print >>f, '\n'.join(caps) print 'Done' # end processes for midx in xrange(n_process): queue.put(None)
def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='deterministic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=False, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 2,#change from 16 saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=5, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr30k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err
def main(model, saveto, k=5, normalize=False, zero_pad=False, datasets='dev,test', sampling=False, pkl_name=None): # load model model_options if pkl_name is None: pkl_name = model with open('%s.pkl'% pkl_name, 'rb') as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options['dataset']) train, valid, test, worddict = load_data(path='./data/flickr8k/', load_train=True if 'train' in datasets else False, load_dev=True if 'dev' in datasets else False, load_test=True if 'test' in datasets else False) import pdb; pdb.set_trace() # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # index -> words def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) capsw.append(' '.join(ww)) return capsw # process all dev examples def _process_examples(contexts): caps = [None] * contexts.shape[0] for idx, ctx in enumerate(contexts): cc = ctx.todense().reshape([14*14,512]) if zero_pad: cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32') cc0[:-1,:] = cc else: cc0 = cc resp = gen_model(idx, cc0, model, options, k, normalize, word_idict, sampling) caps[resp[0]] = resp[1] print 'Sample ', (idx+1), '/', contexts.shape[0], ' Done' print resp[1] return caps ds = datasets.strip().split(',') # send all the features for the various datasets for dd in ds: if dd == 'train': print 'Training Set...', caps = _seqs2words(_process_examples(train[1])) import pdb; pdb.set_trace() with open(saveto+'.train.txt', 'w') as f: print >>f, '\n'.join(caps) print 'Done' if dd == 'dev': print 'Development Set...', caps = _seqs2words(_process_examples(valid[1])) import pdb; pdb.set_trace() with open(saveto+'.dev.txt', 'w') as f: print >>f, '\n'.join(caps) print 'Done' if dd == 'test': print 'Test Set...', caps = _seqs2words(_process_examples(test[1])) with open(saveto+'.test.txt', 'w') as f: print >>f, '\n'.join(caps) print 'Done'
def main(model, saveto, k=5, normalize=False, zero_pad=False, datasets="dev,test", sampling=False, pkl_name=None): # load model model_options if pkl_name is None: pkl_name = model with open("%s.pkl" % pkl_name, "rb") as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options["dataset"]) train, valid, test, worddict = load_data( path="./data/coco/", load_train=True if "train" in datasets else False, load_dev=True if "dev" in datasets else False, load_test=True if "test" in datasets else False, ) # import pdb; pdb.set_trace() # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = "<eos>" word_idict[1] = "UNK" # index -> words def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) capsw.append(" ".join(ww)) return capsw # process all dev examples def _process_examples(contexts): caps = [None] * contexts.shape[0] for idx, ctx in enumerate(contexts): cc = ctx.todense().reshape([14 * 14, 512]) if zero_pad: cc0 = numpy.zeros((cc.shape[0] + 1, cc.shape[1])).astype("float32") cc0[:-1, :] = cc else: cc0 = cc resp = gen_model(idx, cc0, model, options, k, normalize, word_idict, sampling) caps[resp[0]] = resp[1] print "Sample ", (idx + 1), "/", contexts.shape[0], " Done" print resp[1] return caps ds = datasets.strip().split(",") # send all the features for the various datasets for dd in ds: if dd == "train": print "Training Set...", caps = _seqs2words(_process_examples(train[1])) # import pdb; pdb.set_trace() with open(saveto + ".train.txt", "w") as f: print >> f, "\n".join(caps) print "Done" if dd == "dev": print "Development Set...", caps = _seqs2words(_process_examples(valid[1])) # import pdb; pdb.set_trace() with open(saveto + ".dev.txt", "w") as f: print >> f, "\n".join(caps) print "Done" if dd == "test": print "Test Set...", caps = _seqs2words(_process_examples(test[1])) with open(saveto + ".test.txt", "w") as f: print >> f, "\n".join(caps) print "Done"
def main(model, saveto, k=5, normalize=False, zero_pad=False, n_process=5, datasets='train,dev,test', sampling=False, pkl_name=None, cate_name = None, out_name = None): lines = open(cate_name,'r').read().splitlines() ref_images = [] weights = [] for line in lines: s = line.split(',') ref_images.append(s[0]) weights.append(int(s[1])) # load model model_options if pkl_name is None: pkl_name = model[0] with open('%s.pkl'% pkl_name, 'rb') as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options['dataset']) train, valid, test, worddict = load_data(load_train=True if 'train' in datasets else False, load_dev=True if 'dev' in datasets else False, load_test=True if 'test' in datasets else False) # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # create processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process(target=gen_model, args=(queue,rqueue,midx,model,options,k,normalize,word_idict, sampling)) processes[midx].start() # index -> words def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) capsw.append(' '.join(ww)) return capsw # unsparsify, reshape, and queue def _send_jobs(contexts): for idx, ctx in enumerate(contexts): cc = ctx.todense().reshape([14*14,512]) if zero_pad: cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32') cc0[:-1,:] = cc else: cc0 = cc queue.put((idx, cc0)) return # retrieve caption from process def _retrieve_jobs(n_samples): caps = [None] * n_samples scores = [None] * n_samples for idx in xrange(n_samples): resp = rqueue.get() caps[resp[0]] = resp[1] scores[resp[0]] = resp[2] if numpy.mod(idx, 10) == 0: print 'Sample ', (idx+1), '/', n_samples, ' Done' return caps, scores ds = datasets.strip().split(',') # send all the features for the various datasets for dd in ds: if dd == 'dev': print 'Development Set...', _send_jobs(valid[1]) print 'Finished sending DEV' caps,scores = _retrieve_jobs(valid[1].shape[0]) caps = _seqs2words(caps) print 'Finished Generationg DEV' with open(saveto+'.dev.txt', 'w') as f: print >>f, '\n'.join(caps) with open(saveto+'.dev.scores.txt', 'w') as f: for score in scores: print >>f, str(score)+'\n' with open(saveto+'.dev.info.txt', 'w') as f: for idx in range(len(scores)): print >>f, caps[idx] +'\n'+ ref_images[idx] +'\n'+ str(scores[idx]) +'\n' # sents = [] # for sen in valid[0]: # while len(sents) < sen[1]+1: # sents.append([]) # sents[sen[1]].append(sen[0].strip()) # sents2 = zip(*sents) # for idd in range(5): # with open(saveto+'gold'+str(idd)+'.dev.txt', 'w') as f: # print >>f, '\n'.join(sents2[idd]) print 'Done' if dd == 'test': print 'Test Set...', _send_jobs(test[1]) print 'Finished sending TEST' caps,scores = _retrieve_jobs(test[1].shape[0]) caps = _seqs2words(caps) print 'Finished Generationg TEST' with open(saveto+'.test.txt', 'w') as f: print >>f, '\n'.join(caps) with open(saveto+'.test.scores.txt', 'w') as f: for score in scores: print >>f, str(score)+'\n' with open(saveto+'.test.info.txt', 'w') as f: for idx in range(len(scores)): print >>f, caps[idx] +'\n'+ ref_images[idx] +'\n'+ str(scores[idx]) +'\n' # sents = [] # for sen in test[0]: # while len(sents) < sen[1]+1: # sents.append([]) # sents[sen[1]].append(sen[0].strip()) # sents2 = zip(*sents) # for idd in range(5): # with open(saveto+'gold'+str(idd)+'.test.txt', 'w') as f: # print >>f, '\n'.join(sents2[idd]) print 'Done' if dd == 'train': print 'Train Set...', _send_jobs(train[1]) print 'Finished sending TRAIN' caps, scores = _retrieve_jobs(train[1].shape[0]) caps = _seqs2words(caps) # all_caps,all_scores = _retrieve_jobs(train[1].shape[0]) # all_caps = _seqs2words(all_caps) # caps = [] # scores = [] # index = 0 # for i in xrange(len(weights)): # if weights[i] == 0: # scores.append(0) # else: # caps.append(all_caps[index]) # scores.append(all_scores[index]) # index += weights[i] print 'Finished Generationg TRAIN' with open(saveto+'.train.txt', 'w') as f: print >>f, '\n'.join(caps) # with open(saveto+'.train.scores.txt', 'w') as f: # for score in scores: # print >>f, str(score)+'\n' threshold = 1.0 avgScore = sum(scores) / float(len(scores)) totalWeight = float(sum(weights)) loss = 0; with open(out_name, 'w') as f: for i in range(len(scores)): if scores[i] > threshold: loss += float(weights[i]) / totalWeight #modelWeight += float(weights[i])/totalWeight / scores[i] if scores[i] > 1.2*avgScore and weights[i] <= 10: weights[i] = weights[i]+1 if scores[i] < 0.5*avgScore and weights[i] > 0: weights[i] = weights[i]-1 print >>f, ref_images[i]+','+str(weights[i]) modelWeight = 0.5 * numpy.log(1/loss - 1) with open(cate_name[:-4]+'.info.txt', 'w') as f: print >>f, 'ModelWeight:'+str(modelWeight) for idx in range(len(scores)): print >>f, caps[idx] +'\n'+ ref_images[idx] +'\n'+ str(scores[idx]) +'\n' # sents = [] # for sen in test[0]: # while len(sents) < sen[1]+1: # sents.append([]) # sents[sen[1]].append(sen[0].strip()) # sents2 = zip(*sents) # for idd in range(5): # with open(saveto+'gold'+str(idd)+'.test.txt', 'w') as f: # print >>f, '\n'.join(sents2[idd]) print 'Done' # end processes for midx in xrange(n_process): queue.put(None) return
def main(model, saveto, k=5, normalize=False, zero_pad=False, n_process=5, datasets='train,dev,test', sampling=False, pkl_name=None, cate_name=None, out_name=None): lines = open(cate_name, 'r').read().splitlines() ref_images = [] weights = [] for line in lines: s = line.split(',') ref_images.append(s[0]) weights.append(int(s[1])) # load model model_options if pkl_name is None: pkl_name = model[0] with open('%s.pkl' % pkl_name, 'rb') as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options['dataset']) train, valid, test, worddict = load_data( load_train=True if 'train' in datasets else False, load_dev=True if 'dev' in datasets else False, load_test=True if 'test' in datasets else False) # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # create processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process(target=gen_model, args=(queue, rqueue, midx, model, options, k, normalize, word_idict, sampling)) processes[midx].start() # index -> words def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) capsw.append(' '.join(ww)) return capsw # unsparsify, reshape, and queue def _send_jobs(contexts): for idx, ctx in enumerate(contexts): cc = ctx.todense().reshape([14 * 14, 512]) if zero_pad: cc0 = numpy.zeros( (cc.shape[0] + 1, cc.shape[1])).astype('float32') cc0[:-1, :] = cc else: cc0 = cc queue.put((idx, cc0)) return # retrieve caption from process def _retrieve_jobs(n_samples): caps = [None] * n_samples scores = [None] * n_samples for idx in xrange(n_samples): resp = rqueue.get() caps[resp[0]] = resp[1] scores[resp[0]] = resp[2] if numpy.mod(idx, 10) == 0: print 'Sample ', (idx + 1), '/', n_samples, ' Done' return caps, scores ds = datasets.strip().split(',') # send all the features for the various datasets for dd in ds: if dd == 'dev': print 'Development Set...', _send_jobs(valid[1]) print 'Finished sending DEV' caps, scores = _retrieve_jobs(valid[1].shape[0]) caps = _seqs2words(caps) print 'Finished Generationg DEV' with open(saveto + '.dev.txt', 'w') as f: print >> f, '\n'.join(caps) with open(saveto + '.dev.scores.txt', 'w') as f: for score in scores: print >> f, str(score) + '\n' with open(saveto + '.dev.info.txt', 'w') as f: for idx in range(len(scores)): print >> f, caps[idx] + '\n' + ref_images[ idx] + '\n' + str(scores[idx]) + '\n' # sents = [] # for sen in valid[0]: # while len(sents) < sen[1]+1: # sents.append([]) # sents[sen[1]].append(sen[0].strip()) # sents2 = zip(*sents) # for idd in range(5): # with open(saveto+'gold'+str(idd)+'.dev.txt', 'w') as f: # print >>f, '\n'.join(sents2[idd]) print 'Done' if dd == 'test': print 'Test Set...', _send_jobs(test[1]) print 'Finished sending TEST' caps, scores = _retrieve_jobs(test[1].shape[0]) caps = _seqs2words(caps) print 'Finished Generationg TEST' with open(saveto + '.test.txt', 'w') as f: print >> f, '\n'.join(caps) with open(saveto + '.test.scores.txt', 'w') as f: for score in scores: print >> f, str(score) + '\n' with open(saveto + '.test.info.txt', 'w') as f: for idx in range(len(scores)): print >> f, caps[idx] + '\n' + ref_images[ idx] + '\n' + str(scores[idx]) + '\n' # sents = [] # for sen in test[0]: # while len(sents) < sen[1]+1: # sents.append([]) # sents[sen[1]].append(sen[0].strip()) # sents2 = zip(*sents) # for idd in range(5): # with open(saveto+'gold'+str(idd)+'.test.txt', 'w') as f: # print >>f, '\n'.join(sents2[idd]) print 'Done' if dd == 'train': print 'Train Set...', _send_jobs(train[1]) print 'Finished sending TRAIN' caps, scores = _retrieve_jobs(train[1].shape[0]) caps = _seqs2words(caps) # all_caps,all_scores = _retrieve_jobs(train[1].shape[0]) # all_caps = _seqs2words(all_caps) # caps = [] # scores = [] # index = 0 # for i in xrange(len(weights)): # if weights[i] == 0: # scores.append(0) # else: # caps.append(all_caps[index]) # scores.append(all_scores[index]) # index += weights[i] print 'Finished Generationg TRAIN' with open(saveto + '.train.txt', 'w') as f: print >> f, '\n'.join(caps) # with open(saveto+'.train.scores.txt', 'w') as f: # for score in scores: # print >>f, str(score)+'\n' threshold = 1.0 avgScore = sum(scores) / float(len(scores)) totalWeight = float(sum(weights)) loss = 0 with open(out_name, 'w') as f: for i in range(len(scores)): if scores[i] > threshold: loss += float(weights[i]) / totalWeight #modelWeight += float(weights[i])/totalWeight / scores[i] if scores[i] > 1.2 * avgScore and weights[i] <= 10: weights[i] = weights[i] + 1 if scores[i] < 0.5 * avgScore and weights[i] > 0: weights[i] = weights[i] - 1 print >> f, ref_images[i] + ',' + str(weights[i]) modelWeight = 0.5 * numpy.log(1 / loss - 1) with open(cate_name[:-4] + '.info.txt', 'w') as f: print >> f, 'ModelWeight:' + str(modelWeight) for idx in range(len(scores)): print >> f, caps[idx] + '\n' + ref_images[ idx] + '\n' + str(scores[idx]) + '\n' # sents = [] # for sen in test[0]: # while len(sents) < sen[1]+1: # sents.append([]) # sents[sen[1]].append(sen[0].strip()) # sents2 = zip(*sents) # for idd in range(5): # with open(saveto+'gold'+str(idd)+'.test.txt', 'w') as f: # print >>f, '\n'.join(sents2[idd]) print 'Done' # end processes for midx in xrange(n_process): queue.put(None) return
def train( dim_word=300, # word vector dimensionality ctx_dim=300, # context vector dimensionality semantic_dim=300, dim=1000, # the number of LSTM units cnn_dim=4096, # CNN feature dimension n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=True, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit cutoff=10, patience=5, max_epochs=30, dispFreq=500, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=1e-4, # used only for SGD selector=False, # selector (see paper) maxlen=30, # maximum length of the description optimizer='rmsprop', pretrained='', batch_size=256, saveto='model', # relative path of saved model file saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates embedding='../Data/GloVe/vocab_glove.pkl', cnn_type='vgg', prefix='../Data', # path to find data dataset='coco', criterion='Bleu_4', switch_test_val=False, use_cnninit=True, use_dropout=True, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if os.path.exists('%s.pkl' % saveto): print "Reloading options" with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(model_options['dataset']) # Load data from data path if 'switch_test_val' in model_options and model_options['switch_test_val']: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_test=True) else: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_val=True) # Automatically calculate the update frequency validFreq = len(train[0]) / model_options['batch_size'] print "Validation frequency is %d" % validFreq word_idict = {vv: kk for kk, vv in worddict.iteritems()} model_options['n_words'] = len(worddict) # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) # Initialize it with glove if 'VCemb' in params: params['VCemb'] = read_pkl( model_options['embedding']).astype('float32') # If there is a same experiment, don't use pretrained weights if os.path.exists('%s.npz' % saveto): print "Reloading model" params = load_params('%s.npz' % saveto, params) elif pretrained != '': params = load_params(pretrained, params, False) # Only pretrain the Language model # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas,\ cost, \ opt_outs = \ build_model(tparams, model_options) # Load evaluator to calculate bleu score evaluator = cocoEvaluation(model_options['dataset']) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = sum([ alpha_c * ((1. - alpha.sum(0))**2).sum(0).mean() for alpha in alphas ]) cost += alpha_reg # Backprop! grads = tensor.grad(cost, wrt=itemlist(tparams)) # to getthe cost after regularization or the gradients, use this # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) print 'Optimization' train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=model_options['maxlen']) # history_bleu is a bare-bones training log, reload history history_bleu = [] if os.path.exists('%s.npz' % saveto): history_bleu = numpy.load('%s.npz' % saveto)['history_bleu'].tolist() start_epochs = len(history_bleu) best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(start_epochs, model_options['max_epochs']): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx, cnn_feats = prepare_data(caps, train[1], train[2], worddict, model_options) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', model_options[ 'maxlen'] continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx, cnn_feats) print "Epoch %d, Updates: %d, Cost is: %f" % (eidx, uidx, cost) f_update(model_options['lrate']) ud_duration = time.time( ) - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Print a generated sample as a sanity check if numpy.mod(uidx, model_options['sampleFreq']) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score, alphas = gen_sample( f_init, f_next, ctx_s[jj], cnn_feats[jj], model_options, trng=trng, maxlen=model_options['maxlen']) # Decode the sample from encoding back to words print 'Truth ', jj, ': ', print seqs2words(x_s[:, jj], word_idict) for kk, ss in enumerate([sample[0]]): print 'Sample (', kk, ') ', jj, ': ', print seqs2words(ss, word_idict) # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) # Do evaluation on validation set imgid = collapse([elem[-1] for elem in valid[0]]) caps = process_examples([f_init], [f_next], imgid, valid[1], valid[2], word_idict, model_options) folder = osp.join('../output', '%s_%s' % (saveto, 'val')) if not osp.exists(folder): os.mkdir(folder) with open(osp.join(folder, 'captions_val2014_results.json'), 'w') as f: json.dump(caps, f) eva_result = evaluator.evaluate(folder, False) if model_options['criterion'] == 'combine': history_bleu.append(eva_result['Bleu_4'] + eva_result['CIDEr']) else: history_bleu.append(eva_result[model_options['criterion']]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or history_bleu[-1] == max(history_bleu): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto + '_bestll', history_bleu=history_bleu, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if len(history_bleu) > model_options[ 'patience'] and history_bleu[-1] <= max( history_bleu[:-model_options['patience']]): bad_counter += 1 if bad_counter > model_options['patience']: print 'Early Stop!' estop = True break print ' BLEU-4 score ', history_bleu[-1] # Checkpoint if numpy.mod(uidx, model_options['saveFreq']) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_bleu=history_bleu, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' print 'Seen %d samples' % n_samples if estop: break if model_options['save_per_epoch']: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_bleu=history_bleu, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_bleu=history_bleu, **params)