def gen_model(idx, context, model, options, k, normalize, word_idict, sampling): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) def _gencap(cc0): sample, score = gen_sample(tparams, f_init, f_next, cc0, options, trng=trng, k=k, maxlen=200, stochastic=False) # adjust for length bias if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] seq = _gencap(context) return (idx, seq)
def gen_model(idx, context, model, options, k, normalize, word_idict, sampling, params, tparams): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) def _gencap(cc0): sample, score = gen_sample(tparams, f_init, f_next, cc0, options, trng=trng, k=k, maxlen=200, stochastic=False) # adjust for length bias if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] seq = _gencap(context) return (idx, seq)
def _build(self): print 'Building model...' # build the sampling functions and model self.trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.), name='use_noise') params = capgen.init_params(self.options) params = capgen.load_params(self.model, params) self.tparams = capgen.init_tparams(params) # word index self.f_init, self.f_next = capgen.build_sampler(self.tparams, self.options, use_noise, self.trng) self.trng, use_noise, inps, \ alphas, alphas_samples, cost, opt_outs = capgen.build_model(self.tparams, self.options) # get the alphas and selector value [called \beta in the paper] # create update rules for the stochastic attention hard_attn_updates = [] if self.options['attn_type'] == 'stochastic': baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] hard_attn_updates += opt_outs['attn_updates'] self.f_alpha = theano.function(inps, alphas, name='f_alpha', updates=hard_attn_updates) if self.options['selector']: self.f_sels = theano.function(inps, opt_outs['selector'], name='f_sels', updates=hard_attn_updates) print 'Done'
def gen_model(model, options, k, normalize, word_idict, sampling): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # DICTIONARY = "lexicon.txt" # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) return (f_init, f_next, tparams, trng)
def gen_model(queue, rqueue, pid, model, options, k, normalize, word_idict, sampling): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) def _gencap(cc0): sample, score = gen_sample(tparams, f_init, f_next, cc0, options, trng=trng, k=k, maxlen=200, stochastic=False) # adjust for length bias if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() # exit signal if req is None: break idx, context = req[0], req[1] print "Processing example %d in process # %d" % (idx, pid) seq = _gencap(context) print seq rqueue.put((idx, seq)) print "Added example %d to the result queue" % idx print "gen_model process w/ pid %d has returned..." % pid return
def gen_model(queue, rqueue, pid, model, options, k, normalize, word_idict, sampling): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # tparams_list = [] # f_init_list = [] # f_next_list = [] # for m in model: # params = init_params(options) # params = load_params(m, params) # tparams_list.append( init_tparams(params) ) # f_init, f_next = build_sampler(tparams_list[-1], options, use_noise, trng, sampling=sampling) # f_init_list.append( f_init ) # f_next_list.append( f_next ) # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) def _gencap(cc0): sample, score = gen_sample(tparams, f_init, f_next, cc0, options, trng=trng, k=k, maxlen=200, stochastic=False) #sample, score = gen_sample(tparams, f_init, f_next, cc0, options, # trng=trng, k=k, maxlen=200, stochastic=False) # adjust for length bias #if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx], score[sidx] while True: req = queue.get() # exit signal if req is None: break idx, context = req[0], req[1] print pid, '-', idx seq, score = _gencap(context) rqueue.put((idx, seq, score)) return
def load_model(model_path): # print 'loading model' model = model_path options = load_pkl(model + '.pkl') # build the sampling functions and model trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.), name='use_noise') params = capgen.init_params(options) params = capgen.load_params(model, params) tparams = capgen.init_tparams(params) f_init, f_next = capgen.build_sampler(tparams, options, use_noise, trng) trng, use_noise, inps, alphas, alphas_samples, cost, opt_outs = capgen.build_model(tparams, options) # print 'done' return tparams, f_init, f_next, options, trng
def gen_model(model, options): # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng) return f_init, f_next
def gen_model(model, options, k, normalize, word_idict, sampling): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # DICTIONARY = "lexicon.txt" # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) #trie = tr.TrieNode() # #WordCount=0 # for word in open(DICTIONARY, "rt").read().split(): # word = string.lower(word) # # WordCount += 1 # trie.insert( word ) # # print "Read %d words" % WordCount # # def _gencap(cc0): # sample, score = gen_sample(tparams, f_init, f_next, cc0, options, # trng=trng, k=k, maxlen=200, stochastic=False,alpha=0.0) # # adjust for length bias # if normalize: # lengths = numpy.array([len(s) for s in sample]) # score = score / lengths # sidx = numpy.argsort(score) # return [sample[i] for i in sidx] # seq = _gencap(context) return (f_init, f_next, tparams, trng)
def gen_model(queue, rqueue, pid, model, options, k, normalize, word_idict, sampling): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from capgen import build_sampler, gen_sample, load_params, init_params, init_tparams trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph print 'For the first time' k2 = theano.shared(numpy.random.rand(10000, 100).astype('float32')) print 'its done' use_noise = theano.shared(numpy.float32(0.), name='use_noise') params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) print 'done finished now ...' def _gencap(cc0): sample, score = gen_sample(tparams, f_init, f_next, cc0, options, trng=trng, k=k, maxlen=200, stochastic=False) # adjust for length bias if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] print 'i \'m here now ....' while True: req = queue.get() # exit signal if req is None: break idx, context = req[0], req[1] context = context.astype(numpy.float32, copy=False) seq = _gencap(context) rqueue.put((idx, seq)) print 'i am out now!' return
def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='stochastic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=True, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 16, saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr8k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) if dataset == 'coco': valid, _ = valid # the second one contains all the validation data # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err
# ## Creating the Theano Graph # In[42]: # build the sampling functions and model trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.), name='use_noise') params = capgen.init_params(options) params = capgen.load_params(model, params) tparams = capgen.init_tparams(params) # word index f_init, f_next = capgen.build_sampler(tparams, options, use_noise, trng) # In[43]: trng,use_noise,inps, alphas, alphas_samples,cost, opt_outs = capgen.build_model(tparams, options) # In[44]: # get the alphas and selector value [called \beta in the paper] # create update rules for the stochastic attention hard_attn_updates = [] if options['attn_type'] == 'stochastic': baseline_time = theano.shared(numpy.float32(0.), name='baseline_time')
def main(model, saveto, k=1, normalize=False, zero_pad=False, datasets='dev,test', data_path='./', sampling=False, pkl_name=None): # load model model_options if pkl_name is None: pkl_name = model with open('%s.pkl'% pkl_name, 'rb') as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options['dataset']) _, valid, test, worddict = load_data(load_train=False, load_dev=True if 'dev' in datasets else False, load_test=True if 'test' in datasets else False, path=data_path) # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # build sampler trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) # index -> words def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) return ' '.join(ww) # unsparsify, reshape, and queue def _send_job(context): cc = context.todense().reshape([14*14,512]) if zero_pad: cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32') cc0[:-1,:] = cc else: cc0 = cc return create_sample(tparams, f_init, f_next, cc0, options, trng, k, normalize) ds = datasets.strip().split(',') # send all the features for the various datasets for dd in ds: if dd == 'dev': bar = Bar('Development Set...', max=len(valid[1])) caps = [] for i in range(len(valid[1])): sample = _send_job(valid[1][i]) cap = _seqs2words(sample) caps.append(cap) with open(saveto+'_status.json', 'w') as f: json.dump({'current': i, 'total': len(valid[1])}, f) bar.next() bar.finish() with open(saveto, 'w') as f: print >>f, '\n'.join(caps) print 'Done' if dd == 'test': print 'Test Set...', caps = [] for i in range(len(test[1])): sample = _send_job(test[1][i]) cap = _seqs2words(sample) caps.append(cap) with open(saveto+'_status.json', 'w') as f: json.dump({'current': i, 'total': len(test[1])}, f) bar.next() bar.finish() with open(saveto, 'w') as f: print >>f, '\n'.join(caps) print 'Done'
def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='deterministic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=False, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 2,#change from 16 saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=5, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr30k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err
def train( dim_word=300, # word vector dimensionality ctx_dim=300, # context vector dimensionality semantic_dim=300, dim=1000, # the number of LSTM units cnn_dim=4096, # CNN feature dimension n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=True, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit cutoff=10, patience=5, max_epochs=30, dispFreq=500, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=1e-4, # used only for SGD selector=False, # selector (see paper) maxlen=30, # maximum length of the description optimizer='rmsprop', pretrained='', batch_size=256, saveto='model', # relative path of saved model file saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates embedding='../Data/GloVe/vocab_glove.pkl', cnn_type='vgg', prefix='../Data', # path to find data dataset='coco', criterion='Bleu_4', switch_test_val=False, use_cnninit=True, use_dropout=True, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if os.path.exists('%s.pkl' % saveto): print "Reloading options" with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(model_options['dataset']) # Load data from data path if 'switch_test_val' in model_options and model_options['switch_test_val']: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_test=True) else: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_val=True) # Automatically calculate the update frequency validFreq = len(train[0]) / model_options['batch_size'] print "Validation frequency is %d" % validFreq word_idict = {vv: kk for kk, vv in worddict.iteritems()} model_options['n_words'] = len(worddict) # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) # Initialize it with glove if 'VCemb' in params: params['VCemb'] = read_pkl( model_options['embedding']).astype('float32') # If there is a same experiment, don't use pretrained weights if os.path.exists('%s.npz' % saveto): print "Reloading model" params = load_params('%s.npz' % saveto, params) elif pretrained != '': params = load_params(pretrained, params, False) # Only pretrain the Language model # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas,\ cost, \ opt_outs = \ build_model(tparams, model_options) # Load evaluator to calculate bleu score evaluator = cocoEvaluation(model_options['dataset']) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = sum([ alpha_c * ((1. - alpha.sum(0))**2).sum(0).mean() for alpha in alphas ]) cost += alpha_reg # Backprop! grads = tensor.grad(cost, wrt=itemlist(tparams)) # to getthe cost after regularization or the gradients, use this # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) print 'Optimization' train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=model_options['maxlen']) # history_bleu is a bare-bones training log, reload history history_bleu = [] if os.path.exists('%s.npz' % saveto): history_bleu = numpy.load('%s.npz' % saveto)['history_bleu'].tolist() start_epochs = len(history_bleu) best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(start_epochs, model_options['max_epochs']): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx, cnn_feats = prepare_data(caps, train[1], train[2], worddict, model_options) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', model_options[ 'maxlen'] continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx, cnn_feats) print "Epoch %d, Updates: %d, Cost is: %f" % (eidx, uidx, cost) f_update(model_options['lrate']) ud_duration = time.time( ) - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Print a generated sample as a sanity check if numpy.mod(uidx, model_options['sampleFreq']) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score, alphas = gen_sample( f_init, f_next, ctx_s[jj], cnn_feats[jj], model_options, trng=trng, maxlen=model_options['maxlen']) # Decode the sample from encoding back to words print 'Truth ', jj, ': ', print seqs2words(x_s[:, jj], word_idict) for kk, ss in enumerate([sample[0]]): print 'Sample (', kk, ') ', jj, ': ', print seqs2words(ss, word_idict) # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) # Do evaluation on validation set imgid = collapse([elem[-1] for elem in valid[0]]) caps = process_examples([f_init], [f_next], imgid, valid[1], valid[2], word_idict, model_options) folder = osp.join('../output', '%s_%s' % (saveto, 'val')) if not osp.exists(folder): os.mkdir(folder) with open(osp.join(folder, 'captions_val2014_results.json'), 'w') as f: json.dump(caps, f) eva_result = evaluator.evaluate(folder, False) if model_options['criterion'] == 'combine': history_bleu.append(eva_result['Bleu_4'] + eva_result['CIDEr']) else: history_bleu.append(eva_result[model_options['criterion']]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or history_bleu[-1] == max(history_bleu): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto + '_bestll', history_bleu=history_bleu, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if len(history_bleu) > model_options[ 'patience'] and history_bleu[-1] <= max( history_bleu[:-model_options['patience']]): bad_counter += 1 if bad_counter > model_options['patience']: print 'Early Stop!' estop = True break print ' BLEU-4 score ', history_bleu[-1] # Checkpoint if numpy.mod(uidx, model_options['saveFreq']) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_bleu=history_bleu, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' print 'Seen %d samples' % n_samples if estop: break if model_options['save_per_epoch']: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_bleu=history_bleu, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_bleu=history_bleu, **params)