def main(cur_params): # fetch the data provider for i, cpf in enumerate(cur_params['checkpoints']): checkpoint = pickle.load(open(cpf, 'rb')) if 'model' in checkpoint: model_init_gen_from = checkpoint.get('model',{}) else: model_init_gen_from = checkpoint.get('modelGen',{}) model_init_eval_from = checkpoint.get('modelEval',{}) params = checkpoint['params'] # Load data provider and copy misc if i == 0: dp = getDataProvider(params) evaluator = decodeEvaluator(params) modelEval = evaluator.model_th (eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params) misc = checkpoint['misc'] zipp(model_init_eval_from, modelEval) evaluator.use_noise.set_value(1.) print '----------------------- Running model %s -------------------------------'%(cpf.split('_')[-3]) print 'Evaluating GT 5 vs Negative samples from GT' eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, probs = [0.5, 0.5, 0.0]) print '-------------------------------------------------------------------------' print 'Evaluating GT vs repeated GT' eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, probs = [0.5, 0.0, 0.5]) print '-------------------------------------------------------------------------'
def prepPredictor(self, model_npy, checkpoint_params, beam_size): zipp(model_npy, self.model_th) #theano.config.exception_verbosity = 'high' # Now we build a predictor model (inp_list, predLogProb, predIdx, predCand) = self.build_prediction_model(self.model_th, checkpoint_params, beam_size) self.f_pred_th = theano.function(inp_list, [predLogProb, predIdx, predCand], name='f_pred') # Now we build a training model which evaluates cost. This is for the evaluation part in the end (self.use_dropout, inp_list2, f_pred_prob, cost, predTh, updatesLSTM) = self.build_model(self.model_th, checkpoint_params) self.f_eval= theano.function(inp_list2, cost, name='f_eval')
def prepPredictor(self, model_npy, checkpoint_params, beam_size): zipp(model_npy, self.model_th) #theano.config.exception_verbosity = 'high' # Now we build a predictor model (inp_list, predLogProb, predIdx, predCand) = self.build_prediction_model(self.model_th, checkpoint_params, beam_size) self.f_pred_th = theano.function(inp_list, [predLogProb, predIdx, predCand], name='f_pred') # Now we build a training model which evaluates cost. This is for the evaluation part in the end (self.use_dropout, inp_list2, f_pred_prob, cost, predTh, updatesLSTM) = self.build_model(self.model_th, checkpoint_params) self.f_eval = theano.function(inp_list2, cost, name='f_eval')
def prepPredictor(self, model_npy=None, checkpoint_params=None, beam_size=5, xI=None, xAux=None, inp_list_prev=[], per_word_logweight=None): if model_npy != None: if type(model_npy[model_npy.keys()[0]]) == np.ndarray: zipp(model_npy, self.model_th) else: self.model_th = model_npy #theano.config.exception_verbosity = 'high' self.beam_size = beam_size # Now we build a predictor model if checkpoint_params.get('advers_gen', 0) == 1: checkpoint_params['n_gen_samples'] = beam_size (inp_list_gen, predLogProb, predIdx, predCand, wOut_emb, updates, seq_lengths) = self.build_prediction_model( self.model_th, checkpoint_params, xI, xAux, per_word_logweight=per_word_logweight) self.f_pred_th = theano.function(inp_list_prev + inp_list_gen, [predLogProb, predIdx, predCand], name='f_pred') # Now we build a training model which evaluates cost. This is for the evaluation part in the end if checkpoint_params.get('advers_gen', 0) == 0: (self.use_dropout, inp_list_gen2, f_pred_prob, cost, predTh, updatesLSTM) = self.build_model(self.model_th, checkpoint_params, xI, xAux) self.f_eval = theano.function(inp_list_prev + inp_list_gen2, cost, name='f_eval')
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though params['use_dropout'] = 1 # This initializes the model parameters and does matrix initializations lstmGenerator = LSTMGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it model['bd'].set_value(bias_init_vector.astype(config.floatX)) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost / params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from, model) zipp(rg_init, rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) for it in xrange(max_iters): t0 = time.time() # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch, l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 if params['use_pos_tag'] != 'None': real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], None, sentTagMap, misc['ixtoword']) else: real_inp_list, lenS = prepare_data(batch, misc['wordtoix']) # Enable using dropout in training use_dropout.set_value(1.) # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1] / lenS)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch total_cost = cost[0] #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \ epoch, total_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano( 'val', dp, model, params, misc, f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % ( val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % ( params['dataset'], host, params['fappend'], val_ppl2) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def main(scriptparams): checkpoint = pickle.load(open(scriptparams['checkpoint'], 'rb')) npfilename = osp.join( 'scorelogs', osp.basename(scriptparams['checkpoint']).split('.')[0] + '_logprob%s' % (scriptparams['split'])) misc = checkpoint['misc'] # fetch the data provider params = checkpoint['params'] params['use_gumbel_mse'] = 0 params['maxlen'] = scriptparams['maxlen'] dp = getDataProvider(params) model_init_gen_from = checkpoint.get( 'model', {}) if 'model' in checkpoint else checkpoint['modelGen'] lstmGenerator = decodeGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update_list, lstmGenerator.regularize) if params.get('use_encoder_for', 0) & 1: if params.get('encode_gt_sentences', 0): xI = tensor.zeros((batch_size, params['image_encoding_size'])) imgFeatEnc_inp = [] else: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['word_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] if params.get('use_encoder_for', 0) & 2: aux_enc_inp = model['Wemb'] if params.get('encode_gt_sentences', 0) else dp.aux_inputs.T hid_size = params['featenc_hidden_size'] auxFeatEncoder = RecurrentFeatEncoder(hid_size, params['image_encoding_size'], params, mdl_prefix='aux_enc_', features=aux_enc_inp) mdlLen = len(model.keys()) model.update(auxFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(auxFeatEncoder.model_th.keys()))) misc['update'].extend(auxFeatEncoder.update_list) misc['regularize'].extend(auxFeatEncoder.regularize) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params) if params.get('encode_gt_sentences', 0): # Reshape it size(batch_size, n_gt, hidden_size) xAux = xAux.reshape( (-1, params['n_encgt_sent'], params['featenc_hidden_size'])) # Convert it to size (batch_size, n_gt*hidden_size xAux = xAux.flatten(2) else: auxFeatEnc_inp = [] xAux = None attn_nw_func = None (use_dropout, inp_list_gen, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params, xI, xAux, attn_nw=attn_nw_func) inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen f_eval = theano.function(inp_list, cost, name='f_eval') #--------------------------------- Cost function and gradient computations setup #---------------------------------# zipp(model_init_gen_from, model) # perform the evaluation on VAL set #val_sc = eval_split_theano(scriptparams['split'], dp, model, params, misc, f_eval) logppl = [] logppln = [] imgids = [] nsent = 0 for batch in dp.iterImageSentencePairBatch(split=scriptparams['split'], max_batch_size=1, max_images=-1): enc_inp_list = prepare_seq_features( batch, use_enc_for=params.get('use_encoder_for', 0), maxlen=params['maxlen'], use_shared_mem=params.get('use_shared_mem_enc', 0), enc_gt_sent=params.get('encode_gt_sentences', 0), n_enc_sent=params.get('n_encgt_sent', 0), wordtoix=misc['wordtoix']) gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], rev_sents=params.get('reverse_sentence', 0), use_enc_for=params.get('use_encoder_for', 0), use_unk_token=params.get('use_unk_token', 0)) inp_list = enc_inp_list + gen_inp_list cost = f_eval(*inp_list) logppl.append(cost[1]) logppln.append(lenS) imgids.append( str(batch[0]['image']['cocoid']) + '_' + str(batch[0]['sentidx'])) nsent += 1 perplex = 2**(np.array(logppl) / np.array(logppln)) np.savez(npfilename, pplx=perplex, keys=np.array(imgids)) #ppl2 = 2 ** (logppl / logppln) #print 'evaluated %d sentences and got perplexity = %f' % (nsent, ppl2) #met = [ppl2] print 2**(np.array(logppl).sum() / np.array(logppln).sum())
def main(params): checkpoint_path = params['checkpoint_path'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] model_npy = checkpoint['model'] # Load the candidates db generated from rnn's candDb = json.load(open(params['candDb'],'r')) wordtoix = checkpoint['wordtoix'] #find the number of candidates per image and max sentence len batch_size = 0 maxlen = 0 for i,img in enumerate(candDb['imgblobs']): for ids,cand in enumerate(img['candidatelist']): tks = cand['text'].split(' ') # Also tokenize the candidates candDb['imgblobs'][i]['candidatelist'][ids]['tokens'] = tks if len(tks) > maxlen: maxlen = len(tks) if batch_size < len(img['candidatelist']): batch_size = len(img['candidatelist']) # Get all images to this batch size! # HACK!! maxlen = 24 checkpoint_params['maxlen'] = maxlen checkpoint_params['batch_size'] = batch_size print maxlen # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times # This initializes the model parameters and does matrix initializations checkpoint_params['mode'] = 'predict' evalModel = decodeEvaluator(checkpoint_params) model = evalModel.model_th # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, f_pred_fns, cost, predTh, model) = evalModel.build_model(model, checkpoint_params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition # Now let's build a gradient computation graph and rmsprop update mechanism # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images ## Initialize the model parameters from the checkpoint file if we are resuming training zipp(model_npy,model) print("\nPredicting using model %s, run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_path, checkpoint['epoch'], \ checkpoint['perplexity'])) pos_samp = np.arange(1,dtype=np.int32) features,_ = loadArbitraryFeatures(params, -1) #Disable using dropout in training use_dropout.set_value(0.) N = len(candDb['imgblobs']) #################### Main Loop ############################################ for i,img in enumerate(candDb['imgblobs']): # fetch a batch of data print 'image %d/%d \r' % (i, N), batch = [] cbatch_len = len(img['candidatelist']) for s in img['candidatelist']: batch.append({'sentence':s}) if cbatch_len < batch_size: for z in xrange(batch_size - cbatch_len): batch.append({'sentence':img['candidatelist'][-1]}) batch[0]['image'] = {'feat':features[:, img['imgid']]} real_inp_list, lenS = prepare_data(batch, wordtoix, maxlen=maxlen, pos_samp=pos_samp, prep_for=checkpoint_params['eval_model']) # evaluate cost, gradient and perform parameter update scrs = np.squeeze(f_pred_fns[1](*real_inp_list)) scrs = scrs[:cbatch_len] # + scrs[:,cbatch_len:].sum()/cbatch_len for si,s in enumerate(img['candidatelist']): candDb['imgblobs'][i]['candidatelist'][si]['logprob'] = float(scrs[si]) candDb['imgblobs'][i]['candidatelist'][si].pop('tokens') bestcand = scrs.argmax() candDb['imgblobs'][i]['candidate'] = candDb['imgblobs'][i]['candidatelist'][bestcand] srtidx = np.argsort(scrs)[::-1] candDb['imgblobs'][i]['candsort'] = list(srtidx) #import pdb;pdb.set_trace() # print training statistics print "" jsonFname = '%s_reranked_%s.json' % (checkpoint_params['eval_model'],params['fname_append']) save_file = os.path.join(params['root_path'], jsonFname) json.dump(candDb, open(save_file, 'w'))
def build_eval_other_sent(self, tparams, options,model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]); emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps,:,:], xAux, use_noise, options, prefix=options['generator'], mask=mask) p = rval[0] p = tensor.dot(p,tparams['Wd']) + tparams['bd'] #pred = tensor.nnet.softmax(p) #pred = rval[2] #pred = pred[1:,:,:] p = p[1:,:,:] def accumCost(pred,xW,m,c_sum,ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m) ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m) return c_sum, ppl_sum sums, upd = theano.scan(fn=accumCost, outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples), tensor.alloc(numpy_floatX(0.), 1,n_samples)], sequences = [p, xW[1:,:], mask[1:,:]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) cost = sums[0][-1] inp_list = [xW, xI, mask] if options.get('en_aux_inp',0): inp_list.append(xAux) f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, p, updatesLSTM
def build_eval_other_sent(self, tparams, options, model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] n_out_samps = (n_timesteps - 1) * n_samples embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator']) p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] # #pred = tensor.nnet.softmax(p) # # #pred = rval[2] # # #pred = pred[1:,:,:] # # def accumCost(pred,xW,m,c_sum,ppl_sum): # pred = tensor.nnet.softmax(pred) # c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m) # ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m) # return c_sum, ppl_sum # # sums, upd = theano.scan(fn=accumCost, # outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples), # tensor.alloc(numpy_floatX(0.), 1,n_samples)], # sequences = [p, xW[1:,:], mask[1:,:]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum() cost = tot_cost / options['batch_size'] inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
def main(params): checkpoint_path = params['checkpoint_path'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) cp_params = checkpoint['params'] model_npy = checkpoint['model'] # Load the candidates db generated from rnn's if params['candDb'] != None: candDb = json.load(open(params['candDb'], 'r')) else: candDb = mergeRes(params) wordtoix = checkpoint[ 'wordtoix'] if 'wordtoix' in checkpoint else checkpoint['misc'][ 'wordtoix'] # Read labels and build cocoid to imgid Map if params['dataset'] == 'coco': lbls = open(params['lblF'], 'r').read().splitlines() objId2Imgid = {} for lb in lbls: objId2Imgid[str(int(lb.split()[1][1:-1]))] = int(lb.split()[0][1:]) features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures( params, Ellipsis) elif params['dataset'] == 'msr-vtt': img_names_list = open(params['lblF'], 'r').read().splitlines() auxidxes = [] img_names = [x.rsplit(',')[0] for x in img_names_list] objId2Imgid = {imn.split('.')[0]: i for i, imn in enumerate(img_names)} if len(img_names_list[0].split(',', 1)) > 1: if type( ast.literal_eval(img_names_list[0].split( ',', 1)[1].strip())) == tuple: idxes = [ ast.literal_eval(x.split(',', 1)[1].strip())[0] for x in img_names_list ] auxidxes = [ ast.literal_eval(x.split(',', 1)[1].strip())[1] for x in img_names_list ] else: idxes = [ ast.literal_eval(x.split(',', 1)[1].strip()) for x in img_names_list ] else: idxes = xrange(len(img_names_list)) params['poolmethod'] = cp_params['poolmethod'] if params[ 'poolmethod'] == None else params['poolmethod'] features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures( params, idxes, auxidxes=auxidxes) elif params['dataset'] == 'lsmdc': if params['use_label_file'] == 1: params['poolmethod'] = cp_params['poolmethod'] if params[ 'poolmethod'] == None else params['poolmethod'] params['labels'] = cp_params['labels'] if params[ 'labels'] == None else params['labels'] params['featfromlbl'] = cp_params['featfromlbl'] if params[ 'featfromlbl'] == None else params['featfromlbl'] params['uselabel'] = cp_params['uselabel'] if params[ 'uselabel'] == None else params['uselabel'] else: params['uselabel'] = 0 img_names_list = open(params['lblF'], 'r').read().splitlines() img_names = [x.rsplit(',')[0] for x in img_names_list] idxes = [int(x.rsplit(',')[1]) for x in img_names_list] auxidxes = [] objId2Imgid = { osp.basename(imn).split('.')[0]: i for i, imn in enumerate(img_names) } #import pdb;pdb.set_trace() features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures( params, idxes, auxidxes=auxidxes) if cp_params.get('use_encoder_for', 0) & 1: imgFeatEncoder = RecurrentFeatEncoder(cp_params['image_feat_size'], cp_params['sent_encoding_size'], cp_params, mdl_prefix='img_enc_', features=features.T) zipp(model_npy, imgFeatEncoder.model_th) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model( imgFeatEncoder.model_th, cp_params) else: xI = None imgFeatEnc_inp = [] if 'eval_model' not in cp_params: cp_params['eval_model'] = params['eval_model'] print 'Using evaluator module: ', cp_params['eval_model'] #find the number of candidates per image and max sentence len batch_size = 0 maxlen = 0 for i, img in enumerate(candDb['imgblobs']): for ids, cand in enumerate(img['candidatelist']): tks = cand['text'].split(' ') # Also tokenize the candidates candDb['imgblobs'][i]['candidatelist'][ids]['tokens'] = tks if len(tks) > maxlen: maxlen = len(tks) if batch_size < len(img['candidatelist']): batch_size = len(img['candidatelist']) # Get all images to this batch size! # HACK!! maxlen = 24 cp_params['maxlen'] = maxlen cp_params['batch_size'] = batch_size print maxlen # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times # This initializes the model parameters and does matrix initializations cp_params['mode'] = 'predict' evalModel = decodeEvaluator(cp_params) model = evalModel.model_th # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list_eval, f_pred_fns, cost, predTh, modelUpd) = evalModel.build_model(model, cp_params, xI=xI, prior_inp_list=imgFeatEnc_inp) inp_list = imgFeatEnc_inp + inp_list_eval # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition # Now let's build a gradient computation graph and rmsprop update mechanism # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images ## Initialize the model parameters from the checkpoint file if we are resuming training model = modelUpd if cp_params['eval_model'] == 'cnn' else model zipp(model_npy, model) print("\nPredicting using model %s, run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_path, checkpoint['epoch'], \ checkpoint['perplexity'])) pos_samp = np.arange( 1, dtype=np.int32) if cp_params['eval_model'] == 'cnn' else [] #Disable using dropout in training use_dropout.set_value(0.) if cp_params.get('use_encoder_for', 0) & 1: imgenc_use_dropout.set_value(0.) N = len(candDb['imgblobs']) stats = np.zeros((batch_size)) #################### Main Loop ############################################ for i, img in enumerate(candDb['imgblobs']): # fetch a batch of data print 'image %d/%d \r' % (i, N), batch = [] cbatch_len = len(img['candidatelist']) objid = osp.basename(img['img_path']).split('_')[-1].split('.')[0] if params['dataset'] == 'coco': objid = str(int(objid)) for s in img['candidatelist']: batch.append({ 'sentence': s, 'image': { 'feat': features[:, feat_idx[objId2Imgid[objid]]].T, 'img_idx': feat_idx[objId2Imgid[objid]] } }) if params['aux_inp_file'] != None: batch[-1]['aux_inp'] = aux_inp[:, aux_idx[objId2Imgid[objid]]].T if cbatch_len < batch_size and (cp_params['eval_model'] == 'cnn'): for z in xrange(batch_size - cbatch_len): batch.append({'sentence': img['candidatelist'][-1]}) enc_inp_list = prepare_seq_features( batch, use_enc_for=cp_params.get('use_encoder_for', 0), use_shared_mem=cp_params.get('use_shared_mem_enc', 0), pos_samp=pos_samp) eval_inp_list, lenS = prepare_data(batch, wordtoix, maxlen=maxlen, pos_samp=pos_samp, prep_for=cp_params['eval_model'], use_enc_for=cp_params.get( 'use_encoder_for', 0)) real_inp_list = enc_inp_list + eval_inp_list #import pdb;pdb.set_trace() # evaluate cost, gradient and perform parameter update scrs = np.squeeze(f_pred_fns[1](*real_inp_list)) scrs = scrs[:cbatch_len] # + scrs[:,cbatch_len:].sum()/cbatch_len for si, s in enumerate(img['candidatelist']): candDb['imgblobs'][i]['candidatelist'][si]['logprob'] = float( scrs[si]) candDb['imgblobs'][i]['candidatelist'][si].pop('tokens') bestcand = scrs.argmax() stats[bestcand] += 1.0 candDb['imgblobs'][i]['candidate'] = candDb['imgblobs'][i][ 'candidatelist'][bestcand] srtidx = np.argsort(scrs)[::-1] candDb['imgblobs'][i]['candsort'] = list(srtidx) # print training statistics print "" jsonFname = '%s_reranked_%s.json' % (cp_params['eval_model'], params['fname_append']) save_file = os.path.join(params['root_path'], jsonFname) json.dump(candDb, open(save_file, 'w')) print 'Written to file %s' % save_file print 'Final stats are:' print stats * 100.0 / N
def build_eval_other_sent(self, tparams, options, model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] n_out_samps = (n_timesteps - 1) * n_samples embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator']) p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] out_list = [pWSft, totProb, p] else: ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] pW = ((tparams['Wd'][:, xC, :].T * ((p.reshape([1, n_out_samps, options['hidden_size']]) - tparams['WdCls'][:, xC].T))).sum(axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape( [n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten() ).reshape([n_timesteps - 1, n_samples]) cost = tot_cost.sum(axis=0) inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) self.f_pred_prob_other = theano.function([xW, xI, xAux], pWSft, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
def main(params): # load the checkpoint if params['multi_model'] == 0: checkpoint_path = params['checkpoint_path'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] model_npy = checkpoint['model'] checkpoint_params['use_theano'] = 1 if 'image_feat_size' not in checkpoint_params: checkpoint_params['image_feat_size'] = 4096 BatchGenerator = decodeGenerator(checkpoint_params) # Compile and init the theano predictor BatchGenerator.prepPredictor(model_npy, checkpoint_params, params['beam_size']) model = BatchGenerator.model_th else: BatchGenerator = [] model_npy = [] modelTh = [] checkpoint_params = [] for i, checkpoint_path in enumerate(params['checkpoint_path']): checkpoint = pickle.load(open(checkpoint_path, 'rb')) model_npy.append(checkpoint['model']) checkpoint_params.append(checkpoint['params']) checkpoint_params[i]['use_theano'] = 1 BatchGenerator.append(decodeGenerator(checkpoint_params[i])) zipp(model_npy[i], BatchGenerator[i].model_th) modelTh.append(BatchGenerator[i].model_th) modelTh[i]['comb_weight'] = 1.0 / params['nmodels'] BatchGenerator[0].prepMultiPredictor(modelTh, checkpoint_params, params['beam_size'], params['nmodels']) misc = {} ixtoword = checkpoint['ixtoword'] misc['wordtoix'] = checkpoint['wordtoix'] # output blob which we will dump to JSON for visualizing the results blob = {} blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # load the tasks.txt file and setupe feature loading root_path = params['root_path'] img_names_list = open(params['imgList'], 'r').read().splitlines() if len(img_names_list[0].rsplit(',')) > 1: img_names = [x.rsplit(',')[0] for x in img_names_list] idxes = [int(x.rsplit(',')[1]) for x in img_names_list] else: img_names = img_names_list idxes = xrange(len(img_names_list)) #if checkpoint_params.get('en_aux_inp',0) and (params.get('aux_inp_file','None') == 'None'): # raise ValueError('ERROR: please specify auxillary input feature using --aux_inp_file') # return # load the features for all images features, aux_inp = loadArbitraryFeatures(params, idxes) N = len(img_names) # iterate over all images and predict sentences print("\nUsing model run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint['epoch'], \ checkpoint['perplexity'])) kwparams = {'beam_size': params['beam_size']} jsonFname = 'result_struct_%s.json' % (params['fname_append']) save_file = os.path.join(root_path, jsonFname) for n in xrange(N): print 'image %d/%d:' % (n, N) # encode the image if params['multi_model'] == 0: D, NN = features.shape img = {} img['feat'] = features[:, n] if checkpoint_params.get('en_aux_inp', 0): img['aux_inp'] = aux_inp[:, n] img['local_file_path'] = img_names[n] # perform the work. heavy lifting happens inside Ys = BatchGenerator.predict([{ 'image': img }], model, checkpoint_params, **kwparams) else: kwparams['nmodels'] = params['nmodels'] batch = [] for i in xrange(params['nmodels']): img = {} img['feat'] = features[i][:, n] if checkpoint_params[i].get('en_aux_inp', 0): img['aux_inp'] = aux_inp[i][:, n] img['local_file_path'] = img_names[n] batch.append({'image': img}) Ys = BatchGenerator[0].predictMulti(batch, checkpoint_params, **kwparams) # build up the output img_blob = {} img_blob['img_path'] = img['local_file_path'] # encode the top prediction top_predictions = Ys[ 0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[ 0] # these are sorted with highest on top candidate = ' '.join([ ixtoword[int(ix)] for ix in top_prediction[1] if ix > 0 ]) # ix 0 is the END token, skip that print 'PRED: (%f) %s' % (float(top_prediction[0]), candidate) img_blob['candidate'] = { 'text': candidate, 'logprob': float(top_prediction[0]) } # Code to save all the other candidates candlist = [] for ci in xrange(len(top_predictions) - 1): prediction = top_predictions[ ci + 1] # these are sorted with highest on top candidate = ' '.join([ ixtoword[int(ix)] for ix in prediction[1] if ix > 0 ]) # ix 0 is the END token, skip that candlist.append({ 'text': candidate, 'logprob': float(prediction[0]) }) img_blob['candidatelist'] = candlist blob['imgblobs'].append(img_blob) if (n % 5000) == 1: print 'writing predictions to %s...' % (save_file, ) json.dump(blob, open(save_file, 'w')) # dump result struct to file print 'writing predictions to %s...' % (save_file, ) json.dump(blob, open(save_file, 'w')) # dump output html html = '' for img in blob['imgblobs']: html += '<img src="%s" height="400"><br>' % (img['img_path'], ) html += '(%f) %s <br><br>' % (img['candidate']['logprob'], img['candidate']['text']) html_file = 'result_%s.html' % (params['fname_append']) html_file = os.path.join(root_path, html_file) print 'writing html result file to %s...' % (html_file, ) open(html_file, 'w').write(html)
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size']) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though params['use_dropout'] = 1 # This initializes the model parameters and does matrix initializations generator = decodeGenerator(params) (gen_inp_list, predLogProb, predIdx, predCand, wOut_emb, updatesLstm) = generator.build_prediction_model( generator.model_th, params, params['beam_size']) wOut_emb = wOut_emb.reshape([wOut_emb.shape[0],wOut_emb.shape[2]]) f_gen_only = theano.function(gen_inp_list, [predLogProb, predIdx, wOut_emb], name='f_pred', updates=updatesLstm) modelGen = generator.model_th upListGen = generator.update_list if params['share_Wemb']: evaluator = decodeEvaluator(params, modelGen['Wemb']) else: evaluator = decodeEvaluator(params) modelEval = evaluator.model_th # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout_eval, eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params, gen_inp_list, wOut_emb) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it comb_inp_list = eval_inp_list for inp in gen_inp_list: if inp not in comb_inp_list: comb_inp_list.append(inp) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(comb_inp_list, costs, name='f_eval', updates=updatesLstm) # Now let's build a gradient computation graph and rmsprop update mechanism if params['share_Wemb']: modelEval.pop('Wemb') if params['fix_Wemb']: upListGen.remove('Wemb') modelGenUpD = OrderedDict() for k in upListGen: modelGenUpD[k] = modelGen[k] gradsEval = tensor.grad(costs[0], wrt=modelEval.values(),add_names=True) gradsGen = tensor.grad(costs[1], wrt=modelGenUpD.values(), add_names=True) lrEval = tensor.scalar(name='lrEval',dtype=config.floatX) f_grad_comp_eval, f_param_update_eval, zg_eval, rg_eval, ud_eval= solver.build_solver_model(lrEval, modelEval, gradsEval, comb_inp_list, costs[0], params) lrGen = tensor.scalar(name='lrGen',dtype=config.floatX) f_grad_comp_gen, f_param_update_gen, zg_gen, rg_gen, ud_gen = solver.build_solver_model(lrGen, modelGenUpD, gradsGen, comb_inp_list, costs[1], params) print 'model init done.' print 'model has keys: ' + ', '.join(modelGen.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat = 'images') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch iters_eval= num_iters_one_epoch//2 iters_gen = num_iters_one_epoch//4 eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = 0.5 # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) t_print_sec = 60 ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from,modelGen) #zipp(rg_init,rgGen) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) pos_samp = np.arange(batch_size,dtype=np.int32) print batch_size ############################################################## # Define signal handler to catch ctl-c or kills so that we can save the model trained till that point def signal_handler(signal, frame): print('You pressed Ctrl+C! Saving Checkpoint Now before exiting!') filename = 'advmodel_checkpoint_%s_%s_%s_%.2f_INT.p' % (params['dataset'], host, params['fappend'], val_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2) sys.exit(0) signal.signal(signal.SIGINT, signal_handler) ############################################################## for it in xrange(max_epochs): epoch = it * 1.0 / num_iters_one_epoch # Enable using dropout in training use_dropout_eval.set_value(1.) for it2 in xrange(iters_eval): t0 = time.time() # fetch a batch of data batch,_ = dp.sampPosNegSentSamps(params['eval_batch_size'] - params['rand_negs']) real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], rand_negs = params['rand_negs']) # evaluate cost, gradient and perform parameter update cost = f_grad_comp_eval(*real_inp_list) f_param_update_eval(params['learning_rate_eval']) dt = time.time() - t0 # Track training statistics train_ppl2 = (np.e**(-cost)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it2 == 0: smooth_train_ppl2 = train_ppl2 if it2 == 0: smooth_train_cost = cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * cost tnow = time.time() if tnow > last_status_write_time + t_print_sec*1: # every now and then lets write a report print 'Eval Cnn in epoch %d: %d/%d sample done in %.3fs. Cost now is %.3f Pplx is %.3f' % (it, it2, iters_eval, dt, \ smooth_train_cost,smooth_train_ppl2) last_status_write_time = tnow print 'Done training the descriminative model for now. Switching to Genereative model' print 'Eval N/W in epoch %d: Cost now is %.3f Pplx is %.3f' % (it, smooth_train_cost,smooth_train_ppl2) filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_EVOnly.p' % (params['dataset'], host, params['fappend'],it, smooth_train_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2) # Disable Cnn dropout while training gen network use_dropout_eval.set_value(0.) for it2 in xrange(iters_gen): t0 = time.time() # fetch a batch of data batch,_ = dp.sampPosNegSentSamps(params['eval_batch_size'] - params['rand_negs']) real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], rand_negs = params['rand_negs']) #import pdb; pdb.set_trace() # evaluate cost, gradient and perform parameter update #if any([np.isnan(modelGen[m].get_value()).any() for m in modelGen]): # print 'Somebodys NAN!!!' # break; #asd = f_gen_only(real_inp_list[2],real_inp_list[3]) #print it2,asd[-1].shape, real_inp_list[0].shape #if asd[-1].shape[0] > real_inp_list[0].shape[0]: # import pdb; pdb.set_trace() cost = f_grad_comp_gen(*real_inp_list) #print it2,cost #if any([np.isnan(zg_gen[i].get_value()).any() for i in xrange(len(zg_gen))]): # print 'Somebody zg is NAN!!!' # break; #if any([np.isnan(rg_gen[i].get_value()).any() for i in xrange(len(rg_gen))]) or any([(rg_gen[i].get_value()<0).any() for i in xrange(len(rg_gen))]): # print 'Somebody rg is NAN!!!' # break; f_param_update_gen(params['learning_rate_gen']) dt = time.time() - t0 # print training statistics train_ppl2 = (np.e**(-cost)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it2 == 0: smooth_train_ppl2 = train_ppl2 if it2 == 0: smooth_train_cost = cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * cost tnow = time.time() if tnow > last_status_write_time + t_print_sec*1: # every now and then lets write a report print 'Gen Lstm in epoch %d: %d/%d sample done in %.3fs. Cost now is %.3f Pplx is %.3f' % (it, it2, iters_gen, dt, \ smooth_train_cost,smooth_train_ppl2) last_status_write_time = tnow print 'Done training the generative model for now. Switching to Genereative model. Final Stats are:' print 'Gen Lstm in epoch %d: Cost now is %.3f Pplx is %.3f' % (it, smooth_train_cost,smooth_train_ppl2) ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters is_last_iter = 1 if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation # use_dropout.set_value(0.) # val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set # # if it - params['lr_decay_st_epoch'] >= 0: # params['learning_rate'] = params['learning_rate'] * params['lr_decay'] # params['lr_decay_st_epoch'] += 1 # # print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) # if params['sample_by_len'] == 1: # print len_hist val_ppl2 = smooth_train_ppl2 write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist #top_val_ppl2 = val_ppl2 filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_GenDone.p' % (params['dataset'], host, params['fappend'],it, smooth_train_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2)
def build_eval_other_sent(self, tparams, options, model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps, :, :], xAux, use_noise, options, prefix=options['generator'], mask=mask) p = rval[0] p = tensor.dot(p, tparams['Wd']) + tparams['bd'] #pred = tensor.nnet.softmax(p) #pred = rval[2] #pred = pred[1:,:,:] p = p[1:, :, :] def accumCost(pred, xW, m, c_sum, ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += (tensor.log(pred[tensor.arange(n_samples), xW] + 1e-20) * m) ppl_sum += -( tensor.log2(pred[tensor.arange(n_samples), xW] + 1e-10) * m) return c_sum, ppl_sum sums, upd = theano.scan(fn=accumCost, outputs_info=[ tensor.alloc(numpy_floatX(0.), 1, n_samples), tensor.alloc(numpy_floatX(0.), 1, n_samples) ], sequences=[p, xW[1:, :], mask[1:, :]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) cost = sums[0][-1] inp_list = [xW, xI, mask] if options.get('en_aux_inp', 0): inp_list.append(xAux) f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, p, updatesLSTM
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) misc = { } # stores various misc items that need to be passed around the framework if params['checkpoint_file_name'] == 'None': # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) else: # Load Vocabulary from the checkpoint misc = checkpoint_init['misc'] params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though # This initializes the generator model parameters and does matrix initializations if params['t_eval_only'] == 0: generator = decodeGenerator(params) # Build the computational graph if params['use_encoder_for'] & 2: aux_enc_inp = generator.model_th['Wemb'] if params[ 'encode_gt_sentences'] else dp.aux_inputs.T hid_size = params['featenc_hidden_size'] auxFeatEncoder = RecurrentFeatEncoder( hid_size, params['image_encoding_size'], params, mdl_prefix='aux_enc_', features=aux_enc_inp) mdlLen = len(generator.model_th.keys()) generator.model_th.update(auxFeatEncoder.model_th) assert (len(generator.model_th.keys()) == ( mdlLen + len(auxFeatEncoder.model_th.keys()))) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model( generator.model_th, params) if params['encode_gt_sentences']: # Reshape it size(batch_size, n_gt, hidden_size) xAux = xAux.reshape((-1, params['n_encgt_sent'], params['featenc_hidden_size'])) # Convert it to size (batch_size, n_gt*hidden_size xAux = xAux.flatten(2) xI = tensor.zeros((batch_size, params['image_encoding_size'])) imgFeatEnc_inp = [] else: auxFeatEnc_inp = [] imgFeatEnc_inp = [] xAux = None xI = None (gen_inp_list, predLogProb, predIdx, predCand, gen_out, updatesLstm, seq_lengths) = generator.build_prediction_model(generator.model_th, params, xI=xI, xAux=xAux) gen_inp_list = imgFeatEnc_inp + auxFeatEnc_inp + gen_inp_list gen_out = gen_out.reshape([ gen_out.shape[0], -1, params['n_gen_samples'], params['vocabulary_size'] ]) #convert updates lstm to a tuple, this is to help merge it with grad updates updatesLstm = [(k, v) for k, v in updatesLstm.iteritems()] f_gen_only = theano.function( gen_inp_list, [predLogProb, predIdx, gen_out, seq_lengths], name='f_pred', updates=updatesLstm) modelGen = generator.model_th upListGen = generator.update_list if params['use_mle_train']: (use_dropout_genTF, inp_list_genTF, _, cost_genTF, _, updatesLSTM_genTF) = generator.build_model( generator.model_th, params) f_eval_genTF = theano.function(inp_list_genTF, cost_genTF, name='f_eval') grads_genTF = tensor.grad(cost_genTF[0], wrt=modelGen.values(), add_names=True) lr_genTF = tensor.scalar(name='lr', dtype=config.floatX) f_grad_genTF, f_update_genTF, zg_genTF, rg_genTF, ud_genTF = solver.build_solver_model( lr_genTF, modelGen, grads_genTF, inp_list_genTF, cost_genTF, params) else: modelGen = [] updatesLstm = [] if params['met_to_track'] != []: trackMetargs = {'eval_metric': params['met_to_track']} refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track']) trackMetargs['refToks'] = refToks trackMetargs['scr_info'] = scr_info # Initialize the evalator model if params['share_Wemb']: evaluator = decodeEvaluator(params, modelGen['Wemb']) else: evaluator = decodeEvaluator(params) modelEval = evaluator.model_th if params['t_eval_only'] == 0: # Build the evaluator graph to evaluate reference and generated captions if params.get('upd_eval_ref', 0): (refeval_inp_list, ref_f_pred_fns, ref_costs, ref_predTh, ref_modelEval) = evaluator.build_advers_eval(modelEval, params) (eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params, gen_inp_list, gen_out, updatesLstm, seq_lengths) else: # Build the evaluator graph to evaluate only reference captions (eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies if params['t_eval_only'] == 0: # and 0: if params['checkpoint_file_name'] == 'None': modelGen['bd'].set_value(bias_init_vector.astype(config.floatX)) if params.get('class_out_factoring', 0) == 1: modelGen['bdCls'].set_value( bias_init_inter_class.astype(config.floatX)) comb_inp_list = eval_inp_list if params['t_eval_only'] == 0: for inp in gen_inp_list: if inp not in comb_inp_list: comb_inp_list.append(inp) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation or debug purposes if params['t_eval_only'] == 0: f_eval = theano.function(comb_inp_list, costs[:1], name='f_eval', updates=updatesLstm) else: f_eval = theano.function(comb_inp_list, costs[:1], name='f_eval') if params['share_Wemb']: modelEval.pop('Wemb') if params['fix_Wemb']: upListGen.remove('Wemb') #------------------------------------------------------------------------------------------------------------------------- # Now let's build a gradient computation graph and update mechanism #------------------------------------------------------------------------------------------------------------------------- # First compute gradient on the evaluator params w.r.t cost if params.get('upd_eval_ref', 0): gradsEval_ref = tensor.grad(ref_costs[0], wrt=modelEval.values(), add_names=True) gradsEval = tensor.grad(costs[0], wrt=modelEval.values(), add_names=True) # Update functions for the evaluator lrEval = tensor.scalar(name='lrEval', dtype=config.floatX) if params.get('upd_eval_ref', 0): f_grad_comp_eval_ref, f_param_update_eval_ref, _, _, _ = solver.build_solver_model( lrEval, modelEval, gradsEval_ref, refeval_inp_list, ref_costs[0], params, w_clip=params['eval_w_clip']) f_grad_comp_eval, f_param_update_eval, zg_eval, rg_eval, ud_eval = solver.build_solver_model( lrEval, modelEval, gradsEval, comb_inp_list, costs[:1], params, updatesLstm, w_clip=params['eval_w_clip']) # Now compute gradient on the generator params w.r.t the cost if params['t_eval_only'] == 0: gradsGen = tensor.grad(costs[1], wrt=modelGen.values(), add_names=True) lrGen = tensor.scalar(name='lrGen', dtype=config.floatX) # Update functions for the generator f_grad_comp_gen, f_param_update_gen, zg_gen, rg_gen, ud_gen = solver.build_solver_model( lrGen, modelGen, gradsGen, comb_inp_list[:(len(comb_inp_list) - 1 + params['gen_feature_matching'])], costs[1], params, updatesLstm) #------------------------------------------------------------------------------------------------------------------------- # If we want to track some metrics during the training, initialize stuff for that now #------------------------------------------------------------------------------------------------------------------------- print 'model init done.' if params['t_eval_only'] == 0: print 'Gen model has keys: ' + ', '.join(modelGen.keys()) print 'Eval model has keys: ' + ', '.join(modelEval.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='images') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch skip_first = 20 iters_eval = 5 iters_gen = 1 cost_eval_iter = [] cost_gen_iter = [] trackSc_array = [] eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = 0.5 # initially size of dictionary of confusion smooth_train_cost = 0.0 # initially size of dictionary of confusion smooth_train_cost_gen = 1.0 # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] iter_out_file = os.path.join( 'logs', 'advmodel_checkpoint_%s_%s_%s_log.npz' % (params['dataset'], host, params['fappend'])) len_hist = defaultdict(int) t_print_sec = 30 ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': if params['t_eval_only'] != 1: print '\n Now initing gen Model:' zipp(model_init_gen_from, modelGen) if 'trackers' in checkpoint_init: trackSc_array = checkpoint_init['trackers'].get('trackScores', []) print '\n Now initing Eval Model:' zipp(model_init_eval_from, modelEval) #zipp(rg_init,rgGen) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) ############################################################## # Define signal handler to catch ctl-c or kills so that we can save the model trained till that point def signal_handler(signal, frame): print('You pressed Ctrl+C! Saving Checkpoint Now before exiting!') filename = 'advmodel_checkpoint_%s_%s_%s_%.2f_INT.p' % ( params['dataset'], host, params['fappend'], val_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2) sys.exit(0) #signal.signal(signal.SIGINT, signal_handler) ############################################################## #In testing disable sampling and use the greedy approach!? generator.usegumbel.set_value(1) if params['met_to_track'] != []: tsc_max, tsc_mean, tsc_min = eval_gen_samps(f_gen_only, dp, params, misc, params['rev_eval'], **trackMetargs) trackSc_array.append((0, { evm + '_max': tsc_max[i] for i, evm in enumerate(params['met_to_track']) })) trackSc_array[-1][1].update({ evm + '_mean': tsc_mean[i] for i, evm in enumerate(params['met_to_track']) }) trackSc_array[-1][1].update({ evm + '_min': tsc_min[i] for i, evm in enumerate(params['met_to_track']) }) disp_some_gen_samps(f_gen_only, dp, params, misc, n_samp=5) evaluator.use_noise.set_value(1.) eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc) # Re-enable sampling generator.usegumbel.set_value(1) np.savez(iter_out_file, eval_cost=np.array(cost_eval_iter), gen_cost=np.array(cost_gen_iter), tracksc=np.array(trackSc_array)) smooth_train_cost = 0.0 print '###################### NOW BEGINNING TRAINING #################################' for it in xrange(max_iters): t0 = time.time() # Enable using dropout in training evaluator.use_noise.set_value(1.) dt = 0. it2 = 0 while eval_acc <= 60. or gen_acc >= 45. or it2 < iters_eval * skip_first: # fetch a batch of data t1 = time.time() s_probs = [ 0.6, 0.4, 0.0 ] if params['eval_loss'] == 'contrastive' else [1.0, 0.0, 0.0] batch = dp.sampAdversBatch(batch_size, n_sent=params['n_gen_samples'], probs=s_probs) cnn_inps = prepare_adv_data(batch, misc['wordtoix'], maxlen=params['maxlen'], prep_for=params['eval_model']) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) eval_cost = f_grad_comp_eval(*(cnn_inps + enc_inp_list)) if np.isnan(eval_cost[0]): import pdb pdb.set_trace() f_param_update_eval(params['learning_rate_eval']) # Track training statistics smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * eval_cost[ 0] if it > 0 else eval_cost[0] dt2 = time.time() - t1 if it2 % 500 == 499: gb = 0. #modelGen['gumb_temp'].get_value() if params['use_gumbel_mse'] == 1 else 0 print 'Iter %d/%d Eval Only Iter %d/%d, done. in %.3fs. Eval Cost is %.6f' % ( it, max_iters, it2, iters_eval * skip_first, dt2, smooth_train_cost) if it2 % 100 == 99: eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, n_eval=500) it2 += 1 evaluator.use_noise.set_value(1.) if it >= 0: skip_first = 1 if it >= 100: skip_first = 1 if it % 1000 == 999: skip_first = 1 s_probs = [ 1.0, 0.0, 0.0 ] if params['eval_loss'] == 'contrastive' else [1.0, 0.0, 0.0] batch = dp.sampAdversBatch(batch_size, n_sent=params['n_gen_samples'], probs=s_probs) cnn_inps = prepare_adv_data(batch, misc['wordtoix'], maxlen=params['maxlen'], prep_for=params['eval_model']) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) gen_cost = f_grad_comp_gen( *(cnn_inps[:(len(cnn_inps) - 1 + params['gen_feature_matching'])] + enc_inp_list)) f_param_update_gen(params['learning_rate_gen']) if params['use_mle_train']: generator.usegumbel.set_value(0) batch, l = dp.getRandBatchByLen(batch_size) gen_inp_list, lenS = prepare_data(batch, misc['wordtoix'], params['maxlen']) cost_genMLE = f_grad_genTF(*gen_inp_list) f_update_genTF(np.float32(params['learning_rate_gen'] / 50.0)) generator.usegumbel.set_value(1) dt = time.time() - t0 # print training statistics smooth_train_cost_gen = gen_cost if it == 0 else 0.99 * smooth_train_cost_gen + 0.01 * gen_cost tnow = time.time() if tnow > last_status_write_time + t_print_sec * 1: # every now and then lets write a report gb = 0. #modelGen['gumb_temp'].get_value() if params['use_gumbel_mse'] == 1 else 0 print 'Iter %d/%d done. in %.3fs. Eval Cost is %.6f, Gen Cost is %.6f, temp: %.4f' % (it, max_iters, dt, \ smooth_train_cost, smooth_train_cost_gen, gb) last_status_write_time = tnow cost_eval_iter.append(smooth_train_cost) cost_gen_iter.append(smooth_train_cost_gen) if it % 500 == 499: # Run the generator on the validation set and compute some metrics generator.usegumbel.set_value(1) if params['met_to_track'] != []: #In testing set the temperature to very low, so that it is equivalent to Greed samples tsc_max, tsc_mean, tsc_min = eval_gen_samps( f_gen_only, dp, params, misc, params['rev_eval'], **trackMetargs) trackSc_array.append((it, { evm + '_max': tsc_max[i] for i, evm in enumerate(params['met_to_track']) })) trackSc_array[-1][1].update({ evm + '_mean': tsc_mean[i] for i, evm in enumerate(params['met_to_track']) }) trackSc_array[-1][1].update({ evm + '_min': tsc_min[i] for i, evm in enumerate(params['met_to_track']) }) disp_some_gen_samps(f_gen_only, dp, params, misc, n_samp=5) generator.usegumbel.set_value(1) # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = gen_acc if it % 500 == 499: eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, n_eval=500) if it % 1000 == 999: filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_genacc.p' % ( params['dataset'], host, params['fappend'], it, gen_acc) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, gen_acc) if it % 500 == 499: np.savez(iter_out_file, eval_cost=np.array(cost_eval_iter), gen_cost=np.array(cost_gen_iter), tracksc=np.array(trackSc_array)) # AND we also beat the user-defined threshold or it doesnt exist filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_GenDone.p' % ( params['dataset'], host, params['fappend'], it, g_acc) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, g_acc)
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) cp_params = checkpoint['params'] if params['gen_model'] == None: model_npy = checkpoint[ 'model'] if 'model' in checkpoint else checkpoint['modelGen'] else: gen_cp = pickle.load(open(params['gen_model'], 'rb')) model_npy = gen_cp.get('model', {}) cp_params['use_theano'] = 1 if params['dobeamsearch']: cp_params['advers_gen'] = 0 if params['use_label_file'] == 1: params['poolmethod'] = cp_params['poolmethod'] if params[ 'poolmethod'] == None else params['poolmethod'] params['labels'] = cp_params['labels'] if params[ 'labels'] == None else params['labels'] params['featfromlbl'] = cp_params['featfromlbl'] if params[ 'featfromlbl'] == None else params['featfromlbl'] params['uselabel'] = cp_params['uselabel'] if params[ 'uselabel'] == None else params['uselabel'] else: params['uselabel'] = 0 print 'parsed parameters:' print json.dumps(params, indent=2) if 'image_feat_size' not in cp_params: cp_params['image_feat_size'] = 4096 if 'misc' in checkpoint: misc = checkpoint['misc'] ixtoword = misc['ixtoword'] else: misc = {} ixtoword = checkpoint['ixtoword'] misc['wordtoix'] = checkpoint['wordtoix'] cp_params['softmax_smooth_factor'] = params['softmax_smooth_factor'] cp_params['softmax_propogate'] = params['softmax_propogate'] cp_params['computelogprob'] = params['computelogprob'] cp_params['greedy'] = params['greedy'] cp_params['gen_input_noise'] = 0 if cp_params.get('sched_sampling_mode', None) != None: cp_params['sched_sampling_mode'] = None # load the tasks.txt file and setupe feature loading root_path = params['root_path'] img_names_list = open(params['imgList'], 'r').read().splitlines() auxidxes = [] img_names = [x.rsplit(',')[0] for x in img_names_list] if len(img_names_list[0].split(',', 1)) > 1: if type(ast.literal_eval(img_names_list[0].split( ',', 1)[1].strip())) == tuple: idxes = [ ast.literal_eval(x.split(',', 1)[1].strip())[0] for x in img_names_list ] auxidxes = [ ast.literal_eval(x.split(',', 1)[1].strip())[1] for x in img_names_list ] else: idxes = [ ast.literal_eval(x.split(',', 1)[1].strip()) for x in img_names_list ] else: idxes = xrange(len(img_names_list)) if cp_params.get('swap_aux') == 0 or auxidxes == []: features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures( params, idxes, auxidxes=auxidxes) else: features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures( params, auxidxes, auxidxes=idxes) ##-------------------------------- Setup the models --------------------------########### if cp_params.get('use_encoder_for', 0) & 1: imgFeatEncoder = RecurrentFeatEncoder(cp_params['image_feat_size'], cp_params['word_encoding_size'], cp_params, mdl_prefix='img_enc_', features=features.T) zipp(model_npy, imgFeatEncoder.model_th) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model( imgFeatEncoder.model_th, cp_params) else: xI = None imgFeatEnc_inp = [] if cp_params.get('use_encoder_for', 0) & 2: auxFeatEncoder = RecurrentFeatEncoder(cp_params['aux_inp_size'], cp_params['image_encoding_size'], cp_params, mdl_prefix='aux_enc_', features=aux_inp.T) zipp(model_npy, auxFeatEncoder.model_th) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model( auxFeatEncoder.model_th, cp_params) else: auxFeatEnc_inp = [] xAux = None # Testing to see if diversity can be achieved by weighing words if params['word_freq_w'] != None: w_freq = json.load(open(params['word_freq_w'], 'r')) w_logw = np.zeros(len(misc['wordtoix']), dtype=np.float32) for w in w_freq: if w in misc['wordtoix']: w_logw[misc['wordtoix'][w]] = w_freq[w] w_logw = w_logw / w_logw[1:].min() w_logw[0] = w_logw.max() w_logw = -params['word_freq_sc'] * np.log(w_logw) else: w_logw = None BatchGenerator = decodeGenerator(cp_params) # Compile and init the theano predictor BatchGenerator.prepPredictor(model_npy, cp_params, params['beam_size'], xI, xAux, imgFeatEnc_inp + auxFeatEnc_inp, per_word_logweight=w_logw) model = BatchGenerator.model_th if params['greedy']: BatchGenerator.usegumbel.set_value(0) # output blob which we will dump to JSON for visualizing the results blob = {} blob['params'] = params blob['checkpoint_params'] = copy(cp_params) if cp_params.get('class_out_factoring', 0) == 1: blob['checkpoint_params'].pop('ixtoclsinfo') blob['imgblobs'] = [] N = len(img_names) # iterate over all images and predict sentences print("\nUsing model run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint['epoch'], \ checkpoint['perplexity'])) kwparams = {} jsonFname = 'result_struct_%s.json' % (params['fname_append']) save_file = os.path.join(root_path, jsonFname) for n in xrange(N): print 'image %d/%d:' % (n, N) # encode the image D, NN = features.shape img = {} img['feat'] = features[:, feat_idx[n]].T img['img_idx'] = feat_idx[n] if cp_params.get('en_aux_inp', 0): img['aux_inp'] = aux_inp( aux_idx[n]) if aux_inp != [] else np.zeros( cp_params['aux_inp_size'], dtype=np.float32) img['aux_idx'] = aux_idx[n] if aux_inp != [] else [] img['local_file_path'] = img_names[n] # perform the work. heavy lifting happens inside enc_inp_list = prepare_seq_features( [{ 'image': img }], use_enc_for=cp_params.get('use_encoder_for', 0), use_shared_mem=cp_params.get('use_shared_mem_enc', 0)) #import pdb;pdb.set_trace() Ys, Ax = BatchGenerator.predict([{ 'image': img }], cp_params, ext_inp=enc_inp_list) # build up the output img_blob = {} img_blob['img_path'] = img['local_file_path'] # encode the top prediction top_predictions = Ys[0] if params[ 'rescoreByLen'] == 0 else rescoreProbByLen( Ys[0] ) # take predictions for the first (and only) image we passed in top_predictions = sorted(top_predictions, key=lambda aa: aa[0], reverse=True) top_prediction = top_predictions[ 0] # these are sorted with highest on top if cp_params.get('reverse_sentence', 0) == 0: candidate = ' '.join([ ixtoword[int(ix)] for ix in top_prediction[1] if ix > 0 ]) # ix 0 is the END token, skip that else: candidate = ' '.join([ ixtoword[int(ix)] for ix in reversed(top_prediction[1]) if ix > 0 ]) # ix 0 is the END token, skip that #if candidate == '': # import pdb;pdb.set_trace() if params['rescoreByLen'] == 0: print 'PRED: (%f) %s' % (float(top_prediction[0]), candidate) else: print 'PRED: (%f, %f) %s' % (float( top_prediction[0]), float(top_prediction[2]), candidate) img_blob['candidate'] = { 'text': candidate, 'logprob': float(top_prediction[0]) } # Code to save all the other candidates candlist = [] for ci in xrange(len(top_predictions) - 1): prediction = top_predictions[ ci + 1] # these are sorted with highest on top candidate = ' '.join([ ixtoword[int(ix)] for ix in prediction[1] if ix > 0 ]) # ix 0 is the END token, skip that candlist.append({ 'text': candidate, 'logprob': float(prediction[0]) }) img_blob['candidatelist'] = candlist blob['imgblobs'].append(img_blob) if (n % 5000) == 1: print 'writing predictions to %s...' % (save_file, ) json.dump(blob, open(save_file, 'w')) # dump result struct to file print 'writing predictions to %s...' % (save_file, ) json.dump(blob, open(save_file, 'w'))
def main(params): word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['image_feat_size'] = dp.img_feat_size params['aux_inp_size'] = dp.aux_inp_size misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) if params['fine_tune'] == 1: params['mode'] = 'multi_choice_mode' if params[ 'mc_mode'] == 1 else 'multimodal_lstm' if params['checkpoint_file_name'] != None: #params['batch_size'] = dp.dataset['batchsize'] misc['wordtoix'] = checkpoint_init['wordtoix'] misc['ixtoword'] = checkpoint_init['ixtoword'] batch_size = 1 num_sentences_total = dp.getSplitSize('train', ofwhat='images') else: params['mode'] = 'batchtrain' batch_size = params['batch_size'] num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') params['vocabulary_size'] = len(misc['wordtoix']) pos_samp = np.arange(batch_size, dtype=np.int32) # This initializes the model parameters and does matrix initializations evalModel = decodeEvaluator(params) model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize) #----------------- If we are using feature encoders ----------------------- if params['use_encoder_for'] & 1: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['sent_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) #misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list_eval, miscOuts, cost, predTh, model) = evalModel.build_model(model, params, xI=xI, prior_inp_list=imgFeatEnc_inp) inp_list = imgFeatEnc_inp + inp_list_eval # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost / params['batch_size']) # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) if params['sim_minibatch'] > 0: f_grad_accum, f_clr, ag = solver.accumGrads(model, grads, inp_list, cost, params['sim_minibatch']) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, ag, inp_list, cost, params) else: f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1 max_iters = max_iters / inner_loop eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs / inner_loop)) top_val_ppl2 = -1 smooth_train_cost = len( misc['ixtoword']) # initially size of dictionary of confusion smooth_error_rate = 100. error_rate = 0. prev_it = -1 val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != None: zipp(model_init_from, model) zipp(rg_init, rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) elif params['init_from_imagernn'] != None: # Initialize word vecs and image emb from generative model file rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb')) model['Wemb'].set_value(rnnCv['model']['Wemb']) model['WIemb'].set_value(rnnCv['model']['WIemb_aux']) misc['wordtoix'] = rnnCv['wordtoix'] misc['ixtoword'] = rnnCv['ixtoword'] print( "\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn'])) write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] use_dropout.set_value(1.) #################### Main Loop ############################################ for it in xrange(max_iters): t0 = time.time() if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(float(params['use_dropout'])) # fetch a batch of data cost_inner = np.zeros((inner_loop, ), dtype=np.float32) if params['sim_minibatch'] > 0: for i_l in xrange(inner_loop): batch, pos_samp_sent = dp.sampPosNegSentSamps( params['batch_size'], params['mode'], thresh=0.3) eval_inp_list, lenS = prepare_data( batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], use_enc_for=params['use_encoder_for']) if params['fine_tune'] == 1: eval_inp_list.append(pos_samp_sent) cost_inner[i_l] = f_grad_accum(*eval_inp_list) else: batch, pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'], params['mode'], thresh=0.3) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], use_shared_mem=params['use_shared_mem_enc']) eval_inp_list, lenS = prepare_data( batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], use_enc_for=params['use_encoder_for']) if params['fine_tune'] == 1: eval_inp_list.append(pos_samp_sent) real_inp_list = enc_inp_list + eval_inp_list # Enable using dropout in training cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # Reset accumulated gradients to 0 if params['sim_minibatch'] > 0: f_clr() #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model]) #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))]) #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))]) #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) #import pdb; pdb.set_trace() #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) # print training statistics epoch = it * inner_loop * 1.0 / num_iters_one_epoch total_cost = (np.e**(-cost[0]) + (np.e**(-cost_inner)).sum() * (params['sim_minibatch'] > 0)) / ( 1 + params['sim_minibatch']) #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_cost) if it == 0: smooth_train_cost = total_cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost error_rate += 100.0 * float((cost[2] < 0.).sum()) / batch_size margin_strength = cost[2].sum() smooth_error_rate = 0.99 * smooth_error_rate + 0.01 * 100.0 * ( float(cost[1]) / batch_size) if it > 0 else 100.0 * ( float(cost[1]) / batch_size) tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.4f, Error '\ 'rate is %.3f%%, Margin %.2f, negMarg=%.2f' % (it, max_iters, dt, \ epoch, smooth_train_cost, smooth_error_rate, margin_strength, error_rate/(it-prev_it)) error_rate = 0. prev_it = it last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['val_ppl2'] = val_ppl2 # just write the last available one json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(0.) val_ppl2 = eval_split_theano( 'val', dp, model, params, misc, f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % ( val_ppl2, params['learning_rate']) #if params['sample_by_len'] == 1: # print len_hist if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % ( params['eval_model'], params['dataset'], host, params['fappend'], smooth_error_rate, val_ppl2) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e use_dropout.set_value(1.)
def main(params): word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['image_feat_size'] = dp.img_feat_size misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['use_dropout'] = 1 if params['fine_tune'] == 1: params['mode'] = 'multimodal_lstm' if params['multimodal_lstm'] == 0 else 'multimodal_lstm' if params['checkpoint_file_name'] != None: params['batch_size'] = dp.dataset['batchsize'] misc['wordtoix'] = checkpoint_init['wordtoix'] misc['ixtoword'] = checkpoint_init['ixtoword'] batch_size = 1 num_sentences_total = dp.getSplitSize('train', ofwhat = 'images') else: params['mode'] = 'batchtrain' batch_size = params['batch_size'] num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') params['vocabulary_size'] = len(misc['wordtoix']) pos_samp = np.arange(batch_size,dtype=np.int32) # This initializes the model parameters and does matrix initializations evalModel = decodeEvaluator(params) model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, miscOuts, cost, predTh, model) = evalModel.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p] ** 2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost /params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost, wrt=model.values()) lr = tensor.scalar(name='lr',dtype=config.floatX) if params['sim_minibatch'] > 0: f_grad_accum, f_clr, ag = solver.accumGrads(model,grads,inp_list,cost, params['sim_minibatch']) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, ag, inp_list, cost, params) else: f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1 max_iters = max_iters / inner_loop eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs/ inner_loop)) top_val_ppl2 = -1 smooth_train_cost = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != None: zipp(model_init_from,model) zipp(rg_init,rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) elif params['init_from_imagernn'] != None: # Initialize word vecs and image emb from generative model file rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb')) model['Wemb'].set_value(rnnCv['model']['Wemb']) model['WIemb'].set_value(rnnCv['model']['WIemb_aux']) misc['wordtoix'] = rnnCv['wordtoix'] misc['ixtoword'] = rnnCv['ixtoword'] print("\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn'])) use_dropout.set_value(1.) #################### Main Loop ############################################ for it in xrange(max_iters): t0 = time.time() # fetch a batch of data cost_inner = np.zeros((inner_loop,),dtype=np.float32) if params['sim_minibatch'] > 0: for i_l in xrange(inner_loop): batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model']) if params['fine_tune'] == 1: real_inp_list.append(pos_samp_sent) cost_inner[i_l] = f_grad_accum(*real_inp_list) else: batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model']) if params['fine_tune'] == 1: real_inp_list.append(pos_samp_sent) # Enable using dropout in training cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # Reset accumulated gradients to 0 if params['sim_minibatch'] > 0: f_clr() #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model]) #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))]) #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))]) #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) #import pdb; pdb.set_trace() #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) # print training statistics epoch = it*inner_loop * 1.0 / num_iters_one_epoch total_cost = (np.e**-cost + (np.e**(-cost_inner)).sum()*(params['sim_minibatch'] > 0))/ (1 + params['sim_minibatch']) #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_cost) if it == 0: smooth_train_cost = total_cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.3f' % (it, max_iters, dt, \ epoch, smooth_train_cost) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['val_ppl2'] = val_ppl2 # just write the last available one json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params['lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist #top_val_ppl2 = val_ppl2 filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (params['eval_model'], params['dataset'], host, params['fappend'],val_ppl2,smooth_train_cost) filepath = os.path.join(params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepath, ) print e use_dropout.set_value(1.)
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size']) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though params['use_dropout'] = 1 # This initializes the model parameters and does matrix initializations lstmGenerator = LSTMGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it model['bd'].set_value(bias_init_vector.astype(config.floatX)) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p] ** 2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost /params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr',dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop(lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from,model) zipp(rg_init,rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) for it in xrange(max_iters): t0 = time.time() # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch,l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 if params['use_pos_tag'] != 'None': real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],None,sentTagMap,misc['ixtoword']) else: real_inp_list, lenS = prepare_data(batch,misc['wordtoix']) # Enable using dropout in training use_dropout.set_value(1.) # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1]/lenS)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch total_cost = cost[0] #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \ epoch, total_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params['lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (params['dataset'], host, params['fappend'], val_ppl2) filepath = os.path.join(params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepath, ) print e
def main(params): # load the checkpoint if params['multi_model'] == 0: checkpoint_path = params['checkpoint_path'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] model_npy = checkpoint['model'] checkpoint_params['use_theano'] = 1 if 'image_feat_size' not in checkpoint_params: checkpoint_params['image_feat_size'] = 4096 if 'misc' in checkpoint: misc = checkpoint['misc'] ixtoword = misc['ixtoword'] else: misc = {} ixtoword = checkpoint['ixtoword'] misc['wordtoix'] = checkpoint['wordtoix'] checkpoint_params['softmax_smooth_factor'] = params['softmax_smooth_factor'] checkpoint_params['softmax_propogate'] = params['softmax_propogate'] if checkpoint_params.get('class_out_factoring',0) == 1: checkpoint_params['ixtoclsinfo'] = np.zeros((checkpoint_params['nClasses'],2),dtype=np.int32) ixtoclsinfo = misc['ixtoclsinfo'] checkpoint_params['ixtoclsinfo'][ixtoclsinfo[:,0]] = ixtoclsinfo[:,1:3] if checkpoint_params.get('sched_sampling_mode',None) !=None: checkpoint_params['sched_sampling_mode'] = None BatchGenerator = decodeGenerator(checkpoint_params) # Compile and init the theano predictor BatchGenerator.prepPredictor(model_npy, checkpoint_params, params['beam_size']) model = BatchGenerator.model_th else: BatchGenerator = [] model_npy = [] modelTh = [] checkpoint_params = [] for i,checkpoint_path in enumerate(params['checkpoint_path']): checkpoint = pickle.load(open(checkpoint_path, 'rb')) model_npy.append(checkpoint['model']) checkpoint_params.append(checkpoint['params']) checkpoint_params[i]['use_theano'] = 1 BatchGenerator.append(decodeGenerator(checkpoint_params[i])) zipp(model_npy[i],BatchGenerator[i].model_th) modelTh.append(BatchGenerator[i].model_th) modelTh[i]['comb_weight'] = 1.0/params['nmodels'] BatchGenerator[0].prepMultiPredictor(modelTh,checkpoint_params,params['beam_size'],params['nmodels']) # output blob which we will dump to JSON for visualizing the results blob = {} blob['params'] = params blob['checkpoint_params'] = copy(checkpoint_params) if checkpoint_params.get('class_out_factoring',0) == 1: blob['checkpoint_params'].pop('ixtoclsinfo') blob['imgblobs'] = [] # load the tasks.txt file and setupe feature loading root_path = params['root_path'] img_names_list = open(params['imgList'], 'r').read().splitlines() auxidxes = [] if len(img_names_list[0].rsplit(',')) > 2: img_names = [x.rsplit(',')[0] for x in img_names_list] idxes = [int(x.rsplit(',')[1]) for x in img_names_list] auxidxes = [int(x.rsplit(',')[2]) for x in img_names_list] elif len(img_names_list[0].rsplit(',')) > 1: img_names = [x.rsplit(',')[0] for x in img_names_list] idxes = [int(x.rsplit(',')[1]) for x in img_names_list] else: img_names = img_names_list idxes = xrange(len(img_names_list)) #if checkpoint_params.get('en_aux_inp',0) and (params.get('aux_inp_file','None') == 'None'): # raise ValueError('ERROR: please specify auxillary input feature using --aux_inp_file') # return # load the features for all images if checkpoint_params.get('swap_aux') == 0 or auxidxes == []: features, aux_inp = loadArbitraryFeatures(params, idxes, auxidxes=auxidxes) else: features, aux_inp = loadArbitraryFeatures(params, auxidxes, auxidxes=idxes) N = len(img_names) # iterate over all images and predict sentences print("\nUsing model run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint['epoch'], \ checkpoint['perplexity'])) kwparams = { 'beam_size' : params['beam_size'] } jsonFname = 'result_struct_%s.json' % (params['fname_append'] ) save_file = os.path.join(root_path, jsonFname) for n in xrange(N): print 'image %d/%d:' % (n, N) # encode the image if params['multi_model'] == 0: D,NN = features.shape img = {} img['feat'] = features[:, n] if checkpoint_params.get('en_aux_inp',0): img['aux_inp'] = aux_inp[:, n] img['local_file_path'] =img_names[n] # perform the work. heavy lifting happens inside Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams) else: kwparams['nmodels'] = params['nmodels'] batch = [] for i in xrange(params['nmodels']): img = {} img['feat'] = features[i][:, n] if checkpoint_params[i].get('en_aux_inp',0): img['aux_inp'] = aux_inp[i][:, n] img['local_file_path'] =img_names[n] batch.append({'image':img}) Ys = BatchGenerator[0].predictMulti(batch, checkpoint_params, **kwparams) # build up the output img_blob = {} img_blob['img_path'] = img['local_file_path'] # encode the top prediction top_predictions = Ys[0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[0] # these are sorted with highest on top if checkpoint_params.get('reverse_sentence',0) == 0: candidate = ' '.join([ixtoword[int(ix)] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that else: candidate = ' '.join([ixtoword[int(ix)] for ix in reversed(top_prediction[1]) if ix > 0]) # ix 0 is the END token, skip that print 'PRED: (%f) %s' % (float(top_prediction[0]), candidate) img_blob['candidate'] = {'text': candidate, 'logprob': float(top_prediction[0])} # Code to save all the other candidates candlist = [] for ci in xrange(len(top_predictions)-1): prediction = top_predictions[ci+1] # these are sorted with highest on top candidate = ' '.join([ixtoword[int(ix)] for ix in prediction[1] if ix > 0]) # ix 0 is the END token, skip that candlist.append({'text': candidate, 'logprob': float(prediction[0])}) img_blob['candidatelist'] = candlist blob['imgblobs'].append(img_blob) if (n%5000) == 1: print 'writing predictions to %s...' % (save_file, ) json.dump(blob, open(save_file, 'w')) # dump result struct to file print 'writing predictions to %s...' % (save_file, ) json.dump(blob, open(save_file, 'w')) # dump output html html = '' for img in blob['imgblobs']: html += '<img src="%s" height="400"><br>' % (img['img_path'], ) html += '(%f) %s <br><br>' % (img['candidate']['logprob'], img['candidate']['text']) html_file = 'result_%s.html' % (params['fname_append']) html_file = os.path.join(root_path, html_file) print 'writing html result file to %s...' % (html_file, ) open(html_file, 'w').write(html)
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname #--------------------------------- Init data provider and load data+features #---------------------------------# # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = params['featenc_hidden_size'] * params[ 'n_encgt_sent'] if params['encode_gt_sentences'] else dp.aux_inp_size params['featenc_hidden_size'] = params['featenc_hidden_size'] if params[ 'encode_gt_sentences'] else params['aux_inp_size'] params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) #--------------------------------- Preprocess sentences and build Vocabulary #---------------------------------# misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times if params['checkpoint_file_name'] == 'None': if params['class_out_factoring'] == 0: misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) else: [misc['wordtoix'], misc['classes'] ], [misc['ixtoword'], misc['clstotree'], misc['ixtoclsinfo'] ], [bias_init_vector, bias_init_inter_class ] = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold, params) params['nClasses'] = bias_init_inter_class.shape[0] params['ixtoclsinfo'] = misc['ixtoclsinfo'] else: misc = checkpoint_init['misc'] params['nClasses'] = checkpoint_init['params']['nClasses'] if 'ixtoclsinfo' in misc: params['ixtoclsinfo'] = misc['ixtoclsinfo'] params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though print len(misc['wordtoix']), len(misc['ixtoword']) #------------------------------ Initialize the solver/generator and build forward path #-----------------------# # Initialize the optimizer solver = Solver(params['solver']) # This initializes the model parameters and does matrix initializations lstmGenerator = decodeGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update_list, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack if params['checkpoint_file_name'] == 'None': model['bd'].set_value(bias_init_vector.astype(config.floatX)) if params['class_out_factoring'] == 1: model['bdCls'].set_value( bias_init_inter_class.astype(config.floatX)) #----------------- If we are using feature encoders ----------------------- # This mode can now also be used for encoding GT sentences. if params['use_encoder_for'] & 1: if params['encode_gt_sentences']: xI = tensor.zeros((batch_size, params['image_encoding_size'])) imgFeatEnc_inp = [] else: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['word_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] if params['use_encoder_for'] & 2: aux_enc_inp = model['Wemb'] if params[ 'encode_gt_sentences'] else dp.aux_inputs.T hid_size = params['featenc_hidden_size'] auxFeatEncoder = RecurrentFeatEncoder(hid_size, params['image_encoding_size'], params, mdl_prefix='aux_enc_', features=aux_enc_inp) mdlLen = len(model.keys()) model.update(auxFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(auxFeatEncoder.model_th.keys()))) misc['update'].extend(auxFeatEncoder.update_list) misc['regularize'].extend(auxFeatEncoder.regularize) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params) if params['encode_gt_sentences']: # Reshape it size(batch_size, n_gt, hidden_size) xAux = xAux.reshape( (-1, params['n_encgt_sent'], params['featenc_hidden_size'])) # Convert it to size (batch_size, n_gt*hidden_size xAux = xAux.flatten(2) else: auxFeatEnc_inp = [] xAux = None #--------------------------------- Initialize the Attention Network #-------------------------------# if params['use_attn'] != None: attnModel = AttentionNetwork(params['image_feat_size'], params['hidden_size'], params, mdl_prefix='attn_mlp_') mdlLen = len(model.keys()) model.update(attnModel.model_th) assert (len(model.keys()) == (mdlLen + len(attnModel.model_th.keys()))) misc['update'].extend(attnModel.update_list) misc['regularize'].extend(attnModel.regularize) attn_nw_func = attnModel.build_model else: attn_nw_func = None #--------------------------------- Build the language model graph #---------------------------------# # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list_gen, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params, xI, xAux, attn_nw=attn_nw_func) inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen #--------------------------------- Cost function and gradient computations setup #---------------------------------# costGrad = cost[0] # Add class uncertainity to final cost #if params['class_out_factoring'] == 1: # costGrad += cost[2] # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c costGrad += (reg_cost / params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(costGrad, wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) #-------------------------------- Intialize the prediction path if needed by evaluator ----------------------------# evalKwargs = { 'eval_metric': params['eval_metric'], 'f_gen': lstmGenerator.predict, 'beamsize': params['eval_beamsize'] } if params['eval_metric'] != 'perplex': lstmGenerator.prepPredictor(None, params, params['eval_beamsize']) refToks, scr_info = eval_prep_refs('val', dp, params['eval_metric']) evalKwargs['refToks'] = refToks evalKwargs['scr_info'] = scr_info valMetOp = operator.gt else: valMetOp = operator.lt if params['met_to_track'] != []: trackMetargs = { 'eval_metric': params['met_to_track'], 'f_gen': lstmGenerator.predict, 'beamsize': params['eval_beamsize'] } lstmGenerator.prepPredictor(None, params, params['eval_beamsize']) refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track']) trackMetargs['refToks'] = refToks trackMetargs['scr_info'] = scr_info #--------------------------------- Iterations and Logging intializations ------------------------------------------# # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_sc = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_sc = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} #json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) #Initialize Tracking the perplexity of train and val, with iters. train_perplex = [] val_perplex = [] trackSc_array = [] #-------------------------------------- Load previously saved model ------------------------------------------------# #- Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from, model) if params['restore_grads'] == 1: zipp(rg_init, rg) #Copy trackers from previous checkpoint if 'trackers' in checkpoint_init: train_perplex = checkpoint_init['trackers']['train_perplex'] val_perplex = checkpoint_init['trackers']['val_perplex'] trackSc_array = checkpoint_init['trackers'].get('trackScores', []) print( """\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n""" % (checkpoint_init['epoch'], checkpoint_init['perplexity'])) #-------------------------------------- MAIN LOOP ----------------------------------------------------------------# for it in xrange(max_iters): t0 = time.time() # Enable using dropout in training use_dropout.set_value(float(params['use_dropout'])) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(float(params['use_dropout'])) if params['use_encoder_for'] & 2: auxenc_use_dropout.set_value(float(params['use_dropout'])) epoch = it * 1.0 / num_iters_one_epoch #-------------------------------------- Prepare batch-------------------------------------------# # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch, l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) if params['use_pos_tag'] != 'None': gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], params['maxlen'], sentTagMap, misc['ixtoword'], rev_sents=params['reverse_sentence'], use_enc_for=params['use_encoder_for'], use_unk_token=params['use_unk_token']) else: gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], params['maxlen'], rev_sents=params['reverse_sentence'], use_enc_for=params['use_encoder_for'], use_unk_token=params['use_unk_token']) if params['sched_sampling_mode'] != None: gen_inp_list.append(epoch) real_inp_list = enc_inp_list + gen_inp_list #import ipdb; ipdb.set_trace() #---------------------------------- Compute cost and apply gradients ---------------------------# # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1] / lenS)) #step_struct['stats']['ppl2'] # smooth exponentially decaying moving average smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out total_cost = cost[0] if it == 0: smooth_cost = total_cost # start out where we start out smooth_cost = 0.99 * smooth_cost + 0.01 * total_cost #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) #---------------------------------- Write a report into a json file ---------------------------# tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' \ % (it, max_iters, dt, epoch, smooth_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_sc'] = val_sc # just write the last available one jstatus['val_metric'] = params[ 'eval_metric'] # just write the last available one jstatus['train_ppl2'] = train_ppl2 #if params['class_out_factoring'] == 1: # jstatus['class_cost'] = float(cost[2]) json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e #--------------------------------- VALIDATION ---------------------------# #- perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(0.) if params['use_encoder_for'] & 2: auxenc_use_dropout.set_value(0.) # perform the evaluation on VAL set val_sc = eval_split_theano('val', dp, model, params, misc, f_eval, **evalKwargs) val_sc = val_sc[0] val_perplex.append((it, val_sc)) train_perplex.append((it, smooth_train_ppl2)) if params['met_to_track'] != []: track_sc = eval_split_theano('val', dp, model, params, misc, f_eval, **trackMetargs) trackSc_array.append((it, { evm: track_sc[i] for i, evm in enumerate(params['met_to_track']) })) if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation %s = %f, lr = %f' % ( params['eval_metric'], val_sc, params['learning_rate']) #if params['sample_by_len'] == 1: # print len_hist #----------------------------- SAVE THE MODEL -------------------# write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if valMetOp(val_sc, top_val_sc) or top_val_sc < 0: if valMetOp(val_sc, write_checkpoint_ppl_threshold ) or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_sc = val_sc filename = 'model_checkpoint_%s_%s_%s_%s%.2f.p' % ( params['dataset'], host, params['fappend'], params['eval_metric'][:3], val_sc) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_sc checkpoint['misc'] = misc checkpoint['trackers'] = { 'train_perplex': train_perplex, 'val_perplex': val_perplex, 'trackScores': trackSc_array } try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e