def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname params['mode'] = 'CPU' # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) if params['mode'] == 'GPU': # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = gp.garray(bias_init_vector.reshape(1, bias_init_vector.size)) else: model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] max_iters = 1 for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) # perform gradient check if desired, with a bit of a burnin time (10 iterations) #if it == 10 and do_grad_check: # solver.gradCheck(batch, model, costfun) # print 'done gradcheck. continue?' # raw_input() # ## detect if loss is exploding and kill the job if so #total_cost = cost['total_cost'] #if it == 0: # total_cost0 = total_cost # store this initial cost #if total_cost > total_cost0 * 2: # print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' # abort = True # set the abort flag, we'll break out # ## logging: write JSON files for visual inspection of the training #tnow = time.time() #if tnow > last_status_write_time + 60*1: # every now and then lets write a report # last_status_write_time = tnow # jstatus = {} # jstatus['time'] = datetime.datetime.now().isoformat() # jstatus['iter'] = (it, max_iters) # jstatus['epoch'] = (epoch, max_epochs) # jstatus['time_per_batch'] = dt # jstatus['smooth_train_ppl2'] = smooth_train_ppl2 # jstatus['val_ppl2'] = val_ppl2 # just write the last available one # jstatus['train_ppl2'] = train_ppl2 # json_worker_status['history'].append(jstatus) # status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') # try: # json.dump(json_worker_status, open(status_file, 'w')) # except Exception, e: # todo be more clever here # print 'tried to write worker status into %s but got error:' % (status_file, ) # print e # ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good #is_last_iter = (it+1) == max_iters #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set # print 'validation perplexity = %f' % (val_ppl2, ) # write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] # if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: # if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # # if we beat a previous record or if this is the first time # # AND we also beat the user-defined threshold or it doesnt exist # top_val_ppl2 = val_ppl2 # filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2) # filepath = os.path.join(params['checkpoint_output_directory'], filename) # checkpoint = {} # checkpoint['it'] = it # checkpoint['epoch'] = epoch # checkpoint['model'] = model # checkpoint['params'] = params # checkpoint['perplexity'] = val_ppl2 # checkpoint['wordtoix'] = misc['wordtoix'] # checkpoint['ixtoword'] = misc['ixtoword'] # try: # pickle.dump(checkpoint, open(filepath, "wb")) # print 'saved checkpoint in %s' % (filepath, ) # except Exception, e: # todo be more clever here # print 'tried to write checkpoint into %s but got error: ' % (filepat, ) # print e cuda.close()
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) misc = { } # stores various misc items that need to be passed around the framework if params['checkpoint_file_name'] == 'None': # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) else: # Load Vocabulary from the checkpoint misc = checkpoint_init['misc'] params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though # This initializes the generator model parameters and does matrix initializations if params['t_eval_only'] == 0: generator = decodeGenerator(params) # Build the computational graph if params['use_encoder_for'] & 2: aux_enc_inp = generator.model_th['Wemb'] if params[ 'encode_gt_sentences'] else dp.aux_inputs.T hid_size = params['featenc_hidden_size'] auxFeatEncoder = RecurrentFeatEncoder( hid_size, params['image_encoding_size'], params, mdl_prefix='aux_enc_', features=aux_enc_inp) mdlLen = len(generator.model_th.keys()) generator.model_th.update(auxFeatEncoder.model_th) assert (len(generator.model_th.keys()) == ( mdlLen + len(auxFeatEncoder.model_th.keys()))) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model( generator.model_th, params) if params['encode_gt_sentences']: # Reshape it size(batch_size, n_gt, hidden_size) xAux = xAux.reshape((-1, params['n_encgt_sent'], params['featenc_hidden_size'])) # Convert it to size (batch_size, n_gt*hidden_size xAux = xAux.flatten(2) xI = tensor.zeros((batch_size, params['image_encoding_size'])) imgFeatEnc_inp = [] else: auxFeatEnc_inp = [] imgFeatEnc_inp = [] xAux = None xI = None (gen_inp_list, predLogProb, predIdx, predCand, gen_out, updatesLstm, seq_lengths) = generator.build_prediction_model(generator.model_th, params, xI=xI, xAux=xAux) gen_inp_list = imgFeatEnc_inp + auxFeatEnc_inp + gen_inp_list gen_out = gen_out.reshape([ gen_out.shape[0], -1, params['n_gen_samples'], params['vocabulary_size'] ]) #convert updates lstm to a tuple, this is to help merge it with grad updates updatesLstm = [(k, v) for k, v in updatesLstm.iteritems()] f_gen_only = theano.function( gen_inp_list, [predLogProb, predIdx, gen_out, seq_lengths], name='f_pred', updates=updatesLstm) modelGen = generator.model_th upListGen = generator.update_list if params['use_mle_train']: (use_dropout_genTF, inp_list_genTF, _, cost_genTF, _, updatesLSTM_genTF) = generator.build_model( generator.model_th, params) f_eval_genTF = theano.function(inp_list_genTF, cost_genTF, name='f_eval') grads_genTF = tensor.grad(cost_genTF[0], wrt=modelGen.values(), add_names=True) lr_genTF = tensor.scalar(name='lr', dtype=config.floatX) f_grad_genTF, f_update_genTF, zg_genTF, rg_genTF, ud_genTF = solver.build_solver_model( lr_genTF, modelGen, grads_genTF, inp_list_genTF, cost_genTF, params) else: modelGen = [] updatesLstm = [] if params['met_to_track'] != []: trackMetargs = {'eval_metric': params['met_to_track']} refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track']) trackMetargs['refToks'] = refToks trackMetargs['scr_info'] = scr_info # Initialize the evalator model if params['share_Wemb']: evaluator = decodeEvaluator(params, modelGen['Wemb']) else: evaluator = decodeEvaluator(params) modelEval = evaluator.model_th if params['t_eval_only'] == 0: # Build the evaluator graph to evaluate reference and generated captions if params.get('upd_eval_ref', 0): (refeval_inp_list, ref_f_pred_fns, ref_costs, ref_predTh, ref_modelEval) = evaluator.build_advers_eval(modelEval, params) (eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params, gen_inp_list, gen_out, updatesLstm, seq_lengths) else: # Build the evaluator graph to evaluate only reference captions (eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies if params['t_eval_only'] == 0: # and 0: if params['checkpoint_file_name'] == 'None': modelGen['bd'].set_value(bias_init_vector.astype(config.floatX)) if params.get('class_out_factoring', 0) == 1: modelGen['bdCls'].set_value( bias_init_inter_class.astype(config.floatX)) comb_inp_list = eval_inp_list if params['t_eval_only'] == 0: for inp in gen_inp_list: if inp not in comb_inp_list: comb_inp_list.append(inp) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation or debug purposes if params['t_eval_only'] == 0: f_eval = theano.function(comb_inp_list, costs[:1], name='f_eval', updates=updatesLstm) else: f_eval = theano.function(comb_inp_list, costs[:1], name='f_eval') if params['share_Wemb']: modelEval.pop('Wemb') if params['fix_Wemb']: upListGen.remove('Wemb') #------------------------------------------------------------------------------------------------------------------------- # Now let's build a gradient computation graph and update mechanism #------------------------------------------------------------------------------------------------------------------------- # First compute gradient on the evaluator params w.r.t cost if params.get('upd_eval_ref', 0): gradsEval_ref = tensor.grad(ref_costs[0], wrt=modelEval.values(), add_names=True) gradsEval = tensor.grad(costs[0], wrt=modelEval.values(), add_names=True) # Update functions for the evaluator lrEval = tensor.scalar(name='lrEval', dtype=config.floatX) if params.get('upd_eval_ref', 0): f_grad_comp_eval_ref, f_param_update_eval_ref, _, _, _ = solver.build_solver_model( lrEval, modelEval, gradsEval_ref, refeval_inp_list, ref_costs[0], params, w_clip=params['eval_w_clip']) f_grad_comp_eval, f_param_update_eval, zg_eval, rg_eval, ud_eval = solver.build_solver_model( lrEval, modelEval, gradsEval, comb_inp_list, costs[:1], params, updatesLstm, w_clip=params['eval_w_clip']) # Now compute gradient on the generator params w.r.t the cost if params['t_eval_only'] == 0: gradsGen = tensor.grad(costs[1], wrt=modelGen.values(), add_names=True) lrGen = tensor.scalar(name='lrGen', dtype=config.floatX) # Update functions for the generator f_grad_comp_gen, f_param_update_gen, zg_gen, rg_gen, ud_gen = solver.build_solver_model( lrGen, modelGen, gradsGen, comb_inp_list[:(len(comb_inp_list) - 1 + params['gen_feature_matching'])], costs[1], params, updatesLstm) #------------------------------------------------------------------------------------------------------------------------- # If we want to track some metrics during the training, initialize stuff for that now #------------------------------------------------------------------------------------------------------------------------- print 'model init done.' if params['t_eval_only'] == 0: print 'Gen model has keys: ' + ', '.join(modelGen.keys()) print 'Eval model has keys: ' + ', '.join(modelEval.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='images') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch skip_first = 20 iters_eval = 5 iters_gen = 1 cost_eval_iter = [] cost_gen_iter = [] trackSc_array = [] eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = 0.5 # initially size of dictionary of confusion smooth_train_cost = 0.0 # initially size of dictionary of confusion smooth_train_cost_gen = 1.0 # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] iter_out_file = os.path.join( 'logs', 'advmodel_checkpoint_%s_%s_%s_log.npz' % (params['dataset'], host, params['fappend'])) len_hist = defaultdict(int) t_print_sec = 30 ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': if params['t_eval_only'] != 1: print '\n Now initing gen Model:' zipp(model_init_gen_from, modelGen) if 'trackers' in checkpoint_init: trackSc_array = checkpoint_init['trackers'].get('trackScores', []) print '\n Now initing Eval Model:' zipp(model_init_eval_from, modelEval) #zipp(rg_init,rgGen) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) ############################################################## # Define signal handler to catch ctl-c or kills so that we can save the model trained till that point def signal_handler(signal, frame): print('You pressed Ctrl+C! Saving Checkpoint Now before exiting!') filename = 'advmodel_checkpoint_%s_%s_%s_%.2f_INT.p' % ( params['dataset'], host, params['fappend'], val_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2) sys.exit(0) #signal.signal(signal.SIGINT, signal_handler) ############################################################## #In testing disable sampling and use the greedy approach!? generator.usegumbel.set_value(1) if params['met_to_track'] != []: tsc_max, tsc_mean, tsc_min = eval_gen_samps(f_gen_only, dp, params, misc, params['rev_eval'], **trackMetargs) trackSc_array.append((0, { evm + '_max': tsc_max[i] for i, evm in enumerate(params['met_to_track']) })) trackSc_array[-1][1].update({ evm + '_mean': tsc_mean[i] for i, evm in enumerate(params['met_to_track']) }) trackSc_array[-1][1].update({ evm + '_min': tsc_min[i] for i, evm in enumerate(params['met_to_track']) }) disp_some_gen_samps(f_gen_only, dp, params, misc, n_samp=5) evaluator.use_noise.set_value(1.) eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc) # Re-enable sampling generator.usegumbel.set_value(1) np.savez(iter_out_file, eval_cost=np.array(cost_eval_iter), gen_cost=np.array(cost_gen_iter), tracksc=np.array(trackSc_array)) smooth_train_cost = 0.0 print '###################### NOW BEGINNING TRAINING #################################' for it in xrange(max_iters): t0 = time.time() # Enable using dropout in training evaluator.use_noise.set_value(1.) dt = 0. it2 = 0 while eval_acc <= 60. or gen_acc >= 45. or it2 < iters_eval * skip_first: # fetch a batch of data t1 = time.time() s_probs = [ 0.6, 0.4, 0.0 ] if params['eval_loss'] == 'contrastive' else [1.0, 0.0, 0.0] batch = dp.sampAdversBatch(batch_size, n_sent=params['n_gen_samples'], probs=s_probs) cnn_inps = prepare_adv_data(batch, misc['wordtoix'], maxlen=params['maxlen'], prep_for=params['eval_model']) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) eval_cost = f_grad_comp_eval(*(cnn_inps + enc_inp_list)) if np.isnan(eval_cost[0]): import pdb pdb.set_trace() f_param_update_eval(params['learning_rate_eval']) # Track training statistics smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * eval_cost[ 0] if it > 0 else eval_cost[0] dt2 = time.time() - t1 if it2 % 500 == 499: gb = 0. #modelGen['gumb_temp'].get_value() if params['use_gumbel_mse'] == 1 else 0 print 'Iter %d/%d Eval Only Iter %d/%d, done. in %.3fs. Eval Cost is %.6f' % ( it, max_iters, it2, iters_eval * skip_first, dt2, smooth_train_cost) if it2 % 100 == 99: eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, n_eval=500) it2 += 1 evaluator.use_noise.set_value(1.) if it >= 0: skip_first = 1 if it >= 100: skip_first = 1 if it % 1000 == 999: skip_first = 1 s_probs = [ 1.0, 0.0, 0.0 ] if params['eval_loss'] == 'contrastive' else [1.0, 0.0, 0.0] batch = dp.sampAdversBatch(batch_size, n_sent=params['n_gen_samples'], probs=s_probs) cnn_inps = prepare_adv_data(batch, misc['wordtoix'], maxlen=params['maxlen'], prep_for=params['eval_model']) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) gen_cost = f_grad_comp_gen( *(cnn_inps[:(len(cnn_inps) - 1 + params['gen_feature_matching'])] + enc_inp_list)) f_param_update_gen(params['learning_rate_gen']) if params['use_mle_train']: generator.usegumbel.set_value(0) batch, l = dp.getRandBatchByLen(batch_size) gen_inp_list, lenS = prepare_data(batch, misc['wordtoix'], params['maxlen']) cost_genMLE = f_grad_genTF(*gen_inp_list) f_update_genTF(np.float32(params['learning_rate_gen'] / 50.0)) generator.usegumbel.set_value(1) dt = time.time() - t0 # print training statistics smooth_train_cost_gen = gen_cost if it == 0 else 0.99 * smooth_train_cost_gen + 0.01 * gen_cost tnow = time.time() if tnow > last_status_write_time + t_print_sec * 1: # every now and then lets write a report gb = 0. #modelGen['gumb_temp'].get_value() if params['use_gumbel_mse'] == 1 else 0 print 'Iter %d/%d done. in %.3fs. Eval Cost is %.6f, Gen Cost is %.6f, temp: %.4f' % (it, max_iters, dt, \ smooth_train_cost, smooth_train_cost_gen, gb) last_status_write_time = tnow cost_eval_iter.append(smooth_train_cost) cost_gen_iter.append(smooth_train_cost_gen) if it % 500 == 499: # Run the generator on the validation set and compute some metrics generator.usegumbel.set_value(1) if params['met_to_track'] != []: #In testing set the temperature to very low, so that it is equivalent to Greed samples tsc_max, tsc_mean, tsc_min = eval_gen_samps( f_gen_only, dp, params, misc, params['rev_eval'], **trackMetargs) trackSc_array.append((it, { evm + '_max': tsc_max[i] for i, evm in enumerate(params['met_to_track']) })) trackSc_array[-1][1].update({ evm + '_mean': tsc_mean[i] for i, evm in enumerate(params['met_to_track']) }) trackSc_array[-1][1].update({ evm + '_min': tsc_min[i] for i, evm in enumerate(params['met_to_track']) }) disp_some_gen_samps(f_gen_only, dp, params, misc, n_samp=5) generator.usegumbel.set_value(1) # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = gen_acc if it % 500 == 499: eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, n_eval=500) if it % 1000 == 999: filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_genacc.p' % ( params['dataset'], host, params['fappend'], it, gen_acc) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, gen_acc) if it % 500 == 499: np.savez(iter_out_file, eval_cost=np.array(cost_eval_iter), gen_cost=np.array(cost_gen_iter), tracksc=np.array(trackSc_array)) # AND we also beat the user-defined threshold or it doesnt exist filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_GenDone.p' % ( params['dataset'], host, params['fappend'], it, g_acc) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, g_acc)
def main(params): word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['image_feat_size'] = dp.img_feat_size misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['use_dropout'] = 1 if params['fine_tune'] == 1: params['mode'] = 'multimodal_lstm' if params['multimodal_lstm'] == 0 else 'multimodal_lstm' if params['checkpoint_file_name'] != None: params['batch_size'] = dp.dataset['batchsize'] misc['wordtoix'] = checkpoint_init['wordtoix'] misc['ixtoword'] = checkpoint_init['ixtoword'] batch_size = 1 num_sentences_total = dp.getSplitSize('train', ofwhat = 'images') else: params['mode'] = 'batchtrain' batch_size = params['batch_size'] num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') params['vocabulary_size'] = len(misc['wordtoix']) pos_samp = np.arange(batch_size,dtype=np.int32) # This initializes the model parameters and does matrix initializations evalModel = decodeEvaluator(params) model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, miscOuts, cost, predTh, model) = evalModel.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p] ** 2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost /params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost, wrt=model.values()) lr = tensor.scalar(name='lr',dtype=config.floatX) if params['sim_minibatch'] > 0: f_grad_accum, f_clr, ag = solver.accumGrads(model,grads,inp_list,cost, params['sim_minibatch']) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, ag, inp_list, cost, params) else: f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1 max_iters = max_iters / inner_loop eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs/ inner_loop)) top_val_ppl2 = -1 smooth_train_cost = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != None: zipp(model_init_from,model) zipp(rg_init,rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) elif params['init_from_imagernn'] != None: # Initialize word vecs and image emb from generative model file rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb')) model['Wemb'].set_value(rnnCv['model']['Wemb']) model['WIemb'].set_value(rnnCv['model']['WIemb_aux']) misc['wordtoix'] = rnnCv['wordtoix'] misc['ixtoword'] = rnnCv['ixtoword'] print("\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn'])) use_dropout.set_value(1.) #################### Main Loop ############################################ for it in xrange(max_iters): t0 = time.time() # fetch a batch of data cost_inner = np.zeros((inner_loop,),dtype=np.float32) if params['sim_minibatch'] > 0: for i_l in xrange(inner_loop): batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model']) if params['fine_tune'] == 1: real_inp_list.append(pos_samp_sent) cost_inner[i_l] = f_grad_accum(*real_inp_list) else: batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model']) if params['fine_tune'] == 1: real_inp_list.append(pos_samp_sent) # Enable using dropout in training cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # Reset accumulated gradients to 0 if params['sim_minibatch'] > 0: f_clr() #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model]) #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))]) #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))]) #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) #import pdb; pdb.set_trace() #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) # print training statistics epoch = it*inner_loop * 1.0 / num_iters_one_epoch total_cost = (np.e**-cost + (np.e**(-cost_inner)).sum()*(params['sim_minibatch'] > 0))/ (1 + params['sim_minibatch']) #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_cost) if it == 0: smooth_train_cost = total_cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.3f' % (it, max_iters, dt, \ epoch, smooth_train_cost) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['val_ppl2'] = val_ppl2 # just write the last available one json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params['lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist #top_val_ppl2 = val_ppl2 filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (params['eval_model'], params['dataset'], host, params['fappend'],val_ppl2,smooth_train_cost) filepath = os.path.join(params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepath, ) print e use_dropout.set_value(1.)
def main(params): word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['image_feat_size'] = dp.img_feat_size params['aux_inp_size'] = dp.aux_inp_size misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) if params['fine_tune'] == 1: params['mode'] = 'multi_choice_mode' if params[ 'mc_mode'] == 1 else 'multimodal_lstm' if params['checkpoint_file_name'] != None: #params['batch_size'] = dp.dataset['batchsize'] misc['wordtoix'] = checkpoint_init['wordtoix'] misc['ixtoword'] = checkpoint_init['ixtoword'] batch_size = 1 num_sentences_total = dp.getSplitSize('train', ofwhat='images') else: params['mode'] = 'batchtrain' batch_size = params['batch_size'] num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') params['vocabulary_size'] = len(misc['wordtoix']) pos_samp = np.arange(batch_size, dtype=np.int32) # This initializes the model parameters and does matrix initializations evalModel = decodeEvaluator(params) model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize) #----------------- If we are using feature encoders ----------------------- if params['use_encoder_for'] & 1: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['sent_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) #misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list_eval, miscOuts, cost, predTh, model) = evalModel.build_model(model, params, xI=xI, prior_inp_list=imgFeatEnc_inp) inp_list = imgFeatEnc_inp + inp_list_eval # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost / params['batch_size']) # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) if params['sim_minibatch'] > 0: f_grad_accum, f_clr, ag = solver.accumGrads(model, grads, inp_list, cost, params['sim_minibatch']) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, ag, inp_list, cost, params) else: f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1 max_iters = max_iters / inner_loop eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs / inner_loop)) top_val_ppl2 = -1 smooth_train_cost = len( misc['ixtoword']) # initially size of dictionary of confusion smooth_error_rate = 100. error_rate = 0. prev_it = -1 val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != None: zipp(model_init_from, model) zipp(rg_init, rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) elif params['init_from_imagernn'] != None: # Initialize word vecs and image emb from generative model file rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb')) model['Wemb'].set_value(rnnCv['model']['Wemb']) model['WIemb'].set_value(rnnCv['model']['WIemb_aux']) misc['wordtoix'] = rnnCv['wordtoix'] misc['ixtoword'] = rnnCv['ixtoword'] print( "\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn'])) write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] use_dropout.set_value(1.) #################### Main Loop ############################################ for it in xrange(max_iters): t0 = time.time() if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(float(params['use_dropout'])) # fetch a batch of data cost_inner = np.zeros((inner_loop, ), dtype=np.float32) if params['sim_minibatch'] > 0: for i_l in xrange(inner_loop): batch, pos_samp_sent = dp.sampPosNegSentSamps( params['batch_size'], params['mode'], thresh=0.3) eval_inp_list, lenS = prepare_data( batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], use_enc_for=params['use_encoder_for']) if params['fine_tune'] == 1: eval_inp_list.append(pos_samp_sent) cost_inner[i_l] = f_grad_accum(*eval_inp_list) else: batch, pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'], params['mode'], thresh=0.3) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], use_shared_mem=params['use_shared_mem_enc']) eval_inp_list, lenS = prepare_data( batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], use_enc_for=params['use_encoder_for']) if params['fine_tune'] == 1: eval_inp_list.append(pos_samp_sent) real_inp_list = enc_inp_list + eval_inp_list # Enable using dropout in training cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # Reset accumulated gradients to 0 if params['sim_minibatch'] > 0: f_clr() #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model]) #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))]) #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))]) #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) #import pdb; pdb.set_trace() #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) # print training statistics epoch = it * inner_loop * 1.0 / num_iters_one_epoch total_cost = (np.e**(-cost[0]) + (np.e**(-cost_inner)).sum() * (params['sim_minibatch'] > 0)) / ( 1 + params['sim_minibatch']) #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_cost) if it == 0: smooth_train_cost = total_cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost error_rate += 100.0 * float((cost[2] < 0.).sum()) / batch_size margin_strength = cost[2].sum() smooth_error_rate = 0.99 * smooth_error_rate + 0.01 * 100.0 * ( float(cost[1]) / batch_size) if it > 0 else 100.0 * ( float(cost[1]) / batch_size) tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.4f, Error '\ 'rate is %.3f%%, Margin %.2f, negMarg=%.2f' % (it, max_iters, dt, \ epoch, smooth_train_cost, smooth_error_rate, margin_strength, error_rate/(it-prev_it)) error_rate = 0. prev_it = it last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['val_ppl2'] = val_ppl2 # just write the last available one json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(0.) val_ppl2 = eval_split_theano( 'val', dp, model, params, misc, f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % ( val_ppl2, params['learning_rate']) #if params['sample_by_len'] == 1: # print len_hist if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % ( params['eval_model'], params['dataset'], host, params['fappend'], smooth_error_rate, val_ppl2) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e use_dropout.set_value(1.)
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname params['mode'] = 'CPU' # fetch the data provider dp = getDataProvider(dataset) misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) if params['mode'] == 'GPU': # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = gp.garray( bias_init_vector.reshape(1, bias_init_vector.size)) else: model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] max_iters = 1 for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) # perform gradient check if desired, with a bit of a burnin time (10 iterations) #if it == 10 and do_grad_check: # solver.gradCheck(batch, model, costfun) # print 'done gradcheck. continue?' # raw_input() # ## detect if loss is exploding and kill the job if so #total_cost = cost['total_cost'] #if it == 0: # total_cost0 = total_cost # store this initial cost #if total_cost > total_cost0 * 2: # print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' # abort = True # set the abort flag, we'll break out # ## logging: write JSON files for visual inspection of the training #tnow = time.time() #if tnow > last_status_write_time + 60*1: # every now and then lets write a report # last_status_write_time = tnow # jstatus = {} # jstatus['time'] = datetime.datetime.now().isoformat() # jstatus['iter'] = (it, max_iters) # jstatus['epoch'] = (epoch, max_epochs) # jstatus['time_per_batch'] = dt # jstatus['smooth_train_ppl2'] = smooth_train_ppl2 # jstatus['val_ppl2'] = val_ppl2 # just write the last available one # jstatus['train_ppl2'] = train_ppl2 # json_worker_status['history'].append(jstatus) # status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') # try: # json.dump(json_worker_status, open(status_file, 'w')) # except Exception, e: # todo be more clever here # print 'tried to write worker status into %s but got error:' % (status_file, ) # print e # ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good #is_last_iter = (it+1) == max_iters #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set # print 'validation perplexity = %f' % (val_ppl2, ) # write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] # if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: # if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # # if we beat a previous record or if this is the first time # # AND we also beat the user-defined threshold or it doesnt exist # top_val_ppl2 = val_ppl2 # filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2) # filepath = os.path.join(params['checkpoint_output_directory'], filename) # checkpoint = {} # checkpoint['it'] = it # checkpoint['epoch'] = epoch # checkpoint['model'] = model # checkpoint['params'] = params # checkpoint['perplexity'] = val_ppl2 # checkpoint['wordtoix'] = misc['wordtoix'] # checkpoint['ixtoword'] = misc['ixtoword'] # try: # pickle.dump(checkpoint, open(filepath, "wb")) # print 'saved checkpoint in %s' % (filepath, ) # except Exception, e: # todo be more clever here # print 'tried to write checkpoint into %s but got error: ' % (filepat, ) # print e cuda.close()
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) if params.get('init_model_from', ''): # load checkpoint checkpoint = pickle.load(open(params['init_model_from'], 'rb')) model = checkpoint['model'] # overwrite the model # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] import csv csvfile = open(os.path.join(params['outdir'],params['generator']+'.csv'),'wb') csvout = csv.writer(csvfile,delimiter=',',quotechar='"') csv_val_file = open(os.path.join(params['outdir'],params['generator']+'_val.csv'),'wb') csv_val_out = csv.writer(csv_val_file,delimiter=',',quotechar='"') for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) csvout.writerow([it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'],train_ppl2, smooth_train_ppl2]) csvfile.flush() if not host=='oliver-Aurora-R4': sys.stdout.flush() # os.system('./update_plots.sh') # perform gradient check if desired, with a bit of a burnin time (10 iterations) if it == 10 and do_grad_check: print 'disabling dropout for gradient check...' params['drop_prob_encoder'] = 0 params['drop_prob_decoder'] = 0 solver.gradCheck(batch, model, costfun) print 'done gradcheck, exitting.' sys.exit() # hmmm. probably should exit here # detect if loss is exploding and kill the job if so total_cost = cost['total_cost'] if it == 0: total_cost0 = total_cost # store this initial cost if total_cost > total_cost0 * 2: print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' abort = True # set the abort flag, we'll break out # logging: write JSON files for visual inspection of the training tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e # perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set print 'validation perplexity = %f' % (val_ppl2, ) cp_pred = {} cp_pred['it'] = it cp_pred['epoch'] = epoch cp_pred['model'] = model cp_pred['params'] = params cp_pred['perplexity'] = val_ppl2 cp_pred['wordtoix'] = misc['wordtoix'] cp_pred['ixtoword'] = misc['ixtoword'] cp_pred['algorithm'] = params['generator'] cp_pred['outdir'] = params['outdir'] if is_last_iter: scores = eval_sentence_predictions.run(cp_pred) csv_val_out.writerow([it, max_iters, dt, epoch, val_ppl2, scores[0],scores[1],scores[2],scores[3],scores[4],scores[5],scores[6]]) csv_val_file.flush() omail.send('job finished'+params['generator'],'done') # abort training if the perplexity is no good min_ppl_or_abort = params['min_ppl_or_abort'] if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0: print 'aborting job because validation perplexity %f < %f' % (val_ppl2, min_ppl_or_abort) abort = True # abort the job write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_%s_checkpoint_%s_%s_%s_%.2f.p' % (params['generator'],dataset, host, params['fappend'], val_ppl2) filepath = os.path.join(params['outdir'], filename) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] checkpoint['algorithm'] = params['generator'] checkpoint['outdir'] = params['outdir'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepat, ) print e scores = eval_sentence_predictions.run(checkpoint) csv_val_out.writerow([it, max_iters, dt, epoch, val_ppl2, scores[0],scores[1],scores[2],scores[3],scores[4],scores[5],scores[6]]) csv_val_file.flush()
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] # name of the dataset flickr8k, flickr30k.. word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(dataset) completeData = dp.getData('train') misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times #print 'dp.iterSentences', dp.iterSentences('train') misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) #printWordEmbedding(dp.iterSentences('train'),misc['wordtoix']) #print 'type;',type(completeData) # calculate weights of all unique words in vocab weightComputedData = calculateWeights(misc['wordtoix'], misc['ixtoword'], completeData) weightCalculationMethodSec() weightComputedData = getWeightsMethod2() print 'Done:' # delegate the initialization of the model to the Generator class BatchGenerator = GenericBatchGenerator() #decodeGenerator(params) # initialize encoder and decoder weight matrices init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = bias_init_vector.reshape( 1, bias_init_vector.size) # remove and check print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) if params.get('init_model_from', ''): # load checkpoint checkpoint = pickle.load(open(params['init_model_from'], 'rb')) model = checkpoint['model'] # overwrite the model # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc, weightComputedData) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics #train_ppl2 = step_struct['stats']['ppl2'] #if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost']) total_cost = cost['total_cost'] if it == 0: total_cost0 = total_cost if total_cost > total_cost0 * 2: print 'Aborting, cost seems to be exploding. ' abort = True if (it + 1) == max_iters: top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % ( dataset, host, params['fappend'], val_ppl2) filepath = os.path.join(params['checkpoint_output_directory'], filename) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def main(params, split): #import pdb; pdb.set_trace() batch_size = params['batch_size'] dataset = params['dataset'] feature_file = params['feature_file'] class_count_threshold = params['class_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname json_file = 'dataset_mmdb_book_fps_30_samplesize_25_split_%d.json' % ( split) # fetch the data provider dp = getDataProvider(dataset, feature_file, json_file) misc = { } # stores various misc items that need to be passed around the framework # go over all training classes and find the vocabulary we want to use, i.e. the classes that occur # at least class_count_threshold number of times misc['classtoix'], misc[ 'ixtoclass'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), class_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) if params.get('init_model_from', ''): # load checkpoint checkpoint = pickle.load(open(params['init_model_from'], 'rb')) model = checkpoint['model'] # overwrite the model # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoclass']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoclass']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] lastsavedcheckpoint = '' for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) print 'last saved checkpoint in %s' % (lastsavedcheckpoint, ) # perform gradient check if desired, with a bit of a burnin time (10 iterations) if it == 10 and do_grad_check: print 'disabling dropout for gradient check...' params['drop_prob_encoder'] = 0 params['drop_prob_decoder'] = 0 solver.gradCheck(batch, model, costfun) print 'done gradcheck, exitting.' sys.exit() # hmmm. probably should exit here # detect if loss is exploding and kill the job if so total_cost = cost['total_cost'] if it == 0: total_cost0 = total_cost # store this initial cost if total_cost > total_cost0 * 2: print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' abort = True # set the abort flag, we'll break out # logging: write JSON files for visual inspection of the training tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e # perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set print 'validation perplexity = %f' % (val_ppl2, ) # abort training if the perplexity is no good min_ppl_or_abort = params['min_ppl_or_abort'] if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0: print 'aborting job because validation perplexity %f < %f' % ( val_ppl2, min_ppl_or_abort) abort = True # abort the job write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_alpha_%2.2f_beta_%2.2f_split_%d.p' % ( dataset, host, params['fappend'], params['alpha'], params['beta'], split) filepath = os.path.join( params['checkpoint_output_directory'], filename) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['classtoix'] = misc['classtoix'] checkpoint['ixtoclass'] = misc['ixtoclass'] checkpoint['json_file'] = json_file try: if not (params['fappend'] == 'test'): # if it == max_iters - 1 : pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) lastsavedcheckpoint = filepath except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def main(params): batch_size = params["batch_size"] word_count_threshold = params["word_count_threshold"] max_epochs = params["max_epochs"] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params["solver"]) params["aux_inp_size"] = dp.aux_inp_size params["image_feat_size"] = dp.img_feat_size print "Image feature size is %d, and aux input size is %d" % (params["image_feat_size"], params["aux_inp_size"]) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times if params["class_out_factoring"] == 0: misc["wordtoix"], misc["ixtoword"], bias_init_vector = preProBuildWordVocab( dp.iterSentences("train"), word_count_threshold ) else: [misc["wordtoix"], misc["classes"]], [misc["ixtoword"], misc["clstotree"], misc["ixtoclsinfo"]], [ bias_init_vector, bias_init_inter_class, ] = preProBuildWordVocab(dp.iterSentences("train"), word_count_threshold, params) params["nClasses"] = bias_init_inter_class.shape[0] params["vocabulary_size"] = len(misc["wordtoix"]) params["output_size"] = len(misc["ixtoword"]) # these should match though print len(misc["wordtoix"]), len(misc["ixtoword"]) # This initializes the model parameters and does matrix initializations lstmGenerator = LSTMGenerator(params) model, misc["update"], misc["regularize"] = ( lstmGenerator.model_th, lstmGenerator.update_list, lstmGenerator.regularize, ) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it model["bd"].set_value(bias_init_vector.astype(config.floatX)) if params["class_out_factoring"] == 1: model["bdCls"].set_value(bias_init_inter_class.astype(config.floatX)) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params) costGrad = cost[0] # Add class uncertainity to final cost # if params['class_out_factoring'] == 1: # costGrad += cost[2] # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params["regc"] > 0.0: reg_cost = theano.shared(numpy_floatX(0.0), name="reg_c") reg_c = tensor.as_tensor_variable(numpy_floatX(params["regc"]), name="reg_c") reg_cost = 0.0 for p in misc["regularize"]: reg_cost += (model[p] ** 2).sum() reg_cost *= 0.5 * reg_c costGrad += reg_cost / params["batch_size"] # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name="f_eval") # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(costGrad, wrt=model.values()) lr = tensor.scalar(name="lr", dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, grads, inp_list, cost, params) print "model init done." print "model has keys: " + ", ".join(model.keys()) # print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) # print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) # print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize("train", ofwhat="sentences") num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params["eval_period"] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = len(misc["ixtoword"]) # initially size of dictionary of confusion val_ppl2 = len(misc["ixtoword"]) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status["params"] = params json_worker_status["history"] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params["checkpoint_file_name"] != "None": zipp(model_init_from, model) zipp(rg_init, rg) print ( "\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init["epoch"], checkpoint_init["perplexity"]) ) for it in xrange(max_iters): t0 = time.time() # fetch a batch of data if params["sample_by_len"] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch, l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 if params["use_pos_tag"] != "None": real_inp_list, lenS = prepare_data( batch, misc["wordtoix"], params["maxlen"], sentTagMap, misc["ixtoword"], rev_sents=params["reverse_sentence"], ) else: real_inp_list, lenS = prepare_data( batch, misc["wordtoix"], params["maxlen"], rev_sents=params["reverse_sentence"] ) # Enable using dropout in training use_dropout.set_value(float(params["use_dropout"])) epoch = it * 1.0 / num_iters_one_epoch if params["sched_sampling_mode"] != None: real_inp_list.append(epoch) # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params["learning_rate"]) dt = time.time() - t0 # print training statistics train_ppl2 = 2 ** (cost[1] / lenS) # step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out total_cost = cost[0] # print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print "%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f" % ( it, max_iters, dt, epoch, total_cost, train_ppl2, ) last_status_write_time = tnow jstatus = {} jstatus["time"] = datetime.datetime.now().isoformat() jstatus["iter"] = (it, max_iters) jstatus["epoch"] = (epoch, max_epochs) jstatus["time_per_batch"] = dt jstatus["smooth_train_ppl2"] = smooth_train_ppl2 jstatus["val_ppl2"] = val_ppl2 # just write the last available one jstatus["train_ppl2"] = train_ppl2 # if params['class_out_factoring'] == 1: # jstatus['class_cost'] = float(cost[2]) json_worker_status["history"].append(jstatus) status_file = os.path.join(params["worker_status_output_directory"], host + "_status.json") # import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, "w")) except Exception, e: # todo be more clever here print "tried to write worker status into %s but got error:" % (status_file,) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.0) val_ppl2 = eval_split_theano("val", dp, model, params, misc, f_eval) # perform the evaluation on VAL set if epoch - params["lr_decay_st_epoch"] >= 0: params["learning_rate"] = params["learning_rate"] * params["lr_decay"] params["lr_decay_st_epoch"] += 1 print "validation perplexity = %f, lr = %f" % (val_ppl2, params["learning_rate"]) if params["sample_by_len"] == 1: print len_hist write_checkpoint_ppl_threshold = params["write_checkpoint_ppl_threshold"] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = "model_checkpoint_%s_%s_%s_%.2f.p" % ( params["dataset"], host, params["fappend"], val_ppl2, ) filepath = os.path.join(params["checkpoint_output_directory"], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint["it"] = it checkpoint["epoch"] = epoch checkpoint["model"] = model_npy checkpoint["rgrads"] = rgrads_npy checkpoint["params"] = params checkpoint["perplexity"] = val_ppl2 checkpoint["misc"] = misc try: pickle.dump(checkpoint, open(filepath, "wb")) print "saved checkpoint in %s" % (filepath,) except Exception, e: # todo be more clever here print "tried to write checkpoint into %s but got error: " % (filepath,) print e
def main(params): batch_size = params["batch_size"] dataset = params["dataset"] word_count_threshold = params["word_count_threshold"] do_grad_check = params["do_grad_check"] max_epochs = params["max_epochs"] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc["wordtoix"], misc["ixtoword"], bias_init_vector = preProBuildWordVocab( dp.iterSentences("train"), word_count_threshold ) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc["update"], misc["regularize"] = (init_struct["model"], init_struct["update"], init_struct["regularize"]) # force overwrite here. This is a bit of a hack, not happy about it model["bd"] = bias_init_vector.reshape(1, bias_init_vector.size) print "model init done." print "model has keys: " + ", ".join(model.keys()) print "updating: " + ", ".join("%s [%dx%d]" % (k, model[k].shape[0], model[k].shape[1]) for k in misc["update"]) print "updating: " + ", ".join("%s [%dx%d]" % (k, model[k].shape[0], model[k].shape[1]) for k in misc["regularize"]) print "number of learnable parameters total: %d" % ( sum(model[k].shape[0] * model[k].shape[1] for k in misc["update"]), ) if params.get("init_model_from", ""): # load checkpoint checkpoint = pickle.load(open(params["init_model_from"], "rb")) model = checkpoint["model"] # overwrite the model print checkpoint["model"] # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize("train", ofwhat="sentences") num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params["eval_period"] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc["ixtoword"]) # initially size of dictionary of confusion val_ppl2 = len(misc["ixtoword"]) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status["params"] = params json_worker_status["history"] = [] for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct["cost"] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct["stats"]["ppl2"] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print "%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)" % ( it, max_iters, dt, epoch, cost["loss_cost"], cost["reg_cost"], train_ppl2, smooth_train_ppl2, ) # perform gradient check if desired, with a bit of a burnin time (10 iterations) if it == 10 and do_grad_check: print "disabling dropout for gradient check..." params["drop_prob_encoder"] = 0 params["drop_prob_decoder"] = 0 solver.gradCheck(batch, model, costfun) print "done gradcheck, exitting." sys.exit() # hmmm. probably should exit here # detect if loss is exploding and kill the job if so total_cost = cost["total_cost"] if it == 0: total_cost0 = total_cost # store this initial cost if total_cost > total_cost0 * 2: print "Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?" abort = True # set the abort flag, we'll break out # logging: write JSON files for visual inspection of the training tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report last_status_write_time = tnow jstatus = {} jstatus["time"] = datetime.datetime.now().isoformat() jstatus["iter"] = (it, max_iters) jstatus["epoch"] = (epoch, max_epochs) jstatus["time_per_batch"] = dt jstatus["smooth_train_ppl2"] = smooth_train_ppl2 jstatus["val_ppl2"] = val_ppl2 # just write the last available one jstatus["train_ppl2"] = train_ppl2 json_worker_status["history"].append(jstatus) status_file = os.path.join(params["worker_status_output_directory"], host + "_status.json") try: json.dump(json_worker_status, open(status_file, "w")) except Exception, e: # todo be more clever here print "tried to write worker status into %s but got error:" % (status_file,) print e # perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: val_ppl2 = eval_split("val", dp, model, params, misc) # perform the evaluation on VAL set print "validation perplexity = %f" % (val_ppl2,) # abort training if the perplexity is no good min_ppl_or_abort = params["min_ppl_or_abort"] if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0: print "aborting job because validation perplexity %f < %f" % (val_ppl2, min_ppl_or_abort) abort = True # abort the job write_checkpoint_ppl_threshold = params["write_checkpoint_ppl_threshold"] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = "model_checkpoint_%s_%s_%s_%.2f.p" % (dataset, host, params["fappend"], val_ppl2) filepath = os.path.join(params["checkpoint_output_directory"], filename) checkpoint = {} checkpoint["it"] = it checkpoint["epoch"] = epoch checkpoint["model"] = model checkpoint["params"] = params checkpoint["perplexity"] = val_ppl2 checkpoint["wordtoix"] = misc["wordtoix"] checkpoint["ixtoword"] = misc["ixtoword"] try: pickle.dump(checkpoint, open(filepath, "wb")) print "saved checkpoint in %s" % (filepath,) except Exception, e: # todo be more clever here print "tried to write checkpoint into %s but got error: " % (filepat,) print e
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size']) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though params['use_dropout'] = 1 # This initializes the model parameters and does matrix initializations generator = decodeGenerator(params) (gen_inp_list, predLogProb, predIdx, predCand, wOut_emb, updatesLstm) = generator.build_prediction_model( generator.model_th, params, params['beam_size']) wOut_emb = wOut_emb.reshape([wOut_emb.shape[0],wOut_emb.shape[2]]) f_gen_only = theano.function(gen_inp_list, [predLogProb, predIdx, wOut_emb], name='f_pred', updates=updatesLstm) modelGen = generator.model_th upListGen = generator.update_list if params['share_Wemb']: evaluator = decodeEvaluator(params, modelGen['Wemb']) else: evaluator = decodeEvaluator(params) modelEval = evaluator.model_th # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout_eval, eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params, gen_inp_list, wOut_emb) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it comb_inp_list = eval_inp_list for inp in gen_inp_list: if inp not in comb_inp_list: comb_inp_list.append(inp) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(comb_inp_list, costs, name='f_eval', updates=updatesLstm) # Now let's build a gradient computation graph and rmsprop update mechanism if params['share_Wemb']: modelEval.pop('Wemb') if params['fix_Wemb']: upListGen.remove('Wemb') modelGenUpD = OrderedDict() for k in upListGen: modelGenUpD[k] = modelGen[k] gradsEval = tensor.grad(costs[0], wrt=modelEval.values(),add_names=True) gradsGen = tensor.grad(costs[1], wrt=modelGenUpD.values(), add_names=True) lrEval = tensor.scalar(name='lrEval',dtype=config.floatX) f_grad_comp_eval, f_param_update_eval, zg_eval, rg_eval, ud_eval= solver.build_solver_model(lrEval, modelEval, gradsEval, comb_inp_list, costs[0], params) lrGen = tensor.scalar(name='lrGen',dtype=config.floatX) f_grad_comp_gen, f_param_update_gen, zg_gen, rg_gen, ud_gen = solver.build_solver_model(lrGen, modelGenUpD, gradsGen, comb_inp_list, costs[1], params) print 'model init done.' print 'model has keys: ' + ', '.join(modelGen.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat = 'images') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch iters_eval= num_iters_one_epoch//2 iters_gen = num_iters_one_epoch//4 eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = 0.5 # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) t_print_sec = 60 ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from,modelGen) #zipp(rg_init,rgGen) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) pos_samp = np.arange(batch_size,dtype=np.int32) print batch_size ############################################################## # Define signal handler to catch ctl-c or kills so that we can save the model trained till that point def signal_handler(signal, frame): print('You pressed Ctrl+C! Saving Checkpoint Now before exiting!') filename = 'advmodel_checkpoint_%s_%s_%s_%.2f_INT.p' % (params['dataset'], host, params['fappend'], val_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2) sys.exit(0) signal.signal(signal.SIGINT, signal_handler) ############################################################## for it in xrange(max_epochs): epoch = it * 1.0 / num_iters_one_epoch # Enable using dropout in training use_dropout_eval.set_value(1.) for it2 in xrange(iters_eval): t0 = time.time() # fetch a batch of data batch,_ = dp.sampPosNegSentSamps(params['eval_batch_size'] - params['rand_negs']) real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], rand_negs = params['rand_negs']) # evaluate cost, gradient and perform parameter update cost = f_grad_comp_eval(*real_inp_list) f_param_update_eval(params['learning_rate_eval']) dt = time.time() - t0 # Track training statistics train_ppl2 = (np.e**(-cost)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it2 == 0: smooth_train_ppl2 = train_ppl2 if it2 == 0: smooth_train_cost = cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * cost tnow = time.time() if tnow > last_status_write_time + t_print_sec*1: # every now and then lets write a report print 'Eval Cnn in epoch %d: %d/%d sample done in %.3fs. Cost now is %.3f Pplx is %.3f' % (it, it2, iters_eval, dt, \ smooth_train_cost,smooth_train_ppl2) last_status_write_time = tnow print 'Done training the descriminative model for now. Switching to Genereative model' print 'Eval N/W in epoch %d: Cost now is %.3f Pplx is %.3f' % (it, smooth_train_cost,smooth_train_ppl2) filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_EVOnly.p' % (params['dataset'], host, params['fappend'],it, smooth_train_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2) # Disable Cnn dropout while training gen network use_dropout_eval.set_value(0.) for it2 in xrange(iters_gen): t0 = time.time() # fetch a batch of data batch,_ = dp.sampPosNegSentSamps(params['eval_batch_size'] - params['rand_negs']) real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], rand_negs = params['rand_negs']) #import pdb; pdb.set_trace() # evaluate cost, gradient and perform parameter update #if any([np.isnan(modelGen[m].get_value()).any() for m in modelGen]): # print 'Somebodys NAN!!!' # break; #asd = f_gen_only(real_inp_list[2],real_inp_list[3]) #print it2,asd[-1].shape, real_inp_list[0].shape #if asd[-1].shape[0] > real_inp_list[0].shape[0]: # import pdb; pdb.set_trace() cost = f_grad_comp_gen(*real_inp_list) #print it2,cost #if any([np.isnan(zg_gen[i].get_value()).any() for i in xrange(len(zg_gen))]): # print 'Somebody zg is NAN!!!' # break; #if any([np.isnan(rg_gen[i].get_value()).any() for i in xrange(len(rg_gen))]) or any([(rg_gen[i].get_value()<0).any() for i in xrange(len(rg_gen))]): # print 'Somebody rg is NAN!!!' # break; f_param_update_gen(params['learning_rate_gen']) dt = time.time() - t0 # print training statistics train_ppl2 = (np.e**(-cost)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it2 == 0: smooth_train_ppl2 = train_ppl2 if it2 == 0: smooth_train_cost = cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * cost tnow = time.time() if tnow > last_status_write_time + t_print_sec*1: # every now and then lets write a report print 'Gen Lstm in epoch %d: %d/%d sample done in %.3fs. Cost now is %.3f Pplx is %.3f' % (it, it2, iters_gen, dt, \ smooth_train_cost,smooth_train_ppl2) last_status_write_time = tnow print 'Done training the generative model for now. Switching to Genereative model. Final Stats are:' print 'Gen Lstm in epoch %d: Cost now is %.3f Pplx is %.3f' % (it, smooth_train_cost,smooth_train_ppl2) ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters is_last_iter = 1 if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation # use_dropout.set_value(0.) # val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set # # if it - params['lr_decay_st_epoch'] >= 0: # params['learning_rate'] = params['learning_rate'] * params['lr_decay'] # params['lr_decay_st_epoch'] += 1 # # print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) # if params['sample_by_len'] == 1: # print len_hist val_ppl2 = smooth_train_ppl2 write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist #top_val_ppl2 = val_ppl2 filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_GenDone.p' % (params['dataset'], host, params['fappend'],it, smooth_train_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2)
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname #--------------------------------- Init data provider and load data+features #---------------------------------# # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = params['featenc_hidden_size'] * params[ 'n_encgt_sent'] if params['encode_gt_sentences'] else dp.aux_inp_size params['featenc_hidden_size'] = params['featenc_hidden_size'] if params[ 'encode_gt_sentences'] else params['aux_inp_size'] params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) #--------------------------------- Preprocess sentences and build Vocabulary #---------------------------------# misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times if params['checkpoint_file_name'] == 'None': if params['class_out_factoring'] == 0: misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) else: [misc['wordtoix'], misc['classes'] ], [misc['ixtoword'], misc['clstotree'], misc['ixtoclsinfo'] ], [bias_init_vector, bias_init_inter_class ] = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold, params) params['nClasses'] = bias_init_inter_class.shape[0] params['ixtoclsinfo'] = misc['ixtoclsinfo'] else: misc = checkpoint_init['misc'] params['nClasses'] = checkpoint_init['params']['nClasses'] if 'ixtoclsinfo' in misc: params['ixtoclsinfo'] = misc['ixtoclsinfo'] params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though print len(misc['wordtoix']), len(misc['ixtoword']) #------------------------------ Initialize the solver/generator and build forward path #-----------------------# # Initialize the optimizer solver = Solver(params['solver']) # This initializes the model parameters and does matrix initializations lstmGenerator = decodeGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update_list, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack if params['checkpoint_file_name'] == 'None': model['bd'].set_value(bias_init_vector.astype(config.floatX)) if params['class_out_factoring'] == 1: model['bdCls'].set_value( bias_init_inter_class.astype(config.floatX)) #----------------- If we are using feature encoders ----------------------- # This mode can now also be used for encoding GT sentences. if params['use_encoder_for'] & 1: if params['encode_gt_sentences']: xI = tensor.zeros((batch_size, params['image_encoding_size'])) imgFeatEnc_inp = [] else: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['word_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] if params['use_encoder_for'] & 2: aux_enc_inp = model['Wemb'] if params[ 'encode_gt_sentences'] else dp.aux_inputs.T hid_size = params['featenc_hidden_size'] auxFeatEncoder = RecurrentFeatEncoder(hid_size, params['image_encoding_size'], params, mdl_prefix='aux_enc_', features=aux_enc_inp) mdlLen = len(model.keys()) model.update(auxFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(auxFeatEncoder.model_th.keys()))) misc['update'].extend(auxFeatEncoder.update_list) misc['regularize'].extend(auxFeatEncoder.regularize) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params) if params['encode_gt_sentences']: # Reshape it size(batch_size, n_gt, hidden_size) xAux = xAux.reshape( (-1, params['n_encgt_sent'], params['featenc_hidden_size'])) # Convert it to size (batch_size, n_gt*hidden_size xAux = xAux.flatten(2) else: auxFeatEnc_inp = [] xAux = None #--------------------------------- Initialize the Attention Network #-------------------------------# if params['use_attn'] != None: attnModel = AttentionNetwork(params['image_feat_size'], params['hidden_size'], params, mdl_prefix='attn_mlp_') mdlLen = len(model.keys()) model.update(attnModel.model_th) assert (len(model.keys()) == (mdlLen + len(attnModel.model_th.keys()))) misc['update'].extend(attnModel.update_list) misc['regularize'].extend(attnModel.regularize) attn_nw_func = attnModel.build_model else: attn_nw_func = None #--------------------------------- Build the language model graph #---------------------------------# # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list_gen, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params, xI, xAux, attn_nw=attn_nw_func) inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen #--------------------------------- Cost function and gradient computations setup #---------------------------------# costGrad = cost[0] # Add class uncertainity to final cost #if params['class_out_factoring'] == 1: # costGrad += cost[2] # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c costGrad += (reg_cost / params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(costGrad, wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) #-------------------------------- Intialize the prediction path if needed by evaluator ----------------------------# evalKwargs = { 'eval_metric': params['eval_metric'], 'f_gen': lstmGenerator.predict, 'beamsize': params['eval_beamsize'] } if params['eval_metric'] != 'perplex': lstmGenerator.prepPredictor(None, params, params['eval_beamsize']) refToks, scr_info = eval_prep_refs('val', dp, params['eval_metric']) evalKwargs['refToks'] = refToks evalKwargs['scr_info'] = scr_info valMetOp = operator.gt else: valMetOp = operator.lt if params['met_to_track'] != []: trackMetargs = { 'eval_metric': params['met_to_track'], 'f_gen': lstmGenerator.predict, 'beamsize': params['eval_beamsize'] } lstmGenerator.prepPredictor(None, params, params['eval_beamsize']) refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track']) trackMetargs['refToks'] = refToks trackMetargs['scr_info'] = scr_info #--------------------------------- Iterations and Logging intializations ------------------------------------------# # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_sc = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_sc = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} #json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) #Initialize Tracking the perplexity of train and val, with iters. train_perplex = [] val_perplex = [] trackSc_array = [] #-------------------------------------- Load previously saved model ------------------------------------------------# #- Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from, model) if params['restore_grads'] == 1: zipp(rg_init, rg) #Copy trackers from previous checkpoint if 'trackers' in checkpoint_init: train_perplex = checkpoint_init['trackers']['train_perplex'] val_perplex = checkpoint_init['trackers']['val_perplex'] trackSc_array = checkpoint_init['trackers'].get('trackScores', []) print( """\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n""" % (checkpoint_init['epoch'], checkpoint_init['perplexity'])) #-------------------------------------- MAIN LOOP ----------------------------------------------------------------# for it in xrange(max_iters): t0 = time.time() # Enable using dropout in training use_dropout.set_value(float(params['use_dropout'])) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(float(params['use_dropout'])) if params['use_encoder_for'] & 2: auxenc_use_dropout.set_value(float(params['use_dropout'])) epoch = it * 1.0 / num_iters_one_epoch #-------------------------------------- Prepare batch-------------------------------------------# # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch, l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) if params['use_pos_tag'] != 'None': gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], params['maxlen'], sentTagMap, misc['ixtoword'], rev_sents=params['reverse_sentence'], use_enc_for=params['use_encoder_for'], use_unk_token=params['use_unk_token']) else: gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], params['maxlen'], rev_sents=params['reverse_sentence'], use_enc_for=params['use_encoder_for'], use_unk_token=params['use_unk_token']) if params['sched_sampling_mode'] != None: gen_inp_list.append(epoch) real_inp_list = enc_inp_list + gen_inp_list #import ipdb; ipdb.set_trace() #---------------------------------- Compute cost and apply gradients ---------------------------# # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1] / lenS)) #step_struct['stats']['ppl2'] # smooth exponentially decaying moving average smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out total_cost = cost[0] if it == 0: smooth_cost = total_cost # start out where we start out smooth_cost = 0.99 * smooth_cost + 0.01 * total_cost #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) #---------------------------------- Write a report into a json file ---------------------------# tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' \ % (it, max_iters, dt, epoch, smooth_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_sc'] = val_sc # just write the last available one jstatus['val_metric'] = params[ 'eval_metric'] # just write the last available one jstatus['train_ppl2'] = train_ppl2 #if params['class_out_factoring'] == 1: # jstatus['class_cost'] = float(cost[2]) json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e #--------------------------------- VALIDATION ---------------------------# #- perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(0.) if params['use_encoder_for'] & 2: auxenc_use_dropout.set_value(0.) # perform the evaluation on VAL set val_sc = eval_split_theano('val', dp, model, params, misc, f_eval, **evalKwargs) val_sc = val_sc[0] val_perplex.append((it, val_sc)) train_perplex.append((it, smooth_train_ppl2)) if params['met_to_track'] != []: track_sc = eval_split_theano('val', dp, model, params, misc, f_eval, **trackMetargs) trackSc_array.append((it, { evm: track_sc[i] for i, evm in enumerate(params['met_to_track']) })) if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation %s = %f, lr = %f' % ( params['eval_metric'], val_sc, params['learning_rate']) #if params['sample_by_len'] == 1: # print len_hist #----------------------------- SAVE THE MODEL -------------------# write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if valMetOp(val_sc, top_val_sc) or top_val_sc < 0: if valMetOp(val_sc, write_checkpoint_ppl_threshold ) or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_sc = val_sc filename = 'model_checkpoint_%s_%s_%s_%s%.2f.p' % ( params['dataset'], host, params['fappend'], params['eval_metric'][:3], val_sc) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_sc checkpoint['misc'] = misc checkpoint['trackers'] = { 'train_perplex': train_perplex, 'val_perplex': val_perplex, 'trackScores': trackSc_array } try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e