def main(cur_params): # fetch the data provider for i, cpf in enumerate(cur_params['checkpoints']): checkpoint = pickle.load(open(cpf, 'rb')) if 'model' in checkpoint: model_init_gen_from = checkpoint.get('model',{}) else: model_init_gen_from = checkpoint.get('modelGen',{}) model_init_eval_from = checkpoint.get('modelEval',{}) params = checkpoint['params'] # Load data provider and copy misc if i == 0: dp = getDataProvider(params) evaluator = decodeEvaluator(params) modelEval = evaluator.model_th (eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params) misc = checkpoint['misc'] zipp(model_init_eval_from, modelEval) evaluator.use_noise.set_value(1.) print '----------------------- Running model %s -------------------------------'%(cpf.split('_')[-3]) print 'Evaluating GT 5 vs Negative samples from GT' eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, probs = [0.5, 0.5, 0.0]) print '-------------------------------------------------------------------------' print 'Evaluating GT vs repeated GT' eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, probs = [0.5, 0.0, 0.5]) print '-------------------------------------------------------------------------'
def main(params): rootpath = '/home/lgp105b/xirong/VisualSearch' checkpoint_path = params['checkpoint_path'] checkpoint_path = '/home/lgp105b/weiyu/demo/neuraltalk/cv/model_checkpoint_flickr8kchn_lgp105b-OptiPlex-9020_baseline_7.86.p' dataset = params['dataset'] dataset = 'flickr8k' version = 'chn' output_path = '%s.%s.id.score.predict.txt'%(dataset,version) vob = load_chinese_vob(rootpath,dataset,version) sentgen = ChnSentGenerator(checkpoint_path) dp = getDataProvider(dataset) fout = codecs.open(output_path,'w','utf-8') for img in dp.iterImages(split = 'test',max_images=-1): (score,sent) = sentgen.predict(img['feat']) try: #if not sent[0].isalpha(): sent = back_to_words(sent.split(),vob) except: print sent #print sent #fout.write(sent.encode('utf-8')) fout.write('%s %d %s\n'%(img['filename'],score,sent.decode('utf-8'))) #print img['filename'],score,sent fout.close()
def main(params): checkpoint_path = params['checkpoint_path'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dp = getDataProvider(checkpoint_params) bar = progressbar.ProgressBar(maxval=dp.getSplitSize('train'), \ widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
def __init__(self, dataset, nbOfTopics, rate,hidden,layers, pert): ''' :param dataset: dataset to use :param nbOfTopics: number of topics to use :param rate: learning rate :param hidden: number of hidden neurons :param layers: number of hidden layers. Currently not used since one hidden layer is hardcoded :param pert: whether or not to use the perturbed dataset ''' self.nbOfTopics=nbOfTopics self.dataset = dataset self.dataprovider = getDataProvider(dataset, pert) self.pert = pert self.hidden = hidden self.rate = rate
def main(params): # load the checkpoint result_struct = params['checkpoint_path'] max_images = params['max_images'] print 'loading result data %s' % (result_struct, ) resultDb = json.load(open(result_struct,'r')) checkpoint_params = resultDb['checkpoint_params'] dataset = checkpoint_params['dataset'] dump_folder = params['dump_folder'] if 'image_feat_size' not in checkpoint_params: checkpoint_params['image_feat_size'] = 4096 if dump_folder: print 'creating dump folder ' + dump_folder os.system('mkdir -p ' + dump_folder) # fetch the data provider dp = getDataProvider(checkpoint_params) fTrn = open('dBSentFile', 'w') fTst = open('queryFile', 'w') n = 0 for img in dp.iterImages(split = 'train', max_images = max_images): n += 1 print 'image %d/%d:' % (n, max_images) fTrn.writelines("%s\n"% ' '.join(x['tokens']) for x in img['sentences']) fTst.writelines("%s\n"% x['candidate']['text'] for x in resultDb['imgblobs']) fNNRes = open('Brute_SearchResult_FULL.txt') tups = re.findall(pattern,f.read()) fNNRes.close() for t in tups: trnIdx = int(t[1]) tstIdx = int(t[0]) imgIdx = floor(trnIdx / 5) senIdx = trnIdx - imgIdx * 5 nnSent =
def test_linear(args): if args.random_seed is not None: numpy.random.seed(args.random_seed) D = Cdist() model = cPickle.load(gzip.open('model.dat.gz')) vectorizer = cPickle.load(gzip.open('vec.pkl.gz')) scaler = cPickle.load(gzip.open('scaler.pkl.gz')) real_stdout = sys.stdout with open('/dev/null', 'w') as f: sys.stdout = f d = dp.getDataProvider(args.dataset) sys.stdout = real_stdout pairs = list(d.iterImageSentencePair(split='val')) texts = [ pair['sentence']['raw'] for pair in pairs ] images = list(d.iterImages(split='val')) # With pairs we'd get duplicate images! X = vectorizer.transform(texts) Y_pred = numpy.asarray(model.predict(X), dtype='float32') # candidates are identical to Y_pred if args.paraphrase: #distances = D.cosine_distance(Y_pred, Y_pred) distances = cdist(Y_pred, Y_pred, metric='cosine') N = 0 score = 0.0 for j,row in enumerate(distances): imgid = pairs[j]['sentence']['imgid'] sentid = pairs[j]['sentence']['sentid'] best = numpy.argsort(row) top4 = sum([ imgid == pairs[b]['sentence']['imgid'] for b in best[0:5] if sentid != pairs[b]['sentence']['sentid'] ][0:4]) # exclude self score = score + top4/4.0 N = N+1 print args.iter_predict, N, score/N else: Y = numpy.array([ image['feat'] for image in images], dtype='float32') distances = D.cosine_distance(Y_pred, Y) errors = 0 N = 0 for j,row in enumerate(distances): imgid = pairs[j]['sentence']['imgid'] best = numpy.argsort(row) top5 = [ images[b]['imgid'] for b in best[:5] ] N = N+1 if imgid not in top5: errors = errors + 1 print errors, N, errors/N
def main(params): rootpath = '/home/lgp105b/xirong/VisualSearch' collection = 'flickr8k' checkpoint_path = params['checkpoint_path'] #checkpoint_path = '/home/lgp105b/weiyu/demo/neuraltalk/cv/model_checkpoint_flickr8k_lgp105b-OptiPlex-9020_baseline_7.86.p' dataset = params['dataset'] #dataset = 'flickr8k' output_path = os.path.join(rootpath,collection,'prediction','%s.id.score.predict.txt'%(dataset)) sentgen = SentGenerator(checkpoint_path) dp = getDataProvider(dataset) fout = codecs.open(output_path,'w','utf-8') for img in dp.iterImages(split = 'test',max_images=-1): (score,sent) = sentgen.predict(img['feat']) #print sent #fout.write(sent.encode('utf-8')) fout.write('%s %.4f %s\n'%(img['filename'],score,sent)) #print img['filename'],score,sent fout.close()
def train_linear(args): p = dp.getDataProvider(args.dataset) data = list(p.iterImageSentencePair(split='train')) texts = [ pair['sentence']['raw'] for pair in data ] images = [ pair['image']['feat'] for pair in data ] analyzer = 'char' if args.character else 'word' vectorizer = CountVectorizer(min_df=args.word_freq_threshold, analyzer=analyzer, lowercase=True, ngram_range=(1,1)) X = vectorizer.fit_transform(texts) scaler = StandardScaler() if args.scaler == 'standard' else NoScaler() sys.stderr.write("BOW computed\n") Y = scaler.fit_transform(numpy.array(images)) model = Ridge(solver='lsqr', alpha=args.ridge_alpha) sys.stderr.write("Starting training\n") model.fit(X,Y) sys.stderr.write("Saving model\n") cPickle.dump(model, gzip.open('model.dat.gz','w')) cPickle.dump(vectorizer, gzip.open('vec.pkl.gz','w')) cPickle.dump(scaler, gzip.open('vec.pkl.gz', 'w'))
def preprocess(): ''' generate the image and sentence matrices of the given dataset and write it to disk :return: ''' dataset = "flickr30k" # hardcoded os.chdir("..") dataprovider = getDataProvider(dataset, pert=1) os.chdir("cca") img_sentence_pair_generator = dataprovider.iterImageSentencePair() print "Reading Vocabulary..." vocabulary = readVocabulary("training_dictionary_pert.txt") print "Done" print "Creating sentence vectors..." occurrences, idf, images = getOccurenceVectorsAndImages(vocabulary, img_sentence_pair_generator) print "Done" print "Weighing vectors" weightedVectors = weight_tfidf(occurrences, idf) pair = image_sentence_matrix_pair(images, weightedVectors) pair_file = open("imagesentencematrix_pert.p", 'wb') pickle.dump(pair, pair_file) pair_file.close()
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model = checkpoint['model'] dump_folder = params['dump_folder'] if dump_folder: print 'creating dump folder ' + dump_folder os.system('mkdir -p ' + dump_folder) # fetch the data provider dp = getDataProvider(dataset, params['pert']) dp.load_topic_models(dataset, params['lda']) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] ixtoword = checkpoint['ixtoword'] blob = {} # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) n = 0 all_references = [] all_candidates = [] # Added for CCA and perturbed dataset if params['cca']: pert_str = '' if params['pert']: pert_str = '_pert' ccaweights = np.loadtxt('cca/imageprojection_'+str(params['cca'])+pert_str+'.txt', delimiter = ',') misc['ccaweights'] = ccaweights else: ccaweights = None for img in dp.iterImages(split = 'test', max_images = max_images): n+=1 print 'image %d/%d:' % (n, max_images) references = [' '.join(x['tokens']) for x in img['sentences']] # as list of lists of tokens kwparams = { 'beam_size' : params['beam_size'], 'normalization': params['normalization'], 'ccaweights' : ccaweights } # Added for idf normalization if params['normalization']=='idf' or params['normalization']=='combined': idf = load_idf() kwparams['idf']=idf kwparams['words']=ixtoword else: kwparams['idf']=None kwparams['words']=None # Added for LDA if not params['lda'] == 0: Ys = BatchGenerator.predict_test([{'image':img}], model, checkpoint_params, **kwparams) else: Ys = BatchGenerator.predict_test([{'image':img}], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob['img_path'] = img['local_file_path'] img_blob['imgid'] = img['imgid'] if dump_folder: # copy source file to some folder. This makes it easier to distribute results # into a webpage, because all images that were predicted on are in a single folder source_file = img['local_file_path'] target_file = os.path.join(dump_folder, os.path.basename(img['local_file_path'])) os.system('cp %s %s' % (source_file, target_file)) # encode the human-provided references img_blob['references'] = [] for gtsent in references: print 'GT: ' + gtsent img_blob['references'].append({'text': gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[0] # these are sorted with highest on top candidate = ' '.join([ixtoword[ix] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that print 'PRED: (%f) %s' % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) img_blob['candidate'] = {'text': candidate, 'logprob': top_prediction[0]} blob['imgblobs'].append(img_blob) # use perl script to eval BLEU score for fair comparison to other research work # first write intermediate files print 'writing intermediate files into eval/' open('eval/output', 'w').write('\n'.join(all_candidates)) for q in xrange(5): open('eval/reference'+`q`, 'w').write('\n'.join([x[q] for x in all_references])) # invoke the perl script to get BLEU scores print 'invoking eval/multi-bleu.perl script...' owd = os.getcwd() os.chdir('eval') os.system('./multi-bleu.perl reference < output') os.chdir(owd) # now also evaluate test split perplexity gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images = max_images) print 'perplexity of ground truth words based on dictionary of %d words: %f' % (len(ixtoword), gtppl) blob['gtppl'] = gtppl # dump result struct to file print 'saving result struct to %s' % (params['result_struct_filename'], ) json.dump(blob, open(params['result_struct_filename'], 'w'))
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model_npy = checkpoint['model'] dump_folder = params['dump_folder'] if 'use_theano' not in checkpoint_params: checkpoint_params['use_theano'] = 1 checkpoint_params['use_theano'] = 1 if 'image_feat_size' not in checkpoint_params: checkpoint_params['image_feat_size'] = 4096 if dump_folder: print 'creating dump folder ' + dump_folder os.system('mkdir -p ' + dump_folder) # fetch the data provider dp = getDataProvider(checkpoint_params) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] ixtoword = checkpoint['ixtoword'] blob = { } # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) if checkpoint_params['use_theano'] == 1: # Compile and init the theano predictor BatchGenerator.prepPredictor(model_npy, checkpoint_params, params['beam_size']) model = BatchGenerator.model_th print("\nUsing model run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint['epoch'], \ checkpoint['perplexity'])) n = 0 all_references = [] all_candidates = [] for img in dp.iterImages(split='test', max_images=max_images): n += 1 print 'image %d/%d:' % (n, max_images) references = [' '.join(x['tokens']) for x in img['sentences']] # as list of lists of tokens kwparams = {'beam_size': params['beam_size']} img['feat'] = np.random.rand(*img['feat'].shape) Ys = BatchGenerator.predict([{ 'image': img }], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob['img_path'] = img['local_file_path'] img_blob['imgid'] = img['imgid'] if dump_folder: # copy source file to some folder. This makes it easier to distribute results # into a webpage, because all images that were predicted on are in a single folder source_file = img['local_file_path'] target_file = os.path.join( dump_folder, os.path.basename(img['local_file_path'])) os.system('cp %s %s' % (source_file, target_file)) # encode the human-provided references img_blob['references'] = [] for gtsent in references: print 'GT: ' + gtsent img_blob['references'].append({'text': gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[ 0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[ 0] # these are sorted with highest on top #import pdb; pdb.set_trace() candidate = ' '.join([ ixtoword[ix] for ix in top_prediction[1] if ix > 0 ]) # ix 0 is the END token, skip that print 'PRED: (%f) %s' % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) img_blob['candidate'] = { 'text': candidate, 'logprob': float(top_prediction[0]) } # Code to save all the other candidates candlist = [] for ci in xrange(len(top_predictions) - 1): prediction = top_predictions[ ci + 1] # these are sorted with highest on top candidate = ' '.join([ ixtoword[int(ix)] for ix in prediction[1] if ix > 0 ]) # ix 0 is the END token, skip that candlist.append({ 'text': candidate, 'logprob': float(prediction[0]) }) img_blob['candidatelist'] = candlist blob['imgblobs'].append(img_blob) # use perl script to eval BLEU score for fair comparison to other research work # first write intermediate files print 'writing intermediate files into eval/' open('eval/output', 'w').write('\n'.join(all_candidates)) for q in xrange(5): open('eval/reference' + ` q `, 'w').write('\n'.join([x[q] for x in all_references])) # invoke the perl script to get BLEU scores print 'invoking eval/multi-bleu.perl script...' owd = os.getcwd() os.chdir('eval') os.system('./multi-bleu.perl reference < output') os.chdir(owd) # now also evaluate test split perplexity # if checkpoint_params['use_theano'] == 0: # gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images = max_images) # else: # gtppl = eval_split_theano('test', dp, model, checkpoint_params, misc, BatchGenerator.f_eval, eval_max_images = max_images) # perform the evaluation on VAL set # print 'perplexity of ground truth words based on dictionary of %d words: %f' % (len(ixtoword), gtppl) # blob['gtppl'] = gtppl # # dump result struct to file print 'saving result struct to %s' % (params['result_struct_filename'], ) json.dump(blob, open(params['result_struct_filename'], 'w'))
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model_npy = checkpoint['model'] dump_folder = params['dump_folder'] if 'use_theano' not in checkpoint_params: checkpoint_params['use_theano'] = 1 checkpoint_params['use_theano'] = 1 if 'image_feat_size' not in checkpoint_params: checkpoint_params['image_feat_size'] = 4096 if dump_folder: print 'creating dump folder ' + dump_folder os.system('mkdir -p ' + dump_folder) # fetch the data provider dp = getDataProvider(checkpoint_params) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] ixtoword = checkpoint['ixtoword'] blob = {} # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) if checkpoint_params['use_theano'] == 1: # Compile and init the theano predictor BatchGenerator.prepPredictor(model_npy, checkpoint_params,params['beam_size']) model = BatchGenerator.model_th print("\nUsing model run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint['epoch'], \ checkpoint['perplexity'])) n = 0 all_references = [] all_candidates = [] for img in dp.iterImages(split = 'test', max_images = max_images): n += 1 print 'image %d/%d:' % (n, max_images) references = [' '.join(x['tokens']) for x in img['sentences']] # as list of lists of tokens kwparams = {'beam_size' : params['beam_size']} #img['feat'] = np.random.rand(*img['feat'].shape) Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob['img_path'] = img['local_file_path'] img_blob['imgid'] = img['imgid'] if dump_folder: # copy source file to some folder. This makes it easier to distribute results # into a webpage, because all images that were predicted on are in a single folder source_file = img['local_file_path'] target_file = os.path.join(dump_folder, os.path.basename(img['local_file_path'])) os.system('cp %s %s' % (source_file, target_file)) # encode the human-provided references img_blob['references'] = [] for gtsent in references: print 'GT: ' + gtsent img_blob['references'].append({'text': gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[0] # these are sorted with highest on top #import pdb; pdb.set_trace() candidate = ' '.join([ixtoword[ix] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that print 'PRED: (%f) %s' % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) img_blob['candidate'] = {'text': candidate, 'logprob': float(top_prediction[0])} # Code to save all the other candidates candlist = [] for ci in xrange(len(top_predictions)-1): prediction = top_predictions[ci+1] # these are sorted with highest on top candidate = ' '.join([ixtoword[int(ix)] for ix in prediction[1] if ix > 0]) # ix 0 is the END token, skip that candlist.append({'text': candidate, 'logprob': float(prediction[0])}) img_blob['candidatelist'] = candlist blob['imgblobs'].append(img_blob) # use perl script to eval BLEU score for fair comparison to other research work # first write intermediate files print 'writing intermediate files into eval/' open('eval/output', 'w').write('\n'.join(all_candidates)) for q in xrange(5): open('eval/reference'+`q`, 'w').write('\n'.join([x[q] for x in all_references])) # invoke the perl script to get BLEU scores print 'invoking eval/multi-bleu.perl script...' owd = os.getcwd() os.chdir('eval') os.system('./multi-bleu.perl reference < output') os.chdir(owd) # now also evaluate test split perplexity # if checkpoint_params['use_theano'] == 0: # gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images = max_images) # else: # gtppl = eval_split_theano('test', dp, model, checkpoint_params, misc, BatchGenerator.f_eval, eval_max_images = max_images) # perform the evaluation on VAL set # print 'perplexity of ground truth words based on dictionary of %d words: %f' % (len(ixtoword), gtppl) # blob['gtppl'] = gtppl # # dump result struct to file print 'saving result struct to %s' % (params['result_struct_filename'], ) json.dump(blob, open(params['result_struct_filename'], 'w'))
def main(params): # load the checkpoint checkpoint_path = params["checkpoint_path"] max_images = params["max_images"] print "loading checkpoint %s" % (checkpoint_path,) checkpoint = pickle.load(open(checkpoint_path, "rb")) checkpoint_params = checkpoint["params"] dataset = checkpoint_params["dataset"] model_npy = checkpoint["model"] dump_folder = params["dump_folder"] if "use_theano" not in checkpoint_params: checkpoint_params["use_theano"] = 1 checkpoint_params["use_theano"] = 1 if "image_feat_size" not in checkpoint_params: checkpoint_params["image_feat_size"] = 4096 if dump_folder: print "creating dump folder " + dump_folder os.system("mkdir -p " + dump_folder) # fetch the data provider dp = getDataProvider(checkpoint_params) misc = {} misc["wordtoix"] = checkpoint["wordtoix"] ixtoword = checkpoint["ixtoword"] blob = {} # output blob which we will dump to JSON for visualizing the results blob["params"] = params blob["checkpoint_params"] = checkpoint_params blob["imgblobs"] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) if checkpoint_params["use_theano"] == 1: # Compile and init the theano predictor BatchGenerator.prepPredictor(model_npy, checkpoint_params, params["beam_size"]) model = BatchGenerator.model_th print ( "\nUsing model run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint["epoch"], checkpoint["perplexity"]) ) n = 0 all_references = [] all_candidates = [] for img in dp.iterImages(split="test", max_images=max_images): n += 1 print "image %d/%d:" % (n, max_images) references = [" ".join(x["tokens"]) for x in img["sentences"]] # as list of lists of tokens kwparams = {"beam_size": params["beam_size"]} img["feat"] = np.random.rand(*img["feat"].shape) Ys = BatchGenerator.predict([{"image": img}], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob["img_path"] = img["local_file_path"] img_blob["imgid"] = img["imgid"] if dump_folder: # copy source file to some folder. This makes it easier to distribute results # into a webpage, because all images that were predicted on are in a single folder source_file = img["local_file_path"] target_file = os.path.join(dump_folder, os.path.basename(img["local_file_path"])) os.system("cp %s %s" % (source_file, target_file)) # encode the human-provided references img_blob["references"] = [] for gtsent in references: print "GT: " + gtsent img_blob["references"].append({"text": gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[0] # these are sorted with highest on top # import pdb; pdb.set_trace() candidate = " ".join([ixtoword[ix] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that print "PRED: (%f) %s" % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) img_blob["candidate"] = {"text": candidate, "logprob": float(top_prediction[0])} # Code to save all the other candidates candlist = [] for ci in xrange(len(top_predictions) - 1): prediction = top_predictions[ci + 1] # these are sorted with highest on top candidate = " ".join( [ixtoword[int(ix)] for ix in prediction[1] if ix > 0] ) # ix 0 is the END token, skip that candlist.append({"text": candidate, "logprob": float(prediction[0])}) img_blob["candidatelist"] = candlist blob["imgblobs"].append(img_blob) # use perl script to eval BLEU score for fair comparison to other research work # first write intermediate files print "writing intermediate files into eval/" open("eval/output", "w").write("\n".join(all_candidates)) for q in xrange(5): open("eval/reference" + ` q `, "w").write("\n".join([x[q] for x in all_references])) # invoke the perl script to get BLEU scores print "invoking eval/multi-bleu.perl script..." owd = os.getcwd() os.chdir("eval") os.system("./multi-bleu.perl reference < output") os.chdir(owd) # now also evaluate test split perplexity # if checkpoint_params['use_theano'] == 0: # gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images = max_images) # else: # gtppl = eval_split_theano('test', dp, model, checkpoint_params, misc, BatchGenerator.f_eval, eval_max_images = max_images) # perform the evaluation on VAL set # print 'perplexity of ground truth words based on dictionary of %d words: %f' % (len(ixtoword), gtppl) # blob['gtppl'] = gtppl # # dump result struct to file print "saving result struct to %s" % (params["result_struct_filename"],) json.dump(blob, open(params["result_struct_filename"], "w"))
def gen_from_test(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] fout = params['output_file'] tempo = params['tempo'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model = checkpoint['model'] dump_folder = params['dump_folder'] if dump_folder: print 'creating dump folder ' + dump_folder os.system('mkdir -p ' + dump_folder) # fetch the data provider dp = getDataProvider(dataset) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] ixtoword = checkpoint['ixtoword'] blob = { } # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) n = 0 all_references = [] all_candidates = [] candidates = [] for img in dp.iterImages(split='test', max_images=max_images): n += 1 print 'image %d/%d:' % (n, max_images) references = [' '.join(x['tokens']) for x in img['sentences']] # as list of lists of tokens kwparams = {'beam_size': params['beam_size']} Ys = BatchGenerator.predict([{ 'image': img }], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob['img_path'] = img['local_file_path'] img_blob['imgid'] = img['imgid'] if dump_folder: # copy source file to some folder. This makes it easier to distribute results # into a webpage, because all images that were predicted on are in a single folder source_file = img['local_file_path'] target_file = os.path.join( dump_folder, os.path.basename(img['local_file_path'])) os.system('cp %s %s' % (source_file, target_file)) # encode the human-provided references img_blob['references'] = [] for gtsent in references: print 'GT: ' + gtsent img_blob['references'].append({'text': gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[ 0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[ 0] # these are sorted with highest on top candidate = ' '.join([ ixtoword[ix] for ix in top_prediction[1] if ix > 0 ]) # ix 0 is the END token, skip that candidates.append(candidate) print 'PRED: (%f) %s' % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) img_blob['candidate'] = { 'text': candidate, 'logprob': top_prediction[0] } blob['imgblobs'].append(img_blob) # use perl script to eval BLEU score for fair comparison to other research work # first write intermediate files print 'writing intermediate files into eval/' open('eval/output', 'w').write('\n'.join(all_candidates)) for q in xrange(1): open('eval/reference' + ` q `, 'w').write('\n'.join([x[q] for x in all_references])) # invoke the perl script to get BLEU scores print 'invoking eval/multi-bleu.perl script...' owd = os.getcwd() os.chdir('eval') os.system('./multi-bleu.perl reference < output') os.chdir(owd) # now also evaluate test split perplexity gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images=max_images) print 'perplexity of ground truth words based on dictionary of %d words: %f' % ( len(ixtoword), gtppl) blob['gtppl'] = gtppl # dump result struct to file # print 'saving result struct to %s' % (params['result_struct_filename'], ) # json.dump(blob, open(params['result_struct_filename'], 'w')) for idx, c in enumerate(candidates): cs = c.split() for e in cs: es = e.split(';') pitch = int(es[0]) pos = es[1] pos = convert_pos(pos, idx) dur = es[2] dur = convert_dur(dur) note = pretty_midi.Note(90, pitch, pos, pos + dur) new_track.notes.append(note) new_midi_data = pretty_midi.PrettyMIDI(initial_tempo=tempo) new_midi_data.instruments.append(new_track) # pre-set chord preogression bass_track.notes.append(pretty_midi.Note(90, 36, 0, 1)) bass_track.notes.append(pretty_midi.Note(90, 47, 1, 2)) bass_track.notes.append(pretty_midi.Note(90, 45, 2, 3)) bass_track.notes.append(pretty_midi.Note(90, 43, 3, 4)) bass_track.notes.append(pretty_midi.Note(90, 41, 4, 5)) bass_track.notes.append(pretty_midi.Note(90, 40, 5, 6)) bass_track.notes.append(pretty_midi.Note(90, 38, 6, 7)) bass_track.notes.append(pretty_midi.Note(90, 43, 7, 8)) bass_track.notes.append(pretty_midi.Note(90, 36, 8, 9)) bass_track.notes.append(pretty_midi.Note(90, 47, 9, 10)) bass_track.notes.append(pretty_midi.Note(90, 45, 10, 11)) bass_track.notes.append(pretty_midi.Note(90, 43, 11, 12)) bass_track.notes.append(pretty_midi.Note(90, 41, 12, 13)) bass_track.notes.append(pretty_midi.Note(90, 40, 13, 14)) bass_track.notes.append(pretty_midi.Note(90, 38, 14, 15)) bass_track.notes.append(pretty_midi.Note(90, 43, 15, 16)) bass_track.notes.append(pretty_midi.Note(90, 45, 16, 17)) bass_track.notes.append(pretty_midi.Note(90, 41, 17, 18)) bass_track.notes.append(pretty_midi.Note(90, 36, 18, 19)) bass_track.notes.append(pretty_midi.Note(90, 43, 19, 20)) bass_track.notes.append(pretty_midi.Note(90, 45, 20, 21)) bass_track.notes.append(pretty_midi.Note(90, 41, 21, 22)) bass_track.notes.append(pretty_midi.Note(90, 43, 22, 23)) bass_track.notes.append(pretty_midi.Note(90, 43, 23, 24)) bass_track.notes.append(pretty_midi.Note(90, 36, 24, 25)) bass_track.notes.append(pretty_midi.Note(90, 47, 25, 26)) bass_track.notes.append(pretty_midi.Note(90, 45, 26, 27)) bass_track.notes.append(pretty_midi.Note(90, 43, 27, 28)) bass_track.notes.append(pretty_midi.Note(90, 41, 28, 29)) bass_track.notes.append(pretty_midi.Note(90, 40, 29, 30)) bass_track.notes.append(pretty_midi.Note(90, 38, 30, 31)) bass_track.notes.append(pretty_midi.Note(90, 43, 31, 32)) bass_track.notes.append(pretty_midi.Note(90, 36, 32, 33)) bass_track.notes.append(pretty_midi.Note(90, 47, 33, 34)) bass_track.notes.append(pretty_midi.Note(90, 45, 34, 35)) bass_track.notes.append(pretty_midi.Note(90, 43, 35, 36)) bass_track.notes.append(pretty_midi.Note(90, 41, 36, 37)) bass_track.notes.append(pretty_midi.Note(90, 40, 37, 38)) bass_track.notes.append(pretty_midi.Note(90, 38, 38, 39)) bass_track.notes.append(pretty_midi.Note(90, 43, 39, 40)) new_midi_data.instruments.append(bass_track) adjust_tempo(new_midi_data) if params['quantize']: quantize(new_midi_data) new_midi_data.write(fout)
def main(params, split): #import pdb; pdb.set_trace() batch_size = params['batch_size'] dataset = params['dataset'] feature_file = params['feature_file'] class_count_threshold = params['class_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname json_file = 'dataset_mmdb_book_fps_30_samplesize_25_split_%d.json' % ( split) # fetch the data provider dp = getDataProvider(dataset, feature_file, json_file) misc = { } # stores various misc items that need to be passed around the framework # go over all training classes and find the vocabulary we want to use, i.e. the classes that occur # at least class_count_threshold number of times misc['classtoix'], misc[ 'ixtoclass'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), class_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) if params.get('init_model_from', ''): # load checkpoint checkpoint = pickle.load(open(params['init_model_from'], 'rb')) model = checkpoint['model'] # overwrite the model # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoclass']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoclass']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] lastsavedcheckpoint = '' for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) print 'last saved checkpoint in %s' % (lastsavedcheckpoint, ) # perform gradient check if desired, with a bit of a burnin time (10 iterations) if it == 10 and do_grad_check: print 'disabling dropout for gradient check...' params['drop_prob_encoder'] = 0 params['drop_prob_decoder'] = 0 solver.gradCheck(batch, model, costfun) print 'done gradcheck, exitting.' sys.exit() # hmmm. probably should exit here # detect if loss is exploding and kill the job if so total_cost = cost['total_cost'] if it == 0: total_cost0 = total_cost # store this initial cost if total_cost > total_cost0 * 2: print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' abort = True # set the abort flag, we'll break out # logging: write JSON files for visual inspection of the training tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e # perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set print 'validation perplexity = %f' % (val_ppl2, ) # abort training if the perplexity is no good min_ppl_or_abort = params['min_ppl_or_abort'] if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0: print 'aborting job because validation perplexity %f < %f' % ( val_ppl2, min_ppl_or_abort) abort = True # abort the job write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_alpha_%2.2f_beta_%2.2f_split_%d.p' % ( dataset, host, params['fappend'], params['alpha'], params['beta'], split) filepath = os.path.join( params['checkpoint_output_directory'], filename) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['classtoix'] = misc['classtoix'] checkpoint['ixtoclass'] = misc['ixtoclass'] checkpoint['json_file'] = json_file try: if not (params['fappend'] == 'test'): # if it == max_iters - 1 : pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) lastsavedcheckpoint = filepath except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model = checkpoint['model'] dump_folder = params['dump_folder'] if dump_folder: print 'creating dump folder ' + dump_folder os.system('mkdir -p ' + dump_folder) # fetch the data provider dp = getDataProvider(dataset) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] ixtoword = checkpoint['ixtoword'] blob = {} # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) n = 0 all_references = [] all_candidates = [] captions_res = [] for img in dp.iterImages(split = 'test', max_images = max_images): n+=1 print 'image %d/%d:' % (n, max_images) references = [' '.join(x['tokens']) for x in img['sentences']] # as list of lists of tokens kwparams = { 'beam_size' : params['beam_size'] } Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob['img_path'] = img['local_file_path'] img_blob['imgid'] = img['imgid'] img_blob['id'] = img['id'] if dump_folder: # copy source file to some folder. This makes it easier to distribute results # into a webpage, because all images that were predicted on are in a single folder source_file = img['local_file_path'] target_file = os.path.join(dump_folder, os.path.basename(img['local_file_path'])) os.system('cp %s %s' % (source_file, target_file)) # encode the human-provided references img_blob['references'] = [] flag = True for gtsent in references: if flag: print 'GT: ' + gtsent flag = False img_blob['references'].append({'text': gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[0] # these are sorted with highest on top candidate = ' '.join([ixtoword[ix] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that print 'PRED: (%f) %s' % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) captions_res.append({'image_id':img_blob['id'],'caption':candidate}) img_blob['candidate'] = {'text': candidate, 'logprob': top_prediction[0]} blob['imgblobs'].append(img_blob) # use perl script to eval BLEU score for fair comparison to other research work # first write intermediate files print 'writing intermediate files into eval/' open('eval/output', 'w').write('\n'.join(all_candidates)) for q in xrange(5): open('eval/reference'+`q`, 'w').write('\n'.join([x[q] for x in all_references])) # invoke the perl script to get BLEU scores print 'invoking eval/multi-bleu.perl script...' owd = os.getcwd() os.chdir('eval') os.system('./multi-bleu.perl reference < output') os.chdir(owd) # # now also evaluate test split perplexity # gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images = max_images) # print 'perplexity of ground truth words based on dictionary of %d words: %f' % (len(ixtoword), gtppl) # blob['gtppl'] = gtppl # dump result struct to file print 'saving result struct to %s' % (params['result_struct_filename'], ) json.dump(blob, open(params['result_struct_filename'], 'w')) alg_name = params['checkpoint_path'].split('_')[1] res_file_name = params['out_dir']+'/captions_val_'+alg_name+'_results.json' json.dump(captions_res, open(res_file_name, 'w')) from eval_tools import metrics metrics.run(dataset,alg_name,params['out_dir'])
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] # name of the dataset flickr8k, flickr30k.. word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(dataset) completeData = dp.getData('train') misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times #print 'dp.iterSentences', dp.iterSentences('train') misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) #printWordEmbedding(dp.iterSentences('train'),misc['wordtoix']) #print 'type;',type(completeData) # calculate weights of all unique words in vocab weightComputedData = calculateWeights(misc['wordtoix'], misc['ixtoword'], completeData) weightCalculationMethodSec() weightComputedData = getWeightsMethod2() print 'Done:' # delegate the initialization of the model to the Generator class BatchGenerator = GenericBatchGenerator() #decodeGenerator(params) # initialize encoder and decoder weight matrices init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = bias_init_vector.reshape( 1, bias_init_vector.size) # remove and check print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) if params.get('init_model_from', ''): # load checkpoint checkpoint = pickle.load(open(params['init_model_from'], 'rb')) model = checkpoint['model'] # overwrite the model # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc, weightComputedData) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics #train_ppl2 = step_struct['stats']['ppl2'] #if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost']) total_cost = cost['total_cost'] if it == 0: total_cost0 = total_cost if total_cost > total_cost0 * 2: print 'Aborting, cost seems to be exploding. ' abort = True if (it + 1) == max_iters: top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % ( dataset, host, params['fappend'], val_ppl2) filepath = os.path.join(params['checkpoint_output_directory'], filename) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
import numpy as np import cPickle as pickle import json from operator import itemgetter checkpoint = pickle.load(open('trainedModels/model_checkpoint_coco_gpu001_c_in14_o9_fc7_d_a_Auxo9_fc8_11.96.p','r')) wix = checkpoint['wordtoix'] dataset = json.load(open('/triton/ics/project/imagedb/picsom/databases/COCO/download/annotations/instances_train2014.json','r')) ixw = checkpoint['ixtoword'] from collections import defaultdict from imagernn.data_provider import getDataProvider, prepare_data params = {} params['dataset'] = 'coco' params['data_file'] = 'dataset.json' dp = getDataProvider(params) catIdImgs = defaultdict(set) for ann in dataset['annotations']: catIdImgs[ann['category_id']].add(ann['image_id']) catIdtoIx = {} for i,cat in enumerate(catIdImgs.keys()): catIdtoIx[cat] = i nTrnSamp = len(dataset['images']) wordsIdList = defaultdict(set) for img in dp.split['train']: for sent in img['sentences']:
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model = checkpoint['model'] # fetch the data provider dp = getDataProvider(dataset) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] ixtoword = checkpoint['ixtoword'] blob = {} # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) all_bleu_scores = [] n = 0 #for img in dp.iterImages(split = 'test', shuffle = True, max_images = max_images): for img in dp.iterImages(split = 'test', max_images = max_images): n+=1 print 'image %d/%d:' % (n, max_images) references = [x['tokens'] for x in img['sentences']] # as list of lists of tokens kwparams = { 'tanhC_version' : checkpoint_params.get('tanhC_version', 0) ,\ 'beam_size' : params['beam_size'],\ 'generator' : checkpoint_params['generator']} Ys = BatchGenerator.predict([{'image':img}], model, **kwparams) img_blob = {} # we will build this up img_blob['img_path'] = img['local_file_path'] img_blob['imgid'] = img['imgid'] # encode the human-provided references img_blob['references'] = [] for gtwords in references: print 'GT: ' + ' '.join(gtwords) img_blob['references'].append({'text': ' '.join(gtwords)}) # now evaluate and encode the top prediction top_predictions = Ys[0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[0] # these are sorted with highest on top candidate = [ixtoword[ix] for ix in top_prediction[1]] print 'PRED: (%f) %s' % (top_prediction[0], ' '.join(candidate)) bleu_scores = evalCandidate(candidate, references) print 'BLEU: B-1: %f B-2: %f B-3: %f' % tuple(bleu_scores) img_blob['candidate'] = {'text': ' '.join(candidate), 'logprob': top_prediction[0], 'bleu': bleu_scores} all_bleu_scores.append(bleu_scores) blob['imgblobs'].append(img_blob) print 'final average bleu scores:' bleu_averages = [sum(x[i] for x in all_bleu_scores)*1.0/len(all_bleu_scores) for i in xrange(3)] blob['final_result'] = { 'bleu' : bleu_averages } print 'FINAL BLEU: B-1: %f B-2: %f B-3: %f' % tuple(bleu_averages) # now also evaluate test split perplexity gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images = max_images) print 'perplexity of ground truth words: %f' % (gtppl, ) blob['gtppl'] = gtppl # dump result struct to file print 'saving result struct to %s' % (params['result_struct_filename'], ) json.dump(blob, open(params['result_struct_filename'], 'w'))
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) if params.get('init_model_from', ''): # load checkpoint checkpoint = pickle.load(open(params['init_model_from'], 'rb')) model = checkpoint['model'] # overwrite the model # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] import csv csvfile = open(os.path.join(params['outdir'],params['generator']+'.csv'),'wb') csvout = csv.writer(csvfile,delimiter=',',quotechar='"') csv_val_file = open(os.path.join(params['outdir'],params['generator']+'_val.csv'),'wb') csv_val_out = csv.writer(csv_val_file,delimiter=',',quotechar='"') for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) csvout.writerow([it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'],train_ppl2, smooth_train_ppl2]) csvfile.flush() if not host=='oliver-Aurora-R4': sys.stdout.flush() # os.system('./update_plots.sh') # perform gradient check if desired, with a bit of a burnin time (10 iterations) if it == 10 and do_grad_check: print 'disabling dropout for gradient check...' params['drop_prob_encoder'] = 0 params['drop_prob_decoder'] = 0 solver.gradCheck(batch, model, costfun) print 'done gradcheck, exitting.' sys.exit() # hmmm. probably should exit here # detect if loss is exploding and kill the job if so total_cost = cost['total_cost'] if it == 0: total_cost0 = total_cost # store this initial cost if total_cost > total_cost0 * 2: print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' abort = True # set the abort flag, we'll break out # logging: write JSON files for visual inspection of the training tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e # perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set print 'validation perplexity = %f' % (val_ppl2, ) cp_pred = {} cp_pred['it'] = it cp_pred['epoch'] = epoch cp_pred['model'] = model cp_pred['params'] = params cp_pred['perplexity'] = val_ppl2 cp_pred['wordtoix'] = misc['wordtoix'] cp_pred['ixtoword'] = misc['ixtoword'] cp_pred['algorithm'] = params['generator'] cp_pred['outdir'] = params['outdir'] if is_last_iter: scores = eval_sentence_predictions.run(cp_pred) csv_val_out.writerow([it, max_iters, dt, epoch, val_ppl2, scores[0],scores[1],scores[2],scores[3],scores[4],scores[5],scores[6]]) csv_val_file.flush() omail.send('job finished'+params['generator'],'done') # abort training if the perplexity is no good min_ppl_or_abort = params['min_ppl_or_abort'] if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0: print 'aborting job because validation perplexity %f < %f' % (val_ppl2, min_ppl_or_abort) abort = True # abort the job write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_%s_checkpoint_%s_%s_%s_%.2f.p' % (params['generator'],dataset, host, params['fappend'], val_ppl2) filepath = os.path.join(params['outdir'], filename) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] checkpoint['algorithm'] = params['generator'] checkpoint['outdir'] = params['outdir'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepat, ) print e scores = eval_sentence_predictions.run(checkpoint) csv_val_out.writerow([it, max_iters, dt, epoch, val_ppl2, scores[0],scores[1],scores[2],scores[3],scores[4],scores[5],scores[6]]) csv_val_file.flush()
def main(params, splitno, model_file): checkpoint_path = model_file max_blocks = params['max_blocks'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] feature_file = checkpoint_params['feature_file'] json_file = checkpoint['json_file'] model = checkpoint['model'] # fetch the data provider dp = getDataProvider(dataset, feature_file, json_file) misc = {} misc['classtoix'] = checkpoint['classtoix'] ixtoword = checkpoint['ixtoclass'] blob = { } # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all videos in test set and predict class labels BatchGenerator = decodeGenerator(checkpoint_params) n = 0 correct = 0 prev_video_name = '' video_block_count = 0 pred_video_label = [] pred_video_lbl = 0 prev_gt_video_label = 0 label_check = False video_count = 0 stat = [] v_data = {} result = {} for img in dp.iterImagesContext(split='test', max_images=max_blocks): n += 1 print 'clip %d/%d:' % (n, max_blocks) gt_video_label = img['sentences'][0]['tokens'][0] current_video_name = img['filename'] Ys = BatchGenerator.predict([{'image': img}], model, checkpoint_params) pred_frame_labels = np.argmax(Ys[0], axis=1) current_pred_video_label = max_occurrences(pred_frame_labels)[0] # impl based on action recog using visual attn paper - http://arxiv.org/abs/1511.04119 if current_video_name == prev_video_name or n == 1: pred_video_label.append(current_pred_video_label) video_block_count += 1 prev_gt_video_label = gt_video_label prev_video_name = current_video_name label_check = False else: pred_video_lbl = max_occurrences(pred_video_label)[0] if pred_video_lbl == prev_gt_video_label: correct = correct + 1 v_data['video_name'] = prev_video_name v_data['gt_label'] = prev_gt_video_label v_data['pred_label'] = int(pred_video_lbl) stat.append(v_data) v_data = {} pred_video_label = [] video_block_count = 0 label_check = True video_count += 1 # process current video block pred_video_label.append(current_pred_video_label) prev_video_name = current_video_name video_block_count += 1 prev_gt_video_label = gt_video_label if label_check == False: # last block of videos video_count += 1 pred_video_lbl = max_occurrences(pred_video_label)[0] if pred_video_lbl == prev_gt_video_label: correct = correct + 1 v_data['video_name'] = prev_video_name v_data['gt_label'] = prev_gt_video_label v_data['pred_label'] = int(pred_video_lbl) stat.append(v_data) json.dump(stat, open("./status/mmdb_stat_split_%d.json" % (splitno), 'a')) accuracy = correct / float(video_count) result['split'] = splitno result['accuracy'] = accuracy json.dump( result, open("./status/mmdb_split_result_split_%d.json" % (splitno), 'a')) return accuracy
def hold_comittee_discussion(params, com_dataset): n_memb = com_dataset['n_memb'] n_sent = com_dataset['n_sent'] n_imgs = len(com_dataset['images']) eval_array = np.zeros((n_memb, n_imgs * n_sent)) model_id = 0 for mod in com_dataset['members_model']: checkpoint = pickle.load(open(mod, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model_npy = checkpoint['model'] checkpoint_params['use_theano'] = 1 if 'image_feat_size' not in checkpoint_params: checkpoint_params['image_feat_size'] = 4096 checkpoint_params['data_file'] = params['jsonFname'].rsplit('/')[-1] dp = getDataProvider(checkpoint_params) ixtoword = checkpoint['ixtoword'] blob = { } # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) BatchGenerator.build_eval_other_sent(BatchGenerator.model_th, checkpoint_params, model_npy) eval_batch_size = params.get('eval_batch_size', 100) eval_max_images = params.get('eval_max_images', -1) wordtoix = checkpoint['wordtoix'] split = 'test' print 'evaluating %s performance in batches of %d' % (split, eval_batch_size) logppl = 0 logppln = 0 nsent = 0 gen_fprop = BatchGenerator.f_eval_other blob['params'] = params c_id = 0 for batch in dp.iterImageSentencePairBatch( split=split, max_batch_size=eval_batch_size, max_images=eval_max_images): xWd, xId, maskd, lenS = dp.prepare_data(batch, wordtoix) eval_array[model_id, c_id:c_id + xWd.shape[1]] = gen_fprop(xWd, xId, maskd) c_id += xWd.shape[1] model_id += 1 # Calculate oracle scores bleu_array = eval_bleu_all_cand(params, com_dataset) eval_results = {} eval_results['logProb_feat'] = eval_array eval_results['OracleBleu'] = bleu_array #Save the mutual evaluations params['comResFname'] = 'committee_evalSc_%s.json' % (params['fappend']) com_dataset['com_evaluation'] = params['comResFname'] pickle.dump(eval_results, open(params['comResFname'], "wb")) json.dump(com_dataset, open(params['jsonFname'], 'w')) return eval_array
def hold_comittee_discussion(params, com_dataset): n_memb = com_dataset['n_memb'] n_sent = com_dataset['n_sent'] n_imgs = len(com_dataset['images']) eval_array = np.zeros((n_memb,n_imgs*n_sent)) model_id = 0 for mod in com_dataset['members_model']: checkpoint = pickle.load(open(mod, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model_npy = checkpoint['model'] checkpoint_params['use_theano'] = 1 if 'image_feat_size' not in checkpoint_params: checkpoint_params['image_feat_size'] = 4096 checkpoint_params['data_file'] = params['jsonFname'].rsplit('/')[-1] dp = getDataProvider(checkpoint_params) ixtoword = checkpoint['ixtoword'] blob = {} # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) BatchGenerator.build_eval_other_sent(BatchGenerator.model_th, checkpoint_params,model_npy) eval_batch_size = params.get('eval_batch_size',100) eval_max_images = params.get('eval_max_images', -1) wordtoix = checkpoint['wordtoix'] split = 'test' print 'evaluating %s performance in batches of %d' % (split, eval_batch_size) logppl = 0 logppln = 0 nsent = 0 gen_fprop = BatchGenerator.f_eval_other blob['params'] = params c_id = 0 for batch in dp.iterImageSentencePairBatch(split = split, max_batch_size = eval_batch_size, max_images = eval_max_images): xWd, xId, maskd, lenS = dp.prepare_data(batch,wordtoix) eval_array[model_id, c_id:c_id + xWd.shape[1]] = gen_fprop(xWd, xId, maskd) c_id += xWd.shape[1] model_id +=1 # Calculate oracle scores bleu_array = eval_bleu_all_cand(params,com_dataset) eval_results = {} eval_results['logProb_feat'] = eval_array eval_results['OracleBleu'] = bleu_array #Save the mutual evaluations params['comResFname'] = 'committee_evalSc_%s.json' % (params['fappend']) com_dataset['com_evaluation'] = params['comResFname'] pickle.dump(eval_results, open(params['comResFname'], "wb")) json.dump(com_dataset,open(params['jsonFname'], 'w')) return eval_array
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model = checkpoint['model'] # fetch the data provider dp = getDataProvider(dataset) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] ixtoword = checkpoint['ixtoword'] blob = { } # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) n = 0 all_references = [] all_candidates = [] for img in dp.iterImages(split='test', max_images=max_images): n += 1 print 'image %d/%d:' % (n, max_images) references = [' '.join(x['tokens']) for x in img['sentences']] # as list of lists of tokens kwparams = {'beam_size': params['beam_size']} Ys = BatchGenerator.predict([{ 'image': img }], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob['img_path'] = img['local_file_path'] img_blob['imgid'] = img['imgid'] # encode the human-provided references img_blob['references'] = [] for gtsent in references: print 'GT: ' + gtsent img_blob['references'].append({'text': gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[ 0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[ 0] # these are sorted with highest on top candidate = ' '.join([ ixtoword[ix] for ix in top_prediction[1] if ix > 0 ]) # ix 0 is the END token, skip that print 'PRED: (%f) %s' % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) img_blob['candidate'] = { 'text': candidate, 'logprob': top_prediction[0] } blob['imgblobs'].append(img_blob) # use perl script to eval BLEU score for fair comparison to other research work # first write intermediate files print 'writing intermediate files into eval/' open('eval/output', 'w').write('\n'.join(all_candidates)) for q in xrange(5): open('eval/reference' + ` q `, 'w').write('\n'.join([x[q] for x in all_references])) # invoke the perl script to get BLEU scores print 'invoking eval/multi-bleu.perl script...' owd = os.getcwd() os.chdir('eval') os.system('./multi-bleu.perl reference < output') os.chdir(owd) # now also evaluate test split perplexity gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images=max_images) print 'perplexity of ground truth words based on dictionary of %d words: %f' % ( len(ixtoword), gtppl) blob['gtppl'] = gtppl # dump result struct to file print 'saving result struct to %s' % (params['result_struct_filename'], ) json.dump(blob, open(params['result_struct_filename'], 'w'))
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] gt_dataset = params['gt_dataset'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model = checkpoint['model'] dump_folder = params['dump_folder'] rootpath = '/home/lgp105b/xirong/VisualSearch' collection = 'flickr8k' version = 'baidu' fout = open(os.path.join(rootpath,collection,'SimilarityIndex','test_sent','%s.top20.sentid.txt'%dataset),'w') fout_s = open(os.path.join(rootpath,collection,'SimilarityIndex','test_sent','%s.top20.sentid.score.txt'%dataset),'w') if dump_folder: print 'creating dump folder ' + dump_folder os.system('mkdir -p ' + dump_folder) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] print "len(misc['wordtoix']):",len(misc['wordtoix']) ixtoword = checkpoint['ixtoword'] #get the groundtruth sentences encoded in model-dataset's chvob vob2idx = chinese_vob_idx(rootpath,collection,version) testset_filename = os.path.join(rootpath,collection,'Annotation','test_dataset.txt') test_ids = [x.strip() for x in open(testset_filename).readlines()] gt_filename = os.path.join(rootpath,collection,'seg.Flickr8k.token.Chinese.txt') testid2sentences = {} input_data = map(str.strip, open(gt_filename).readlines()) input_data = [x.decode('utf-8', 'ignore') for x in input_data] input_data = [x for x in input_data if x.split()[0][:-2] in test_ids] print len(input_data) #ignore if a word not in chvob or not in wodtoix(words occur more tham threshold) testid2sentences = encode_to_chvob(vob2idx, input_data) count_del = 0 for sid in testid2sentences.keys(): testid2sentences[sid] = [misc['wordtoix'][x] for x in testid2sentences[sid] if x in misc['wordtoix'].keys()] if len(testid2sentences[sid]) < 2: del testid2sentences[sid] count_del+=1 print '%d sentences cannot encoded with misx[wordtoix]'%count_del ''' sentences = {} for img in dp.iterImages(split = 'test', max_images = max_images): filename = img['filename'] for sent in img['sentences']: sentid = sent['sentid'] sentences['%s#%s'%(filename,sentid)] = [misc['wordtoix'][x] if x in misc['wordtoix'].keys() else 0 for x in sent['tokens']] #references = [' '.join(x['tokens']) for x in img['sentences']] # as list of lists of tokens #sentences[filename] = [[int(x) if int(x) <= len(misc['wordtoix']) else 0 for x in sentence.split()] for sentence in references] ''' # fetch the data provider dp = getDataProvider(dataset) blob = {} # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) n = 0 all_references = [] all_candidates = [] for img in dp.iterImages(split = 'test', max_images = max_images): n+=1 filename = img['filename'] print 'image %d/%d:%s' % (n, max_images,filename) #references = [' '.join(x['tokens']) for x in img['sentences']] # as list of lists of tokens #sentences = [[int(x) if x <= len(misc['wordtoix']) else 0 for x in sentence.split()] for sentence in references] #print sentences kwparams = { 'beam_size' : params['beam_size'] } top_sentences = BatchGenerator.sentence_relevance([{'image':img}], model, checkpoint_params, testid2sentences, **kwparams) fout.write('%s '%filename) output_line = '%s '%filename for x in top_sentences: for elem in x: fout.write('%s '%elem[0]) output_line += '%s %s '%(elem[0],elem[1]) output_line += '\n' print output_line fout.write('\n') fout_s.write(output_line) fout_s.flush() fout.close() fout_s.close()
def main(scriptparams): checkpoint = pickle.load(open(scriptparams['checkpoint'], 'rb')) npfilename = osp.join( 'scorelogs', osp.basename(scriptparams['checkpoint']).split('.')[0] + '_logprob%s' % (scriptparams['split'])) misc = checkpoint['misc'] # fetch the data provider params = checkpoint['params'] params['use_gumbel_mse'] = 0 params['maxlen'] = scriptparams['maxlen'] dp = getDataProvider(params) model_init_gen_from = checkpoint.get( 'model', {}) if 'model' in checkpoint else checkpoint['modelGen'] lstmGenerator = decodeGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update_list, lstmGenerator.regularize) if params.get('use_encoder_for', 0) & 1: if params.get('encode_gt_sentences', 0): xI = tensor.zeros((batch_size, params['image_encoding_size'])) imgFeatEnc_inp = [] else: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['word_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] if params.get('use_encoder_for', 0) & 2: aux_enc_inp = model['Wemb'] if params.get('encode_gt_sentences', 0) else dp.aux_inputs.T hid_size = params['featenc_hidden_size'] auxFeatEncoder = RecurrentFeatEncoder(hid_size, params['image_encoding_size'], params, mdl_prefix='aux_enc_', features=aux_enc_inp) mdlLen = len(model.keys()) model.update(auxFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(auxFeatEncoder.model_th.keys()))) misc['update'].extend(auxFeatEncoder.update_list) misc['regularize'].extend(auxFeatEncoder.regularize) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params) if params.get('encode_gt_sentences', 0): # Reshape it size(batch_size, n_gt, hidden_size) xAux = xAux.reshape( (-1, params['n_encgt_sent'], params['featenc_hidden_size'])) # Convert it to size (batch_size, n_gt*hidden_size xAux = xAux.flatten(2) else: auxFeatEnc_inp = [] xAux = None attn_nw_func = None (use_dropout, inp_list_gen, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params, xI, xAux, attn_nw=attn_nw_func) inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen f_eval = theano.function(inp_list, cost, name='f_eval') #--------------------------------- Cost function and gradient computations setup #---------------------------------# zipp(model_init_gen_from, model) # perform the evaluation on VAL set #val_sc = eval_split_theano(scriptparams['split'], dp, model, params, misc, f_eval) logppl = [] logppln = [] imgids = [] nsent = 0 for batch in dp.iterImageSentencePairBatch(split=scriptparams['split'], max_batch_size=1, max_images=-1): enc_inp_list = prepare_seq_features( batch, use_enc_for=params.get('use_encoder_for', 0), maxlen=params['maxlen'], use_shared_mem=params.get('use_shared_mem_enc', 0), enc_gt_sent=params.get('encode_gt_sentences', 0), n_enc_sent=params.get('n_encgt_sent', 0), wordtoix=misc['wordtoix']) gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], rev_sents=params.get('reverse_sentence', 0), use_enc_for=params.get('use_encoder_for', 0), use_unk_token=params.get('use_unk_token', 0)) inp_list = enc_inp_list + gen_inp_list cost = f_eval(*inp_list) logppl.append(cost[1]) logppln.append(lenS) imgids.append( str(batch[0]['image']['cocoid']) + '_' + str(batch[0]['sentidx'])) nsent += 1 perplex = 2**(np.array(logppl) / np.array(logppln)) np.savez(npfilename, pplx=perplex, keys=np.array(imgids)) #ppl2 = 2 ** (logppl / logppln) #print 'evaluated %d sentences and got perplexity = %f' % (nsent, ppl2) #met = [ppl2] print 2**(np.array(logppl).sum() / np.array(logppln).sum())
def main(params): word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['image_feat_size'] = dp.img_feat_size params['aux_inp_size'] = dp.aux_inp_size misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) if params['fine_tune'] == 1: params['mode'] = 'multi_choice_mode' if params[ 'mc_mode'] == 1 else 'multimodal_lstm' if params['checkpoint_file_name'] != None: #params['batch_size'] = dp.dataset['batchsize'] misc['wordtoix'] = checkpoint_init['wordtoix'] misc['ixtoword'] = checkpoint_init['ixtoword'] batch_size = 1 num_sentences_total = dp.getSplitSize('train', ofwhat='images') else: params['mode'] = 'batchtrain' batch_size = params['batch_size'] num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') params['vocabulary_size'] = len(misc['wordtoix']) pos_samp = np.arange(batch_size, dtype=np.int32) # This initializes the model parameters and does matrix initializations evalModel = decodeEvaluator(params) model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize) #----------------- If we are using feature encoders ----------------------- if params['use_encoder_for'] & 1: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['sent_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) #misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list_eval, miscOuts, cost, predTh, model) = evalModel.build_model(model, params, xI=xI, prior_inp_list=imgFeatEnc_inp) inp_list = imgFeatEnc_inp + inp_list_eval # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost / params['batch_size']) # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) if params['sim_minibatch'] > 0: f_grad_accum, f_clr, ag = solver.accumGrads(model, grads, inp_list, cost, params['sim_minibatch']) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, ag, inp_list, cost, params) else: f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1 max_iters = max_iters / inner_loop eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs / inner_loop)) top_val_ppl2 = -1 smooth_train_cost = len( misc['ixtoword']) # initially size of dictionary of confusion smooth_error_rate = 100. error_rate = 0. prev_it = -1 val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != None: zipp(model_init_from, model) zipp(rg_init, rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) elif params['init_from_imagernn'] != None: # Initialize word vecs and image emb from generative model file rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb')) model['Wemb'].set_value(rnnCv['model']['Wemb']) model['WIemb'].set_value(rnnCv['model']['WIemb_aux']) misc['wordtoix'] = rnnCv['wordtoix'] misc['ixtoword'] = rnnCv['ixtoword'] print( "\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn'])) write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] use_dropout.set_value(1.) #################### Main Loop ############################################ for it in xrange(max_iters): t0 = time.time() if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(float(params['use_dropout'])) # fetch a batch of data cost_inner = np.zeros((inner_loop, ), dtype=np.float32) if params['sim_minibatch'] > 0: for i_l in xrange(inner_loop): batch, pos_samp_sent = dp.sampPosNegSentSamps( params['batch_size'], params['mode'], thresh=0.3) eval_inp_list, lenS = prepare_data( batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], use_enc_for=params['use_encoder_for']) if params['fine_tune'] == 1: eval_inp_list.append(pos_samp_sent) cost_inner[i_l] = f_grad_accum(*eval_inp_list) else: batch, pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'], params['mode'], thresh=0.3) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], use_shared_mem=params['use_shared_mem_enc']) eval_inp_list, lenS = prepare_data( batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], use_enc_for=params['use_encoder_for']) if params['fine_tune'] == 1: eval_inp_list.append(pos_samp_sent) real_inp_list = enc_inp_list + eval_inp_list # Enable using dropout in training cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # Reset accumulated gradients to 0 if params['sim_minibatch'] > 0: f_clr() #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model]) #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))]) #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))]) #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) #import pdb; pdb.set_trace() #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) # print training statistics epoch = it * inner_loop * 1.0 / num_iters_one_epoch total_cost = (np.e**(-cost[0]) + (np.e**(-cost_inner)).sum() * (params['sim_minibatch'] > 0)) / ( 1 + params['sim_minibatch']) #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_cost) if it == 0: smooth_train_cost = total_cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost error_rate += 100.0 * float((cost[2] < 0.).sum()) / batch_size margin_strength = cost[2].sum() smooth_error_rate = 0.99 * smooth_error_rate + 0.01 * 100.0 * ( float(cost[1]) / batch_size) if it > 0 else 100.0 * ( float(cost[1]) / batch_size) tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.4f, Error '\ 'rate is %.3f%%, Margin %.2f, negMarg=%.2f' % (it, max_iters, dt, \ epoch, smooth_train_cost, smooth_error_rate, margin_strength, error_rate/(it-prev_it)) error_rate = 0. prev_it = it last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['val_ppl2'] = val_ppl2 # just write the last available one json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(0.) val_ppl2 = eval_split_theano( 'val', dp, model, params, misc, f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % ( val_ppl2, params['learning_rate']) #if params['sample_by_len'] == 1: # print len_hist if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % ( params['eval_model'], params['dataset'], host, params['fappend'], smooth_error_rate, val_ppl2) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e use_dropout.set_value(1.)
def main(params): batch_size = params["batch_size"] dataset = params["dataset"] word_count_threshold = params["word_count_threshold"] do_grad_check = params["do_grad_check"] max_epochs = params["max_epochs"] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc["wordtoix"], misc["ixtoword"], bias_init_vector = preProBuildWordVocab( dp.iterSentences("train"), word_count_threshold ) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc["update"], misc["regularize"] = (init_struct["model"], init_struct["update"], init_struct["regularize"]) # force overwrite here. This is a bit of a hack, not happy about it model["bd"] = bias_init_vector.reshape(1, bias_init_vector.size) print "model init done." print "model has keys: " + ", ".join(model.keys()) print "updating: " + ", ".join("%s [%dx%d]" % (k, model[k].shape[0], model[k].shape[1]) for k in misc["update"]) print "updating: " + ", ".join("%s [%dx%d]" % (k, model[k].shape[0], model[k].shape[1]) for k in misc["regularize"]) print "number of learnable parameters total: %d" % ( sum(model[k].shape[0] * model[k].shape[1] for k in misc["update"]), ) if params.get("init_model_from", ""): # load checkpoint checkpoint = pickle.load(open(params["init_model_from"], "rb")) model = checkpoint["model"] # overwrite the model print checkpoint["model"] # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize("train", ofwhat="sentences") num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params["eval_period"] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc["ixtoword"]) # initially size of dictionary of confusion val_ppl2 = len(misc["ixtoword"]) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status["params"] = params json_worker_status["history"] = [] for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct["cost"] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct["stats"]["ppl2"] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print "%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)" % ( it, max_iters, dt, epoch, cost["loss_cost"], cost["reg_cost"], train_ppl2, smooth_train_ppl2, ) # perform gradient check if desired, with a bit of a burnin time (10 iterations) if it == 10 and do_grad_check: print "disabling dropout for gradient check..." params["drop_prob_encoder"] = 0 params["drop_prob_decoder"] = 0 solver.gradCheck(batch, model, costfun) print "done gradcheck, exitting." sys.exit() # hmmm. probably should exit here # detect if loss is exploding and kill the job if so total_cost = cost["total_cost"] if it == 0: total_cost0 = total_cost # store this initial cost if total_cost > total_cost0 * 2: print "Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?" abort = True # set the abort flag, we'll break out # logging: write JSON files for visual inspection of the training tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report last_status_write_time = tnow jstatus = {} jstatus["time"] = datetime.datetime.now().isoformat() jstatus["iter"] = (it, max_iters) jstatus["epoch"] = (epoch, max_epochs) jstatus["time_per_batch"] = dt jstatus["smooth_train_ppl2"] = smooth_train_ppl2 jstatus["val_ppl2"] = val_ppl2 # just write the last available one jstatus["train_ppl2"] = train_ppl2 json_worker_status["history"].append(jstatus) status_file = os.path.join(params["worker_status_output_directory"], host + "_status.json") try: json.dump(json_worker_status, open(status_file, "w")) except Exception, e: # todo be more clever here print "tried to write worker status into %s but got error:" % (status_file,) print e # perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: val_ppl2 = eval_split("val", dp, model, params, misc) # perform the evaluation on VAL set print "validation perplexity = %f" % (val_ppl2,) # abort training if the perplexity is no good min_ppl_or_abort = params["min_ppl_or_abort"] if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0: print "aborting job because validation perplexity %f < %f" % (val_ppl2, min_ppl_or_abort) abort = True # abort the job write_checkpoint_ppl_threshold = params["write_checkpoint_ppl_threshold"] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = "model_checkpoint_%s_%s_%s_%.2f.p" % (dataset, host, params["fappend"], val_ppl2) filepath = os.path.join(params["checkpoint_output_directory"], filename) checkpoint = {} checkpoint["it"] = it checkpoint["epoch"] = epoch checkpoint["model"] = model checkpoint["params"] = params checkpoint["perplexity"] = val_ppl2 checkpoint["wordtoix"] = misc["wordtoix"] checkpoint["ixtoword"] = misc["ixtoword"] try: pickle.dump(checkpoint, open(filepath, "wb")) print "saved checkpoint in %s" % (filepath,) except Exception, e: # todo be more clever here print "tried to write checkpoint into %s but got error: " % (filepat,) print e
def main(params): for resF in params['resFileList']: caps = json.load(open(resF, 'r')) dp = getDataProvider(caps['checkpoint_params']) trackMetargs = {'eval_metric': params['met_to_track']} refToks, scr_info = eval_prep_refs(params['split'], dp, params['met_to_track']) trackMetargs['refToks'] = refToks trackMetargs['scr_info'] = scr_info capsById = {} n_cands = params['keepN'] - 1 if params['keepN'] != None else None npfilename = osp.join( 'scorelogs', osp.basename(resF).split('.')[0] + '_all%s_pairwise_%d' % (params['met_to_track'][0], n_cands + 1)) n = 0 for img in caps['imgblobs']: imgid = int(img['img_path'].split('_')[-1].split('.')[0]) capsById[imgid] = [{ 'image_id': imgid, 'caption': img['candidate']['text'], 'id': n }] n += 1 capsById[imgid].extend([{ 'image_id': imgid, 'caption': cd['text'], 'id': n + j } for j, cd in enumerate(img['candidatelist'][:n_cands])]) if len(capsById[imgid]) < (n_cands + 1): capsById[imgid].extend([ capsById[imgid][-1] for _ in xrange(n_cands + 1 - len(capsById[imgid])) ]) n += len(capsById[imgid]) - 1 n_caps_perimg = len(capsById[capsById.keys()[0]]) n_refs_perimg = len(refToks[refToks.keys()[0]]) capsById = trackMetargs['scr_info']['tokenizer'].tokenize(capsById) all_scrs = [] eval_metric = trackMetargs.get('eval_metric', 'perplex') #met = [[] for i in xrange(len(eval_metric)) if eval_metric[i][:6] != 'lcldiv'] if params['rev_eval'] == 1: tempCont = capsById capsById = refToks refToks = tempCont temp_cnt = n_caps_perimg n_caps_perimg = n_refs_perimg n_refs_perimg = temp_cnt npfilename += '_reverse' met = np.zeros( (len(eval_metric), n_caps_perimg, n_refs_perimg, len(capsById))) for j in xrange(n_caps_perimg): candToks = {imgid: [capsById[imgid][j]] for imgid in capsById} for r in xrange(n_refs_perimg): refTokInp = { imgid: refToks[imgid][r:r + 1] for imgid in capsById } # Now invoke all the scorers and get the scores for i, evm in enumerate(eval_metric): score, scores = trackMetargs['scr_info']['scr_fn'][ i].compute_score(refTokInp, candToks) met[i, j, r, :] = scores[-1] if type(score) == list else scores #print 'evaluated %d sentences and got %s = %f' % (n, evm, met[-1]) np.savez(npfilename + '.npz', met=met, keys=refTokInp.keys()) # Compute some specific scores mean_max_scr = met[0, :, :, :].max(axis=1).mean() if met.shape[1] <= met.shape[2] and met.shape[1] > 1 and params[ 'keepN'] <= 10: perms = np.array( [c for c in permutations(xrange(met.shape[2]), met.shape[1])]) #Compute non-overlapping max-mean new_idx = np.concatenate([ perms[:, None, :], np.tile(np.arange(met.shape[1])[None, :], [perms.shape[0], 1])[:, None, :] ], axis=1) non_overlapping_scrs = met[0, new_idx[:, 0, :], new_idx[:, 1, :], :].sum(axis=1).max( axis=0).mean() / float(met.shape[1]) else: non_overlapping_scrs = 0. print 'mean %s is %.3f, mean-max is %.3f, non-overlapping mean-max is %.3f' % ( eval_metric[0], met.mean(), mean_max_scr, non_overlapping_scrs)
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model = checkpoint['model'] # fetch the data provider dp = getDataProvider(dataset) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] ixtoword = checkpoint['ixtoword'] blob = {} # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) n = 0 all_references = [] all_candidates = [] for img in dp.iterImages(split = 'test', max_images = max_images): n+=1 print 'image %d/%d:' % (n, max_images) references = [' '.join(x['tokens']) for x in img['sentences']] # as list of lists of tokens kwparams = { 'beam_size' : params['beam_size'] } Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob['img_path'] = img['local_file_path'] img_blob['imgid'] = img['imgid'] # encode the human-provided references img_blob['references'] = [] for gtsent in references: print 'GT: ' + gtsent img_blob['references'].append({'text': gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[0] # these are sorted with highest on top candidate = ' '.join([ixtoword[ix] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that print 'PRED: (%f) %s' % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) img_blob['candidate'] = {'text': candidate, 'logprob': top_prediction[0]} blob['imgblobs'].append(img_blob) # use perl script to eval BLEU score for fair comparison to other research work # first write intermediate files print 'writing intermediate files into eval/' open('eval/output', 'w').write('\n'.join(all_candidates)) for q in xrange(5): open('eval/reference'+`q`, 'w').write('\n'.join([x[q] for x in all_references])) # invoke the perl script to get BLEU scores print 'invoking eval/multi-bleu.perl script...' owd = os.getcwd() os.chdir('eval') os.system('./multi-bleu.perl reference < output') os.chdir(owd) # now also evaluate test split perplexity gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images = max_images) print 'perplexity of ground truth words based on dictionary of %d words: %f' % (len(ixtoword), gtppl) blob['gtppl'] = gtppl # dump result struct to file print 'saving result struct to %s' % (params['result_struct_filename'], ) json.dump(blob, open(params['result_struct_filename'], 'w'))
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though params['use_dropout'] = 1 # This initializes the model parameters and does matrix initializations lstmGenerator = LSTMGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it model['bd'].set_value(bias_init_vector.astype(config.floatX)) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost / params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from, model) zipp(rg_init, rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) for it in xrange(max_iters): t0 = time.time() # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch, l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 if params['use_pos_tag'] != 'None': real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], None, sentTagMap, misc['ixtoword']) else: real_inp_list, lenS = prepare_data(batch, misc['wordtoix']) # Enable using dropout in training use_dropout.set_value(1.) # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1] / lenS)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch total_cost = cost[0] #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \ epoch, total_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano( 'val', dp, model, params, misc, f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % ( val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % ( params['dataset'], host, params['fappend'], val_ppl2) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname params['mode'] = 'CPU' # fetch the data provider dp = getDataProvider(dataset) misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) if params['mode'] == 'GPU': # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = gp.garray( bias_init_vector.reshape(1, bias_init_vector.size)) else: model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] max_iters = 1 for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) # perform gradient check if desired, with a bit of a burnin time (10 iterations) #if it == 10 and do_grad_check: # solver.gradCheck(batch, model, costfun) # print 'done gradcheck. continue?' # raw_input() # ## detect if loss is exploding and kill the job if so #total_cost = cost['total_cost'] #if it == 0: # total_cost0 = total_cost # store this initial cost #if total_cost > total_cost0 * 2: # print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' # abort = True # set the abort flag, we'll break out # ## logging: write JSON files for visual inspection of the training #tnow = time.time() #if tnow > last_status_write_time + 60*1: # every now and then lets write a report # last_status_write_time = tnow # jstatus = {} # jstatus['time'] = datetime.datetime.now().isoformat() # jstatus['iter'] = (it, max_iters) # jstatus['epoch'] = (epoch, max_epochs) # jstatus['time_per_batch'] = dt # jstatus['smooth_train_ppl2'] = smooth_train_ppl2 # jstatus['val_ppl2'] = val_ppl2 # just write the last available one # jstatus['train_ppl2'] = train_ppl2 # json_worker_status['history'].append(jstatus) # status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') # try: # json.dump(json_worker_status, open(status_file, 'w')) # except Exception, e: # todo be more clever here # print 'tried to write worker status into %s but got error:' % (status_file, ) # print e # ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good #is_last_iter = (it+1) == max_iters #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set # print 'validation perplexity = %f' % (val_ppl2, ) # write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] # if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: # if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # # if we beat a previous record or if this is the first time # # AND we also beat the user-defined threshold or it doesnt exist # top_val_ppl2 = val_ppl2 # filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2) # filepath = os.path.join(params['checkpoint_output_directory'], filename) # checkpoint = {} # checkpoint['it'] = it # checkpoint['epoch'] = epoch # checkpoint['model'] = model # checkpoint['params'] = params # checkpoint['perplexity'] = val_ppl2 # checkpoint['wordtoix'] = misc['wordtoix'] # checkpoint['ixtoword'] = misc['ixtoword'] # try: # pickle.dump(checkpoint, open(filepath, "wb")) # print 'saved checkpoint in %s' % (filepath, ) # except Exception, e: # todo be more clever here # print 'tried to write checkpoint into %s but got error: ' % (filepat, ) # print e cuda.close()
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) misc = { } # stores various misc items that need to be passed around the framework if params['checkpoint_file_name'] == 'None': # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) else: # Load Vocabulary from the checkpoint misc = checkpoint_init['misc'] params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though # This initializes the generator model parameters and does matrix initializations if params['t_eval_only'] == 0: generator = decodeGenerator(params) # Build the computational graph if params['use_encoder_for'] & 2: aux_enc_inp = generator.model_th['Wemb'] if params[ 'encode_gt_sentences'] else dp.aux_inputs.T hid_size = params['featenc_hidden_size'] auxFeatEncoder = RecurrentFeatEncoder( hid_size, params['image_encoding_size'], params, mdl_prefix='aux_enc_', features=aux_enc_inp) mdlLen = len(generator.model_th.keys()) generator.model_th.update(auxFeatEncoder.model_th) assert (len(generator.model_th.keys()) == ( mdlLen + len(auxFeatEncoder.model_th.keys()))) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model( generator.model_th, params) if params['encode_gt_sentences']: # Reshape it size(batch_size, n_gt, hidden_size) xAux = xAux.reshape((-1, params['n_encgt_sent'], params['featenc_hidden_size'])) # Convert it to size (batch_size, n_gt*hidden_size xAux = xAux.flatten(2) xI = tensor.zeros((batch_size, params['image_encoding_size'])) imgFeatEnc_inp = [] else: auxFeatEnc_inp = [] imgFeatEnc_inp = [] xAux = None xI = None (gen_inp_list, predLogProb, predIdx, predCand, gen_out, updatesLstm, seq_lengths) = generator.build_prediction_model(generator.model_th, params, xI=xI, xAux=xAux) gen_inp_list = imgFeatEnc_inp + auxFeatEnc_inp + gen_inp_list gen_out = gen_out.reshape([ gen_out.shape[0], -1, params['n_gen_samples'], params['vocabulary_size'] ]) #convert updates lstm to a tuple, this is to help merge it with grad updates updatesLstm = [(k, v) for k, v in updatesLstm.iteritems()] f_gen_only = theano.function( gen_inp_list, [predLogProb, predIdx, gen_out, seq_lengths], name='f_pred', updates=updatesLstm) modelGen = generator.model_th upListGen = generator.update_list if params['use_mle_train']: (use_dropout_genTF, inp_list_genTF, _, cost_genTF, _, updatesLSTM_genTF) = generator.build_model( generator.model_th, params) f_eval_genTF = theano.function(inp_list_genTF, cost_genTF, name='f_eval') grads_genTF = tensor.grad(cost_genTF[0], wrt=modelGen.values(), add_names=True) lr_genTF = tensor.scalar(name='lr', dtype=config.floatX) f_grad_genTF, f_update_genTF, zg_genTF, rg_genTF, ud_genTF = solver.build_solver_model( lr_genTF, modelGen, grads_genTF, inp_list_genTF, cost_genTF, params) else: modelGen = [] updatesLstm = [] if params['met_to_track'] != []: trackMetargs = {'eval_metric': params['met_to_track']} refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track']) trackMetargs['refToks'] = refToks trackMetargs['scr_info'] = scr_info # Initialize the evalator model if params['share_Wemb']: evaluator = decodeEvaluator(params, modelGen['Wemb']) else: evaluator = decodeEvaluator(params) modelEval = evaluator.model_th if params['t_eval_only'] == 0: # Build the evaluator graph to evaluate reference and generated captions if params.get('upd_eval_ref', 0): (refeval_inp_list, ref_f_pred_fns, ref_costs, ref_predTh, ref_modelEval) = evaluator.build_advers_eval(modelEval, params) (eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params, gen_inp_list, gen_out, updatesLstm, seq_lengths) else: # Build the evaluator graph to evaluate only reference captions (eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies if params['t_eval_only'] == 0: # and 0: if params['checkpoint_file_name'] == 'None': modelGen['bd'].set_value(bias_init_vector.astype(config.floatX)) if params.get('class_out_factoring', 0) == 1: modelGen['bdCls'].set_value( bias_init_inter_class.astype(config.floatX)) comb_inp_list = eval_inp_list if params['t_eval_only'] == 0: for inp in gen_inp_list: if inp not in comb_inp_list: comb_inp_list.append(inp) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation or debug purposes if params['t_eval_only'] == 0: f_eval = theano.function(comb_inp_list, costs[:1], name='f_eval', updates=updatesLstm) else: f_eval = theano.function(comb_inp_list, costs[:1], name='f_eval') if params['share_Wemb']: modelEval.pop('Wemb') if params['fix_Wemb']: upListGen.remove('Wemb') #------------------------------------------------------------------------------------------------------------------------- # Now let's build a gradient computation graph and update mechanism #------------------------------------------------------------------------------------------------------------------------- # First compute gradient on the evaluator params w.r.t cost if params.get('upd_eval_ref', 0): gradsEval_ref = tensor.grad(ref_costs[0], wrt=modelEval.values(), add_names=True) gradsEval = tensor.grad(costs[0], wrt=modelEval.values(), add_names=True) # Update functions for the evaluator lrEval = tensor.scalar(name='lrEval', dtype=config.floatX) if params.get('upd_eval_ref', 0): f_grad_comp_eval_ref, f_param_update_eval_ref, _, _, _ = solver.build_solver_model( lrEval, modelEval, gradsEval_ref, refeval_inp_list, ref_costs[0], params, w_clip=params['eval_w_clip']) f_grad_comp_eval, f_param_update_eval, zg_eval, rg_eval, ud_eval = solver.build_solver_model( lrEval, modelEval, gradsEval, comb_inp_list, costs[:1], params, updatesLstm, w_clip=params['eval_w_clip']) # Now compute gradient on the generator params w.r.t the cost if params['t_eval_only'] == 0: gradsGen = tensor.grad(costs[1], wrt=modelGen.values(), add_names=True) lrGen = tensor.scalar(name='lrGen', dtype=config.floatX) # Update functions for the generator f_grad_comp_gen, f_param_update_gen, zg_gen, rg_gen, ud_gen = solver.build_solver_model( lrGen, modelGen, gradsGen, comb_inp_list[:(len(comb_inp_list) - 1 + params['gen_feature_matching'])], costs[1], params, updatesLstm) #------------------------------------------------------------------------------------------------------------------------- # If we want to track some metrics during the training, initialize stuff for that now #------------------------------------------------------------------------------------------------------------------------- print 'model init done.' if params['t_eval_only'] == 0: print 'Gen model has keys: ' + ', '.join(modelGen.keys()) print 'Eval model has keys: ' + ', '.join(modelEval.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='images') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch skip_first = 20 iters_eval = 5 iters_gen = 1 cost_eval_iter = [] cost_gen_iter = [] trackSc_array = [] eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = 0.5 # initially size of dictionary of confusion smooth_train_cost = 0.0 # initially size of dictionary of confusion smooth_train_cost_gen = 1.0 # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] iter_out_file = os.path.join( 'logs', 'advmodel_checkpoint_%s_%s_%s_log.npz' % (params['dataset'], host, params['fappend'])) len_hist = defaultdict(int) t_print_sec = 30 ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': if params['t_eval_only'] != 1: print '\n Now initing gen Model:' zipp(model_init_gen_from, modelGen) if 'trackers' in checkpoint_init: trackSc_array = checkpoint_init['trackers'].get('trackScores', []) print '\n Now initing Eval Model:' zipp(model_init_eval_from, modelEval) #zipp(rg_init,rgGen) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) ############################################################## # Define signal handler to catch ctl-c or kills so that we can save the model trained till that point def signal_handler(signal, frame): print('You pressed Ctrl+C! Saving Checkpoint Now before exiting!') filename = 'advmodel_checkpoint_%s_%s_%s_%.2f_INT.p' % ( params['dataset'], host, params['fappend'], val_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2) sys.exit(0) #signal.signal(signal.SIGINT, signal_handler) ############################################################## #In testing disable sampling and use the greedy approach!? generator.usegumbel.set_value(1) if params['met_to_track'] != []: tsc_max, tsc_mean, tsc_min = eval_gen_samps(f_gen_only, dp, params, misc, params['rev_eval'], **trackMetargs) trackSc_array.append((0, { evm + '_max': tsc_max[i] for i, evm in enumerate(params['met_to_track']) })) trackSc_array[-1][1].update({ evm + '_mean': tsc_mean[i] for i, evm in enumerate(params['met_to_track']) }) trackSc_array[-1][1].update({ evm + '_min': tsc_min[i] for i, evm in enumerate(params['met_to_track']) }) disp_some_gen_samps(f_gen_only, dp, params, misc, n_samp=5) evaluator.use_noise.set_value(1.) eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc) # Re-enable sampling generator.usegumbel.set_value(1) np.savez(iter_out_file, eval_cost=np.array(cost_eval_iter), gen_cost=np.array(cost_gen_iter), tracksc=np.array(trackSc_array)) smooth_train_cost = 0.0 print '###################### NOW BEGINNING TRAINING #################################' for it in xrange(max_iters): t0 = time.time() # Enable using dropout in training evaluator.use_noise.set_value(1.) dt = 0. it2 = 0 while eval_acc <= 60. or gen_acc >= 45. or it2 < iters_eval * skip_first: # fetch a batch of data t1 = time.time() s_probs = [ 0.6, 0.4, 0.0 ] if params['eval_loss'] == 'contrastive' else [1.0, 0.0, 0.0] batch = dp.sampAdversBatch(batch_size, n_sent=params['n_gen_samples'], probs=s_probs) cnn_inps = prepare_adv_data(batch, misc['wordtoix'], maxlen=params['maxlen'], prep_for=params['eval_model']) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) eval_cost = f_grad_comp_eval(*(cnn_inps + enc_inp_list)) if np.isnan(eval_cost[0]): import pdb pdb.set_trace() f_param_update_eval(params['learning_rate_eval']) # Track training statistics smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * eval_cost[ 0] if it > 0 else eval_cost[0] dt2 = time.time() - t1 if it2 % 500 == 499: gb = 0. #modelGen['gumb_temp'].get_value() if params['use_gumbel_mse'] == 1 else 0 print 'Iter %d/%d Eval Only Iter %d/%d, done. in %.3fs. Eval Cost is %.6f' % ( it, max_iters, it2, iters_eval * skip_first, dt2, smooth_train_cost) if it2 % 100 == 99: eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, n_eval=500) it2 += 1 evaluator.use_noise.set_value(1.) if it >= 0: skip_first = 1 if it >= 100: skip_first = 1 if it % 1000 == 999: skip_first = 1 s_probs = [ 1.0, 0.0, 0.0 ] if params['eval_loss'] == 'contrastive' else [1.0, 0.0, 0.0] batch = dp.sampAdversBatch(batch_size, n_sent=params['n_gen_samples'], probs=s_probs) cnn_inps = prepare_adv_data(batch, misc['wordtoix'], maxlen=params['maxlen'], prep_for=params['eval_model']) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) gen_cost = f_grad_comp_gen( *(cnn_inps[:(len(cnn_inps) - 1 + params['gen_feature_matching'])] + enc_inp_list)) f_param_update_gen(params['learning_rate_gen']) if params['use_mle_train']: generator.usegumbel.set_value(0) batch, l = dp.getRandBatchByLen(batch_size) gen_inp_list, lenS = prepare_data(batch, misc['wordtoix'], params['maxlen']) cost_genMLE = f_grad_genTF(*gen_inp_list) f_update_genTF(np.float32(params['learning_rate_gen'] / 50.0)) generator.usegumbel.set_value(1) dt = time.time() - t0 # print training statistics smooth_train_cost_gen = gen_cost if it == 0 else 0.99 * smooth_train_cost_gen + 0.01 * gen_cost tnow = time.time() if tnow > last_status_write_time + t_print_sec * 1: # every now and then lets write a report gb = 0. #modelGen['gumb_temp'].get_value() if params['use_gumbel_mse'] == 1 else 0 print 'Iter %d/%d done. in %.3fs. Eval Cost is %.6f, Gen Cost is %.6f, temp: %.4f' % (it, max_iters, dt, \ smooth_train_cost, smooth_train_cost_gen, gb) last_status_write_time = tnow cost_eval_iter.append(smooth_train_cost) cost_gen_iter.append(smooth_train_cost_gen) if it % 500 == 499: # Run the generator on the validation set and compute some metrics generator.usegumbel.set_value(1) if params['met_to_track'] != []: #In testing set the temperature to very low, so that it is equivalent to Greed samples tsc_max, tsc_mean, tsc_min = eval_gen_samps( f_gen_only, dp, params, misc, params['rev_eval'], **trackMetargs) trackSc_array.append((it, { evm + '_max': tsc_max[i] for i, evm in enumerate(params['met_to_track']) })) trackSc_array[-1][1].update({ evm + '_mean': tsc_mean[i] for i, evm in enumerate(params['met_to_track']) }) trackSc_array[-1][1].update({ evm + '_min': tsc_min[i] for i, evm in enumerate(params['met_to_track']) }) disp_some_gen_samps(f_gen_only, dp, params, misc, n_samp=5) generator.usegumbel.set_value(1) # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = gen_acc if it % 500 == 499: eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, n_eval=500) if it % 1000 == 999: filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_genacc.p' % ( params['dataset'], host, params['fappend'], it, gen_acc) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, gen_acc) if it % 500 == 499: np.savez(iter_out_file, eval_cost=np.array(cost_eval_iter), gen_cost=np.array(cost_gen_iter), tracksc=np.array(trackSc_array)) # AND we also beat the user-defined threshold or it doesnt exist filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_GenDone.p' % ( params['dataset'], host, params['fappend'], it, g_acc) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, g_acc)
def train(args): zero_words = cPickle.load(gzip.open("zero_shot.pkl.gz")) if args.zero_shot else set() def maybe_zero(s, i): overlap = set(tokenize(s)).intersection(zero_words) if args.zero_shot and len(overlap) > 0: return numpy.zeros(i.shape) else: return i dataset = args.dataset tok_path = args.tokenizer model_path = args.model d = dp.getDataProvider(dataset) pairs = list(d.iterImageSentencePair(split='train')) if args.shuffle: numpy.random.shuffle(pairs) output_size = len(pairs[0]['image']['feat']) embedding_size = args.embedding_size if args.embedding_size is not None else args.hidden_size tokenizer = cPickle.load(gzip.open(args.init_tokenizer)) \ if args.init_tokenizer else Tokenizer(min_df=args.word_freq_threshold, character=args.character) sentences, images = zip(*[ (pair['sentence']['raw'], maybe_zero(pair['sentence']['raw'],pair['image']['feat'])) for pair in pairs ]) scaler = StandardScaler() if args.scaler == 'standard' else NoScaler() images = scaler.fit_transform(images) tokens = [ [tokenizer.encoder['PAD']] + sent + [tokenizer.encoder['END'] ] for sent in tokenizer.fit_transform(sentences) ] tokens_inp = [ token[:-1] for token in tokens ] tokens_out = [ token[1:] for token in tokens ] cPickle.dump(tokenizer, gzip.open(tok_path, 'w')) cPickle.dump(scaler, gzip.open('scaler.pkl.gz','w')) # Validation data valid_pairs = list(d.iterImageSentencePair(split='val')) valid_sents, valid_images = zip(*[ (pair['sentence']['raw'], pair['image']['feat']) for pair in valid_pairs ]) valid_images = scaler.transform(valid_images) valid_tokens = [ [ tokenizer.encoder['PAD'] ] + sent + [tokenizer.encoder['END'] ] for sent in tokenizer.transform(valid_sents) ] valid_tokens_inp = [ token[:-1] for token in valid_tokens ] valid_tokens_out = [ token[1:] for token in valid_tokens ] valid = (valid_tokens_inp, valid_tokens_out, valid_images) updater = passage.updates.Adam(lr=args.rate, clipnorm=args.clipnorm) if args.cost == 'MeanSquaredError': z_cost = MeanSquaredError elif args.cost == 'CosineDistance': z_cost = CosineDistance else: raise ValueError("Unknown cost") if args.hidden_type == 'gru': Recurrent = GatedRecurrent elif args.hidden_type == 'lstm': Recurrent = LstmRecurrent else: Recurrent = GatedRecurrent # if args.init_model is not None: # model_init = cPickle.load(open(args.init_model)) # def values(ps): # return [ p.get_value() for p in ps ] # # FIXME enable this for shared only embeddings # layers = [ Embedding(size=args.hidden_size, n_features=tokenizer.n_features, # weights=values(model_init.layers[0].params)), # Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation, # weights=values(model_init.layers[1].params)), # Combined(left=Dense(size=tokenizer.n_features, activation='softmax', reshape=True, # weights=values(model_init.layers[2].left.params)), # right=Dense(size=output_size, activation=args.out_activation, # weights=values(model_init.layers[2].right.params)) # ) ] # else: # FIXME implement proper pretraining FIXME interpolated = True if not args.non_interpolated else False if args.model_type in ['add', 'mult', 'matrix']: if args.model_type == 'add': layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=Add) elif args.model_type == 'mult': layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=Mult) elif args.model_type == 'matrix': sqrt_size = embedding_size ** 0.5 if not sqrt_size.is_integer(): raise ValueError("Sqrt of embedding_size not integral for matrix model") layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=MatrixMult) layers = [ layer0, Dense(size=output_size, activation=args.out_activation, reshape=False) ] valid = (valid_tokens_inp, valid_images) model = RNN(layers=layers, updater=updater, cost=z_cost, iterator=SortedPadded(shuffle=False), verbose=1) model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) elif args.model_type == 'simple': layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), Dense(size=output_size, activation=args.out_activation, reshape=False) ] valid = (valid_tokens_inp, valid_images) model = RNN(layers=layers, updater=updater, cost=z_cost, iterator=SortedPadded(shuffle=False), verbose=1) model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) # FIXME need validation elif args.model_type == 'deep-simple': layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), Dense(size=output_size, activation=args.out_activation, reshape=False) ] valid = (valid_tokens_inp, valid_images) model = RNN(layers=layers, updater=updater, cost=z_cost, iterator=SortedPadded(shuffle=False), verbose=1) model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) # FIXME need validation elif args.model_type == 'shared_all': if args.zero_shot: raise NotImplementedError # FIXME zero_shot not implemented layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), Combined(left=Dense(size=tokenizer.n_features, activation='softmax', reshape=True), right=Dense(size=output_size, activation=args.out_activation, reshape=False)) ] model = ForkedRNN(layers=layers, updater=updater, cost_y=CategoricalCrossEntropySwapped, cost_z=z_cost, alpha=args.alpha, size_y=tokenizer.n_features, verbose=1, interpolated=interpolated) model.fit(tokens_inp, tokens_out, images, n_epochs=args.iterations, batch_size=args.batch_size, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) elif args.model_type == 'shared_embeddings': layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Combined(left=Stacked([Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), Dense(size=tokenizer.n_features, activation='softmax', reshape=True)]), left_type='id', right=Stacked([Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), Dense(size=output_size, activation=args.out_activation, reshape=False)]), right_type='id') ] model = ForkedRNN(layers=layers, updater=updater, cost_y=CategoricalCrossEntropySwapped, cost_z=z_cost, alpha=args.alpha, size_y=tokenizer.n_features, verbose=1, interpolated=interpolated, zero_shot=args.zero_shot) model.fit(tokens_inp, tokens_out, images, n_epochs=args.iterations, batch_size=args.batch_size, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) cPickle.dump(model, gzip.open(model_path,"w"))
def main(params): word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['image_feat_size'] = dp.img_feat_size misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['use_dropout'] = 1 if params['fine_tune'] == 1: params['mode'] = 'multimodal_lstm' if params['multimodal_lstm'] == 0 else 'multimodal_lstm' if params['checkpoint_file_name'] != None: params['batch_size'] = dp.dataset['batchsize'] misc['wordtoix'] = checkpoint_init['wordtoix'] misc['ixtoword'] = checkpoint_init['ixtoword'] batch_size = 1 num_sentences_total = dp.getSplitSize('train', ofwhat = 'images') else: params['mode'] = 'batchtrain' batch_size = params['batch_size'] num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') params['vocabulary_size'] = len(misc['wordtoix']) pos_samp = np.arange(batch_size,dtype=np.int32) # This initializes the model parameters and does matrix initializations evalModel = decodeEvaluator(params) model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, miscOuts, cost, predTh, model) = evalModel.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p] ** 2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost /params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost, wrt=model.values()) lr = tensor.scalar(name='lr',dtype=config.floatX) if params['sim_minibatch'] > 0: f_grad_accum, f_clr, ag = solver.accumGrads(model,grads,inp_list,cost, params['sim_minibatch']) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, ag, inp_list, cost, params) else: f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1 max_iters = max_iters / inner_loop eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs/ inner_loop)) top_val_ppl2 = -1 smooth_train_cost = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != None: zipp(model_init_from,model) zipp(rg_init,rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) elif params['init_from_imagernn'] != None: # Initialize word vecs and image emb from generative model file rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb')) model['Wemb'].set_value(rnnCv['model']['Wemb']) model['WIemb'].set_value(rnnCv['model']['WIemb_aux']) misc['wordtoix'] = rnnCv['wordtoix'] misc['ixtoword'] = rnnCv['ixtoword'] print("\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn'])) use_dropout.set_value(1.) #################### Main Loop ############################################ for it in xrange(max_iters): t0 = time.time() # fetch a batch of data cost_inner = np.zeros((inner_loop,),dtype=np.float32) if params['sim_minibatch'] > 0: for i_l in xrange(inner_loop): batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model']) if params['fine_tune'] == 1: real_inp_list.append(pos_samp_sent) cost_inner[i_l] = f_grad_accum(*real_inp_list) else: batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model']) if params['fine_tune'] == 1: real_inp_list.append(pos_samp_sent) # Enable using dropout in training cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # Reset accumulated gradients to 0 if params['sim_minibatch'] > 0: f_clr() #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model]) #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))]) #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))]) #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) #import pdb; pdb.set_trace() #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) # print training statistics epoch = it*inner_loop * 1.0 / num_iters_one_epoch total_cost = (np.e**-cost + (np.e**(-cost_inner)).sum()*(params['sim_minibatch'] > 0))/ (1 + params['sim_minibatch']) #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_cost) if it == 0: smooth_train_cost = total_cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.3f' % (it, max_iters, dt, \ epoch, smooth_train_cost) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['val_ppl2'] = val_ppl2 # just write the last available one json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params['lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist #top_val_ppl2 = val_ppl2 filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (params['eval_model'], params['dataset'], host, params['fappend'],val_ppl2,smooth_train_cost) filepath = os.path.join(params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepath, ) print e use_dropout.set_value(1.)
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname #--------------------------------- Init data provider and load data+features #---------------------------------# # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = params['featenc_hidden_size'] * params[ 'n_encgt_sent'] if params['encode_gt_sentences'] else dp.aux_inp_size params['featenc_hidden_size'] = params['featenc_hidden_size'] if params[ 'encode_gt_sentences'] else params['aux_inp_size'] params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) #--------------------------------- Preprocess sentences and build Vocabulary #---------------------------------# misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times if params['checkpoint_file_name'] == 'None': if params['class_out_factoring'] == 0: misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) else: [misc['wordtoix'], misc['classes'] ], [misc['ixtoword'], misc['clstotree'], misc['ixtoclsinfo'] ], [bias_init_vector, bias_init_inter_class ] = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold, params) params['nClasses'] = bias_init_inter_class.shape[0] params['ixtoclsinfo'] = misc['ixtoclsinfo'] else: misc = checkpoint_init['misc'] params['nClasses'] = checkpoint_init['params']['nClasses'] if 'ixtoclsinfo' in misc: params['ixtoclsinfo'] = misc['ixtoclsinfo'] params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though print len(misc['wordtoix']), len(misc['ixtoword']) #------------------------------ Initialize the solver/generator and build forward path #-----------------------# # Initialize the optimizer solver = Solver(params['solver']) # This initializes the model parameters and does matrix initializations lstmGenerator = decodeGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update_list, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack if params['checkpoint_file_name'] == 'None': model['bd'].set_value(bias_init_vector.astype(config.floatX)) if params['class_out_factoring'] == 1: model['bdCls'].set_value( bias_init_inter_class.astype(config.floatX)) #----------------- If we are using feature encoders ----------------------- # This mode can now also be used for encoding GT sentences. if params['use_encoder_for'] & 1: if params['encode_gt_sentences']: xI = tensor.zeros((batch_size, params['image_encoding_size'])) imgFeatEnc_inp = [] else: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['word_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] if params['use_encoder_for'] & 2: aux_enc_inp = model['Wemb'] if params[ 'encode_gt_sentences'] else dp.aux_inputs.T hid_size = params['featenc_hidden_size'] auxFeatEncoder = RecurrentFeatEncoder(hid_size, params['image_encoding_size'], params, mdl_prefix='aux_enc_', features=aux_enc_inp) mdlLen = len(model.keys()) model.update(auxFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(auxFeatEncoder.model_th.keys()))) misc['update'].extend(auxFeatEncoder.update_list) misc['regularize'].extend(auxFeatEncoder.regularize) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params) if params['encode_gt_sentences']: # Reshape it size(batch_size, n_gt, hidden_size) xAux = xAux.reshape( (-1, params['n_encgt_sent'], params['featenc_hidden_size'])) # Convert it to size (batch_size, n_gt*hidden_size xAux = xAux.flatten(2) else: auxFeatEnc_inp = [] xAux = None #--------------------------------- Initialize the Attention Network #-------------------------------# if params['use_attn'] != None: attnModel = AttentionNetwork(params['image_feat_size'], params['hidden_size'], params, mdl_prefix='attn_mlp_') mdlLen = len(model.keys()) model.update(attnModel.model_th) assert (len(model.keys()) == (mdlLen + len(attnModel.model_th.keys()))) misc['update'].extend(attnModel.update_list) misc['regularize'].extend(attnModel.regularize) attn_nw_func = attnModel.build_model else: attn_nw_func = None #--------------------------------- Build the language model graph #---------------------------------# # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list_gen, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params, xI, xAux, attn_nw=attn_nw_func) inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen #--------------------------------- Cost function and gradient computations setup #---------------------------------# costGrad = cost[0] # Add class uncertainity to final cost #if params['class_out_factoring'] == 1: # costGrad += cost[2] # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c costGrad += (reg_cost / params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(costGrad, wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) #-------------------------------- Intialize the prediction path if needed by evaluator ----------------------------# evalKwargs = { 'eval_metric': params['eval_metric'], 'f_gen': lstmGenerator.predict, 'beamsize': params['eval_beamsize'] } if params['eval_metric'] != 'perplex': lstmGenerator.prepPredictor(None, params, params['eval_beamsize']) refToks, scr_info = eval_prep_refs('val', dp, params['eval_metric']) evalKwargs['refToks'] = refToks evalKwargs['scr_info'] = scr_info valMetOp = operator.gt else: valMetOp = operator.lt if params['met_to_track'] != []: trackMetargs = { 'eval_metric': params['met_to_track'], 'f_gen': lstmGenerator.predict, 'beamsize': params['eval_beamsize'] } lstmGenerator.prepPredictor(None, params, params['eval_beamsize']) refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track']) trackMetargs['refToks'] = refToks trackMetargs['scr_info'] = scr_info #--------------------------------- Iterations and Logging intializations ------------------------------------------# # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_sc = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_sc = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} #json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) #Initialize Tracking the perplexity of train and val, with iters. train_perplex = [] val_perplex = [] trackSc_array = [] #-------------------------------------- Load previously saved model ------------------------------------------------# #- Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from, model) if params['restore_grads'] == 1: zipp(rg_init, rg) #Copy trackers from previous checkpoint if 'trackers' in checkpoint_init: train_perplex = checkpoint_init['trackers']['train_perplex'] val_perplex = checkpoint_init['trackers']['val_perplex'] trackSc_array = checkpoint_init['trackers'].get('trackScores', []) print( """\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n""" % (checkpoint_init['epoch'], checkpoint_init['perplexity'])) #-------------------------------------- MAIN LOOP ----------------------------------------------------------------# for it in xrange(max_iters): t0 = time.time() # Enable using dropout in training use_dropout.set_value(float(params['use_dropout'])) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(float(params['use_dropout'])) if params['use_encoder_for'] & 2: auxenc_use_dropout.set_value(float(params['use_dropout'])) epoch = it * 1.0 / num_iters_one_epoch #-------------------------------------- Prepare batch-------------------------------------------# # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch, l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) if params['use_pos_tag'] != 'None': gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], params['maxlen'], sentTagMap, misc['ixtoword'], rev_sents=params['reverse_sentence'], use_enc_for=params['use_encoder_for'], use_unk_token=params['use_unk_token']) else: gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], params['maxlen'], rev_sents=params['reverse_sentence'], use_enc_for=params['use_encoder_for'], use_unk_token=params['use_unk_token']) if params['sched_sampling_mode'] != None: gen_inp_list.append(epoch) real_inp_list = enc_inp_list + gen_inp_list #import ipdb; ipdb.set_trace() #---------------------------------- Compute cost and apply gradients ---------------------------# # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1] / lenS)) #step_struct['stats']['ppl2'] # smooth exponentially decaying moving average smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out total_cost = cost[0] if it == 0: smooth_cost = total_cost # start out where we start out smooth_cost = 0.99 * smooth_cost + 0.01 * total_cost #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) #---------------------------------- Write a report into a json file ---------------------------# tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' \ % (it, max_iters, dt, epoch, smooth_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_sc'] = val_sc # just write the last available one jstatus['val_metric'] = params[ 'eval_metric'] # just write the last available one jstatus['train_ppl2'] = train_ppl2 #if params['class_out_factoring'] == 1: # jstatus['class_cost'] = float(cost[2]) json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e #--------------------------------- VALIDATION ---------------------------# #- perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(0.) if params['use_encoder_for'] & 2: auxenc_use_dropout.set_value(0.) # perform the evaluation on VAL set val_sc = eval_split_theano('val', dp, model, params, misc, f_eval, **evalKwargs) val_sc = val_sc[0] val_perplex.append((it, val_sc)) train_perplex.append((it, smooth_train_ppl2)) if params['met_to_track'] != []: track_sc = eval_split_theano('val', dp, model, params, misc, f_eval, **trackMetargs) trackSc_array.append((it, { evm: track_sc[i] for i, evm in enumerate(params['met_to_track']) })) if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation %s = %f, lr = %f' % ( params['eval_metric'], val_sc, params['learning_rate']) #if params['sample_by_len'] == 1: # print len_hist #----------------------------- SAVE THE MODEL -------------------# write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if valMetOp(val_sc, top_val_sc) or top_val_sc < 0: if valMetOp(val_sc, write_checkpoint_ppl_threshold ) or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_sc = val_sc filename = 'model_checkpoint_%s_%s_%s_%s%.2f.p' % ( params['dataset'], host, params['fappend'], params['eval_metric'][:3], val_sc) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_sc checkpoint['misc'] = misc checkpoint['trackers'] = { 'train_perplex': train_perplex, 'val_perplex': val_perplex, 'trackScores': trackSc_array } try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def main(params): # load the checkpoint checkpoint_path = params['checkpoint_path'] max_images = params['max_images'] print 'loading checkpoint %s' % (checkpoint_path, ) checkpoint = pickle.load(open(checkpoint_path, 'rb')) checkpoint_params = checkpoint['params'] dataset = checkpoint_params['dataset'] model = checkpoint['model'] # fetch the data provider dp = getDataProvider(dataset) misc = {} misc['wordtoix'] = checkpoint['wordtoix'] ixtoword = checkpoint['ixtoword'] blob = { } # output blob which we will dump to JSON for visualizing the results blob['params'] = params blob['checkpoint_params'] = checkpoint_params blob['imgblobs'] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) all_bleu_scores = [] n = 0 #for img in dp.iterImages(split = 'test', shuffle = True, max_images = max_images): for img in dp.iterImages(split='test', max_images=max_images): n += 1 print 'image %d/%d:' % (n, max_images) references = [x['tokens'] for x in img['sentences']] # as list of lists of tokens kwparams = {'beam_size': params['beam_size']} Ys = BatchGenerator.predict([{ 'image': img }], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob['img_path'] = img['local_file_path'] img_blob['imgid'] = img['imgid'] # encode the human-provided references img_blob['references'] = [] for gtwords in references: print 'GT: ' + ' '.join(gtwords) img_blob['references'].append({'text': ' '.join(gtwords)}) # now evaluate and encode the top prediction top_predictions = Ys[ 0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[ 0] # these are sorted with highest on top candidate = [ixtoword[ix] for ix in top_prediction[1]] print 'PRED: (%f) %s' % (top_prediction[0], ' '.join(candidate)) bleu_scores = evalCandidate(candidate, references) print 'BLEU: B-1: %f B-2: %f B-3: %f' % tuple(bleu_scores) img_blob['candidate'] = { 'text': ' '.join(candidate), 'logprob': top_prediction[0], 'bleu': bleu_scores } all_bleu_scores.append(bleu_scores) blob['imgblobs'].append(img_blob) print 'final average bleu scores:' bleu_averages = [ sum(x[i] for x in all_bleu_scores) * 1.0 / len(all_bleu_scores) for i in xrange(3) ] blob['final_result'] = {'bleu': bleu_averages} print 'FINAL BLEU: B-1: %f B-2: %f B-3: %f' % tuple(bleu_averages) # now also evaluate test split perplexity gtppl = eval_split('test', dp, model, checkpoint_params, misc, eval_max_images=max_images) print 'perplexity of ground truth words: %f' % (gtppl, ) blob['gtppl'] = gtppl # dump result struct to file print 'saving result struct to %s' % (params['result_struct_filename'], ) json.dump(blob, open(params['result_struct_filename'], 'w'))
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size']) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though params['use_dropout'] = 1 # This initializes the model parameters and does matrix initializations lstmGenerator = LSTMGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it model['bd'].set_value(bias_init_vector.astype(config.floatX)) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p] ** 2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost /params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr',dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop(lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from,model) zipp(rg_init,rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) for it in xrange(max_iters): t0 = time.time() # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch,l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 if params['use_pos_tag'] != 'None': real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],None,sentTagMap,misc['ixtoword']) else: real_inp_list, lenS = prepare_data(batch,misc['wordtoix']) # Enable using dropout in training use_dropout.set_value(1.) # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1]/lenS)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch total_cost = cost[0] #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \ epoch, total_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params['lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (params['dataset'], host, params['fappend'], val_ppl2) filepath = os.path.join(params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepath, ) print e
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname params['mode'] = 'CPU' # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) if params['mode'] == 'GPU': # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = gp.garray(bias_init_vector.reshape(1, bias_init_vector.size)) else: model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] max_iters = 1 for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) # perform gradient check if desired, with a bit of a burnin time (10 iterations) #if it == 10 and do_grad_check: # solver.gradCheck(batch, model, costfun) # print 'done gradcheck. continue?' # raw_input() # ## detect if loss is exploding and kill the job if so #total_cost = cost['total_cost'] #if it == 0: # total_cost0 = total_cost # store this initial cost #if total_cost > total_cost0 * 2: # print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' # abort = True # set the abort flag, we'll break out # ## logging: write JSON files for visual inspection of the training #tnow = time.time() #if tnow > last_status_write_time + 60*1: # every now and then lets write a report # last_status_write_time = tnow # jstatus = {} # jstatus['time'] = datetime.datetime.now().isoformat() # jstatus['iter'] = (it, max_iters) # jstatus['epoch'] = (epoch, max_epochs) # jstatus['time_per_batch'] = dt # jstatus['smooth_train_ppl2'] = smooth_train_ppl2 # jstatus['val_ppl2'] = val_ppl2 # just write the last available one # jstatus['train_ppl2'] = train_ppl2 # json_worker_status['history'].append(jstatus) # status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') # try: # json.dump(json_worker_status, open(status_file, 'w')) # except Exception, e: # todo be more clever here # print 'tried to write worker status into %s but got error:' % (status_file, ) # print e # ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good #is_last_iter = (it+1) == max_iters #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set # print 'validation perplexity = %f' % (val_ppl2, ) # write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] # if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: # if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # # if we beat a previous record or if this is the first time # # AND we also beat the user-defined threshold or it doesnt exist # top_val_ppl2 = val_ppl2 # filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2) # filepath = os.path.join(params['checkpoint_output_directory'], filename) # checkpoint = {} # checkpoint['it'] = it # checkpoint['epoch'] = epoch # checkpoint['model'] = model # checkpoint['params'] = params # checkpoint['perplexity'] = val_ppl2 # checkpoint['wordtoix'] = misc['wordtoix'] # checkpoint['ixtoword'] = misc['ixtoword'] # try: # pickle.dump(checkpoint, open(filepath, "wb")) # print 'saved checkpoint in %s' % (filepath, ) # except Exception, e: # todo be more clever here # print 'tried to write checkpoint into %s but got error: ' % (filepat, ) # print e cuda.close()
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size']) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though params['use_dropout'] = 1 # This initializes the model parameters and does matrix initializations generator = decodeGenerator(params) (gen_inp_list, predLogProb, predIdx, predCand, wOut_emb, updatesLstm) = generator.build_prediction_model( generator.model_th, params, params['beam_size']) wOut_emb = wOut_emb.reshape([wOut_emb.shape[0],wOut_emb.shape[2]]) f_gen_only = theano.function(gen_inp_list, [predLogProb, predIdx, wOut_emb], name='f_pred', updates=updatesLstm) modelGen = generator.model_th upListGen = generator.update_list if params['share_Wemb']: evaluator = decodeEvaluator(params, modelGen['Wemb']) else: evaluator = decodeEvaluator(params) modelEval = evaluator.model_th # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout_eval, eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params, gen_inp_list, wOut_emb) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it comb_inp_list = eval_inp_list for inp in gen_inp_list: if inp not in comb_inp_list: comb_inp_list.append(inp) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(comb_inp_list, costs, name='f_eval', updates=updatesLstm) # Now let's build a gradient computation graph and rmsprop update mechanism if params['share_Wemb']: modelEval.pop('Wemb') if params['fix_Wemb']: upListGen.remove('Wemb') modelGenUpD = OrderedDict() for k in upListGen: modelGenUpD[k] = modelGen[k] gradsEval = tensor.grad(costs[0], wrt=modelEval.values(),add_names=True) gradsGen = tensor.grad(costs[1], wrt=modelGenUpD.values(), add_names=True) lrEval = tensor.scalar(name='lrEval',dtype=config.floatX) f_grad_comp_eval, f_param_update_eval, zg_eval, rg_eval, ud_eval= solver.build_solver_model(lrEval, modelEval, gradsEval, comb_inp_list, costs[0], params) lrGen = tensor.scalar(name='lrGen',dtype=config.floatX) f_grad_comp_gen, f_param_update_gen, zg_gen, rg_gen, ud_gen = solver.build_solver_model(lrGen, modelGenUpD, gradsGen, comb_inp_list, costs[1], params) print 'model init done.' print 'model has keys: ' + ', '.join(modelGen.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat = 'images') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch iters_eval= num_iters_one_epoch//2 iters_gen = num_iters_one_epoch//4 eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = 0.5 # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) t_print_sec = 60 ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from,modelGen) #zipp(rg_init,rgGen) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) pos_samp = np.arange(batch_size,dtype=np.int32) print batch_size ############################################################## # Define signal handler to catch ctl-c or kills so that we can save the model trained till that point def signal_handler(signal, frame): print('You pressed Ctrl+C! Saving Checkpoint Now before exiting!') filename = 'advmodel_checkpoint_%s_%s_%s_%.2f_INT.p' % (params['dataset'], host, params['fappend'], val_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2) sys.exit(0) signal.signal(signal.SIGINT, signal_handler) ############################################################## for it in xrange(max_epochs): epoch = it * 1.0 / num_iters_one_epoch # Enable using dropout in training use_dropout_eval.set_value(1.) for it2 in xrange(iters_eval): t0 = time.time() # fetch a batch of data batch,_ = dp.sampPosNegSentSamps(params['eval_batch_size'] - params['rand_negs']) real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], rand_negs = params['rand_negs']) # evaluate cost, gradient and perform parameter update cost = f_grad_comp_eval(*real_inp_list) f_param_update_eval(params['learning_rate_eval']) dt = time.time() - t0 # Track training statistics train_ppl2 = (np.e**(-cost)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it2 == 0: smooth_train_ppl2 = train_ppl2 if it2 == 0: smooth_train_cost = cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * cost tnow = time.time() if tnow > last_status_write_time + t_print_sec*1: # every now and then lets write a report print 'Eval Cnn in epoch %d: %d/%d sample done in %.3fs. Cost now is %.3f Pplx is %.3f' % (it, it2, iters_eval, dt, \ smooth_train_cost,smooth_train_ppl2) last_status_write_time = tnow print 'Done training the descriminative model for now. Switching to Genereative model' print 'Eval N/W in epoch %d: Cost now is %.3f Pplx is %.3f' % (it, smooth_train_cost,smooth_train_ppl2) filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_EVOnly.p' % (params['dataset'], host, params['fappend'],it, smooth_train_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2) # Disable Cnn dropout while training gen network use_dropout_eval.set_value(0.) for it2 in xrange(iters_gen): t0 = time.time() # fetch a batch of data batch,_ = dp.sampPosNegSentSamps(params['eval_batch_size'] - params['rand_negs']) real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], rand_negs = params['rand_negs']) #import pdb; pdb.set_trace() # evaluate cost, gradient and perform parameter update #if any([np.isnan(modelGen[m].get_value()).any() for m in modelGen]): # print 'Somebodys NAN!!!' # break; #asd = f_gen_only(real_inp_list[2],real_inp_list[3]) #print it2,asd[-1].shape, real_inp_list[0].shape #if asd[-1].shape[0] > real_inp_list[0].shape[0]: # import pdb; pdb.set_trace() cost = f_grad_comp_gen(*real_inp_list) #print it2,cost #if any([np.isnan(zg_gen[i].get_value()).any() for i in xrange(len(zg_gen))]): # print 'Somebody zg is NAN!!!' # break; #if any([np.isnan(rg_gen[i].get_value()).any() for i in xrange(len(rg_gen))]) or any([(rg_gen[i].get_value()<0).any() for i in xrange(len(rg_gen))]): # print 'Somebody rg is NAN!!!' # break; f_param_update_gen(params['learning_rate_gen']) dt = time.time() - t0 # print training statistics train_ppl2 = (np.e**(-cost)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it2 == 0: smooth_train_ppl2 = train_ppl2 if it2 == 0: smooth_train_cost = cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * cost tnow = time.time() if tnow > last_status_write_time + t_print_sec*1: # every now and then lets write a report print 'Gen Lstm in epoch %d: %d/%d sample done in %.3fs. Cost now is %.3f Pplx is %.3f' % (it, it2, iters_gen, dt, \ smooth_train_cost,smooth_train_ppl2) last_status_write_time = tnow print 'Done training the generative model for now. Switching to Genereative model. Final Stats are:' print 'Gen Lstm in epoch %d: Cost now is %.3f Pplx is %.3f' % (it, smooth_train_cost,smooth_train_ppl2) ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters is_last_iter = 1 if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation # use_dropout.set_value(0.) # val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set # # if it - params['lr_decay_st_epoch'] >= 0: # params['learning_rate'] = params['learning_rate'] * params['lr_decay'] # params['lr_decay_st_epoch'] += 1 # # print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) # if params['sample_by_len'] == 1: # print len_hist val_ppl2 = smooth_train_ppl2 write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist #top_val_ppl2 = val_ppl2 filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_GenDone.p' % (params['dataset'], host, params['fappend'],it, smooth_train_ppl2) dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2)
def run(checkpoint): max_images = -1 dump_folder = "" checkpoint_params = checkpoint["params"] dataset = checkpoint_params["dataset"] model = checkpoint["model"] beam_size = 1 # dump_folder = params['dump_folder'] # fetch the data provider dp = getDataProvider(dataset) misc = {} misc["wordtoix"] = checkpoint["wordtoix"] ixtoword = checkpoint["ixtoword"] blob = {} # output blob which we will dump to JSON for visualizing the results # blob['params'] = params blob["checkpoint_params"] = checkpoint_params blob["imgblobs"] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) n = 0 all_references = [] all_candidates = [] captions_res = [] for img in dp.iterImages(split="test", max_images=max_images): n += 1 print "image %d/%d:" % (n, max_images) references = [" ".join(x["tokens"]) for x in img["sentences"]] # as list of lists of tokens kwparams = {"beam_size": beam_size} Ys = BatchGenerator.predict([{"image": img}], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob["img_path"] = img["local_file_path"] img_blob["imgid"] = img["imgid"] img_blob["id"] = img["id"] if dump_folder: # copy source file to some folder. This makes it easier to distribute results # into a webpage, because all images that were predicted on are in a single folder source_file = img["local_file_path"] target_file = os.path.join(dump_folder, os.path.basename(img["local_file_path"])) os.system("cp %s %s" % (source_file, target_file)) # encode the human-provided references img_blob["references"] = [] flag = True for gtsent in references: if flag: print "GT: " + gtsent flag = False img_blob["references"].append({"text": gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[0] # these are sorted with highest on top candidate = " ".join([ixtoword[ix] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that print "PRED: (%f) %s" % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) captions_res.append({"image_id": img_blob["id"], "caption": candidate}) img_blob["candidate"] = {"text": candidate, "logprob": top_prediction[0]} blob["imgblobs"].append(img_blob) alg_name = checkpoint["algorithm"] res_file_name = checkpoint["outdir"] + "/captions_val_" + alg_name + "_results.json" json.dump(captions_res, open(res_file_name, "w")) from eval_tools import metrics scores = metrics.run(dataset, alg_name, checkpoint["outdir"]) return scores
def main(params): # load the checkpoint checkpoint_path = params["checkpoint_path"] max_images = params["max_images"] print "loading checkpoint %s" % (checkpoint_path,) checkpoint = pickle.load(open(checkpoint_path, "rb")) checkpoint_params = checkpoint["params"] dataset = checkpoint_params["dataset"] model = checkpoint["model"] dump_folder = params["dump_folder"] if dump_folder: print "creating dump folder " + dump_folder os.system("mkdir -p " + dump_folder) ## ANAND - CHANGE TEST PATH # fetch the data provider # dp = getDataProvider(dataset) # pdb.set_trace() dp = getDataProvider("example_images") misc = {} misc["wordtoix"] = checkpoint["wordtoix"] ixtoword = checkpoint["ixtoword"] blob = {} # output blob which we will dump to JSON for visualizing the results blob["params"] = params blob["checkpoint_params"] = checkpoint_params blob["imgblobs"] = [] # iterate over all images in test set and predict sentences BatchGenerator = decodeGenerator(checkpoint_params) n = 0 all_references = [] all_candidates = [] for img in dp.iterImages(split="test", max_images=max_images): n += 1 print "image %d/%d:" % (n, max_images) # pdb.set_trace() references = [" ".join(x["tokens"]) for x in img["sentences"]] # as list of lists of tokens kwparams = {"beam_size": params["beam_size"]} Ys = BatchGenerator.predict([{"image": img}], model, checkpoint_params, **kwparams) img_blob = {} # we will build this up img_blob["img_path"] = img["local_file_path"] img_blob["imgid"] = img["imgid"] if dump_folder: # copy source file to some folder. This makes it easier to distribute results # into a webpage, because all images that were predicted on are in a single folder source_file = img["local_file_path"] target_file = os.path.join(dump_folder, os.path.basename(img["local_file_path"])) os.system("cp %s %s" % (source_file, target_file)) # encode the human-provided references img_blob["references"] = [] for gtsent in references: print "GT: " + gtsent img_blob["references"].append({"text": gtsent}) # now evaluate and encode the top prediction top_predictions = Ys[0] # take predictions for the first (and only) image we passed in top_prediction = top_predictions[0] # these are sorted with highest on top candidate = " ".join([ixtoword[ix] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that print "PRED: (%f) %s" % (top_prediction[0], candidate) # save for later eval all_references.append(references) all_candidates.append(candidate) img_blob["candidate"] = {"text": candidate, "logprob": top_prediction[0]} blob["imgblobs"].append(img_blob) # use perl script to eval BLEU score for fair comparison to other research work # first write intermediate files print "writing intermediate files into eval/" open("eval/output", "w").write("\n".join(all_candidates)) for q in xrange(5): open("eval/reference" + ` q `, "w").write("\n".join([x[q] for x in all_references])) # invoke the perl script to get BLEU scores print "invoking eval/multi-bleu.perl script..." owd = os.getcwd() os.chdir("eval") os.system("./multi-bleu.perl reference < output") os.chdir(owd) # now also evaluate test split perplexity gtppl = eval_split("test", dp, model, checkpoint_params, misc, eval_max_images=max_images) print "perplexity of ground truth words based on dictionary of %d words: %f" % (len(ixtoword), gtppl) blob["gtppl"] = gtppl # dump result struct to file print "saving result struct to %s" % (params["result_struct_filename"],) json.dump(blob, open(params["result_struct_filename"], "w"))
def test(args): if args.random_seed is not None: numpy.random.seed(args.random_seed) def scramble(words): ixs = range(len(words)) random.shuffle(ixs) return [ words[ix] for ix in ixs ] testInfo = {'argv': sys.argv, 'dataset': args.dataset, 'scramble': args.scramble, 'model_type': args.model_type, 'alpha': args.alpha, 'iter_predict': args.iter_predict, 'task': 'paraphrase' if args.paraphrase else 'image', 'items': []} D = Cdist() dataset = args.dataset suffix = '' if args.iter_predict is None else ".{0}".format(args.iter_predict) model = cPickle.load(gzip.open('model.dat.gz' + suffix)) tokenizer = cPickle.load(gzip.open('tok.pkl.gz')) scaler = cPickle.load(gzip.open('scaler.pkl.gz')) real_stdout = sys.stdout with open('/dev/null', 'w') as f: sys.stdout = f d = dp.getDataProvider(args.dataset) sys.stdout = real_stdout pairs = list(d.iterImageSentencePair(split='val')) inputs = [ scramble(s) if args.scramble else s for s in tokenizer.transform([ pair['sentence']['raw'] for pair in pairs]) ] if args.paraphrase: candidates = tokenizer.transform([ pair['sentence']['raw'] for pair in pairs]) # No scrambling of candidates if args.paraphrase_state == 'output_vis': preds = model.predict(inputs) candidates_pred = model.predict(candidates) elif args.paraphrase_state == 'hidden_text': preds, _ = predict_h(model, inputs) candidates_pred, _ = predict_h(model, candidates) elif args.paraphrase_state == 'hidden_vis' and hasattr(model.layers[1], 'left'): _, preds = predict_h(model, inputs) _, candidates_pred = predict_h(model, candidates) elif args.paraphrase_state == 'hidden_vis' and not hasattr(model.layers[1], 'left'): preds = predict_h_simple(model, inputs) candidates_pred = predict_h_simple(model, candidates) elif args.paraphrase_state == 'hidden_multi': preds = numpy.hstack(predict_h(model, inputs)) candidates_pred = numpy.hstack(predict_h(model, candidates)) else: raise ValueError("Unknown state") distances = D.cosine_distance(preds, candidates_pred) #distances = cdist(preds, candidates_pred, metric='cosine') N = 0 score = 0.0 imgids = numpy.array([ pair['sentence']['imgid'] for pair in pairs ]) sentids = numpy.array([ pair['sentence']['sentid'] for pair in pairs]) for j,row in enumerate(distances): imgid = pairs[j]['sentence']['imgid'] sentid = pairs[j]['sentence']['sentid'] best = numpy.argsort(row) rank = numpy.where((imgids[best] == imgid) * (sentids[best] != sentid))[0][0] + 1 top4 = [ pairs[b]['sentence']['imgid'] for b in best[0:5] if sentid != pairs[b]['sentence']['sentid'] ][0:4] # exclude self top4sent = [ pairs[b]['sentence']['sentid'] for b in best[0:5] if sentid != pairs[b]['sentence']['sentid'] ][0:4] score = score + sum([i == imgid for i in top4 ])/4.0 N = N+1 itemInfo = {'sentid':sentid, 'imgid': imgid, 'score': sum([i == imgid for i in top4 ])/4.0, 'rank': rank, 'topn': top4 , 'topnsentid': top4sent, 'input': tokenizer.inverse_transform([inputs[j]])[0]} testInfo['items'].append(itemInfo) print args.iter_predict, N, score/N else: preds = model.predict(inputs) images = list(d.iterImages(split='val')) distances = D.cosine_distance(preds, scaler.transform([image['feat'] for image in images ])) errors = 0 N = 0 imgids = numpy.array([ img['imgid'] for img in images ]) for j,row in enumerate(distances): imgid = pairs[j]['sentence']['imgid'] sentid = pairs[j]['sentence']['sentid'] best = numpy.argsort(row) rank = numpy.where(imgids[best] == imgid)[0][0] + 1 top5 = [ images[b]['imgid'] for b in best[:5] ] N = N+1 if imgid not in top5: errors = errors + 1 itemInfo = {'sentid':sentid, 'imgid': imgid, 'score': float(imgid in top5), 'rank': rank, 'topn': top5, 'input':tokenizer.inverse_transform([inputs[j]])[0] } testInfo['items'].append(itemInfo) print args.iter_predict, errors, N, errors/N testInfoPath = 'testInfo-task={0}-scramble={1}-iter_predict={2}.json.gz'.format(testInfo['task'], testInfo['scramble'], testInfo['iter_predict']) json.dump(testInfo, gzip.open(testInfoPath,'w'))
def __init__(self, dataset, nbOfTopics,iterations=1500, pert = None): self.nbOfTopics=nbOfTopics self.iterations=iterations self.dataprovider = getDataProvider(dataset, pert) self.nbOfWordOccurences = 5