def main(): try: # Setup argument parser parser = ArgumentParser(description="Wide residual network") parser.add_argument("--port", default=9999, help="listen port") parser.add_argument("--use_cpu", action="store_true", help="If set, load models onto CPU devices") parser.add_argument("--parameter_file", default="wrn-50-2.pickle") parser.add_argument("--model", choices=['resnet', 'wrn', 'preact', 'addbn'], default='wrn') parser.add_argument("--depth", type=int, choices=[18, 34, 50, 101, 152, 200], default='50') # Process arguments args = parser.parse_args() port = args.port # start to train agent = Agent(port) net = model.create_net(args.model, args.depth, args.use_cpu) if args.use_cpu: print('Using CPU') dev = device.get_default_device() else: print('Using GPU') dev = device.create_cuda_gpu() net.to_device(dev) model.init_params(net, args.parameter_file) print('Finish loading models') labels = np.loadtxt('synset_words.txt', str, delimiter='\t ') serve(net, labels, dev, agent) # acc = evaluate(net, '../val_list.txt', 'image/val', dev) # print acc # wait the agent finish handling http request agent.stop() except SystemExit: return except: traceback.print_exc() sys.stderr.write(" for help use --help \n\n") return 2
def main(): try: # Setup argument parser parser = ArgumentParser(description="VGG inference") parser.add_argument("--port", default=9999, help="listen port") parser.add_argument("--use_cpu", action="store_true", help="If set, load models onto CPU devices") parser.add_argument("--parameter_file", default="") parser.add_argument("--depth", type=int, choices=[11, 13, 16, 19], default='11') parser.add_argument("--batchnorm", action='store_true', help='use batchnorm or not') # Process arguments args = parser.parse_args() port = args.port # start to train agent = Agent(port) net = model.create_net(args.depth, 1000, args.batchnorm, args.use_cpu) if args.use_cpu: print('Using CPU') dev = device.get_default_device() else: print('Using GPU') dev = device.create_cuda_gpu() net.to_device(dev) model.init_params(net, args.parameter_file) print('Finish loading models') labels = np.loadtxt('synset_words.txt', str, delimiter='\t ') serve(net, labels, dev, agent) # acc = evaluate(net, '../val_list.txt', 'image/val', dev) # print acc # wait the agent finish handling http request agent.stop() except SystemExit: return except: traceback.print_exc() sys.stderr.write(" for help use --help \n\n") return 2
def load_model(embed_map=None, pickle_table=None): """ Load all model components + apply vocab expansion """ # Load the worddict print 'Loading dictionary...' with open(path_to_dictionary, 'rb') as f: worddict = pkl.load(f) # Create inverted dictionary print 'Creating inverted dictionary...' word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Load model options print 'Loading model options...' with open('%s.pkl' % path_to_model, 'rb') as f: options = pkl.load(f) # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_model, params) tparams = init_tparams(params) # Extractor functions print 'Compiling encoder...' trng = RandomStreams(1234) trng, x, x_mask, ctx, emb = build_encoder(tparams, options) f_enc = theano.function([x, x_mask], ctx, name='f_enc') f_emb = theano.function([x], emb, name='f_emb') trng, embedding, x_mask, ctxw2v = build_encoder_w2v(tparams, options) f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v') # Load word2vec, if applicable if embed_map == None: print 'Loading word2vec embeddings...' embed_map = load_googlenews_vectors(path_to_word2vec) # Lookup table using vocab expansion trick if pickle_table: t = numpy.load(pickle_table) table = OrderedDict() for k, v in t: table[k] = v else: print 'Creating word lookup tables...' table = lookup_table(options, embed_map, worddict, word_idict, f_emb) # Store everything we need in a dictionary print 'Packing up...' model = {} model['options'] = options model['table'] = table model['f_w2v'] = f_w2v return model
def heart_disease(data_file, iterations=3000, learning_rate=0.1, reg_param=0.1, plot_learning_curves=False): dataset = dproc.read_dataset(data_file) X, y = dproc.preprocess(dataset) X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3) # Standardize data X_train_std, X_cv_std = dproc.standardize(X_train, X_cv) activation_layers = (X.shape[1], 25, 1) parameters = model.init_params(activation_layers) model.train_model(X_train_std.T, y_train.T, parameters, iterations, learning_rate, reg_param) if plot_learning_curves: costs_train, costs_cv, m_examples = model.train_various_sizes(X_train_std.T, X_cv_std.T, y_train.T, y_cv.T, parameters, activation_layers, 3000, 0.01, reg_param) dataplot.plot_learning_curves(costs_train, costs_cv, m_examples) train_accuracy = model.compute_accuracy(X_train_std.T, y_train.T, parameters) cv_accuracy = model.compute_accuracy(X_cv_std.T, y_cv.T, parameters) print(f"Train accuracy: {train_accuracy}") print(f"CV accuracy: {cv_accuracy}")
def load_model( embed_map=None, path_to_model=PATH_TO_MODEL, # model opts (.pkl) path_to_params=PATH_TO_PARAMS, # model params (.npz) path_to_dictionary=PATH_TO_DICTIONARY, path_to_word2vec=PATH_TO_WORD2VEC ): """ Load all model components + apply vocab expansion """ # Load the worddict print 'Loading dictionary...' with open(path_to_dictionary, 'rb') as f: worddict = pkl.load(f) # Create inverted dictionary print 'Creating inverted dictionary...' word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Load model options print 'Loading model options...' with open(path_to_model, 'rb') as f: options = pkl.load(f) # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_params, params) tparams = init_tparams(params) # Extractor functions print 'Compiling encoder...' trng = RandomStreams(1234) trng, x, x_mask, ctx, emb = build_encoder(tparams, options) f_enc = theano.function([x, x_mask], ctx, name='f_enc') f_emb = theano.function([x], emb, name='f_emb') trng, embedding, x_mask, ctxw2v = build_encoder_w2v(tparams, options) f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v') # Load word2vec, if applicable if embed_map == None: print 'Loading word2vec embeddings...' embed_map = load_googlenews_vectors(path_to_word2vec) # Lookup table using vocab expansion trick print 'Creating word lookup tables...' table = lookup_table(options, embed_map, worddict, word_idict, f_emb) # Store everything we need in a dictionary print 'Packing up...' model = {} model['options'] = options model['table'] = table model['f_w2v'] = f_w2v return model
def main(): try: # Setup argument parser parser = ArgumentParser(description='DenseNet inference') parser.add_argument("--port", default=9999, help="listen port") parser.add_argument("--use_cpu", action="store_true", help="If set, load models onto CPU devices") parser.add_argument("--parameter_file", default="densenet-121.pickle") parser.add_argument("--depth", type=int, choices=[121, 169, 201, 161], default=121) parser.add_argument('--nb_classes', default=1000, type=int) # Process arguments args = parser.parse_args() port = args.port # start to train agent = Agent(port) net = model.create_net(args.depth, args.nb_classes, 0, args.use_cpu) if args.use_cpu: print('Using CPU') dev = device.get_default_device() else: print('Using GPU') dev = device.create_cuda_gpu() net.to_device(dev) print('start to load parameter_file') model.init_params(net, args.parameter_file) print('Finish loading models') labels = np.loadtxt('synset_words.txt', str, delimiter='\t ') serve(net, labels, dev, agent) # wait the agent finish handling http request agent.stop() except SystemExit: return except: traceback.print_exc() sys.stderr.write(" for help use --help \n\n") return 2
def load_model( path_to_model='/home/shunan/Code/skip-thoughts/experiments/amazon/amazon_model_bi.npz', path_to_dictionary='/home/shunan/Code/skip-thoughts/experiments/amazon/word_dicts.pkl', embed_map=None): """ Load all model components + apply vocab expansion """ # Load the worddict print 'Loading dictionary...' with open(path_to_dictionary, 'rb') as f: worddict = pkl.load(f) # Create inverted dictionary print 'Creating inverted dictionary...' word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Load model options print 'Loading model options...' with open('%s.pkl' % path_to_model, 'rb') as f: options = pkl.load(f) # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_model, params) tparams = init_tparams(params) # Extractor functions print 'Compiling encoder...' trng = RandomStreams(1234) trng, x, x_mask, ctx, emb = build_encoder(tparams, options) f_enc = theano.function([x, x_mask], ctx, name='f_enc') f_emb = theano.function([x], emb, name='f_emb') trng, embedding, x_mask, ctxw2v = build_encoder_w2v(tparams, options) f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v') # Load word2vec, if applicable # if embed_map == None: # print 'Loading word2vec embeddings...' # embed_map = load_googlenews_vectors(path_to_word2vec) # Lookup table using vocab expansion trick print 'Creating word lookup tables...' table = lookup_table(options, embed_map, worddict, word_idict, f_emb) # Store everything we need in a dictionary print 'Packing up...' model = {} model['options'] = options model['table'] = table model['f_w2v'] = f_w2v # model is just a dict. return model
def gen_model(queue, rqueue, pid, model, options, k, normalize, word_idict, sampling): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see model.py for more detailed explanations print "Starting to build sampler ..." f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) def _gencap(cc0): sample, score = gen_sample(tparams, f_init, f_next, cc0, options, trng=trng, k=k, maxlen=200, stochastic=False) # adjust for length bias if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() # exit signal if req is None: break idx, context = req[0], req[1] print pid, '-', idx seq = _gencap(context) rqueue.put((idx, seq)) return
def main(model, dictionary, dictionary_tag, source_file, target_file, saveto): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load tag dictionary and invert with open(dictionary_tag, 'rb') as f: tag_dict = pkl.load(f) tag_idict = dict() for kk, vv in tag_dict.iteritems(): tag_idict[vv] = kk # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost, predicts = \ build_model(tparams, options) print 'Building f_predicts...', f_predicts = theano.function([x, x_mask], predicts) print 'Done' use_noise.set_value(0.) valid_err = evaluation(f_predicts, options, tag_idict, word_dict, source_file, saveto, target_file, 0, options['n_words_src'], back_file=target_file + ".back") print 'Test ', valid_err
def load_model(path_to_model=default_model): """ Load all model components """ print path_to_model # Load the worddict print 'Loading dictionary...' with open(path_to_model + '.dictionary.pkl', 'rb') as f: worddict = pkl.load(f) # Create inverted dictionary print 'Creating inverted dictionary...' word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Load model options print 'Loading model options...' with open(path_to_model + '.pkl', 'rb') as f: options = pkl.load(f) # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_model, params) tparams = init_tparams(params) # Extractor functions print 'Compiling image encoder...' trng, [im], images = build_image_encoder(tparams, options) f_ienc = theano.function([im], images, name='f_ienc') print 'Compiling sentence encoder...' trng = RandomStreams(1234) trng, [x, x_mask], sentences = build_sentence_encoder(tparams, options) f_senc = theano.function([x, x_mask], sentences, name='f_senc') # Store everything we need in a dictionary print 'Packing up...' model = {} model['options'] = options model['worddict'] = worddict model['word_idict'] = word_idict model['f_senc'] = f_senc model['f_ienc'] = f_ienc return model
def load_model(): """ Load all model components """ print path_to_model # Load the worddict print "Loading dictionary..." with open("%s.dictionary.pkl" % path_to_model, "rb") as f: worddict = pkl.load(f) # Create inverted dictionary print "Creating inverted dictionary..." word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = "<eos>" word_idict[1] = "UNK" # Load model options print "Loading model options..." with open("%s.pkl" % path_to_model, "rb") as f: options = pkl.load(f) # Load parameters print "Loading model parameters..." params = init_params(options) params = load_params(path_to_model, params) tparams = init_tparams(params) # Extractor functions print "Compiling sentence encoder..." trng = RandomStreams(1234) trng, [x, x_mask], sentences = build_sentence_encoder(tparams, options) f_senc = theano.function([x, x_mask], sentences, name="f_senc") print "Compiling image encoder..." trng, [im], images = build_image_encoder(tparams, options) f_ienc = theano.function([im], images, name="f_ienc") # Store everything we need in a dictionary print "Packing up..." model = {} model["options"] = options model["worddict"] = worddict model["word_idict"] = word_idict model["f_senc"] = f_senc model["f_ienc"] = f_ienc return model
def load_model(path_to_model=default_model): """ Load all model components """ print path_to_model # Load the worddict print 'Loading dictionary...' with open('%s.dictionary.pkl'%path_to_model, 'rb') as f: worddict = pkl.load(f) # Create inverted dictionary print 'Creating inverted dictionary...' word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Load model options print 'Loading model options...' with open('%s.pkl'%path_to_model, 'rb') as f: options = pkl.load(f) # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_model, params) tparams = init_tparams(params) # Extractor functions print 'Compiling sentence encoder...' trng = RandomStreams(1234) trng, [x, x_mask], sentences = build_sentence_encoder(tparams, options) f_senc = theano.function([x, x_mask], sentences, name='f_senc') print 'Compiling image encoder...' trng, [im], images = build_image_encoder(tparams, options) f_ienc = theano.function([im], images, name='f_ienc') # Store everything we need in a dictionary print 'Packing up...' model = {} model['options'] = options model['worddict'] = worddict model['word_idict'] = word_idict model['f_senc'] = f_senc model['f_ienc'] = f_ienc return model
def load_model( path_to_model=PATH_TO_MODEL, # model opts (.pkl) path_to_params=PATH_TO_PARAMS, # model params (.npz) path_to_dictionary=PATH_TO_DICTIONARY ): """ Load a trained model for decoding """ # Load the worddict print 'Loading dictionary...' with open(path_to_dictionary, 'rb') as f: worddict = pkl.load(f) # Create inverted dictionary print 'Creating inverted dictionary...' word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Load model options print 'Loading model options...' with open('%s.pkl'%path_to_model, 'rb') as f: options = pkl.load(f) # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_params, params) tparams = init_tparams(params) # Sampler. trng = RandomStreams(1234) f_init, f_next = build_sampler(tparams, options, trng) # Pack everything up dec = dict() dec['options'] = options dec['trng'] = trng dec['worddict'] = worddict dec['word_idict'] = word_idict dec['tparams'] = tparams dec['f_init'] = f_init dec['f_next'] = f_next return dec
def load_model( path_to_model=PATH_TO_MODEL, # model opts (.pkl) path_to_params=PATH_TO_PARAMS, # model params (.npz) path_to_dictionary=PATH_TO_DICTIONARY): """ Load a trained model for decoding """ # Load the worddict print 'Loading dictionary...' with open(path_to_dictionary, 'rb') as f: worddict = pkl.load(f) # Create inverted dictionary print 'Creating inverted dictionary...' word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Load model options print 'Loading model options...' with open('%s.pkl' % path_to_model, 'rb') as f: options = pkl.load(f) # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_params, params) tparams = init_tparams(params) # Sampler. trng = RandomStreams(1234) f_init, f_next = build_sampler(tparams, options, trng) # Pack everything up dec = dict() dec['options'] = options dec['trng'] = trng dec['worddict'] = worddict dec['word_idict'] = word_idict dec['tparams'] = tparams dec['f_init'] = f_init dec['f_next'] = f_next return dec
def load_model(save_dir, model_name, best=True): """ Load all model components Input are only save_dir and model_name since it is assumed that filenames keep convention """ model_options = {} model_options['save_dir'] = save_dir model_options['model_name'] = model_name # Load model print 'Loading model' opt_filename_reload = get_opt_filename(model_options, previous=True) with open(opt_filename_reload, 'rb') as f: model = pkl.load(f) options = model['options'] # Load parameters print 'Loading model parameters...' params = init_params(options) params_filename = get_npz_filename(model_options, best=best, previous=True) params = load_params(params_filename, params) tparams = init_tparams(params) # Extractor functions print 'Compiling sentence encoder...' [x, x_mask], sentences = build_sentence_encoder(tparams, options) f_senc = theano.function([x, x_mask], sentences, name='f_senc') print 'Compiling image encoder...' [im], images = build_image_encoder(tparams, options) f_ienc = theano.function([im], images, name='f_ienc') print 'Compiling error computation...' [s, im], errs = build_errors(options) f_err = theano.function([s, im], errs, name='f_err') # Store everything we need in a dictionary print 'Packing up...' model['f_senc'] = f_senc model['f_ienc'] = f_ienc model['f_err'] = f_err return model
def load_model_path(path_to_model): """ Load all model components """ print path_to_model # Load model print 'Loading model' with open(path_to_model + '.pkl', 'rb') as f: model = pkl.load(f) options = model['options'] # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_model + '.npz', params) tparams = init_tparams(params) # Extractor functions print 'Compiling sentence encoder...' [x, x_mask], sentences = build_sentence_encoder(tparams, options) f_senc = theano.function([x, x_mask], sentences, name='f_senc') print 'Compiling image encoder...' [im], images = build_image_encoder(tparams, options) f_ienc = theano.function([im], images, name='f_ienc') print 'Compiling error computation...' [s, im], errs = build_errors(options) f_err = theano.function([s, im], errs, name='f_err') # Store everything we need in a dictionary print 'Packing up...' model['f_senc'] = f_senc model['f_ienc'] = f_ienc model['f_err'] = f_err return model
def load_model(path_to_model): """ Load all model components """ print path_to_model # Load model print 'Loading model' with open(path_to_model + '.pkl', 'rb') as f: model = pkl.load(f) options = model['options'] # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_model + '.npz', params) tparams = init_tparams(params) # Extractor functions print 'Compiling sentence encoder...' [x, x_mask], sentences = build_sentence_encoder(tparams, options) f_senc = theano.function([x, x_mask], sentences, name='f_senc') print 'Compiling image encoder...' [im], images = build_image_encoder(tparams, options) f_ienc = theano.function([im], images, name='f_ienc') print 'Compiling error computation...' [s, im], errs = build_errors(options) f_err = theano.function([s,im], errs, name='f_err') # Store everything we need in a dictionary print 'Packing up...' model['f_senc'] = f_senc model['f_ienc'] = f_ienc model['f_err'] = f_err return model
def trainer(X, C, stmodel, dimctx=4800, #vector dimensionality dim_word=620, # word vector dimensionality dim=1600, # the number of GRU units encoder='gru', decoder='gru', doutput=False, max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=40000, maxlen_w=100, optimizer='adam', batch_size = 16, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', embeddings=None, saveFreq=1000, sampleFreq=100, reload_=False): # Model options model_options = {} model_options['dimctx'] = dimctx model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['doutput'] = doutput model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['embeddings'] = embeddings model_options['saveFreq'] = saveFreq model_options['sampleFreq'] = sampleFreq model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Load pre-trained embeddings, if applicable if embeddings != None: print 'Loading embeddings...' with open(embeddings, 'rb') as f: embed_map = pkl.load(f) dim_word = len(embed_map.values()[0]) model_options['dim_word'] = dim_word preemb = norm_weight(n_words, dim_word) pz = defaultdict(lambda : 0) for w in embed_map.keys(): pz[w] = 1 for w in worddict.keys()[:n_words-2]: if pz[w] > 0: preemb[worddict[w]] = embed_map[w] else: preemb = None # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options, preemb=preemb) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([X,C], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, c in train_iter: n_samples += len(x) uidx += 1 x, mask, ctx = homogeneous_data.prepare_data(x, c, worddict, stmodel, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', params = unzip(tparams) numpy.savez(saveto, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' if numpy.mod(uidx, sampleFreq) == 0: x_s = x mask_s = mask ctx_s = ctx for jj in xrange(numpy.minimum(10, len(ctx_s))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj].reshape(1, model_options['dimctx']), model_options, trng=trng, k=1, maxlen=100, stochastic=False, use_unk=False) print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print print 'Seen %d samples'%n_samples
def trainer(X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', num_neg=4, gamma=1.0, max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=20000, maxlen_w=30, optimizer='adam', batch_size = 64, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', saveFreq=1000, reload_=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['num_neg'] = num_neg model_options['gamma'] = gamma model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, x, x_mask, p_f, p_f_mask, p_b, p_b_mask, \ ns_list, ns_masks, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, p_f, p_f_mask, p_b, p_b_mask] + ns_list + ns_masks # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, num_neg=num_neg, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, p_f, p_b, ns in train_iter: n_samples += len(x) uidx += 1 # ns input is list of num_neg negative sentences, # output ns is list of num_neg (batchsize, neg_len) negative sentences x, x_mask, p_f, p_f_mask, p_b, p_b_mask, ns_list, ns_masks = homogeneous_data.prepare_data(x, p_f, p_b, ns, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() args = [x, x_mask, p_f, p_f_mask, p_b, p_b_mask] + ns_list + ns_masks cost = f_grad_shared(*args) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost_p1) or numpy.isinf(cost_p1) or numpy.isnan(cost_p2) or numpy.isinf(cost_p2): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost_p1 + cost_p2, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', saveto_idx = saveto.format(uidx) params = unzip(tparams) numpy.savez(saveto_idx, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl'%saveto_idx, 'wb')) print 'Done' print 'Seen %d samples'%n_samples
def trainer( X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', decoder='gru', max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=20000, maxlen_w=30, optimizer='adam', batch_size=512, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', saveFreq=5000, reload_=False, reload_path='output_books_full/model_ae_full_bsz_64_iter_313000.npz', SICK_eval=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ model_options['reload_path'] = reload_path print model_options # reload options if reload_ and os.path.exists(reload_path): print 'reloading...' + reload_path with open('%s.pkl' % reload_path, 'rb') as f: models_options = pkl.load(f) reload_idx = int(reload_path.split('_')[-1].split('.')[0]) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(reload_path): params = load_params(reload_path, params) tparams = init_tparams(params) trng, x, x_mask, y, y_mask, z, z_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask, z, z_mask] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k, t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w) if not reload_: uidx = 0 else: uidx = reload_idx lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, y, z in train_iter: n_samples += len(x) uidx += 1 x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data( x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', saveto_iternum = saveto.format(uidx) params = unzip(tparams) numpy.savez(saveto_iternum, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl' % saveto_iternum, 'wb')) print 'Done' if SICK_eval: print "Evaluating SICK Test performance" embed_map = tools.load_googlenews_vectors() model = tools.load_model(path_to_model=saveto_iternum, embed_map=embed_map) yhat, pr, sr, mse = eval_sick.evaluate(model, evaltest=True) del (model) del (embed_map) print pr, sr, mse res_save_file = saveto.format('ALL').split( '.')[0] + '_SICK_EVAL.txt' with open(res_save_file, 'a') as rsf: cur_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) rsf.write('\n \n {}'.format(cur_time)) rsf.write('\n{}, {}, {}, {}'.format(uidx, pr, se, mse)) print "Done" print 'Seen %d samples' % n_samples
def trainer(X, C, stmodel, dimctx=4800, #vector dimensionality dim_word=620, # word vector dimensionality dim=1600, # the number of GRU units encoder='gru', decoder='gru', doutput=False, max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=40000, maxlen_w=100, optimizer='adam', batch_size = 16, saveto='adventuremodel.npz', dictionary='/home/jm7432/tell-tall-tales/decoding/adventure_dict_final.pkl', embeddings=None, saveFreq=1000, sampleFreq=100, reload_=False): # Model options model_options = {} model_options['dimctx'] = dimctx model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['doutput'] = doutput model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['embeddings'] = embeddings model_options['saveFreq'] = saveFreq model_options['sampleFreq'] = sampleFreq model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Load pre-trained embeddings, if applicable if embeddings != None: print 'Loading embeddings...' with open(embeddings, 'rb') as f: embed_map = pkl.load(f) dim_word = len(embed_map.values()[0]) model_options['dim_word'] = dim_word preemb = norm_weight(n_words, dim_word) pz = defaultdict(lambda : 0) for w in embed_map.keys(): pz[w] = 1 for w in worddict.keys()[:n_words-2]: if pz[w] > 0: preemb[worddict[w]] = embed_map[w] else: preemb = None # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options, preemb=preemb) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([X,C], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, c in train_iter: n_samples += len(x) uidx += 1 x, mask, ctx = homogeneous_data.prepare_data(x, c, worddict, stmodel, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', params = unzip(tparams) numpy.savez(saveto, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' if numpy.mod(uidx, sampleFreq) == 0: x_s = x mask_s = mask ctx_s = ctx for jj in xrange(numpy.minimum(10, len(ctx_s))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj].reshape(1, model_options['dimctx']), model_options, trng=trng, k=1, maxlen=100, stochastic=False, use_unk=False) print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print print 'Seen %d samples'%n_samples
def trainer( data='coco', #f8k, f30k, coco margin=0.2, dim=1024, dim_image=4096, dim_word=300, encoder='gru', # gru OR bow max_epochs=15, dispFreq=10, decay_c=0., grad_clip=2., maxlen_w=100, optimizer='adam', batch_size=128, saveto='/ais/gobi3/u/rkiros/uvsmodels/coco.npz', validFreq=100, lrate=0.0002, reload_=False): # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['encoder'] = encoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ print(model_options) # reload options if reload_ and os.path.exists(saveto): print('reloading...' + saveto) with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) # Load training and development sets print('Loading dataset') train, dev = load_dataset(data)[:2] # Create and save dictionary print('Creating dictionary') worddict = build_dictionary(train[0] + dev[0])[0] n_words = len(worddict) model_options['n_words'] = n_words print('Dictionary size: ' + str(n_words)) with open('%s.dictionary.pkl' % saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print('Building model') params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) # before any regularizer print('Building f_log_probs...', ) f_log_probs = theano.function(inps, cost, profile=False) print('Done') # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print('Building f_cost...', ) f_cost = theano.function(inps, cost, profile=False) print('Done') print('Building sentence encoder') trng, inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print('Building image encoder') trng, inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print('Building f_grad...', ) grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k, t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print('Building optimizers...', ) # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print('Optimization') # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 curr = 0. n_samples = 0 for eidx in xrange(max_epochs): print('Epoch ', eidx) for x, im in train_iter: n_samples += len(x) uidx += 1 x, mask, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print('Minibatch with zero sample under length ', maxlen_w) uidx -= 1 continue # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud) if numpy.mod(uidx, validFreq) == 0: print('Computing results...') curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc ls = encode_sentences(curr_model, dev[0]) lim = encode_images(curr_model, dev[1]) (r1, r5, r10, medr) = i2t(lim, ls) print("Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)) (r1i, r5i, r10i, medri) = t2i(lim, ls) print("Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri)) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print('Saving...', ) params = unzip(tparams) numpy.savez(saveto, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print('Done') print('Seen %d samples' % n_samples)
def trainer(load_from=None, save_dir="snapshots", name="anon", **kwargs): """ :param load_from: location to load parameters + options from :param name: name of model, used as location to save parameters + options """ curr_model = dict() # load old model, including parameters, but overwrite with new options if load_from: print "reloading..." + load_from with open("%s.pkl" % load_from, "rb") as f: curr_model = pkl.load(f) else: curr_model["options"] = {} for k, v in kwargs.iteritems(): curr_model["options"][k] = v model_options = curr_model["options"] # initialize logger import datetime timestampedName = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + "_" + name from logger import Log log = Log( name=timestampedName, hyperparams=model_options, saveDir="vis/training", xLabel="Examples Seen", saveFrequency=1 ) print curr_model["options"] # Load training and development sets print "Loading dataset" dataset = load_dataset(model_options["data"], cnn=model_options["cnn"], load_train=True) train = dataset["train"] dev = dataset["dev"] # Create dictionary print "Creating dictionary" worddict = build_dictionary(train["caps"] + dev["caps"]) print "Dictionary size: " + str(len(worddict)) curr_model["worddict"] = worddict curr_model["options"]["n_words"] = len(worddict) + 2 # save model pkl.dump(curr_model, open("%s/%s.pkl" % (save_dir, name), "wb")) print "Loading data" train_iter = datasource.Datasource(train, batch_size=model_options["batch_size"], worddict=worddict) dev = datasource.Datasource(dev, worddict=worddict) dev_caps, dev_ims = dev.all() print "Building model" params = init_params(model_options) # reload parameters if load_from is not None and os.path.exists(load_from): params = load_params(load_from, params) tparams = init_tparams(params) inps, cost = build_model(tparams, model_options) print "Building sentence encoder" inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print "Building image encoder" inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print "Building f_grad...", grads = tensor.grad(cost, wrt=itemlist(tparams)) print "Building errors.." inps_err, errs = build_errors(model_options) f_err = theano.function(inps_err, errs, profile=False) curr_model["f_senc"] = f_senc curr_model["f_ienc"] = f_ienc curr_model["f_err"] = f_err if model_options["grad_clip"] > 0.0: grads = [maxnorm(g, model_options["grad_clip"]) for g in grads] lr = tensor.scalar(name="lr") print "Building optimizers...", # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(model_options["optimizer"])(lr, tparams, grads, inps, cost) print "Optimization" uidx = 0 curr = 0 n_samples = 0 for eidx in xrange(model_options["max_epochs"]): print "Epoch ", eidx for x, mask, im in train_iter: n_samples += x.shape[1] uidx += 1 # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(model_options["lrate"]) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print "NaN detected" return 1.0, 1.0, 1.0 if numpy.mod(uidx, model_options["dispFreq"]) == 0: print "Epoch ", eidx, "Update ", uidx, "Cost ", cost, "UD ", ud log.update({"Error": float(cost)}, n_samples) if numpy.mod(uidx, model_options["validFreq"]) == 0: print "Computing results..." # encode sentences efficiently dev_s = encode_sentences(curr_model, dev_caps, batch_size=model_options["batch_size"]) dev_i = encode_images(curr_model, dev_ims) # compute errors dev_errs = compute_errors(curr_model, dev_s, dev_i) # compute ranking error (r1, r5, r10, medr, meanr), vis_details = t2i(dev_errs, vis_details=True) (r1i, r5i, r10i, medri, meanri) = i2t(dev_errs) print "Text to image (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr) log.update({"R@1": r1, "R@5": r5, "R@10": r10, "median_rank": medr, "mean_rank": meanr}, n_samples) print "Image to text (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri) log.update( { "Image2Caption_R@1": r1i, "Image2Caption_R@5": r5i, "Image2CaptionR@10": r10i, "Image2Caption_median_rank": medri, "Image2Caption_mean_rank": meanri, }, n_samples, ) tot = r1 + r5 + r10 if tot > curr: curr = tot # Save parameters print "Saving...", numpy.savez("%s/%s" % (save_dir, name), **unzip(tparams)) print "Done" vis_details["hyperparams"] = model_options # Save visualization details with open("vis/roc/%s/%s.json" % (model_options["data"], timestampedName), "w") as f: json.dump(vis_details, f) # Add the new model to the index try: index = json.load(open("vis/roc/index.json", "r")) except IOError: index = {model_options["data"]: []} models = index[model_options["data"]] if timestampedName not in models: models.append(timestampedName) with open("vis/roc/index.json", "w") as f: json.dump(index, f) print "Seen %d samples" % n_samples
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., diag_c=0., clip_c=-1., lrate=0.01, n_words_src=100000, n_words=100000, maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates datasets=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok' ], valid_datasets=[ '../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok' ], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl' ], use_dropout=False, reload_=False): # Model options model_options = locals().copy() worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' train = TextIterator(datasets[0], datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Buliding sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' print 'Building f_grad...', f_grad = theano.function(inps, grads, profile=profile) print 'Done' if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[1])): stochastic = True sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False) print 'Source ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK', print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= numpy.array( history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print 'Valid ', valid_err print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def trainer(**kwargs): """ Train the model according to input params Info about input params is available in parameters.py """ # Timing print('Starting time:', datetime.now()) sys.stdout.flush() t_start_train = time.time() # Model options # load old model, including parameters, but overwrite with new options # Extract model options from arguments model_options = {} for k, v in kwargs.iteritems(): model_options[k] = v # Print input options print('PARAMETERS BEFORE LOADING:') for k, v in model_options.items(): print('{:>26}: {}'.format(k, v)) sys.stdout.flush() # Reload options if required curr_model = dict() if model_options['reload_']: # Reload model parameters opt_filename_reload = get_opt_filename(model_options, previous=True) print('reloading...', opt_filename_reload) sys.stdout.flush() try: with open(opt_filename_reload, 'rb') as f: curr_model = pkl.load(f) except: print( 'Failed to reload parameters, try to use only feeded parameters' ) curr_model['options'] = {} # Check if we reload from best model or last model if model_options['load_from'] in ['Best', 'best', 'B', 'b']: load_from_best = True print('Loading from Best saved model in validation results') elif model_options['load_from'] in ['Last', 'last', 'L', 'l']: load_from_best = False print('Loading from Last saved model') else: print('Unkown choice for "load_from" parameter', model_options['load_from']) print('Please choose one of:', ['Best', 'best', 'B', 'b'], ['Last', 'last', 'L', 'l']) print('Using Last as default') load_from_best = False # Reload end-point parameters state_filename = get_sol_filename(model_options, best=load_from_best, previous=True) print('reloading...', state_filename) sys.stdout.flush() try: with open(state_filename, 'rb') as f: state_params = pkl.load(f) if load_from_best: init_epoch = state_params['epoch'] solution = state_params else: init_epoch = state_params['epoch_done'] + 1 solution = state_params['solution'] best_val_score = solution['best_val_score'] n_samples = solution['samples_seen'] except: print('Failed to reload state parameters, starting from 0') init_epoch = 0 best_val_score = 0 n_samples = 0 else: curr_model['options'] = {} init_epoch = 0 best_val_score = 0 n_samples = 0 # Overwrite loaded options with input options for k, v in kwargs.iteritems(): curr_model['options'][k] = v model_options = curr_model['options'] # Print final options loaded if model_options['reload_']: print('PARAMETERS AFTER LOADING:') for k, v in model_options.items(): print('{:>26}: {}'.format(k, v)) sys.stdout.flush() # Load training and development sets print('Loading dataset') sys.stdout.flush() dataset = load_dataset(dataset_name=model_options['data'], embedding=model_options['embedding'], path_to_data=model_options['data_path'], test_subset=model_options['test_subset'], load_train=True, fold=0) train = dataset['train'] dev = dataset['val'] # Create word dictionary print('Creating dictionary') sys.stdout.flush() worddict = build_dictionary(train['caps'] + dev['caps']) print('Dictionary size: ' + str(len(worddict))) sys.stdout.flush() curr_model['worddict'] = worddict curr_model['options']['n_words'] = len(worddict) + 2 # save model opt_filename_save = get_opt_filename(model_options, previous=False) print('Saving model parameters in', opt_filename_save) sys.stdout.flush() try: os.makedirs(os.path.dirname(opt_filename_save)) except: pass pkl.dump(curr_model, open(opt_filename_save, 'wb')) # Load data from dataset print('Loading data') sys.stdout.flush() train_iter = datasource.Datasource(train, batch_size=model_options['batch_size'], worddict=worddict) dev = datasource.Datasource(dev, worddict=worddict) dev_caps, dev_ims = dev.all() print('Building model') sys.stdout.flush() params = init_params(model_options) # reload network parameters, ie. weights if model_options['reload_']: params_filename = get_npz_filename(model_options, best=load_from_best, previous=True) params = load_params(params_filename, params) tparams = init_tparams(params) inps, cost = build_model(tparams, model_options) print('Building sentence encoder') sys.stdout.flush() inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print('Building image encoder') sys.stdout.flush() inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print('Building f_grad...') sys.stdout.flush() grads = tensor.grad(cost, wrt=itemlist(tparams)) print('Building errors...') sys.stdout.flush() inps_err, errs = build_errors(model_options) f_err = theano.function(inps_err, errs, profile=False) curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc curr_model['f_err'] = f_err if model_options['grad_clip'] > 0.: grads = [maxnorm(g, model_options['grad_clip']) for g in grads] lr = tensor.scalar(name='lr') print('Building optimizers...') sys.stdout.flush() # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) # Get names for the files to save model and solution sol_filename_best = get_sol_filename(model_options, best=True, previous=False) sol_filename_last = get_sol_filename(model_options, best=False, previous=False) params_filename_best = get_npz_filename(model_options, best=True, previous=False) params_filename_last = get_npz_filename(model_options, best=False, previous=False) print('PATHS TO MODELS:') for filename in [ sol_filename_best, sol_filename_last, params_filename_best, params_filename_last ]: print(filename) sys.stdout.flush() try: os.makedirs(os.path.dirname(filename)) except: pass # Start optimization print('Optimization') sys.stdout.flush() uidx = 0 # Timing t_start = time.time() print('Starting time:', datetime.now()) for eidx in range(init_epoch, model_options['max_epochs']): t_start_epoch = time.time() print('Epoch ', eidx) sys.stdout.flush() for x, mask, im in train_iter: n_samples += x.shape[1] uidx += 1 # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(model_options['lrate']) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') sys.stdout.flush() return 1., 1., 1. if numpy.mod(uidx, model_options['dispFreq']) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud) sys.stdout.flush() if numpy.mod(uidx, model_options['validFreq']) == 0: print('Computing results...') sys.stdout.flush() # encode sentences efficiently dev_s = encode_sentences( curr_model, dev_caps, batch_size=model_options['batch_size']) dev_i = encode_images(curr_model, dev_ims) # compute errors dev_errs = compute_errors(curr_model, dev_s, dev_i) # compute ranking error (r1, r5, r10, medr, meanr) = i2t(dev_errs) (r1i, r5i, r10i, medri, meanri) = t2i(dev_errs) print("Text to image (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri)) sys.stdout.flush() print("Image to text (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) sys.stdout.flush() # Score val_score = r1 + r5 + r10 + r1i + r5i + r10i if val_score > best_val_score: print('BEST MODEL FOUND') print('Score:', val_score) print('Previous best score:', best_val_score) best_val_score = val_score # Join in a results dict results_dict = build_results_dict(r1, r5, r10, medr, r1i, r5i, r10i, medri) # Save parameters print('Saving...', end=' ') sys.stdout.flush() numpy.savez(params_filename_best, **unzip(tparams)) print('Done') sys.stdout.flush() # Update solution solution = OrderedDict([ ('epoch', eidx), ('update', uidx), ('samples_seen', n_samples), ('best_val_score', best_val_score), ('best_val_res', results_dict), ('time_until_results', str(timedelta(seconds=(time.time() - t_start_train)))) ]) pkl.dump(solution, open(sol_filename_best, 'wb')) print('Seen %d samples' % n_samples) sys.stdout.flush() # Timing t_epoch = time.time() - t_start_epoch t_epoch_avg = (time.time() - t_start) / (eidx + 1 - (init_epoch)) print('Time for this epoch:', str(timedelta(seconds=t_epoch)), 'Average:', str(timedelta(seconds=t_epoch_avg))) t_2_complete = t_epoch_avg * (model_options['max_epochs'] - (eidx + 1)) print('Time since start session:', str(timedelta(seconds=time.time() - t_start)), 'Estimated time to complete training:', str(timedelta(seconds=t_2_complete))) print('Current time:', datetime.now()) sys.stdout.flush() # Save current model try: state_params = OrderedDict([('epoch_done', eidx), ('solution', solution)]) except: solution = OrderedDict([ ('epoch', eidx), ('update', uidx), ('samples_seen', n_samples), ('best_val_score', best_val_score), ('time_until_results', str(timedelta(seconds=(time.time() - t_start_train)))) ]) state_params = OrderedDict([('epoch_done', eidx), ('solution', solution)]) pkl.dump(state_params, open(sol_filename_last, 'wb')) # Save parameters print('Saving LAST npz...', end=' ') sys.stdout.flush() numpy.savez(params_filename_last, **unzip(tparams)) print('Done') sys.stdout.flush() return solution
def trainer(load_from=None, save_dir='snapshots', name='anon', **kwargs): """ :param load_from: location to load parameters + options from :param name: name of model, used as location to save parameters + options """ curr_model = dict() # load old model, including parameters, but overwrite with new options if load_from: print 'reloading...' + load_from with open('%s.pkl'%load_from, 'rb') as f: curr_model = pkl.load(f) else: curr_model['options'] = {} for k, v in kwargs.iteritems(): curr_model['options'][k] = v model_options = curr_model['options'] # initialize logger import datetime timestampedName = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + name from logger import Log log = Log(name=timestampedName, hyperparams=model_options, saveDir='vis/training', xLabel='Examples Seen', saveFrequency=1) print curr_model['options'] # Load training and development sets print 'Loading dataset' dataset = load_dataset(model_options['data'], cnn=model_options['cnn'], load_train=True) train = dataset['train'] dev = dataset['dev'] # Create dictionary print 'Creating dictionary' worddict = build_dictionary(train['caps']+dev['caps']) print 'Dictionary size: ' + str(len(worddict)) curr_model['worddict'] = worddict curr_model['options']['n_words'] = len(worddict) + 2 # save model pkl.dump(curr_model, open('%s/%s.pkl' % (save_dir, name), 'wb')) print 'Loading data' train_iter = datasource.Datasource(train, batch_size=model_options['batch_size'], worddict=worddict) dev = datasource.Datasource(dev, worddict=worddict) dev_caps, dev_ims = dev.all() print 'Building model' params = init_params(model_options) # reload parameters if load_from is not None and os.path.exists(load_from): params = load_params(load_from, params) tparams = init_tparams(params) inps, cost = build_model(tparams, model_options) print 'Building sentence encoder' inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print 'Building image encoder' inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Building errors..' inps_err, errs = build_errors(model_options) f_err = theano.function(inps_err, errs, profile=False) curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc curr_model['f_err'] = f_err if model_options['grad_clip'] > 0.: grads = [maxnorm(g, model_options['grad_clip']) for g in grads] lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) print 'Optimization' uidx = 0 curr = 0 n_samples = 0 for eidx in xrange(model_options['max_epochs']): print 'Epoch ', eidx for x, mask, im in train_iter: n_samples += x.shape[1] uidx += 1 # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(model_options['lrate']) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, model_options['dispFreq']) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud log.update({'Error': float(cost)}, n_samples) if numpy.mod(uidx, model_options['validFreq']) == 0: print 'Computing results...' # encode sentences efficiently dev_s = encode_sentences(curr_model, dev_caps, batch_size=model_options['batch_size']) dev_i = encode_images(curr_model, dev_ims) # compute errors dev_errs = compute_errors(curr_model, dev_s, dev_i) # compute ranking error (r1, r5, r10, medr, meanr), vis_details = t2i(dev_errs, vis_details=True) (r1i, r5i, r10i, medri, meanri) = i2t(dev_errs) print "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr) log.update({'R@1': r1, 'R@5': r5, 'R@10': r10, 'median_rank': medr, 'mean_rank': meanr}, n_samples) print "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri) log.update({'Image2Caption_R@1': r1i, 'Image2Caption_R@5': r5i, 'Image2CaptionR@10': r10i, 'Image2Caption_median_rank': medri, 'Image2Caption_mean_rank': meanri}, n_samples) tot = r1 + r5 + r10 if tot > curr: curr = tot # Save parameters print 'Saving...', numpy.savez('%s/%s'%(save_dir, name), **unzip(tparams)) print 'Done' vis_details['hyperparams'] = model_options # Save visualization details with open('vis/roc/%s/%s.json' % (model_options['data'], timestampedName), 'w') as f: json.dump(vis_details, f) # Add the new model to the index index = json.load(open('vis/roc/index.json', 'r')) models = index[model_options['data']] if timestampedName not in models: models.append(timestampedName) with open('vis/roc/index.json', 'w') as f: json.dump(index, f) print 'Seen %d samples'%n_samples
def main(data_path,save_path,num_epochs=NUM_EPOCHS): print("Preparing Data...") # Training data with open(data_path,'r') as f: X = f.read().splitlines() # Build dictionary chardict, charcount = batched_tweets.build_dictionary(X) batched_tweets.save_dictionary(chardict,charcount,'%s/dict.pkl' % save_path) trainX = batched_tweets.grouper(X) train_iter = batched_tweets.BatchedTweets(trainX, validation_size=N_VAL, batch_size=N_BATCH, maxlen=MAX_LENGTH) # Validation set t_val, tp_val, tn_val = train_iter.validation_set() t_val, t_val_m, tp_val, tp_val_m, tn_val, tn_val_m = batched_tweets.prepare_data(t_val, tp_val, tn_val, chardict, maxlen=MAX_LENGTH) print("Building network...") # params n_char = len(chardict.keys()) + 1 params = init_params(n_chars=n_char) # Tweet variables tweet = T.itensor3() ptweet = T.itensor3() ntweet = T.itensor3() # masks t_mask = T.fmatrix() tp_mask = T.fmatrix() tn_mask = T.fmatrix() # Embeddings emb_t = tweet2vec(tweet, t_mask, params) emb_tp = tweet2vec(ptweet, tp_mask, params) emb_tn = tweet2vec(ntweet, tn_mask, params) # batch loss D1 = 1 - T.batched_dot(emb_t, emb_tp)/(tnorm(emb_t)*tnorm(emb_tp)) D2 = 1 - T.batched_dot(emb_t, emb_tn)/(tnorm(emb_t)*tnorm(emb_tn)) gap = D1-D2+M loss = gap*(gap>0) cost = T.sum(loss) # params and updates print("Computing updates...") updates = lasagne.updates.adagrad(cost, params.values(), LEARNING_RATE) # Theano function print("Compiling theano functions...") inps = [tweet,t_mask,ptweet,tp_mask,ntweet,tn_mask] dist = theano.function(inps,[D1,D2]) l = theano.function(inps,loss) t2v = theano.function(inps,[emb_t,emb_tp,emb_tn]) cost_val = theano.function(inps,cost) train = theano.function(inps,cost,updates=updates) # Training print("Training...") uidx = 0 try: for epoch in range(num_epochs): n_samples = 0 print("Epoch {}".format(epoch)) for x,y,z in train_iter: n_samples +=len(x) uidx += 1 x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data(x, y, z, chardict, maxlen=MAX_LENGTH) if x==None: print("Minibatch with zero samples under maxlength.") uidx -= 1 continue ud_start = time.time() curr_cost = train(x,x_m,y,y_m,z,z_m) ud = time.time() - ud_start if np.isnan(curr_cost) or np.isinf(curr_cost): print("Nan detected.") return if np.mod(uidx, DISPF) == 0: print("Epoch {} Update {} Cost {} Time {}".format(epoch,uidx,curr_cost,ud)) if np.mod(uidx,SAVEF) == 0: print("Saving...") saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model.npz' % save_path,**saveparams) print("Done.") validation_cost = cost_val(t_val,t_val_m,tp_val,tp_val_m,tn_val,tn_val_m) print("Epoch {} Validation Cost {}".format(epoch, validation_cost)) print("Seen {} samples.".format(n_samples)) except KeyboardInterrupt: pass
def main(train_path,val_path,save_path,num_epochs=NUM_EPOCHS): shutil.copyfile('settings.py','%s/settings.txt'%save_path) print("Preparing Data...") # Training data if not RELOAD_DATA: print("Creating Pairs...") trainX = batched_tweets.create_pairs(train_path) valX = batched_tweets.create_pairs(val_path) print("Number of training pairs = {}".format(len(trainX[0]))) print("Number of validation pairs = {}".format(len(valX[0]))) with open('%s/train_pairs.pkl'%(save_path),'w') as f: pkl.dump(trainX, f) with open('%s/val_pairs.pkl'%(save_path),'w') as f: pkl.dump(valX, f) else: print("Loading Pairs...") with open(train_path,'r') as f: trainX = pkl.load(f) with open(val_path,'r') as f: valX = pkl.load(f) if not RELOAD_MODEL: # Build dictionary chardict, charcount = batched_tweets.build_dictionary(trainX[0] + trainX[1]) n_char = len(chardict.keys()) + 1 batched_tweets.save_dictionary(chardict,charcount,'%s/dict.pkl' % save_path) # params n_char = len(chardict.keys()) + 1 params = init_params(n_chars=n_char) else: print("Loading model params...") params = load_params_shared('%s/model.npz' % save_path) print("Loading dictionary...") with open('%s/dict.pkl' % save_path, 'rb') as f: chardict = pkl.load(f) n_char = len(chardict.keys()) + 1 train_iter = batched_tweets.BatchedTweets(trainX, batch_size=N_BATCH, maxlen=MAX_LENGTH) val_iter = batched_tweets.BatchedTweets(valX, batch_size=512, maxlen=MAX_LENGTH) print("Building network...") # Tweet variables tweet = T.itensor3() ptweet = T.itensor3() ntweet = T.itensor3() # masks t_mask = T.fmatrix() tp_mask = T.fmatrix() tn_mask = T.fmatrix() # Embeddings emb_t, net = tweet2vec(tweet, t_mask, params, n_char) emb_tp, net = tweet2vec(ptweet, tp_mask, params, n_char) emb_tn, net = tweet2vec(ntweet, tn_mask, params, n_char) # batch loss D1 = 1 - T.batched_dot(emb_t, emb_tp)/(tnorm(emb_t)*tnorm(emb_tp)) D2 = 1 - T.batched_dot(emb_t, emb_tn)/(tnorm(emb_t)*tnorm(emb_tn)) gap = D1-D2+M loss = gap*(gap>0) cost = T.mean(loss) cost_only = T.mean(loss) # params and updates print("Computing updates...") lr = LEARNING_RATE mu = MOMENTUM updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net, trainable=True), lr, momentum=mu) # Theano function print("Compiling theano functions...") inps = [tweet,t_mask,ptweet,tp_mask,ntweet,tn_mask] #dist = theano.function(inps,[D1,D2]) #l = theano.function(inps,loss) #t2v = theano.function(inps,[emb_t,emb_tp,emb_tn]) cost_val = theano.function(inps,[cost_only, emb_t, emb_tp, emb_tn]) train = theano.function(inps,cost,updates=updates) # Training print("Training...") uidx = 0 try: for epoch in range(num_epochs): n_samples = 0 train_cost = 0. print("Epoch {}".format(epoch)) if USE_SCHEDULE: # schedule if epoch > 0 and (epoch+1) % 10 == 0: print("Updating Schedule...") lr = max(1e-5,lr/10) mu = mu - 0.1 updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net, trainable=True), lr, momentum=mu) train = theano.function(inps,cost,updates=updates) if epoch >= 10: cost = T.mean(loss) + REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2) reg_only = REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2) reg_val = theano.function([],reg_only) train = theano.function(inps,cost,updates=updates) ud_start = time.time() for x,y,z in train_iter: if not x: print("Minibatch with no valid triples") continue n_samples +=len(x) uidx += 1 if DEBUG and uidx > 3: sys.exit() if DEBUG: print("Tweets = {}".format(x[:5])) x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data(x, y, z, chardict, maxlen=MAX_LENGTH, n_chars=n_char) if x==None: print("Minibatch with zero samples under maxlength.") uidx -= 1 continue if DEBUG: print("Params before update...") print_params(params) display_actv(x,x_m,y,y_m,z,z_m,inps,net,'before') cb, embb, embb_p, embb_n = cost_val(x,x_m,y,y_m,z,z_m) curr_cost = train(x,x_m,y,y_m,z,z_m) ud = time.time() - ud_start train_cost += curr_cost*len(x) if DEBUG: print("Params after update...") print_params(params) display_actv(x,x_m,y,y_m,z,z_m,inps,net,'after') ca, emba, emba_p, emba_n = cost_val(x,x_m,y,y_m,z,z_m) print("Embeddings before = {}".format(embb[:5])) print("Embeddings after = {}".format(emba[:5])) print("Cost before update = {} \nCost after update = {}".format(cb, ca)) if np.isnan(curr_cost) or np.isinf(curr_cost): print("Nan detected.") return if np.mod(uidx, DISPF) == 0: print("Epoch {} Update {} Cost {} Time {} Samples {}".format(epoch,uidx,curr_cost,ud,len(x))) if np.mod(uidx,SAVEF) == 0: print("Saving...") saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model.npz' % save_path,**saveparams) print("Done.") print("Computing Validation Cost...") validation_cost = 0. n_val_samples = 0 for x,y,z in val_iter: if not x: print("Validation: Minibatch with no valid triples") continue n_val_samples += len(x) x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data(x, y, z, chardict, maxlen=MAX_LENGTH, n_chars=n_char) if x==None: print("Validation: Minibatch with zero samples under maxlength") continue curr_cost, _, _, _ = cost_val(x,x_m,y,y_m,z,z_m) validation_cost += curr_cost*len(x) if epoch >= 10: regularization_cost = reg_val() else: regularization_cost = 0 print("Epoch {} Training Cost {} Validation Cost {} Regularization Cost {}".format(epoch, train_cost/n_samples, validation_cost/n_val_samples, regularization_cost)) print("Seen {} samples.".format(n_samples)) for kk,vv in params.iteritems(): print("Param {} Epoch {} Max {} Min {}".format(kk, epoch, np.max(vv.get_value()), np.min(vv.get_value()))) print("Saving...") saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model_%d.npz' % (save_path,epoch),**saveparams) print("Done.") if False: # store embeddings and data features = np.zeros((len(train_iter.data[0]),3*WDIM)) distances = np.zeros((len(train_iter.data[0]),2)) for idx, triple in enumerate(zip(train_iter.data[0],train_iter.data[1],train_iter.data[2])): x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data([triple[0]], [triple[1]], [triple[2]], chardict, maxlen=MAX_LENGTH, n_chars=n_char) if x==None: continue emb1, emb2, emb3 = t2v(x,x_m,y,y_m,z,z_m) emb1 = np.reshape(emb1, (WDIM)) emb2 = np.reshape(emb2, (WDIM)) emb3 = np.reshape(emb3, (WDIM)) features[idx,:] = np.concatenate((emb1,emb2,emb3),axis=0) distances[idx,0] = 1-np.dot(emb1,emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2)) distances[idx,1] = 1-np.dot(emb1,emb3)/(np.linalg.norm(emb1)*np.linalg.norm(emb3)) with open('debug/feat_%d.npy'%epoch,'w') as df: np.save(df,features) with open('debug/dist_%d.npy'%epoch,'w') as ds: np.save(ds,distances) if False: with open('debug/data.txt','w') as dd: for triple in zip(train_iter.data[0],train_iter.data[1],train_iter.data[2]): dd.write('%s\t%s\t%s\n' % (triple[0],triple[1],triple[2])) except KeyboardInterrupt: pass
def train_lstm( imshape=(3, 224, 224), nCategories=51, batch=10, max_epochs=3, # The maximum number of epoch to run. lrate=[0.001, 0.001, 0.001], decay_c=0., # Weight decay for the classifier applied to the U weights. maxlen=100, # Maximum length of captions optimizer='rmsprop', # sgd, adadelta and rmsprop available saveto='recognition_model', # The best model will be saved there. dispFreq=10000, # Display to stdout the training progress every N updates. saveFreq=20000, #9210, # Save the parameters after every saveFreq updates. validFreq=140000, #2*9210, testFreq=5 * 140000, margin=0.55, # Parameter for extra option. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. load_from_old=False, ): # Model options model_options = {} model_options['image_shape'] = imshape model_options['nCategories'] = nCategories model_options['batch'] = batch model_options['lrate'] = lrate model_options['max_epochs'] = max_epochs model_options['optimizer'] = optimizer model_options['dispFreq'] = dispFreq model_options['validFreq'] = validFreq model_options['saveto'] = saveto model_options['saveFreq'] = saveFreq ######################################################## # Load training and development sets print('\nBuilding Forward Pass') # Read from Previous Training Result if load_from_old: print('Loading Previously Trained Weight') start = 14 with open('../save/weights_14.pkl', 'rb') as W: tparams = pickle.load(W) else: start = 0 params = init_params(model_options) pdb.set_trace() tparams = init_tparams(params) pdb.set_trace() trng, use_noise, x, mask, cost, pred = build_model(params, model_options) print('Done') pdb.set_trace() print('\nBuilding Backward Pass') if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function(inputs=[mask, x, y], outputs=[cost], name='f_cost', allow_input_downcast=True) grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad = theano.function(inputs=[mask, x, y], outputs=grads, name='f_grad', allow_input_downcast=True) print('Done') ######################################################## print('Optimization') lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, x, mask, cost) ######################################################## '''
def main(train_path,val_path,save_path,num_epochs=NUM_EPOCHS): shutil.copyfile('settings.py','%s/settings.txt'%save_path) print("Preparing Data...") # Training data if not RELOAD_DATA: print("Creating Pairs...") trainX = batched_tweets.create_pairs(train_path) valX = batched_tweets.create_pairs(val_path) print("Number of training pairs = {}".format(len(trainX[0]))) print("Number of validation pairs = {}".format(len(valX[0]))) with open('%s/train_pairs.pkl'%(save_path),'w') as f: pkl.dump(trainX, f) with open('%s/val_pairs.pkl'%(save_path),'w') as f: pkl.dump(valX, f) else: print("Loading Pairs...") with open(train_path,'r') as f: trainX = pkl.load(f) with open(val_path,'r') as f: valX = pkl.load(f) if not RELOAD_MODEL: # Build dictionary chardict, charcount = batched_tweets.build_dictionary(trainX[0] + trainX[1]) n_char = len(chardict.keys()) + 1 batched_tweets.save_dictionary(chardict,charcount,'%s/dict.pkl' % save_path) # params n_char = len(chardict.keys()) + 1 params = init_params(n_chars=n_char) else: print("Loading model params...") params = load_params_shared('%s/model.npz' % save_path) print("Loading dictionary...") with open('%s/dict.pkl' % save_path, 'rb') as f: chardict = pkl.load(f) n_char = len(chardict.keys()) + 1 train_iter = batched_tweets.BatchedTweets(trainX, batch_size=N_BATCH, maxlen=MAX_LENGTH) val_iter = batched_tweets.BatchedTweets(valX, batch_size=512, maxlen=MAX_LENGTH) print("Building network...") # Tweet variables tweet = T.itensor3() ptweet = T.itensor3() ntweet = T.itensor3() # masks t_mask = T.fmatrix() tp_mask = T.fmatrix() tn_mask = T.fmatrix() # Embeddings emb_t, net = tweet2vec(tweet, t_mask, params, n_char) emb_tp, net = tweet2vec(ptweet, tp_mask, params, n_char) emb_tn, net = tweet2vec(ntweet, tn_mask, params, n_char) # batch loss D1 = 1 - T.batched_dot(emb_t, emb_tp)/(tnorm(emb_t)*tnorm(emb_tp)) D2 = 1 - T.batched_dot(emb_t, emb_tn)/(tnorm(emb_t)*tnorm(emb_tn)) gap = D1-D2+M loss = gap*(gap>0) cost = T.mean(loss) + REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2) cost_only = T.mean(loss) reg_only = REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2) # params and updates print("Computing updates...") lr = LEARNING_RATE mu = MOMENTUM updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net, trainable=True), lr, momentum=mu) # Theano function print("Compiling theano functions...") inps = [tweet,t_mask,ptweet,tp_mask,ntweet,tn_mask] #dist = theano.function(inps,[D1,D2]) #l = theano.function(inps,loss) #t2v = theano.function(inps,[emb_t,emb_tp,emb_tn]) cost_val = theano.function(inps,[cost_only, emb_t, emb_tp, emb_tn]) reg_val = theano.function([],reg_only) train = theano.function(inps,cost,updates=updates) # Training print("Training...") uidx = 0 try: for epoch in range(num_epochs): n_samples = 0 train_cost = 0. print("Epoch {}".format(epoch)) if USE_SCHEDULE: # schedule if epoch > 0 and epoch % 5 == 0: print("Updating Schedule...") lr = max(1e-5,lr/2) mu = mu - 0.05 updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net), lr, momentum=mu) train = theano.function(inps,cost,updates=updates) ud_start = time.time() for x,y,z in train_iter: if not x: print("Minibatch with no valid triples") continue n_samples +=len(x) uidx += 1 if DEBUG and uidx > 3: sys.exit() if DEBUG: print("Tweets = {}".format(x[:5])) x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data(x, y, z, chardict, maxlen=MAX_LENGTH, n_chars=n_char) if x==None: print("Minibatch with zero samples under maxlength.") uidx -= 1 continue if DEBUG: print("Params before update...") print_params(params) display_actv(x,x_m,y,y_m,z,z_m,inps,net,'before') cb, embb, embb_p, embb_n = cost_val(x,x_m,y,y_m,z,z_m) curr_cost = train(x,x_m,y,y_m,z,z_m) ud = time.time() - ud_start train_cost += curr_cost*len(x) if DEBUG: print("Params after update...") print_params(params) display_actv(x,x_m,y,y_m,z,z_m,inps,net,'after') ca, emba, emba_p, emba_n = cost_val(x,x_m,y,y_m,z,z_m) print("Embeddings before = {}".format(embb[:5])) print("Embeddings after = {}".format(emba[:5])) print("Cost before update = {} \nCost after update = {}".format(cb, ca)) if np.isnan(curr_cost) or np.isinf(curr_cost): print("Nan detected.") return if np.mod(uidx, DISPF) == 0: print("Epoch {} Update {} Cost {} Time {} Samples {}".format(epoch,uidx,curr_cost,ud,len(x))) if np.mod(uidx,SAVEF) == 0: print("Saving...") saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model.npz' % save_path,**saveparams) print("Done.") print("Computing Validation Cost...") validation_cost = 0. n_val_samples = 0 for x,y,z in val_iter: if not x: print("Validation: Minibatch with no valid triples") continue n_val_samples += len(x) x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data(x, y, z, chardict, maxlen=MAX_LENGTH, n_chars=n_char) if x==None: print("Validation: Minibatch with zero samples under maxlength") continue curr_cost, _, _, _ = cost_val(x,x_m,y,y_m,z,z_m) validation_cost += curr_cost*len(x) regularization_cost = reg_val() print("Epoch {} Training Cost {} Validation Cost {} Regularization Cost {}".format(epoch, train_cost/n_samples, validation_cost/n_val_samples, regularization_cost)) print("Seen {} samples.".format(n_samples)) for kk,vv in params.iteritems(): print("Param {} Epoch {} Max {} Min {}".format(kk, epoch, np.max(vv.get_value()), np.min(vv.get_value()))) print("Saving...") saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model_%d.npz' % (save_path,epoch),**saveparams) print("Done.") if False: # store embeddings and data features = np.zeros((len(train_iter.data[0]),3*WDIM)) distances = np.zeros((len(train_iter.data[0]),2)) for idx, triple in enumerate(zip(train_iter.data[0],train_iter.data[1],train_iter.data[2])): x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data([triple[0]], [triple[1]], [triple[2]], chardict, maxlen=MAX_LENGTH, n_chars=n_char) if x==None: continue emb1, emb2, emb3 = t2v(x,x_m,y,y_m,z,z_m) emb1 = np.reshape(emb1, (WDIM)) emb2 = np.reshape(emb2, (WDIM)) emb3 = np.reshape(emb3, (WDIM)) features[idx,:] = np.concatenate((emb1,emb2,emb3),axis=0) distances[idx,0] = 1-np.dot(emb1,emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2)) distances[idx,1] = 1-np.dot(emb1,emb3)/(np.linalg.norm(emb1)*np.linalg.norm(emb3)) with open('debug/feat_%d.npy'%epoch,'w') as df: np.save(df,features) with open('debug/dist_%d.npy'%epoch,'w') as ds: np.save(ds,distances) if False: with open('debug/data.txt','w') as dd: for triple in zip(train_iter.data[0],train_iter.data[1],train_iter.data[2]): dd.write('%s\t%s\t%s\n' % (triple[0],triple[1],triple[2])) except KeyboardInterrupt: pass
def load_model(path_to_model): """ Load all model components """ print path_to_model # Load model print 'Loading model' with open(path_to_model + '.pkl', 'rb') as f: model = pkl.load(f) options = model['options'] options['use_topic'] = True # Load parameters print 'Loading model parameters...' params = init_params(options) params = load_params(path_to_model + '.npz', params) tparams = init_tparams(params) # Extractor functions print 'Compiling sentence encoder...' [x, x_mask], sentences = build_sentence_encoder(tparams, options) f_senc = theano.function([x, x_mask], sentences, name='f_senc') #print 'Compiling sentence encoder with topics...' #[x, x_mask, topics], sentences = build_sentence_encoder_with_topicvector(tparams, options) #f_senc_t = theano.function([x, x_mask, topics], sentences, name='f_senc_t') print 'Compiling image encoder...' [im], images = build_image_encoder(tparams, options) f_ienc = theano.function([im], images, name='f_ienc') print 'Compiling topic encoder...' [t], topics = build_topic_encoder(tparams, options) f_tenc = theano.function([t], topics, name='f_tenc') ''' print 'Compiling topic_vector1 encoder...' [t], topics = build_topic_vector1_encoder(tparams, options) f_tv1enc = theano.function([t], topics, name='f_tv1enc') print 'Compiling topic_vector2 encoder...' [t], topics = build_topic_encoder(tparams, options) f_tv2enc = theano.function([t], topics, name='f_tv2enc') ''' print 'Compiling error computation...' [s, im], errs = build_errors(options) f_err = theano.function([s, im], errs, name='f_err') ''' [s, im, t], errs_t1 = build_errors_3level(options) f_err_t1 = theano.function([s,im,t], errs_t1, name='f_err_t1') [s_t, im_t], errs_t2 = build_errors_t2(options) f_err_t2 = theano.function([s_t, im_t], errs_t2, name='f_err_t2') ''' # Store everything we need in a dictionary print 'Packing up...' model['f_senc'] = f_senc #model['f_senc_t'] = f_senc_t model['f_ienc'] = f_ienc model['f_tenc'] = f_tenc #model['f_tv1enc'] = f_tv1enc #model['f_tv2enc'] = f_tv2enc model['f_err'] = f_err #model['f_err_t1'] = f_err_t1 #model['f_err_t2'] = f_err_t2 return model
def trainer(load_from=None, save_dir='snapshots', name='anon', **kwargs): """ :param load_from: location to load parameters + options from :param name: name of model, used as location to save parameters + options """ curr_model = dict() # load old model, including parameters, but overwrite with new options if load_from: print 'reloading...' + load_from with open('%s.pkl' % load_from, 'rb') as f: curr_model = pkl.load(f) else: curr_model['options'] = {} for k, v in kwargs.iteritems(): curr_model['options'][k] = v model_options = curr_model['options'] # initialize logger import datetime timestampedName = datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '_' + name from logger import Log log = Log(name=timestampedName, hyperparams=model_options, saveDir='vis/training', xLabel='Examples Seen', saveFrequency=1) print curr_model['options'] # Load training and development sets print 'Loading dataset' dataset = load_dataset(model_options['data'], cnn=model_options['cnn'], load_train=True) train = dataset['train'] dev = dataset['dev'] # Create dictionary print 'Creating dictionary' worddict = build_dictionary(train['caps'] + dev['caps']) print 'Dictionary size: ' + str(len(worddict)) curr_model['worddict'] = worddict curr_model['options']['n_words'] = len(worddict) + 2 # save model pkl.dump(curr_model, open('%s/%s.pkl' % (save_dir, name), 'wb')) print 'Loading data' train_iter = datasource.Datasource(train, batch_size=model_options['batch_size'], worddict=worddict) dev = datasource.Datasource(dev, worddict=worddict) dev_caps, dev_ims = dev.all() print 'Building model' params = init_params(model_options) # reload parameters if load_from is not None and os.path.exists(load_from): params = load_params(load_from, params) tparams = init_tparams(params) inps, cost = build_model(tparams, model_options) print 'Building sentence encoder' inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print 'Building image encoder' inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Building errors..' inps_err, errs = build_errors(model_options) f_err = theano.function(inps_err, errs, profile=False) curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc curr_model['f_err'] = f_err if model_options['grad_clip'] > 0.: grads = [maxnorm(g, model_options['grad_clip']) for g in grads] lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) print 'Optimization' uidx = 0 curr = 0 n_samples = 0 for eidx in xrange(model_options['max_epochs']): print 'Epoch ', eidx for x, mask, im in train_iter: n_samples += x.shape[1] uidx += 1 # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(model_options['lrate']) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, model_options['dispFreq']) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud log.update({'Error': float(cost)}, n_samples) if numpy.mod(uidx, model_options['validFreq']) == 0: print 'Computing results...' # encode sentences efficiently dev_s = encode_sentences( curr_model, dev_caps, batch_size=model_options['batch_size']) dev_i = encode_images(curr_model, dev_ims) # compute errors dev_errs = compute_errors(curr_model, dev_s, dev_i) # compute ranking error (r1, r5, r10, medr, meanr), vis_details = t2i(dev_errs, vis_details=True) (r1i, r5i, r10i, medri, meanri) = i2t(dev_errs) print "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ( r1, r5, r10, medr, meanr) log.update( { 'R@1': r1, 'R@5': r5, 'R@10': r10, 'median_rank': medr, 'mean_rank': meanr }, n_samples) print "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % ( r1i, r5i, r10i, medri, meanri) log.update( { 'Image2Caption_R@1': r1i, 'Image2Caption_R@5': r5i, 'Image2CaptionR@10': r10i, 'Image2Caption_median_rank': medri, 'Image2Caption_mean_rank': meanri }, n_samples) tot = r1 + r5 + r10 if tot > curr: curr = tot # Save parameters print 'Saving...', numpy.savez('%s/%s' % (save_dir, name), **unzip(tparams)) print 'Done' vis_details['hyperparams'] = model_options # Save visualization details with open( 'vis/roc/%s/%s.json' % (model_options['data'], timestampedName), 'w') as f: json.dump(vis_details, f) # Add the new model to the index try: index = json.load(open('vis/roc/index.json', 'r')) except IOError: index = {model_options['data']: []} models = index[model_options['data']] if timestampedName not in models: models.append(timestampedName) with open('vis/roc/index.json', 'w') as f: json.dump(index, f) print 'Seen %d samples' % n_samples
def train( dim_word_desc=400, # word vector dimensionality dim_word_q=400, dim_word_ans=600, dim_proj=300, dim=400, # the number of LSTM units encoder_desc='lstm', encoder_desc_word='lstm', encoder_desc_sent='lstm', use_dq_sims=False, eyem=None, learn_h0=False, use_desc_skip_c_g=False, debug=False, encoder_q='lstm', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., clip_c=-1., lrate=0.01, n_words_q=49145, n_words_desc=115425, n_words_ans=409, pkl_train_files=None, pkl_valid_files=None, maxlen=2000, # maximum length of the description optimizer='rmsprop', batch_size=2, vocab=None, valid_batch_size=16, use_elu_g=False, saveto='model.npz', model_dir=None, ms_nlayers=3, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates datasets=[None], truncate=400, momentum=0.9, use_bidir=False, cost_mask=None, valid_datasets=[ '/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5', '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5' ], dropout_rate=0.5, use_dropout=True, reload_=True, **opt_ds): ensure_dir_exists(model_dir) mpath = os.path.join(model_dir, saveto) mpath_best = os.path.join(model_dir, prfx("best", saveto)) mpath_last = os.path.join(model_dir, prfx("last", saveto)) mpath_stats = os.path.join(model_dir, prfx("stats", saveto)) # Model options model_options = locals().copy() model_options['use_sent_reps'] = opt_ds['use_sent_reps'] stats = defaultdict(list) del model_options['eyem'] del model_options['cost_mask'] if cost_mask is not None: cost_mask = sharedX(cost_mask) # reload options and parameters if reload_: print "Reloading the model." if os.path.exists(mpath_best): print "Reloading the best model from %s." % mpath_best with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath_best, params) elif os.path.exists(mpath): print "Reloading the model from %s." % mpath with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath, params) else: raise IOError("Couldn't open the file.") else: print "Couldn't reload the models initializing from scratch." params = init_params(model_options) if datasets[0]: print "Short dataset", datasets[0] print 'Loading data' print 'Building model' if pkl_train_files is None or pkl_valid_files is None: train, valid, test = load_data(path=datasets[0], valid_path=valid_datasets[0], test_path=valid_datasets[1], batch_size=batch_size, **opt_ds) else: train, valid, test = load_pkl_data(train_file_paths=pkl_train_files, valid_file_paths=pkl_valid_files, batch_size=batch_size, vocab=vocab, eyem=eyem, **opt_ds) tparams = init_tparams(params) trng, use_noise, inps_d, \ opt_ret, \ cost, errors, ent_errors, ent_derrors, probs = \ build_model(tparams, model_options, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, valid, cost_mask=cost_mask) alphas = opt_ret['dec_alphas'] if opt_ds['use_sent_reps']: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], inps_d['slen'], inps_d['qlen'],\ inps_d['ent_mask'] ] else: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], \ inps_d['qlen'], \ inps_d['ent_mask']] outs = [cost, errors, probs, alphas] if ent_errors: outs += [ent_errors] if ent_derrors: outs += [ent_derrors] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, outs, profile=profile) print 'Done' # Apply weight decay on the feed-forward connections if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): if "logit" in kk or "ff" in kk: weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Computing gradient...', grads = safe_grad(cost, itemlist(tparams)) print 'Done' # Gradient clipping: if clip_c > 0.: g2 = get_norms(grads) for p, g in grads.iteritems(): grads[p] = tensor.switch(g2 > (clip_c**2), (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g) inps.pop() if optimizer.lower() == "adasecant": learning_rule = Adasecant(delta_clip=25.0, use_adagrad=True, grad_clip=0.25, gamma_clip=0.) elif optimizer.lower() == "rmsprop": learning_rule = RMSPropMomentum(init_momentum=momentum) elif optimizer.lower() == "adam": learning_rule = Adam() elif optimizer.lower() == "adadelta": learning_rule = AdaDelta() lr = tensor.scalar(name='lr') print 'Building optimizers...', learning_rule = None if learning_rule: f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr, grads=grads, inp=inps, cost=cost, errors=errors) else: f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, errors) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(mpath): history_errs = list(numpy.load(mpath)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size best_found = False uidx = 0 estop = False train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() for eidx in xrange(max_epochs): n_samples = 0 if train.done: train.reset() for d_, q_, a, em in train: n_samples += len(a) uidx += 1 use_noise.set_value(1.) if opt_ds['use_sent_reps']: # To mask the description and the question. d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents( d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared( d, d_mask, q, q_mask, a, dlen, slen, qlen) else: d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared( d, d_mask, q, q_mask, a, dlen, qlen) upnorm = f_update(lrate) ud = time.time() - ud_start # Collect the running ave train stats. train_cost_ave = running_ave(train_cost_ave, cost) train_err_ave = running_ave(train_err_ave, errors) train_gnorm_ave = running_ave(train_gnorm_ave, gnorm) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' import ipdb ipdb.set_trace() if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, ' Update ', uidx, \ ' Cost ', cost, ' UD ', ud, \ ' UpNorm ', upnorm[0].tolist(), \ ' GNorm ', gnorm, \ ' Pnorm ', pnorm, 'Terrors ', errors if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None and best_found: numpy.savez(mpath_best, history_errs=history_errs, **best_p) pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb')) else: params = unzip(tparams) numpy.savez(mpath, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % mpath, 'wb')) pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb')) print 'Done' print_param_norms(tparams) if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if valid.done: valid.reset() valid_costs, valid_errs, valid_probs, \ valid_alphas, error_ent, error_dent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_reps']) valid_alphas_ = numpy.concatenate( [va.argmax(0) for va in valid_alphas.tolist()], axis=0) valid_err = valid_errs.mean() valid_cost = valid_costs.mean() valid_alpha_ent = -negentropy(valid_alphas) mean_valid_alphas = valid_alphas_.mean() std_valid_alphas = valid_alphas_.std() mean_valid_probs = valid_probs.argmax(1).mean() std_valid_probs = valid_probs.argmax(1).std() history_errs.append([valid_cost, valid_err]) stats['train_err_ave'].append(train_err_ave) stats['train_cost_ave'].append(train_cost_ave) stats['train_gnorm_ave'].append(train_gnorm_ave) stats['valid_errs'].append(valid_err) stats['valid_costs'].append(valid_cost) stats['valid_err_ent'].append(error_ent) stats['valid_err_desc_ent'].append(error_dent) stats['valid_alphas_mean'].append(mean_valid_alphas) stats['valid_alphas_std'].append(std_valid_alphas) stats['valid_alphas_ent'].append(valid_alpha_ent) stats['valid_probs_mean'].append(mean_valid_probs) stats['valid_probs_std'].append(std_valid_probs) if uidx == 0 or valid_err <= numpy.array( history_errs)[:, 1].min(): best_p = unzip(tparams) bad_counter = 0 best_found = True else: bst_found = False if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print "============================" print '\t>>>Valid error: ', valid_err, \ ' Valid cost: ', valid_cost print '\t>>>Valid pred mean: ', mean_valid_probs, \ ' Valid pred std: ', std_valid_probs print '\t>>>Valid alphas mean: ', mean_valid_alphas, \ ' Valid alphas std: ', std_valid_alphas, \ ' Valid alpha negent: ', valid_alpha_ent, \ ' Valid error ent: ', error_ent, \ ' Valid error desc ent: ', error_dent print "============================" print "Running average train stats " print '\t>>>Train error: ', train_err_ave, \ ' Train cost: ', train_cost_ave, \ ' Train grad norm: ', train_gnorm_ave print "============================" train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid.reset() valid_cost, valid_error, valid_probs, \ valid_alphas, error_ent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_rep']) print " Final eval resuts: " print 'Valid error: ', valid_error.mean() print 'Valid cost: ', valid_cost.mean() print '\t>>>Valid pred mean: ', valid_probs.mean(), \ ' Valid pred std: ', valid_probs.std(), \ ' Valid error ent: ', error_ent params = copy.copy(best_p) numpy.savez(mpath_last, zipped_params=best_p, history_errs=history_errs, **params) return valid_err, valid_cost
def trainer(data='coco', #f8k, f30k, coco margin=0.2, dim=1024, dim_image=4096, dim_word=300, encoder='gru', # gru OR bow max_epochs=15, dispFreq=10, decay_c=0., grad_clip=2., maxlen_w=100, optimizer='adam', batch_size = 128, saveto='/ais/gobi3/u/rkiros/uvsmodels/coco.npz', validFreq=100, lrate=0.0002, reload_=False): # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['encoder'] = encoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # Load training and development sets print 'Loading dataset' train, dev = load_dataset(data)[:2] # Create and save dictionary print 'Creating dictionary' worddict = build_dictionary(train[0]+dev[0])[0] n_words = len(worddict) model_options['n_words'] = n_words print 'Dictionary size: ' + str(n_words) with open('%s.dictionary.pkl'%saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Building sentence encoder' trng, inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print 'Building image encoder' trng, inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 curr = 0. n_samples = 0 for eidx in xrange(max_epochs): print 'Epoch ', eidx for x, im in train_iter: n_samples += len(x) uidx += 1 x, mask, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, validFreq) == 0: print 'Computing results...' curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc ls = encode_sentences(curr_model, dev[0]) lim = encode_images(curr_model, dev[1]) (r1, r5, r10, medr) = i2t(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print 'Saving...', params = unzip(tparams) numpy.savez(saveto, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' print 'Seen %d samples'%n_samples
def trainer(X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', decoder='gru', max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=20000, maxlen_w=30, optimizer='adam', batch_size = 64, saveto='/u/rkiros/research/semhash/models/toy.npz', dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', saveFreq=1000, reload_=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ print model_options # reload options # TODO: if loading old parameters you need to make sure you are using them # in the rest of the code # if reload_ and os.path.exists(saveto): # print 'reloading...' + saveto # with open('%s.pkl'%saveto, 'rb') as f: # model_options = pkl.load(f) # load dictionary print 'Loading dictionary...' worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto + '.npz', params) tparams = init_tparams(params) trng, x, x_mask, y, y_mask, z, z_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask, z, z_mask] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Done' print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for x, y, z in train_iter: n_samples += len(x) uidx += 1 x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data(x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', params = unzip(tparams) numpy.savez(saveto, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' print 'Seen %d samples'%n_samples
def trainer( X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units encoder='gru', decoder='gru', max_epochs=5, dispFreq=1, decay_c=0., grad_clip=5., n_words=474000, maxlen_w=30, optimizer='adam', batch_size=64, saveto='/data/embeddingModel.npz', dictionary='dictionary.pkl', saveFreq=1000, reload_=False): # Model options model_options = {} model_options['dim_word'] = dim_word model_options['dim'] = dim model_options['encoder'] = encoder model_options['decoder'] = decoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['n_words'] = n_words model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['dictionary'] = dictionary model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ print(model_options) # reload options if reload_ and os.path.exists(saveto): print('reloading...' + saveto) with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) # load dictionary print('Loading dictionary...') worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() for kk, vv in worddict.items(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print('Building model') params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, x, x_mask, y, y_mask, z, z_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask, z, z_mask] # before any regularizer print('Building f_log_probs...') f_log_probs = theano.function(inps, cost, profile=False) print('Done') # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.items(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print('Building f_cost...') f_cost = theano.function(inps, cost, profile=False) print('Done') print('Done') print('Building f_grad...') grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k, t in tparams.items()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print('Building optimizers...', end='') # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print('Optimization') # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w) uidx = 0 lrate = 0.01 for eidx in range(max_epochs): n_samples = 0 print('Epoch ', eidx) for x, y, z in train_iter: n_samples += len(x) uidx += 1 x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data( x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) if x is None: print('Minibatch with zero sample under length ', maxlen_w) uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud) if numpy.mod(uidx, saveFreq) == 0: print('Saving...', end='') params = unzip(tparams) numpy.savez(saveto, history_errs=[], **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print('Done') print('Seen %d samples' % n_samples)
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., diag_c=0., clip_c=-1., lrate=0.01, n_words_src=100000, n_words=100000, maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates datasets=['/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'], valid_datasets=['../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'], dictionaries=['/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], use_dropout=False, reload_=False): # Model options model_options = locals().copy() worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' train = TextIterator(datasets[0], datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Buliding sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:,None]- opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' print 'Building f_grad...', f_grad = theano.function(inps, grads, profile=profile) print 'Done' if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5,x.shape[1])): stochastic = True sample, score = gen_sample(tparams, f_init, f_next, x[:,jj][:,None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False) print 'Source ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK', print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): import ipdb; ipdb.set_trace() print 'Valid ', valid_err print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def train(dim_word_desc=400,# word vector dimensionality dim_word_q=400, dim_word_ans=600, dim_proj=300, dim=400,# the number of LSTM units encoder_desc='lstm', encoder_desc_word='lstm', encoder_desc_sent='lstm', use_dq_sims=False, eyem=None, learn_h0=False, use_desc_skip_c_g=False, debug=False, encoder_q='lstm', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., clip_c=-1., lrate=0.01, n_words_q=49145, n_words_desc=115425, n_words_ans=409, pkl_train_files=None, pkl_valid_files=None, maxlen=2000, # maximum length of the description optimizer='rmsprop', batch_size=2, vocab=None, valid_batch_size=16, use_elu_g=False, saveto='model.npz', model_dir=None, ms_nlayers=3, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates datasets=[None], truncate=400, momentum=0.9, use_bidir=False, cost_mask=None, valid_datasets=['/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5', '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'], dropout_rate=0.5, use_dropout=True, reload_=True, **opt_ds): ensure_dir_exists(model_dir) mpath = os.path.join(model_dir, saveto) mpath_best = os.path.join(model_dir, prfx("best", saveto)) mpath_last = os.path.join(model_dir, prfx("last", saveto)) mpath_stats = os.path.join(model_dir, prfx("stats", saveto)) # Model options model_options = locals().copy() model_options['use_sent_reps'] = opt_ds['use_sent_reps'] stats = defaultdict(list) del model_options['eyem'] del model_options['cost_mask'] if cost_mask is not None: cost_mask = sharedX(cost_mask) # reload options and parameters if reload_: print "Reloading the model." if os.path.exists(mpath_best): print "Reloading the best model from %s." % mpath_best with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath_best, params) elif os.path.exists(mpath): print "Reloading the model from %s." % mpath with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath, params) else: raise IOError("Couldn't open the file.") else: print "Couldn't reload the models initializing from scratch." params = init_params(model_options) if datasets[0]: print "Short dataset", datasets[0] print 'Loading data' print 'Building model' if pkl_train_files is None or pkl_valid_files is None: train, valid, test = load_data(path=datasets[0], valid_path=valid_datasets[0], test_path=valid_datasets[1], batch_size=batch_size, **opt_ds) else: train, valid, test = load_pkl_data(train_file_paths=pkl_train_files, valid_file_paths=pkl_valid_files, batch_size=batch_size, vocab=vocab, eyem=eyem, **opt_ds) tparams = init_tparams(params) trng, use_noise, inps_d, \ opt_ret, \ cost, errors, ent_errors, ent_derrors, probs = \ build_model(tparams, model_options, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, valid, cost_mask=cost_mask) alphas = opt_ret['dec_alphas'] if opt_ds['use_sent_reps']: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], inps_d['slen'], inps_d['qlen'],\ inps_d['ent_mask'] ] else: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], \ inps_d['qlen'], \ inps_d['ent_mask']] outs = [cost, errors, probs, alphas] if ent_errors: outs += [ent_errors] if ent_derrors: outs += [ent_derrors] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, outs, profile=profile) print 'Done' # Apply weight decay on the feed-forward connections if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): if "logit" in kk or "ff" in kk: weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Computing gradient...', grads = safe_grad(cost, itemlist(tparams)) print 'Done' # Gradient clipping: if clip_c > 0.: g2 = get_norms(grads) for p, g in grads.iteritems(): grads[p] = tensor.switch(g2 > (clip_c**2), (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g) inps.pop() if optimizer.lower() == "adasecant": learning_rule = Adasecant(delta_clip=25.0, use_adagrad=True, grad_clip=0.25, gamma_clip=0.) elif optimizer.lower() == "rmsprop": learning_rule = RMSPropMomentum(init_momentum=momentum) elif optimizer.lower() == "adam": learning_rule = Adam() elif optimizer.lower() == "adadelta": learning_rule = AdaDelta() lr = tensor.scalar(name='lr') print 'Building optimizers...', learning_rule = None if learning_rule: f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr, grads=grads, inp=inps, cost=cost, errors=errors) else: f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, errors) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(mpath): history_errs = list(numpy.load(mpath)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size best_found = False uidx = 0 estop = False train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() for eidx in xrange(max_epochs): n_samples = 0 if train.done: train.reset() for d_, q_, a, em in train: n_samples += len(a) uidx += 1 use_noise.set_value(1.) if opt_ds['use_sent_reps']: # To mask the description and the question. d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask, q, q_mask, a, dlen, slen, qlen) else: d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask, q, q_mask, a, dlen, qlen) upnorm = f_update(lrate) ud = time.time() - ud_start # Collect the running ave train stats. train_cost_ave = running_ave(train_cost_ave, cost) train_err_ave = running_ave(train_err_ave, errors) train_gnorm_ave = running_ave(train_gnorm_ave, gnorm) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' import ipdb; ipdb.set_trace() if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, ' Update ', uidx, \ ' Cost ', cost, ' UD ', ud, \ ' UpNorm ', upnorm[0].tolist(), \ ' GNorm ', gnorm, \ ' Pnorm ', pnorm, 'Terrors ', errors if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None and best_found: numpy.savez(mpath_best, history_errs=history_errs, **best_p) pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb')) else: params = unzip(tparams) numpy.savez(mpath, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % mpath, 'wb')) pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb')) print 'Done' print_param_norms(tparams) if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if valid.done: valid.reset() valid_costs, valid_errs, valid_probs, \ valid_alphas, error_ent, error_dent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_reps']) valid_alphas_ = numpy.concatenate([va.argmax(0) for va in valid_alphas.tolist()], axis=0) valid_err = valid_errs.mean() valid_cost = valid_costs.mean() valid_alpha_ent = -negentropy(valid_alphas) mean_valid_alphas = valid_alphas_.mean() std_valid_alphas = valid_alphas_.std() mean_valid_probs = valid_probs.argmax(1).mean() std_valid_probs = valid_probs.argmax(1).std() history_errs.append([valid_cost, valid_err]) stats['train_err_ave'].append(train_err_ave) stats['train_cost_ave'].append(train_cost_ave) stats['train_gnorm_ave'].append(train_gnorm_ave) stats['valid_errs'].append(valid_err) stats['valid_costs'].append(valid_cost) stats['valid_err_ent'].append(error_ent) stats['valid_err_desc_ent'].append(error_dent) stats['valid_alphas_mean'].append(mean_valid_alphas) stats['valid_alphas_std'].append(std_valid_alphas) stats['valid_alphas_ent'].append(valid_alpha_ent) stats['valid_probs_mean'].append(mean_valid_probs) stats['valid_probs_std'].append(std_valid_probs) if uidx == 0 or valid_err <= numpy.array(history_errs)[:, 1].min(): best_p = unzip(tparams) bad_counter = 0 best_found = True else: bst_found = False if numpy.isnan(valid_err): import ipdb; ipdb.set_trace() print "============================" print '\t>>>Valid error: ', valid_err, \ ' Valid cost: ', valid_cost print '\t>>>Valid pred mean: ', mean_valid_probs, \ ' Valid pred std: ', std_valid_probs print '\t>>>Valid alphas mean: ', mean_valid_alphas, \ ' Valid alphas std: ', std_valid_alphas, \ ' Valid alpha negent: ', valid_alpha_ent, \ ' Valid error ent: ', error_ent, \ ' Valid error desc ent: ', error_dent print "============================" print "Running average train stats " print '\t>>>Train error: ', train_err_ave, \ ' Train cost: ', train_cost_ave, \ ' Train grad norm: ', train_gnorm_ave print "============================" train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid.reset() valid_cost, valid_error, valid_probs, \ valid_alphas, error_ent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_rep']) print " Final eval resuts: " print 'Valid error: ', valid_error.mean() print 'Valid cost: ', valid_cost.mean() print '\t>>>Valid pred mean: ', valid_probs.mean(), \ ' Valid pred std: ', valid_probs.std(), \ ' Valid error ent: ', error_ent params = copy.copy(best_p) numpy.savez(mpath_last, zipped_params=best_p, history_errs=history_errs, **params) return valid_err, valid_cost
def train(options, data, load_params=False, start_epoc=0): print "OPTIONS: ", options print 'Setting up model with options:' options = set_defaults(options) for kk, vv in options.iteritems(): print kk, vv print "model seed: ", options['model_seed'] print "fold: ", options['fold'] print 'seed: ', options['seed'] rng = numpy.random.RandomState(options['model_seed'] + 100 * options.get('fold', 99) + options.get('seed', 99)) params, operators = init_params(options, rng) print 'done...' if load_params: loaded = load_par(options) start_epoc = resume_epoc(options) # Check that we've loaded the correct parameters... for kk, vv in loaded.iteritems(): assert params[kk].shape == vv.shape assert type(params[kk]) == type(vv) params = loaded tparams = init_tparams(params) trng, use_noise, inps, out = build_model(tparams, options, rng) y = tensor.imatrix('y') cost = nll(out, y) f_eval = theano.function([inps, y], cost, givens={use_noise: numpy.float32(0.)}, on_unused_input='ignore') reg = 0. for k, v in tparams.iteritems(): if k[:6] == 'hidden' or k[-3:] == 'W_h': reg += options['l1'] * tensor.sum(abs(v)) reg += options['l2'] * tensor.sum((v)**2) cost += reg grads = tensor.grad(cost, wrt=itemlist(tparams)) lr = tensor.scalar(name='lr', dtype=theano.config.floatX) opt = get_optim(options['opt']) print 'Compiling functions' f_grad_shared, f_update, gshared = opt(lr, tparams, grads, [inps, y], cost, use_noise) f_out = theano.function([inps], out, givens={use_noise: numpy.float32(0.)}, on_unused_input='ignore', allow_input_downcast=True) best = numpy.inf print 'Starting training' train = list_update(data[0], f_eval, options['batch_size'], rng=rng) test = list_update(data[-1], f_eval, options['batch_size'], rng=rng) starting = (train, test) print 'Pre-training. test: %f, train: %f' % (test, train) print 'Training' lr = options['lr'] max_itr = options['max_itr'] grad_norm = 0. train_scores = 50 * [0.] try: for epoch in xrange(max_itr): start_time = time.time() for g in gshared: # manually set gradients to 0 because we accumulate in list update g.set_value(0.0 * g.get_value()) use_noise.set_value(1.) train_cost, n_obs = list_update(data[0], f_grad_shared, batchsize=options['batch_size'], rng=rng, return_n_obs=True) use_noise.set_value(0.) for g in gshared: g.set_value(floatx(g.get_value() / float(n_obs))) f_update(lr) apply_proximity(tparams, operators) train = list_update(data[0], f_eval, options['batch_size'], rng=rng) elapsed_time = time.time() - start_time if train < best: # early stopping on training set test = list_update(data[-1], f_eval) best_par = unzip(tparams) best_perf = (train, test) best = train test = list_update(data[-1], f_eval) if (epoch % 50) == 0: # Save progress.... save_progress(options, tparams, epoch, best_perf) print 'Epoch: %d, cost: %f, train: %f, test: %f, lr:%f, time: %f' % ( epoch, train_cost, train, test, lr, elapsed_time) # Check if we're diverging... train_ave = running_ave(train_scores, train, epoch) if epoch > 1000: # Only exit if we're diverging after 1000 iterations if train_ave > 1.03 * best_perf[0]: print "Diverged..." break except KeyboardInterrupt: print "Interrupted" # check that we're outputing prob distributions X = data[0][(3, 3)][0] assert abs( f_out(X.reshape(X.shape[0], 2, 3, 3)).sum() - float(X.shape[0])) < 1e-4 print "Best performance:" print "train, test" print "%f,%f" % best_perf return best_perf, best_par