#cost = cost.sum(axis=1) / target_mask.sum(axis=1) #cost = cost.mean(axis=0) # Use this one instead. cost = cost.sum() cost = cost / target_mask.sum() # By default we report cross-entropy cost in bits. # Switch to nats by commenting out this line: # log_2(e) = 1.44269504089 cost = cost * lib.floatX(numpy.log2(numpy.e)) ### Getting the params, grads, updates, and Theano functions ### params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param == True) lib.print_params_info(params, path=FOLDER_PREFIX) grads = T.grad(cost, wrt=params, disconnected_inputs='warn') grads = [ T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads ] updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE) # Training function train_fn = theano.function([sequences, h0, reset, mask], [cost, new_h0], updates=updates, on_unused_input='warn') # Validation and Test function, hence no updates test_fn = theano.function([sequences, h0, reset, mask], [cost, new_h0],
#prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1)) #prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') #prev_samples = prev_samples.reshape((BATCH_SIZE,SEQ_LEN, FRAME_SIZE)) encoder_outputs, new_h0 = encoder(input_sequences, h0, reset) #decoder_outputs = decoder(encoder_outputs,prev_samples) cost = T.nnet.categorical_crossentropy(T.nnet.softmax(encoder_outputs), target_sequences.flatten()).mean() cost = cost * lib.floatX(1.44269504089) params = lib.search(cost, lambda x: hasattr(x, 'param')) lib.print_params_info(cost, params) grads = T.grad(cost, wrt=params, disconnected_inputs='warn') grads = [ T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads ] print "Gradients Computed" updates = lasagne.updates.adam(grads, params, learning_rate=lr) train_fn = theano.function([sequences, h0, reset, lr], [cost, new_h0], updates=updates, on_unused_input='warn')
) readout = lib.ops.Linear( 'Generator.GRU.Output.MLP.1', T.concatenate([state[:,:,-1],tiled_speaker],-1), DEC_DIM+SPEAKER_DIM, OUTPUT_DIM ) mask_mult = T.shape_padright(mask) cost = T.sum(T.sqr(X-readout)*mask_mult)/(T.sum(mask)*63.) test_cost = T.sum(T.sqr(X-predict_readout)*T.shape_padright(mask))/(T.sum(mask)*63.) params = lib.search(cost, lambda x: hasattr(x, "param") and x.param==True) lib.print_params_info(params) grads = T.grad(cost, wrt=params, disconnected_inputs='warn') grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] print "Gradients Computed" updates = lasagne.updates.adam(grads, params, learning_rate=learn_rate) train_fn = theano.function( [noise_vocoder,X,spkr_ids,ctx,mask,learn_rate], cost, updates=updates, on_unused_input='warn' )
#params = ip_params + other_params #lib.print_params_info(params, path=FOLDER_PREFIX) # #grads = T.grad(cost, wrt=params, disconnected_inputs='warn') #grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] # #updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE) ########### all_params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param == True) ip_params = lib.get_params(ip_cost, lambda x: hasattr(x, 'param') and x.param==True\ and 'BigFrameLevel' in x.name) other_params = [p for p in all_params if p not in ip_params] all_params = ip_params + other_params lib.print_params_info(ip_params, path=FOLDER_PREFIX) lib.print_params_info(other_params, path=FOLDER_PREFIX) lib.print_params_info(all_params, path=FOLDER_PREFIX) ip_grads = T.grad(ip_cost, wrt=ip_params, disconnected_inputs='warn') ip_grads = [ T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in ip_grads ] other_grads = T.grad(cost, wrt=other_params, disconnected_inputs='warn') other_grads = [ T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in other_grads ] grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn')
# all zero for some of the shorter files in mini-batch. #cost = cost.sum(axis=1) / target_mask.sum(axis=1) #cost = cost.mean(axis=0) # Use this one instead. cost = cost.sum() cost = cost / target_mask.sum() # By default we report cross-entropy cost in bits. # Switch to nats by commenting out this line: # log_2(e) = 1.44269504089 cost = cost * lib.floatX(numpy.log2(numpy.e)) ### Getting the params, grads, updates, and Theano functions ### params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) lib.print_params_info(params, path=FOLDER_PREFIX) grads = T.grad(cost, wrt=params, disconnected_inputs='warn') grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE) # Training function train_fn = theano.function( [sequences, mask], cost, updates=updates, on_unused_input='warn' ) # Validation and Test function
#prev_samples = prev_samples.reshape((BATCH_SIZE,SEQ_LEN, FRAME_SIZE)) encoder_outputs, new_h0 = encoder(input_sequences, h0, reset) #decoder_outputs = decoder(encoder_outputs,prev_samples) cost = T.nnet.categorical_crossentropy( T.nnet.softmax(encoder_outputs), target_sequences.flatten() ).mean() cost = cost * lib.floatX(1.44269504089) params = lib.search(cost, lambda x: hasattr(x, 'param')) lib.print_params_info(cost, params) grads = T.grad(cost, wrt=params, disconnected_inputs='warn') grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] print "Gradients Computed" updates = lasagne.updates.adam(grads, params, learning_rate = lr) train_fn = theano.function( [sequences, h0, reset,lr], [cost, new_h0], updates=updates, on_unused_input='warn' )
def train_loop(inputs, cost, train_data, times, prints=None, inject_total_iters=False, test_data=None, callback=None, optimizer=lasagne.updates.adam, save_params=False, nan_guard=False): params = lib.search(cost, lambda x: hasattr(x, 'param')) lib.print_params_info(params) grads = T.grad(cost, wrt=params, disconnected_inputs='warn') grads = [T.clip(g, lib.floatX(-1), lib.floatX(1)) for g in grads] updates = optimizer(grads, params) if prints is None: prints = [('cost', cost)] else: prints = [('cost', cost)] + prints print "Compiling train function..." if nan_guard: from theano.compile.nanguardmode import NanGuardMode mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) else: mode = None train_fn = theano.function(inputs, [p[1] for p in prints], updates=updates, on_unused_input='warn', mode=mode) print "Compiling eval function..." eval_fn = theano.function(inputs, [p[1] for p in prints], on_unused_input='warn') print "Training!" total_iters = 0 total_seconds = 0. last_print = 0 last_gen = 0 if len(times) >= 4: gen_every = times[3] else: gen_every = times[1] if len(times) >= 5: early_stop = times[4] if len(times) >= 6: early_stop_min = times[5] else: early_stop_min = 0 else: early_stop = None early_stop_min = None best_test_cost = np.inf best_test_cost_iter = 0. all_outputs = [] all_stats = [] for epoch in itertools.count(): generator = train_data() while True: try: inputs = generator.__next__() except StopIteration: break if inject_total_iters: inputs = [np.int32(total_iters)] + list(inputs) start_time = time.time() outputs = train_fn(*inputs) total_seconds += time.time() - start_time total_iters += 1 all_outputs.append(outputs) if total_iters == 1: try: # This only matters on Ishaan's computer import experiment_tools experiment_tools.register_crash_notifier() except ImportError: pass if (times[0]=='iters' and total_iters-last_print == times[1]) or \ (times[0]=='seconds' and total_seconds-last_print >= times[1]): mean_outputs = np.array(all_outputs).mean(axis=0) if test_data is not None: if inject_total_iters: test_outputs = [ eval_fn(np.int32(total_iters), *inputs) for inputs in test_data() ] else: test_outputs = [ eval_fn(*inputs) for inputs in test_data() ] test_mean_outputs = np.array(test_outputs).mean(axis=0) stats = collections.OrderedDict() stats['epoch'] = epoch stats['iters'] = total_iters for i, p in enumerate(prints): stats['train ' + p[0]] = mean_outputs[i] if test_data is not None: for i, p in enumerate(prints): stats['test ' + p[0]] = test_mean_outputs[i] stats['secs'] = total_seconds stats['secs/iter'] = total_seconds / total_iters if test_data != None and (stats['test cost'] < best_test_cost or (early_stop_min != None and total_iters <= early_stop_min)): best_test_cost = stats['test cost'] best_test_cost_iter = total_iters print_str = "" for k, v in stats.items(): if isinstance(v, int): print_str += "{}:{}\t".format(k, v) else: print_str += "{}:{:.4f}\t".format(k, v) print print_str[:-1] # omit the last \t all_stats.append(stats) all_outputs = [] last_print += times[1] if (times[0]=='iters' and total_iters-last_gen==gen_every) or \ (times[0]=='seconds' and total_seconds-last_gen >= gen_every): tag = "iters{}_time{}".format(total_iters, total_seconds) if callback is not None: callback(tag) if save_params: lib.save_params('params_{}.pkl'.format(tag)) last_gen += gen_every if (times[0]=='iters' and total_iters == times[2]) or \ (times[0]=='seconds' and total_seconds >= times[2]) or \ (test_data != None and early_stop != None and total_iters > (3*early_stop) and (total_iters-best_test_cost_iter) > early_stop): if (test_data != None and early_stop != None and total_iters > (3 * early_stop) and (total_iters - best_test_cost_iter) > early_stop): print "Early stop! Best test cost was {} at iter {}".format( best_test_cost, best_test_cost_iter) print "Done!" try: # This only matters on Ishaan's computer import experiment_tools experiment_tools.send_sms("done!") except ImportError: pass return all_stats
def create_encoder_decoder(): input_var = T.tensor3('input') input_var_normalised = (input_var - floatX(0.5)) mu, log_square_sigma = Encoder(input_var_normalised) mu = lib.floatX(2.) * T.tanh(mu / lib.floatX(2.)) sampled_z = gaussian_sampler(mu, log_square_sigma) reconstructed = Decoder(sampled_z) reconstruction_cost = T.nnet.binary_crossentropy( reconstructed.reshape((reconstructed.shape[0], -1)), input_var.reshape((input_var.shape[0], -1))).sum(axis=1) kl_cost = KL_with_standard_gaussian(mu, log_square_sigma) loss = T.mean(kl_cost + reconstruction_cost) params = lib.search(loss, lambda x: hasattr(x, 'param') and x.param == True) lib.print_params_info(params) grads = T.grad(loss, wrt=params, disconnected_inputs='warn') grads = [T.clip(g, lib.floatX(-1.), lib.floatX(1.)) for g in grads] lr = T.scalar('lr') updates = lasagne.updates.adam(grads, params, learning_rate=lr, epsilon=EPS) generated_z = T.matrix('generated_z') generated_samples = Decoder(generated_z) print "Compiling functions ..." train_fn = theano.function( [input_var, lr], [ loss, kl_cost.mean(), mu.min(), mu.max(), mu, sampled_z.min(), sampled_z.max() ], updates=updates, # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) ) reconstruct_fn = theano.function([input_var], reconstructed) val_fn = theano.function( [input_var], [ loss, kl_cost.mean(), mu.min(), mu.max(), mu, sampled_z.min(), sampled_z.max() ], # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) ) generate_fn = theano.function([generated_z], generated_samples) encode_fn = theano.function([input_var], mu) return train_fn, val_fn, generate_fn, reconstruct_fn, encode_fn
tiled_speaker = T.tile(emb_spkr[:,None,:],[1,seq_len,1]) if ENCODING: emb_ctx = T.concatenate([T.nnet.relu(ConvolutionalMapper(aligned_ctx,mode='train')),tiled_speaker],-1) predict_readout = lib.ops.RNN( 'GRU', 'Generator.GRU', emb_ctx, EMB_DIM+SPEAKER_DIM, DEC_DIM, OUTPUT_DIM, n_layers=N_RNN, mode='open-loop-rnn' ) lib.print_params_info(lib._params.values()) predict_fn = theano.function( [spkr_ids,ctx], predict_readout, on_unused_input='warn' ) direct_fn = theano.function( [spkr_ids,aligned_ctx], predict_readout, on_unused_input='warn' ) nmt_fn = theano.function( [chars,chars_mask],
def train_loop( inputs, cost, train_data, times, prints=None, inject_total_iters=False, test_data=None, callback=None, optimizer=lasagne.updates.adam, save_params=False, nan_guard=False ): params = lib.search(cost, lambda x: hasattr(x, 'param')) lib.print_params_info(params) grads = T.grad(cost, wrt=params, disconnected_inputs='warn') grads = [T.clip(g, lib.floatX(-1), lib.floatX(1)) for g in grads] updates = optimizer(grads, params) if prints is None: prints = [('cost', cost)] else: prints = [('cost', cost)] + prints print "Compiling train function..." if nan_guard: from theano.compile.nanguardmode import NanGuardMode mode = NanGuardMode( nan_is_error=True, inf_is_error=True, big_is_error=True ) else: mode = None train_fn = theano.function( inputs, [p[1] for p in prints], updates=updates, on_unused_input='warn', mode=mode ) print "Compiling eval function..." eval_fn = theano.function( inputs, [p[1] for p in prints], on_unused_input='warn' ) print "Training!" total_iters = 0 total_seconds = 0. last_print = 0 all_outputs = [] all_stats = [] for epoch in itertools.count(): for inputs in train_data(): if inject_total_iters: inputs = [np.int32(total_iters)] + list(inputs) start_time = time.time() outputs = train_fn(*inputs) total_seconds += time.time() - start_time total_iters += 1 all_outputs.append(outputs) if total_iters == 1: try: # This only matters on Ishaan's computer import experiment_tools experiment_tools.register_crash_notifier() except ImportError: pass if (times[0]=='iters' and total_iters-last_print == times[1]) or \ (times[0]=='seconds' and total_seconds-last_print >= times[1]): mean_outputs = np.array(all_outputs).mean(axis=0) if test_data is not None: if inject_total_iters: test_outputs = [ eval_fn(np.int32(total_iters), *inputs) for inputs in test_data() ] else: test_outputs = [ eval_fn(*inputs) for inputs in test_data() ] test_mean_outputs = np.array(test_outputs).mean(axis=0) stats = collections.OrderedDict() stats['epoch'] = epoch stats['iters'] = total_iters for i,p in enumerate(prints): stats['train '+p[0]] = mean_outputs[i] if test_data is not None: for i,p in enumerate(prints): stats['test '+p[0]] = test_mean_outputs[i] stats['secs'] = total_seconds stats['secs/iter'] = total_seconds / total_iters print_str = "" for k,v in stats.items(): if isinstance(v, int): print_str += "{}:{}\t".format(k,v) else: print_str += "{}:{:.4f}\t".format(k,v) print print_str[:-1] # omit the last \t all_stats.append(stats) tag = "iters{}_time{}".format(total_iters, total_seconds) if callback is not None: callback(tag) if save_params: lib.save_params('params_{}.pkl'.format(tag)) all_outputs = [] last_print += times[1] if (times[0]=='iters' and total_iters == times[2]) or \ (times[0]=='seconds' and total_seconds >= times[2]): print "Done!" try: # This only matters on Ishaan's computer import experiment_tools experiment_tools.send_sms("done!") except ImportError: pass return all_stats
#other_params = [p for p in params if p not in ip_params] #params = ip_params + other_params #lib.print_params_info(params, path=FOLDER_PREFIX) # #grads = T.grad(cost, wrt=params, disconnected_inputs='warn') #grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] # #updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE) ########### all_params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) ip_params = lib.get_params(ip_cost, lambda x: hasattr(x, 'param') and x.param==True\ and 'BigFrameLevel' in x.name) other_params = [p for p in all_params if p not in ip_params] all_params = ip_params + other_params lib.print_params_info(ip_params, path=FOLDER_PREFIX) lib.print_params_info(other_params, path=FOLDER_PREFIX) lib.print_params_info(all_params, path=FOLDER_PREFIX) ip_grads = T.grad(ip_cost, wrt=ip_params, disconnected_inputs='warn') ip_grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in ip_grads] other_grads = T.grad(cost, wrt=other_params, disconnected_inputs='warn') other_grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in other_grads] grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn') grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] ip_updates = lasagne.updates.adam(ip_grads, ip_params) other_updates = lasagne.updates.adam(other_grads, other_params) updates = lasagne.updates.adam(grads, all_params)