def make_functions(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs, 'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:, -output_length:, :-2] Y_hat = T.nnet.sigmoid(outputs[:, -output_length:, :-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat, Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)" % (time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop(params, grads, learning_rate=1e-4, P=P_learn) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function(inputs=[input_seqs, output_seqs], outputs=bits_loss) print "Done. (%0.3f s)" % (time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def make_train(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') seqs = predict(input_seq) output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) cost = T.sum(cross_entropy) # + 1e-3 * l2 params = P.values() grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=T.sum(cross_entropy), updates=updates.adadelta(params, grads)) return P, train
def make_train(input_size, output_size, mem_size, mem_width, hidden_size=100): P = Parameters() # Build controller. ctrl is a network that takes an external and read input # and returns the output of the network and its hidden layer ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_size) # Build model that predicts output sequence given input sequence predict = model.build(P, mem_size, mem_width, hidden_size, ctrl) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M, weights, output_seq_pred] = predict(input_seq) # Setup for adadelta updates cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1) params = P.values() l2 = T.sum(0) for p in params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + 1e-3 * l2 # clip gradients grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)] train = theano.function(inputs=[input_seq, output_seq], outputs=cost, updates=updates.adadelta(params, grads)) return P, train
def prepare_functions(input_size, hidden_size, latent_size, step_count, batch_size, train_X, valid_X): P = Parameters() encode_decode = model.build(P, input_size=input_size, hidden_size=hidden_size, latent_size=latent_size) P.W_decoder_input_0.set_value(P.W_decoder_input_0.get_value() * 10) X = T.matrix('X') step_count = 10 parameters = P.values() cost_symbs = [] for s in xrange(step_count): Z_means, Z_stds, alphas, \ X_mean, log_pi_samples = encode_decode(X, step_count=s + 1) batch_recon_loss, log_p = model.recon_loss(X, X_mean, log_pi_samples) recon_loss = T.mean(batch_recon_loss, axis=0) reg_loss = T.mean(model.reg_loss(Z_means, Z_stds, alphas), axis=0) vlb = recon_loss + reg_loss corr = T.mean(T.eq(T.argmax(log_p, axis=0), T.argmax(log_pi_samples, axis=0)), axis=0) cost = cost_symbs.append(vlb) avg_cost = sum(cost_symbs) / step_count cost = avg_cost + 1e-3 * sum(T.sum(T.sqr(w)) for w in parameters) gradients = updates.clip_deltas(T.grad(cost, wrt=parameters), 5) print "Updated parameters:" pprint(parameters) idx = T.iscalar('idx') train = theano.function( inputs=[idx], outputs=[ vlb, recon_loss, reg_loss, T.max(T.argmax(log_pi_samples, axis=0)), corr ], updates=updates.adam(parameters, gradients, learning_rate=1e-4), givens={X: train_X[idx * batch_size:(idx + 1) * batch_size]}) validate = theano.function(inputs=[], outputs=vlb, givens={X: valid_X}) sample = theano.function(inputs=[], outputs=[ X, X_mean, T.argmax(log_pi_samples, axis=0), T.exp(log_pi_samples) ], givens={X: valid_X[:10]}) return train, validate, sample
def make_model(input_size=8, output_size=8, mem_size=128, mem_width=20, hidden_sizes=[100]): P = Parameters() ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes) predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl) input_seq = T.matrix('input_sequence') [M_curr, weights, output] = predict(input_seq) test_fun = theano.function(inputs=[input_seq], outputs=[weights, output]) return P, test_fun
def build_network(input_size, hidden_size, constraint_adj=False): P = Parameters() X = T.bmatrix('X') P.W_input_hidden = U.initial_weights(input_size, hidden_size) P.b_hidden = U.initial_weights(hidden_size) P.b_output = U.initial_weights(input_size) hidden_lin = T.dot(X, P.W_input_hidden) + P.b_hidden hidden = T.nnet.sigmoid(hidden_lin) output = T.nnet.softmax(T.dot(hidden, P.W_input_hidden.T) + P.b_output) parameters = P.values() cost = build_error(X, output, P) if constraint_adj: pass #cost = cost + adjacency_constraint(hidden_lin) return X, output, cost, P
def make_train_functions(): P = Parameters() X = T.bvector('X') Y = T.ivector('Y') aux = {} predict = model.build( P, input_size=128, embedding_size=64, controller_size=256, stack_size=256, output_size=128, ) output = predict(X, aux=aux) error = -T.log(output[T.arange(Y.shape[0]), ((128 + 1 + Y) % (128 + 1))]) error = error[-(Y.shape[0] / 2):] parameters = P.values() gradients = T.grad(T.sum(error), wrt=parameters) shapes = [p.get_value().shape for p in parameters] count = theano.shared(np.float32(0)) acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes] acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\ [ (count,count + np.float32(1)) ] acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\ [ (count,np.int32(0)) ] avg_grads = [(g / count) for g in acc_grads] avg_grads = [clip(g, 1) for g in acc_grads] acc = theano.function( inputs=[X, Y], outputs=T.mean(error), updates=acc_update, ) update = theano.function( inputs=[], updates=updates.adadelta(parameters, avg_grads, learning_rate=1e-8) + acc_clear) test = theano.function( inputs=[X], outputs=T.argmax(output, axis=1)[-(X.shape[0] / 2):], ) return acc, update, test
def make_model(input_size=8, output_size=8, mem_size=128, mem_width=20, hidden_size=100): """ Given the model parameters, return a Theano function for the NTM's model """ P = Parameters() # Build the controller ctrl = controller.build(P, input_size, output_size, mem_size, mem_width, hidden_size) predict = model.build(P, mem_size, mem_width, hidden_size, ctrl) input_seq = T.matrix('input_sequence') [M_curr, weights, output] = predict(input_seq) # Return a Theano function for the NTM test_fun = theano.function(inputs=[input_seq], outputs=[weights, output]) return P, test_fun
def create_model(ids, vocab2id, size): word_vector_size = size hidden_state_size = size P = Parameters() P.V = create_vocab_vectors(P, vocab2id, word_vector_size) P.W_predict = np.zeros(P.V.get_value().shape).T P.b_predict = np.zeros((P.V.get_value().shape[0], )) X = P.V[ids] step = build_lstm_step(P, word_vector_size, hidden_state_size) [states, _], _ = theano.scan(step, sequences=[X], outputs_info=[P.init_h, P.init_c]) scores = T.dot(states, P.W_predict) + P.b_predict scores = T.nnet.softmax(scores) log_likelihood, cross_ent = word_cost(scores[:-1], ids[1:]) cost = log_likelihood #+ 1e-4 * sum( T.sum(abs(w)) for w in P.values() ) obv_cost = cross_ent return scores, cost, obv_cost, P
acc_size = 0 for i, size in enumerate(input_sizes): P["W_%s_%d" % (name, i)] = weights[acc_size:acc_size + size] Ws.append(P["W_%s_%d" % (name, i)]) acc_size += size P["b_%s" % name] = np.zeros((output_size, ), dtype=np.float32) b = P["b_%s" % name] def transform(Xs): acc = 0. for X, W in zip(Xs, Ws): if X.dtype.startswith('int'): acc += W[X] else: acc += T.dot(X, W) output = activation(acc + b) output.name = name return output return transform if __name__ == "__main__": import vae P = Parameters() inferer = build_classifier(P, "z1_latent", [10, 5], [5, 5, 5, 5], 5) print inferer( [T.constant(np.arange(5)), T.constant(np.eye(5, dtype=np.float32))]).eval()
B1 = -0.5*(((z2-w1)/0.4)**2) - 0.1 * w4 B2 = -0.5*(((z2-w1+w3)/0.35)**2) - 0.1 * w4 B3 = -0.5*(z1**2 + z2**2/5.) return lse(lse(B1,B2),B3) from theano_toolkit.parameters import Parameters from theano_toolkit import updates from pprint import pprint floatX = theano.config.floatX print 'building model' z0 = T.matrix('z0') P = Parameters() iaf, masks = iaf_made_wn(P,L=8,num_units=64, num_hids=1,nonl=T.nnet.elu, cond_bias=False) zT, ss = iaf(z0,cond_bias=None) parameters = P.values() pprint(parameters) logp = U(zT) logq = - ss losses = logq - logp loss = losses.mean() gradients = updates.clip_deltas(T.grad(loss, wrt=parameters), 5) P_train = Parameters() fupdates = updates.adam(parameters, gradients,
def crossentropy(output, Y): if output.owner.op == T.nnet.softmax_op: x = output.owner.inputs[0] k = T.max(x, axis=1, keepdims=True) sum_x = T.log(T.sum(T.exp(x - k), axis=1)) + k return -x[T.arange(x.shape[0]), Y] + sum_x else: return T.nnet.categorical_crossentropy(outputs, Y) if __name__ == "__main__": config.parse_args() total_frames = sum(x.shape[0] for x, _ in frame_label_data.training_stream()) logging.info("Total frames: %d" % total_frames) P = Parameters() predict = model.build(P) X = T.matrix('X') Y = T.ivector('Y') _, outputs = predict(X) cross_entropy = T.mean(crossentropy(outputs, Y)) parameters = P.values() loss = cross_entropy + \ (0.5/total_frames) * sum(T.sum(T.sqr(w)) for w in parameters) gradients = T.grad(loss, wrt=parameters) logging.info("Parameters to tune:" + ', '.join(sorted(w.name for w in parameters))) update_vars = Parameters()
def __init__(self, input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads, max_epochs, momentum, learning_rate, grad_clip, l2_norm): self.input_size = input_size self.output_size = output_size self.mem_size = mem_size self.mem_width = mem_width self.hidden_sizes = hidden_sizes self.num_heads = num_heads self.max_epochs = max_epochs self.momentum = momentum self.learning_rate = learning_rate self.grad_clip = grad_clip self.l2_norm = l2_norm self.best_train_cost = np.inf self.best_valid_cost = np.inf #self.train = None #self.cost = None self.train_his = [] P = Parameters() ctrl = controller.build(P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes) predict = model.build(P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads) input_seq = T.matrix('input_sequence') output_seq = T.matrix('output_sequence') [M_curr, weights, output] = predict(input_seq) # output_seq_pred = seqs[-1] cross_entropy = T.sum(T.nnet.binary_crossentropy( 5e-6 + (1 - 2 * 5e-6) * output, output_seq), axis=1) self.params = P.values() l2 = T.sum(0) for p in self.params: l2 = l2 + (p**2).sum() cost = T.sum(cross_entropy) + self.l2_norm * l2 # cost = T.sum(cross_entropy) + 1e-3*l2 grads = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ] # grads = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ] # grads = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ] self.train = theano.function( inputs=[input_seq, output_seq], outputs=cost, # updates=updates.adadelta(params,grads) updates=updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate)) self.predict_cost = theano.function(inputs=[input_seq, output_seq], outputs=cost) self.predict = theano.function(inputs=[input_seq], outputs=[weights, output])
] def weight_norm(u, norm=1.9356): in_norm = T.sqrt(T.sum(T.sqr(u), axis=0)) ratio = T.minimum(norm, in_norm) / (in_norm + 1e-8) return ratio * u def normalise_weights(updates): return [(p, weight_norm(u) if p.name.startswith('W') else u) for p, u in updates] if __name__ == "__main__": P = Parameters() extract, _ = model.build(P, "vrnn") X = T.tensor3('X') l = T.ivector('l') [Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std] = extract(X, l) parameters = P.values() batch_cost = model.cost(X, Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std, l) print "Calculating gradient..." print parameters batch_size = T.cast(X.shape[1], 'float32') gradients = T.grad(batch_cost, wrt=parameters) gradients = [g / batch_size for g in gradients] gradients = clip(5, parameters, gradients)