def prepare_functions(input_size, hidden_size, latent_size, step_count, batch_size, train_X, valid_X): P = Parameters() encode_decode = model.build(P, input_size=input_size, hidden_size=hidden_size, latent_size=latent_size) P.W_decoder_input_0.set_value(P.W_decoder_input_0.get_value() * 10) X = T.matrix('X') step_count = 10 parameters = P.values() cost_symbs = [] for s in xrange(step_count): Z_means, Z_stds, alphas, \ X_mean, log_pi_samples = encode_decode(X, step_count=s + 1) batch_recon_loss, log_p = model.recon_loss(X, X_mean, log_pi_samples) recon_loss = T.mean(batch_recon_loss, axis=0) reg_loss = T.mean(model.reg_loss(Z_means, Z_stds, alphas), axis=0) vlb = recon_loss + reg_loss corr = T.mean(T.eq(T.argmax(log_p, axis=0), T.argmax(log_pi_samples, axis=0)), axis=0) cost = cost_symbs.append(vlb) avg_cost = sum(cost_symbs) / step_count cost = avg_cost + 1e-3 * sum(T.sum(T.sqr(w)) for w in parameters) gradients = updates.clip_deltas(T.grad(cost, wrt=parameters), 5) print "Updated parameters:" pprint(parameters) idx = T.iscalar('idx') train = theano.function( inputs=[idx], outputs=[ vlb, recon_loss, reg_loss, T.max(T.argmax(log_pi_samples, axis=0)), corr ], updates=updates.adam(parameters, gradients, learning_rate=1e-4), givens={X: train_X[idx * batch_size:(idx + 1) * batch_size]}) validate = theano.function(inputs=[], outputs=vlb, givens={X: valid_X}) sample = theano.function(inputs=[], outputs=[ X, X_mean, T.argmax(log_pi_samples, axis=0), T.exp(log_pi_samples) ], givens={X: valid_X[:10]}) return train, validate, sample
P = Parameters() iaf, masks = iaf_made_wn(P,L=8,num_units=64, num_hids=1,nonl=T.nnet.elu, cond_bias=False) zT, ss = iaf(z0,cond_bias=None) parameters = P.values() pprint(parameters) logp = U(zT) logq = - ss losses = logq - logp loss = losses.mean() gradients = updates.clip_deltas(T.grad(loss, wrt=parameters), 5) P_train = Parameters() fupdates = updates.adam(parameters, gradients, learning_rate=1e-3, P=P_train) train = theano.function([z0],[loss,logq.mean(),logp.mean()], updates=fupdates) samples = theano.function([z0],zT) gradoldold = 0 # for debugging gradold = 0 print 'starting training' for i in range(50000): spl = np.random.randn(64,2).astype(floatX) outs = train(spl) l = outs[0] lq = outs[1] lp = outs[2] gs = outs[2:]
W * W_s / W_s_batch, W * W_b / W_b_batch) output = f(X) test_output = f(X, test=True) soft_ams = ams(output, Y, W_hat) discrete_ams = ams(output > 0.5, Y, W_hat) parameters = P.values() gradients = T.grad(-soft_ams, wrt=parameters) idx = T.iscalar('idx') batch_size = T.iscalar('batch_size') train = theano.function( inputs=[idx, batch_size], outputs=[soft_ams, discrete_ams], updates=updates.adam(parameters, gradients, learning_rate=5e-4), givens={ X: data_X[idx * batch_size: (idx + 1) * batch_size], W: data_W[idx * batch_size: (idx + 1) * batch_size], Y: data_Y[idx * batch_size: (idx + 1) * batch_size], } ) test = theano.function( inputs=[X, W, Y], outputs=[ams(test_output > 0.5, Y, W_hat), ams(test_output > 0.7, Y, W_hat), ams(test_output > 0.9, Y, W_hat), ams(test_output > 0.99, Y, W_hat)] ) print "Compilation done."
print "Compiling functions...", # pretrain = theano.function( # inputs=[idx], # outputs=[recon_loss / (32 * 32)] + latent_kls, # updates=updates.adam( # parameters, # T.grad(pretrain_loss, wrt=parameters), # learning_rate=1e-3 # ), # givens={X: chunk_X[idx * batch_size:(idx + 1) * batch_size]} # ) train = theano.function( inputs=[idx], outputs=[recon_loss / (32 * 32)] + latent_kls, updates=updates.adam(parameters, gradients, learning_rate=1e-4) + [(beta_lin, beta_lin + 1)], givens={X: chunk_X[idx * batch_size:(idx + 1) * batch_size]}) test = theano.function(inputs=[X], outputs=[val_loss, recon_loss / (32 * 32)] + latent_kls) show_betas = theano.function(inputs=[], outputs=betas) print "Done compilation." def data_stream(): stream = data_io.stream_file("data/train2014.pkl.gz") stream = data_io.buffered_random(stream) stream = data_io.chunks((x[0] for x in stream), buffer_items=chunk_size) stream = data_io. async (stream, queue_size=2)
[Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std] = extract(X,l) parameters = P.values() batch_cost = model.cost(X, Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std,l) print "Calculating gradient..." print parameters batch_size = T.cast(X.shape[1],'float32') gradients = T.grad(batch_cost,wrt=parameters) gradients = [ g / batch_size for g in gradients ] gradients = clip(5,parameters,gradients) P_learn = Parameters() updates = updates.adam(parameters,gradients,learning_rate=0.00025,P=P_learn) updates = normalise_weights(updates) print "Compiling..." train = theano.function( inputs=[X,l], outputs=batch_cost, updates=updates, ) test = theano.function(inputs=[X,l],outputs=batch_cost) print "Calculating mean variance..." rand_stream = data_io.random_select_stream(*[ data_io.stream_file('data/train.%02d.pklgz' % i) for i in xrange(1, 20)
[Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std] = extract(X, l) parameters = P.values() batch_cost = model.cost(X, Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std, l) print "Calculating gradient..." print parameters batch_size = T.cast(X.shape[1], 'float32') gradients = T.grad(batch_cost, wrt=parameters) gradients = [g / batch_size for g in gradients] gradients = clip(5, parameters, gradients) P_learn = Parameters() updates = updates.adam(parameters, gradients, learning_rate=0.00025, P=P_learn) updates = normalise_weights(updates) print "Compiling..." train = theano.function( inputs=[X, l], outputs=batch_cost, updates=updates, ) test = theano.function(inputs=[X, l], outputs=batch_cost) print "Calculating mean variance..." rand_stream = data_io.random_select_stream(*[ data_io.stream_file('data/train.%02d.pklgz' % i) for i in xrange(1, 20)