def auto4check2(input, dataset): a = theano.shared(value=dataset[0], name="a") b = theano.shared(value=dataset[1], name="b") c = theano.shared(value=dataset[2], name="c") x = T.vector('x') u = x[0] - 0.8 v = x[1] - (a[0] + a[1] * u ** 2 * (1 - u) ** 0.5 - a[2] * u) alpha = -b[0] + b[1] * u ** 2 * (1 + u) ** 0.5 + b[2] * u beta = c[0] * v ** 2 * (1 - c[1] * v) / (1 + c[2] * u ** 2) fx = alpha * np.e ** (-beta) g_f_x = T.jacobian(fx, x) grad = theano.function([x], g_f_x) Hessian = theano.function([x], T.hessian(fx, x)) H_alpha_x = theano.function([x], T.hessian(alpha, x)) H_beta_x = theano.function([x], T.hessian(beta, x)) J_f_alpha = theano.function([x], T.grad(fx, alpha)) J_f_beta = theano.function([x], T.grad(fx, beta)) J_alpha_x = theano.function([x], T.grad(alpha, x)) J_beta_x = theano.function([x], T.grad(beta, x)) J_f_y = [J_f_alpha(input), J_f_beta(input)] J_y_x = [J_alpha_x(input), J_beta_x(input)] # print "H_alpha_x" # print H_alpha_x(input) # print "H_beta_x" # print H_beta_x(input) # print "J_f_y" # print J_f_y # print "J_y_x" # print J_y_x # print grad(input) return Hessian(input)
def test004_hessian(): x = tensor.vector() y = tensor.sum(x ** 2) Hx = tensor.hessian(y, x) f = theano.function([x], Hx) vx = numpy.arange(10).astype(theano.config.floatX) assert numpy.allclose(f(vx), numpy.eye(10) * 2)
def test_DownsampleFactorMax_hessian(self): # Example provided by Frans Cronje, see # https://groups.google.com/d/msg/theano-users/qpqUy_3glhw/JMwIvlN5wX4J x_vec = tensor.vector('x') z = tensor.dot(x_vec.dimshuffle(0, 'x'), x_vec.dimshuffle('x', 0)) y = max_pool_2d(input=z, ds=(2, 2), ignore_border=True) C = tensor.exp(tensor.sum(y)) grad_hess = tensor.hessian(cost=C, wrt=x_vec) fn_hess = function(inputs=[x_vec], outputs=grad_hess) # The value has been manually computed from the theoretical gradient, # and confirmed by the implementation. assert numpy.allclose(fn_hess([1, 2]), [[0., 0.], [0., 982.7667]])
def test_DownsampleFactorMax_hessian(self): # Example provided by Frans Cronje, see # https://groups.google.com/d/msg/theano-users/qpqUy_3glhw/JMwIvlN5wX4J x_vec = tensor.vector("x") z = tensor.dot(x_vec.dimshuffle(0, "x"), x_vec.dimshuffle("x", 0)) y = max_pool_2d(input=z, ds=(2, 2)) C = tensor.exp(tensor.sum(y)) grad_hess = tensor.hessian(cost=C, wrt=x_vec) fn_hess = function(inputs=[x_vec], outputs=grad_hess) # The value has been manually computed from the theoretical gradient, # and confirmed by the implementation. assert numpy.allclose(fn_hess([1, 2]), [[0.0, 0.0], [0.0, 982.7667]])
def hessian_vector(expr, wrt): """Computes the Hessian of a vector expression with respect to varaibles. Args: expr: Vector Theano tensor expression. wrt: List of Theano variables. Returns: Theano tensor. """ try: return _tensor_map(lambda f: hessian_scalar(f, wrt), expr) except ValueError: # Fallback for wider support. return T.stack([T.hessian(expr, wrt, disconnected_inputs="ignore")])
def get_gessians(self, y): """Return a list of hessians wrt to the model parameters Args: y (theano.tensor.TensorVariable): corresponds to a vector that gives for each example the correct label. Returns: list(TensorSharedVariable): a list of hessian matrix """ hessians = [] for param in [self.beta_flat, self.asc]: shp = param.shape batch_size = y.shape[0] cost = self.negative_log_likelihood(y) h = T.hessian(cost, param, disconnected_inputs='ignore') hessians.append(h) return hessians
def auto4check(dataset, x, tol=1e-9, maxiter=1000): t0 = theano.shared(value=dataset[0], name="t0") a0 = theano.shared(value=dataset[1], name="a0") b0 = theano.shared(value=dataset[2], name="b0") c0 = theano.shared(value=dataset[3], name="c0") k = T.vector('k') a_t = np.e ** (-(k[0] + k[1]) * t0) b_t = k[0] / (k[0] + k[1]) * (1 - a_t) c_t = k[1] / (k[0] + k[1]) * (1 - a_t) f = T.sum((a0 - a_t) ** 2 + (b0 - b_t) ** 2 + (c0 - c_t) ** 2) F = theano.function([k], f) g_f_k = T.jacobian(f, k) j_f_k = theano.function([k], g_f_k) H_f_k = T.hessian(f, k) Hessian = theano.function([k], H_f_k) track, f_val = [], [] track.append(array(x)) f_val.append(F(x)) g = j_f_k(x) i = 0 print "Step =", i, "g=", g, "x=", x, "loss=", F(x) while norm(g) > tol: i += 1 if i > maxiter: break G = Hessian(x) s = -np.linalg.solve(G, g) x += s track.append(array(x)) f_val.append(F(x)) g = j_f_k(x) print "step =", i, "g=", g, "x=", x, "loss=", F(x), "G=", G return x, F(x), track, f_val
def get_expr_rff_feature_map_component_third_order_tensor(x, omega, u): grad = get_expr_rff_feature_map_component_grad(x, omega, u) G3, updates = theano.scan(lambda i, grad, x: T.hessian(grad[i], x), sequences=T.arange(grad.shape[0]), non_sequences=[grad, x]) return G3, updates
def get_expr_gaussian_kernel_hessian(x, y, sigma): return T.hessian(get_expr_gaussian_kernel(x, y, sigma), x)
def run_crbm(): """ Discrete choice model estimation with Theano Setup ----- step 1: Load variables from csv file step 2: Define hyperparameters used in the computation step 3: define symbolic Theano tensors step 4: build model and define cost function step 5: define gradient calculation algorithm step 6: define Theano symbolic functions step 7: run main estimaiton loop for n iterations step 8: perform analytics and model statistics """ # compile and import dataset from csv# d_x_ng, d_x_g, d_y, avail, d_ind = extractdata(csvString) data_x_ng = shared(np.asarray(d_x_ng, dtype=floatX), borrow=True) data_x_g = shared(np.asarray(d_x_g, dtype=floatX), borrow=True) data_y = T.cast(shared(np.asarray(d_y - 1, dtype=floatX), borrow=True), 'int32') data_av = shared(np.asarray(avail, dtype=floatX), borrow=True) data_ind = shared(np.asarray(d_ind, dtype=floatX), borrow=True) sz_n = d_x_g.shape[0] # number of samples sz_k = d_x_g.shape[1] # number of generic variables sz_m = d_x_ng.shape[2] # number of non-generic variables sz_i = d_x_ng.shape[1] # number of alternatives sz_z = d_ind.shape[1] # number of indicators sz_minibatch = sz_n # model hyperparameters learning_rate = 0.1 gen_rate = 1.0 momentum = 0.9 n_hidden = 3 # latent variable model parameters x_ng = T.tensor3('data_x_ng') # symbolic theano tensors x_g = T.matrix('data_x_g') y = T.ivector('data_y') av = T.matrix('data_av') index = T.lscalar('index') z = T.matrix('data_ind') # construct model model = CRBM(sz_i, av, n_in=[(sz_m, ), (sz_k, n_hidden)], n_hid=[(n_hidden, ), (n_hidden, sz_i), (n_hidden, sz_z)], n_ind=(sz_z, ), input=[x_ng, x_g], output=y, inds=z) cost, error, chain_end, updates = model.gibbs_sampling(y, x_ng, x_g, av, alts=6, steps=25) grads = T.grad(cost=cost - model.loglikelihood(y), wrt=model.params, consider_constant=[chain_end]) cost2 = -(model.loglikelihood(y) + 0.1 * model.cross_entropy(z)) grads2 = T.grad(cost=cost2, wrt=model.params2) opt = optimizers.adadelta(model.params, model.masks, momentum) opt2 = optimizers.adadelta(model.params2, model.masks2, momentum) # opt = optimizers.sgd(model.params, model.masks) updates.update(opt.updates(model.params, grads, learning_rate)) updates2 = opt2.updates(model.params2, grads2, learning_rate) # null loglikelihood function fn_null = function(inputs=[], outputs=model.loglikelihood(y), givens={ x_ng: data_x_ng, x_g: data_x_g, y: data_y, av: data_av }, on_unused_input='ignore') # compile the theano functions fn_estimate = function( name='estimate', inputs=[index], outputs=[model.loglikelihood(y), cost], updates=updates, givens={ x_ng: data_x_ng[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], x_g: data_x_g[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], y: data_y[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], av: data_av[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))] }, allow_input_downcast=True, on_unused_input='ignore', ) fn_optimize = function( name='optimize', inputs=[index], outputs=[model.loglikelihood(y)], updates=updates2, givens={ x_ng: data_x_ng[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], x_g: data_x_g[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], y: data_y[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], av: data_av[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], z: data_ind[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))] }, allow_input_downcast=True, on_unused_input='ignore', ) fn_pred = function(inputs=[], outputs=model.y_pred, givens={ x_ng: data_x_ng, x_g: data_x_g, y: data_y, av: data_av }, on_unused_input='ignore') """ Main estimation process loop """ print('Begin estimation...') epoch = 0 # process loop parameters sz_epoches = 2000 sz_batches = np.ceil(sz_n / sz_minibatch).astype(np.int32) done_looping = False patience = 300 patience_inc = 10 best_loglikelihood = -np.inf null_Loglikelihood = fn_null() start_time = timeit.default_timer() while epoch < sz_epoches and done_looping is False: epoch_cost = [] epoch_loglikelihood = [] for i in range(sz_batches): (batch_loglikelihood, batch_cost) = fn_estimate(i) epoch_cost.append(batch_cost) epoch_loglikelihood.append(batch_loglikelihood) this_loglikelihood = np.sum(epoch_loglikelihood) this_cost = np.sum(epoch_cost) print('@ iteration %d/%d loglikelihood: %.3f' % (epoch, patience, this_loglikelihood)) print(' cost %.3f' % this_cost) print(fn_pred()) print(data_y.eval()) if this_loglikelihood > best_loglikelihood: if this_loglikelihood > 0.998 * best_loglikelihood: patience += patience_inc best_loglikelihood = this_loglikelihood best_model = model if (epoch > patience or this_loglikelihood < 1.01 * best_loglikelihood): done_looping = True epoch += 1 epoch = 0 patience = 900 done_looping = False best_loglikelihood = -np.inf # done_looping = True while epoch < sz_epoches and done_looping is False: epoch_cost = [] epoch_loglikelihood = [] for i in range(sz_batches): (batch_loglikelihood) = fn_optimize(i) epoch_loglikelihood.append(batch_loglikelihood) this_loglikelihood = np.sum(epoch_loglikelihood) this_cost = np.sum(epoch_cost) print('@ iteration %d/%d loglikelihood: %.3f' % (epoch, patience, this_loglikelihood)) print(fn_pred()) print(data_y.eval()) if this_loglikelihood > best_loglikelihood: if this_loglikelihood > 0.999 * best_loglikelihood: patience += patience_inc best_loglikelihood = this_loglikelihood best_model = model if (epoch > patience or this_loglikelihood < 1.01 * best_loglikelihood): done_looping = True epoch += 1 final_Loglikelihood = best_loglikelihood rho_square = 1. - (final_Loglikelihood / null_Loglikelihood) with open('best_model.pkl', 'wb') as f: pickle.dump(best_model, f) end_time = timeit.default_timer() """ Analytics and model statistics """ with open('best_model.pkl', 'rb') as f: best_model = pickle.load(f) print('... solving Hessians') # hessian function fn_hessian = function( inputs=[best_model.x_ng, best_model.x_g, best_model.av], outputs=T.hessian( cost=-(best_model.loglikelihood(y) + best_model.cross_entropy(z)), wrt=best_model.params2), givens={ y: data_y, z: data_ind }, on_unused_input='ignore') h = np.hstack([ np.diagonal(mat) for mat in fn_hessian(data_x_ng.eval(), data_x_g.eval(), data_av.eval()) ]) n_est_params = np.count_nonzero(h) aic = 2 * n_est_params - 2 * final_Loglikelihood bic = np.log(sz_n) * n_est_params - 2 * final_Loglikelihood print('@iteration %d, run time %.3f ' % (epoch, end_time - start_time)) print('Null Loglikelihood: %.3f' % null_Loglikelihood) print('Final Loglikelihood: %.3f' % final_Loglikelihood) print('rho square %.3f' % rho_square) print('AIC %.3f' % aic) print('BIC %.3f' % bic) run_analytics(best_model, h, n_hidden)
def sym_hes(*args, **kwargs): return T.hessian(*args, disconnected_inputs='warn', **kwargs)
def get_generative_cost_updates(self, k=1, lr=1e-3): """ get_generative_cost_updates func updates weights for W^(1), W^(2), a, c and d """ # prepare visible samples from x input and y outputs v0_samples = self.input + self.output labels = self.label # perform positive Gibbs sampling phase # one step Gibbs sampling p(h|v1,v2,...) = p(h|v1)+p(h|v2)+... h1_pre, h1_means, h1_samples = self.sample_h_given_v(v0_samples) # start of Gibbs sampling chain # we only want the samples generated from the Gibbs sampling phase chain_start = h1_samples scan_out = 3 * len(v0_samples) * [None] + [None, None, chain_start] # theano scan function to loop over all Gibbs steps k # [v1_pre[], v1_means[], v1_samples[], h1_pre, h1_means, h1_samples] # outputs are given by outputs_info # [[t,t+1,t+2,...], [t,t+1,t+2,...], ], gibbs_updates # NOTE: scan returns a dictionary of updates gibbs_output, gibbs_updates = theano.scan(fn=self.gibbs_hvh, outputs_info=scan_out, n_steps=k, name='gibbs_hvh') # note that we only need the visible samples at the end of the chain chain_end = [] a = self.hyperparameters['alpha'] for output in gibbs_output: chain_end.append(output[-1]) gibbs_pre = chain_end[:len(v0_samples)] gibbs_means = chain_end[len(v0_samples):2 * len(v0_samples)] gibbs_samples = chain_end[2 * len(v0_samples):3 * len(v0_samples)] # calculate the model cost ginitial_cost = self.free_energy(self.input) gfinal_cost = self.free_energy(gibbs_samples[:len(self.input)]) gcost = a * (T.mean(ginitial_cost) - T.mean(gfinal_cost)) dinitial_cost = self.discriminative_free_energy() dfinal_cost = self.discriminative_free_energy(gibbs_samples) dgcost = T.mean(dinitial_cost) - T.mean(dfinal_cost) g_params = self.vbias_f + self.V_params_f + self.hbias + self.vsigmas_f dg_params = self.B_params_f + self.U_params_f + self.cbias_f dg_masks = self.B_params_m + self.U_params_m + self.cbias_m # conditonal probability dcost = 0. sigmas = [] for i, (logit, label) in enumerate(zip(dinitial_cost, labels)): p_y_given_x = T.nnet.softmax(logit) dcost += Metric.loglikelihood(p_y_given_x, label) pred = T.argmax(p_y_given_x, axis=-1) errors = T.neq(pred, label) # calculate the Hessians hessians = T.hessian(cost=Metric.loglikelihood(p_y_given_x, label), wrt=dg_params, disconnected_inputs='ignore') sigma = [T.sqrt(s) for s in [T.diag(2. / h) for h in hessians]] sigmas.extend(sigma) # calculate the gradients g_grads = T.grad(cost=gcost, wrt=g_params, consider_constant=gibbs_samples, disconnected_inputs='ignore') dg_grads = T.grad(cost=dgcost + dcost, wrt=dg_params, consider_constant=gibbs_samples, disconnected_inputs='ignore') for i, m in enumerate(dg_masks): dg_grads[i] = dg_grads[i] * m # update Gibbs chain with update expressions from updates list[] g_updates = self.update_opt(g_params, g_grads, lr) dg_updates = self.update_opt(dg_params, dg_grads, lr) for variable, expression in g_updates: gibbs_updates[variable] = expression for variable, expression in dg_updates: gibbs_updates[variable] = expression # pseudo loglikelihood to track the quality of the hidden units # on input variables ONLY monitoring_cost = self.pseudo_loglikelihood( inputs=self.input, preactivation=gibbs_pre[:len(self.input)]) return monitoring_cost, dcost, errors, gibbs_updates, [ ginitial_cost, gfinal_cost ], [dinitial_cost, dfinal_cost], sigmas
def get_expr_rff_feature_map_component_hessian(x, omega, u): expr = get_expr_rff_feature_map_component(x, omega, u) return T.hessian(expr, x)
def get_expr_gaussian_kernel_third_order_tensor(x, y, sigma): grad = get_expr_gaussian_kernel_grad(x, y, sigma) G3, updates = theano.scan(lambda i, grad, x: T.hessian(grad[i], x), sequences=T.arange(grad.shape[0]), non_sequences=[grad, x]) return G3, updates
def main(data): # optimizer opt = Optimizers() # sampler theano_rng = RandomStreams(999) # import dataset n_samples = data.attrs['n_rows'] lr = 1e-3 batch_size = 128 x_data = [ data['purpose'], data['avg_speed'], data['duration'], data['trip_km'], data['n_coord'], data['interval'], data['dow'], data['startdistrict'], data['enddistrict'] ] y_data = [data['mode']] params = OrderedDict() params_shp = OrderedDict() output = [] input = [] asc_params = [] asc_params_m = [] beta_params_f = [] beta_params_s = [] beta_params_sf = [] beta_params = [] beta_params_m = [] for var in y_data: name = 'asc_' + var.name.strip('/') asc_shp = var['data'][:].squeeze().shape[1:] print('y', name, asc_shp) output.append(init_tensor((), name)) mask = np.ones(asc_shp, DTYPE_FLOATX) mask[-1] = 0. asc_value = np.zeros(asc_shp, DTYPE_FLOATX) * mask asc_params.append(shared(asc_value, name)) asc_params_m.append(shared(mask, name + '_mask')) params[name] = asc_params[-1] params_shp[name] = asc_shp for var in x_data: name = 'beta_' + var.name.strip('/') shp = var['data'].shape[1:] + asc_shp print('x', name, shp) input.append(init_tensor(var['data'].shape[1:], name)) mask = np.ones(shp, DTYPE_FLOATX) mask[..., -1] = 0. mask = mask.flatten() beta_value = np.zeros(np.prod(shp), DTYPE_FLOATX) * mask sigma_value = np.ones(np.prod(shp), DTYPE_FLOATX) * mask beta_params_f.append(shared(beta_value, name)) beta_params_sf.append(shared(sigma_value, name + '_sigma')) beta_params.append(T.reshape(beta_params_f[-1], shp)) beta_params_s.append(T.reshape(beta_params_sf[-1], shp)) beta_params_m.append(shared(mask, name + '_mask')) params[name] = beta_params_f[-1] params[name + '_sigma'] = beta_params_sf[-1] params_shp[name] = shp params_shp[name + '_sigma'] = shp # compute the utility function utility = 0. h_utility = 0. for x, b, s in zip(input, beta_params, beta_params_s): normal_sample = b[..., None] + T.sqr(s)[..., None] * theano_rng.normal( size=b.eval().shape + (1, ), avg=0., std=1., dtype=DTYPE_FLOATX) ax = [np.arange(x.ndim)[1:], np.arange(b.ndim)[:-1]] utility += T.tensordot(x, normal_sample, axes=ax) if x.ndim > 2: h_utility += T.tensordot(x, b + T.sqr(s), axes=[[1, 2], [0, 1]]) else: h_utility += T.tensordot(x, b + T.sqr(s), axes=[[1], [0]]) for y, asc in zip(output, asc_params): utility += asc[None, ..., None] h_utility += asc (d1, d2, d3) = utility.shape utility = utility.reshape((d1 * d3, d2)) p_y_given_x = T.nnet.softmax(utility) hessian_prob = T.nnet.softmax(h_utility) #! hessian_nll = T.log(hessian_prob) hessian_cr = hessian_nll[T.arange(y.shape[0]), y] hessian_cost = -T.sum(hessian_cr) nll = T.log(p_y_given_x).reshape((d3, d1, d2)) nll = nll[:, T.arange(y.shape[0]), y] cost = -T.sum(T.mean(nll, axis=0)) gparams = asc_params + beta_params_f + beta_params_sf grads = T.grad(cost, gparams) # mask gradient updates mask = asc_params_m + beta_params_m + beta_params_m for j, g in enumerate(grads): grads[j] = g * mask[j] # create list of updates to iterate over updates = opt.sgd_updates(gparams, grads, lr) # symbolic equation for the Hessian function stderrs = [] hessian = T.hessian(cost=hessian_cost, wrt=gparams) stderr = [T.sqrt(f) for f in [T.diag(2. / h) for h in hessian]] stderrs.extend(stderr) tensors = input + output shared_x = [shared(var['data'][:], borrow=True) for var in x_data] shared_y = [T.cast(shared(var['label'][:]), 'int32') for var in y_data] shared_variables = shared_x + shared_y i = T.lscalar('index') start_idx = i * batch_size end_idx = (i + 1) * batch_size print('constructing Theano computational graph...') train = theano.function( inputs=[i], outputs=cost, updates=updates, givens={ key: val[start_idx:end_idx] for key, val in zip(tensors, shared_variables) }, name='train', allow_input_downcast=True, ) std_err = theano.function( inputs=[], outputs=stderrs, givens={key: val[:] for key, val in zip(tensors, shared_variables)}, name='std errors', allow_input_downcast=True, ) # train model print('training the model...') curves = [] n_batches = n_samples // batch_size epochs = 100 epoch = 0 t0 = time.time() while epoch < epochs: epoch += 1 cost = [] for i in range(n_batches): cost_items = train(i) cost.append(cost_items) epoch_cost = np.sum(cost) curves.append((epoch, epoch_cost)) minutes, seconds = divmod(time.time() - t0, 60.) hours, minutes = divmod(minutes, 60.) print(("epoch {0:d} loglikelihood " "{1:.3f} time {hh:02d}:{mm:02d}:{ss:05.2f}").format( epoch, epoch_cost, hh=int(hours), mm=int(minutes), ss=seconds)) if (epoch % 5) == 0: print('checkpoint') param_values = {} for name, param in params.items(): param_shp = params_shp[name] param_values[name] = param.eval().reshape(param_shp) np.savetxt('params/{}.csv'.format(name), param_values[name].squeeze(), fmt='%.3f', delimiter=',') to_file = param_values, curves path = 'params/epoch_{0:d}.params'.format(epoch) with open(path, 'wb') as f: pickle.dump(to_file, f, protocol=pickle.HIGHEST_PROTOCOL) # save parameters and stderrs to .csv stderrs = std_err() params_list = [p for p in asc_params + beta_params_f + beta_params_sf] param_names = [p.name for p in asc_params + beta_params_f + beta_params_sf] for se, param, name in zip(stderrs, params_list, param_names): v = param.eval().squeeze() shp = v.shape path = 'params/stderrs_{}.csv'.format(name) np.savetxt(path, se.reshape(shp), fmt='%.3f', delimiter=',') path = 'params/tstat_{}.csv'.format(name) np.savetxt(path, v / se.reshape(shp), fmt='%.3f', delimiter=',')
def test_mlp(learning_rate=0.1, L1_reg=0.01, L2_reg=0.0001, n_epochs=1000, batch_size=200, n_hidden=10, n_in=40, n_out=6): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ # datasets = load_data(dataset) train_set, test_set, valid_set = load_from_file("processed_dataset.pkl") # train_set, test_set, valid_set = prepareData.get_data() # temp1, temp2 = test_set # print temp1.shape test_set_x, test_set_y = shared_dataset(test_set) valid_set_x, valid_set_y = shared_dataset(valid_set) train_set_x, train_set_y = shared_dataset(train_set) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(1234) # construct the MLP class classifier = MLP( rng=rng, input=x, n_in=n_in, n_hidden=n_hidden, n_out=n_out ) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = ( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr ) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size,] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] } ) # start-snippet-5 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams) ] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-5 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatches before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = theano.shared(train_model(minibatch_index)) test_W_flat = theano.shared(classifier.hiddenLayer.W.get_value().flatten()) w1 = test_W_flat.reshape((40,10)) test = theano.shared(classifier.hiddenLayer.W.get_value().flatten()) hessianMatrix = T.hessian( cost=minibatch_avg_cost, wrt=test) f = theano.function(inputs=[], outputs=hessianMatrix) print f() pause() iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter best_weights = classifier.hiddenLayer.W.get_value() # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print("Final weights of the hidden layer:") print(best_weights) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return best_validation_loss, best_iter, best_weights
def run_mxl(): """ Discrete choice model estimation by Mixed Logit (MxL) formulation with Theano Setup ----- step 1: Load variables from csv file step 2: Define hyperparameters used in the computation step 3: define symbolic Theano tensors step 4: build model and define cost function step 5: define gradient calculation algorithm step 6: define Theano symbolic functions step 7: run main estimaiton loop for n iterations step 8: perform analytics and model statistics """ # compile and import dataset from csv# d_x_ng, d_x_g, d_y, avail, d_ind = extractdata(csvString) data_x_ng = shared(np.asarray(d_x_ng, dtype=floatX), borrow=True) data_x_g = shared(np.asarray(d_x_g, dtype=floatX), borrow=True) data_y = T.cast(shared(np.asarray(d_y - 1, dtype=floatX), borrow=True), 'int32') data_av = shared(np.asarray(avail, dtype=floatX), borrow=True) sz_n = d_x_g.shape[0] # number of samples sz_k = d_x_g.shape[1] # number of generic variables sz_m = d_x_ng.shape[2] # number of non-generic variables sz_i = d_x_ng.shape[1] # number of alternatives sz_minibatch = sz_n # model hyperparameters sz_draw = 50 learning_rate = 0.3 momentum = 0.9 srng = RandomStreams(1234) # random draws rng = srng.normal((sz_n, sz_draw, sz_m)) x_ng = T.tensor3('data_x_ng') # symbolic theano tensors x_g = T.matrix('data_x_g') y = T.ivector('data_y') av = T.matrix('data_av') index = T.lscalar('index') draws = T.tensor3('normal_draws') # construct model model = MixedLogit(sz_i, av, input=[x_ng, x_g], n_in=[(sz_m), (sz_k, sz_i)], draws=draws) cost = -model.loglikelihood(y) # calculate the gradients wrt to the loss function grads = T.grad(cost=cost, wrt=model.params) opt = optimizers.adadelta(model.params, model.masks, momentum) updates = optimizer.updates(model.params, grads, learning_rate) # returns the distribution of the draws at iteration fn_checkdraw = function(inputs=[], outputs=model.draws, givens={draws: rng}) # hessian function fn_hessian = function(inputs=[], outputs=T.hessian(cost=cost, wrt=model.params), givens={ x_ng: data_x_ng, x_g: data_x_g, y: data_y, av: data_av, draws: rng }, on_unused_input='ignore') # null loglikelihood function fn_null = function(inputs=[], outputs=model.loglikelihood(y), givens={ x_ng: data_x_ng, x_g: data_x_g, y: data_y, av: data_av, draws: rng }, on_unused_input='ignore') # compile the theano functions fn_estimate = function( name='estimate', inputs=[index], outputs=[model.loglikelihood(y), model.errors(y)], updates=updates, givens={ x_ng: data_x_ng[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], x_g: data_x_g[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], y: data_y[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], av: data_av[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))], draws: rng[index * sz_minibatch:T.min(((index + 1) * sz_minibatch, sz_n))] }, allow_input_downcast=True, on_unused_input='ignore', ) """ Main estimation process loop """ print('Begin estimation...') epoch = 0 # process loop parameters sz_epoches = 9999 sz_batches = np.ceil(sz_n / sz_minibatch).astype(np.int32) done_looping = False patience = 300 patience_inc = 10 best_loglikelihood = -np.inf null_Loglikelihood = fn_null() start_time = timeit.default_timer() while epoch < sz_epoches and done_looping is False: epoch_error = [] epoch_loglikelihood = [] for i in range(sz_batches): (batch_loglikelihood, batch_error) = fn_estimate(i) epoch_error.append(batch_error) epoch_loglikelihood.append(batch_loglikelihood) this_loglikelihood = np.sum(epoch_loglikelihood) print('@ iteration %d loglikelihood: %.3f' % (epoch, this_loglikelihood)) if this_loglikelihood > best_loglikelihood: if this_loglikelihood > 0.997 * best_loglikelihood: patience += patience_inc best_loglikelihood = this_loglikelihood with open('best_model.pkl', 'wb') as f: pickle.dump(model, f) if epoch > patience: done_looping = True epoch += 1 """ Analytics and model statistics """ print('... solving Hessians') h = np.hstack([np.diagonal(mat) for mat in fn_hessian()]) n_est_params = np.count_nonzero(h) aic = 2 * n_est_params - 2 * final_Loglikelihood bic = np.log(sz_n) * n_est_params - 2 * final_Loglikelihood print('@iteration %d, run time %.3f ' % (epoch, end_time - start_time)) print('Null Loglikelihood: %.3f' % null_Loglikelihood) print('Final Loglikelihood: %.3f' % final_Loglikelihood) print('rho square %.3f' % rho_square) print('AIC %.3f' % aic) print('BIC %.3f' % bic) with open('best_model.pkl', 'rb') as f: best_model = pickle.load(f) run_analytics(best_model, h)