def __init__(self, layers, cost, x, shared_input, x_mask=None, shared_mask=None, batch_size=64, learning_rate=0.05, momentum=0.9, weight_decay=0.0002): self.i, self.bs = T.iscalars('i', 'bs') if type(layers) == list: self.layers = layers else: self.layers = [layers] self.cost = cost self.x = x self.x_mask = x_mask self.shared_input = shared_input self.shared_mask = shared_mask self.batch_size = batch_size self.momentum = momentum self.weight_decay = weight_decay self.learning_rate = learning_rate self.params = [param for layer in self.layers for param in layer.parameters] self.gradients = T.grad(self.cost, self.params) self.lr = T.dscalar('lr') if momentum: self.momentums = {param : theano.shared(np.zeros(param.get_value().shape)) for param in self.params} self.steps = 0 self.costs = [(0, 999999)]
def get_testing_function(test_data, test_mask, pct_blackout=0.5): raise Error("fix me!") i, batch_size = T.iscalars('i', 'batch_size') self.test_noise = T.shared_randomstreams.RandomStreams(1234).binomial( (self.inputs.shape), n=1, p=1-pct_blackout, dtype=theano.config.floatX) self.test_noisy = self.test_noise * self.inputs self.test_active_hidden = T.nnet.sigmoid(T.dot(self.test_noisy, self.W) + self.b_in) self.test_output = T.nnet.sigmoid(T.dot(self.test_active_hidden, self.W.T) + self.b_out) # root mean squared error of unknowns only # taking the original input vector's mask of which beers had no input information (no rating) # mask out any output predicted ratings where there was no rating of the original beer # so we aren't affecting the error factor in dimensions where we don't have any meaningful information in the original input data # flattenedOutputVector = dot product ( (mask vector of which items we sent through the network to test, so we only test accuracy of non-inputted answers) with dot product ( inputMask with full output vector ) ) self.only_originally_unknown = T.dot(1-self.test_noise, T.dot(self.inputs_mask, self.test_output)) self.test_error = T.pow(T.mean(T.pow(T.dot(self.inputs_mask, self.test_output) - self.inputs, 2)), 0.5) self.testing_function = theano.function([i, batch_size], self.test_error, givens={self.inputs: test_data[i:i+batch_size], self.inputs_mask: test_mask[i:i+batch_size]}) return self.testing_function
def test_equal_computations(): a, b = tensor.iscalars(2) with pytest.raises(ValueError): equal_computations([a], [a, b]) assert equal_computations([a], [a]) assert equal_computations([tensor.as_tensor(1)], [tensor.as_tensor(1)]) assert not equal_computations([b], [a]) assert not equal_computations([tensor.as_tensor(1)], [tensor.as_tensor(2)]) assert equal_computations([2], [2]) assert equal_computations([np.r_[2, 1]], [np.r_[2, 1]]) assert equal_computations([np.r_[2, 1]], [tensor.as_tensor(np.r_[2, 1])]) assert equal_computations([tensor.as_tensor(np.r_[2, 1])], [np.r_[2, 1]]) assert not equal_computations([2], [a]) assert not equal_computations([np.r_[2, 1]], [a]) assert not equal_computations([a], [2]) assert not equal_computations([a], [np.r_[2, 1]]) c = tensor.type_other.NoneConst assert equal_computations([c], [c]) m = tensor.matrix() max_argmax1 = tensor.max_and_argmax(m) max_argmax2 = tensor.max_and_argmax(m) assert equal_computations(max_argmax1, max_argmax2)
def extract_feature_from_data_provider(self, data_provider, feature_layer_name, train_mean=None, batch_mean_subtraction=False, niter=1, noiseless=False): assert isinstance(data_provider, LabeledDataProvider), ( 'data_provider need to be a subclass from LabeledDataProvider' ' so that it provides labeled data for supervised models') assert feature_layer_name in self.name_index_dic, ('need to provide feature_layer_name ' 'that is in the current network structure') layer_outputs = self.network_fprop(isTest=True, noiseless=noiseless) # assumes the output is always the last layer of the network for now final_output = layer_outputs[feature_layer_name] self.shared_train, _, _, _ = data_provider.get_train_labeled_data_and_idx(0) start_index, end_index = T.iscalars('s_i', 'e_i') xgiven = {} for i in xrange(self.ninputs): xgiven[self.xs[i]] = self.shared_train[i][start_index:end_index] if self.shared_train[0].dtype=='uint8': for i in xrange(self.ninputs): xgiven[self.xs[i]] = T.cast(xgiven[self.xs[i]], dtype='float32') if train_mean is not None and batch_mean_subtraction: tm = [None] * self.ninputs for i in xrange(self.ninputs): tm[i] = theano.shared(numpy.asarray(train_mean[i], dtype='float32')) xgiven[self.xs[i]] -= tm[i] self.__predict = theano.function([start_index, end_index], final_output, givens=xgiven) ndata = data_provider.get_number_of_train_data() prediction = numpy.zeros((ndata,)+self.layers[self.name_index_dic[feature_layer_name]].getOutputShape()[1:], dtype='float32') for minibatch_idx in xrange(data_provider.get_number_of_train_batches()): self.shared_train, _, s_i, e_i = data_provider.get_train_labeled_data_and_idx(minibatch_idx) pred_start = minibatch_idx*self.batch_size pred_end = (minibatch_idx+1)*self.batch_size if pred_end > ndata: pred_start = ndata-self.batch_size pred_end = ndata for j in xrange(niter): if j == 0: p = self.__predict(s_i, e_i) else: p += self.__predict(s_i, e_i) prediction[pred_start:pred_end] = p/float(niter) return prediction
def reconstruct_from_data_provider(self, data_provider, train_mean=None, batch_mean_subtraction=False, steps=1, noiseless=True): assert isinstance(data_provider, UnlabeledDataProvider), ( 'data_provider need to be a subclass from UnlabeledDataProvider' ' so that it provides appropriate data for unsupervised models') assert ((steps==1) == noiseless), ( 'need noise to simulate generalized deonising autoencoder, ' 'for noiseless case there is no need to taking multiple steps ' 'in reconstruction') enc_outputs = self.network_fprop(self.enc_layers, self.x, isTest=True, noiseless=noiseless) dec_outputs = self.network_fprop(self.dec_layers, enc_outputs[self.encoder_ns[-1]['name']], isTest=True, noiseless=noiseless) # the reconstruction is always the last layer of the network x_hat_given_x = dec_outputs[self.decoder_ns[-1]['name']] self.shared_train, _, _ = data_provider.get_train_data_and_idx(0) start_index, end_index = T.iscalars('s_i', 'e_i') xgiven = self.shared_train[start_index:end_index] if self.shared_train.dtype=='uint8': xgiven = T.cast(xgiven, dtype='float32') if train_mean is not None and batch_mean_subtraction: tm = theano.shared(numpy.asarray(train_mean, dtype='float32')) xgiven -= tm reconstruct_dp = theano.function([start_index, end_index], x_hat_given_x, givens={self.x:xgiven}) reconstruct_mem = theano.function([self.x], x_hat_given_x) ndata = data_provider.get_number_of_train_data() recs = numpy.zeros((ndata, self.shared_train.get_value().shape[1]), dtype='float32') for minibatch_idx in xrange(data_provider.get_number_of_train_batches()): self.shared_train, s_i, e_i = data_provider.get_train_data_and_idx(minibatch_idx) pred_start = minibatch_idx*self.batch_size pred_end = (minibatch_idx+1)*self.batch_size if pred_end > ndata: pred_start = ndata-self.batch_size pred_end = ndata for j in xrange(steps): if j == 0: p = reconstruct_dp(s_i, e_i) else: p = reconstruct_mem(p) recs[pred_start:pred_end] = p return recs
def __init__(self, input_tensor, n_in, n_hidden, learning_rate, pct_blackout=0.2, W=None, b_in=None, b_out=None): if W == None: # initialization of weights as suggested in theano tutorials W = np.asarray(np.random.uniform( low=-4 * np.sqrt(6. / (n_hidden + n_in)), high=4 * np.sqrt(6. / (n_hidden + n_in)), size=(n_in, n_hidden)), dtype=theano.config.floatX) self.W = theano.shared(W, 'W') if b_in == None: self.b_in = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX), 'b_in') else: self.b_in = theano.shared(b_in, 'b_in') if b_out == None: self.b_out = theano.shared(np.zeros(n_in, dtype=theano.config.floatX), 'b_out') else: self.b_out = theano.shared(b_out, 'b_out') matrixType = T.TensorType(theano.config.floatX, (False,)*2) self.n_in = n_in self.n_hidden = n_hidden self.inputs = input_tensor self.x = matrixType('x') self.pct_blackout = pct_blackout self.noise = T.shared_randomstreams.RandomStreams(1234).binomial( (self.x.shape), n=1, p=1-(self.pct_blackout), dtype=theano.config.floatX) self.noisy = self.noise * self.x self.active_hidden = T.nnet.sigmoid(T.dot(self.noisy, self.W) + self.b_in) self.output = T.nnet.sigmoid(T.dot(self.active_hidden, self.W.T) + self.b_out) self.entropy = -T.sum(self.x * T.log(self.output) + (1 - self.x) * T.log(1 - self.output), axis=1) self.cost = T.mean(self.entropy) self.params = [self.W, self.b_in, self.b_out] self.gradients = T.grad(self.cost, self.params) self.learning_rate = learning_rate self.updates = [] for param, grad in zip(self.params, self.gradients): self.updates.append((param, param - self.learning_rate * grad)) i, batch_size = T.iscalars('i', 'batch_size') self.train_step = theano.function([i, batch_size], self.cost, updates=self.updates, givens={self.x:self.inputs[i:i+batch_size]})
def __init__(self, model): self.model = model a = T.scalars('a') b = T.scalars('b') c = T.iscalars('c') d = T.iscalars('d') state = T.vector('state') action = T.vector('action') o = T.vector('o') z_switch = T.switch(T.lt(a, b), c, d) _predict_obs = self.model.tf_predict_guide(state.reshape((1, -1)), action.reshape((1, -1))) self._update_state = self.model.tf_update_state(state, o, action) self.f_switch = theano.function([a, b, c, d], z_switch, mode=theano.Mode(linker='vm')) self.predict_obs = theano.function(inputs=[state, action], outputs=_predict_obs, on_unused_input='ignore') self.update_state = theano.function(inputs=[state, o, action], outputs=self._update_state)
def extract_feature_from_data_provider(self, data_provider, feature_layer_name, train_mean=None, batch_mean_subtraction=False, niter=1, noiseless=False): assert isinstance(data_provider, UnlabeledDataProvider), ( 'data_provider need to be a subclass from UnlabeledDataProvider' ' so that it provides appropriate data for unsupervised models') assert feature_layer_name in self.ae_name_index_dic, ('need to provide feature_layer_name ' 'that is in the current network structure') layer_outputs = self.network_fprop(self.enc_layers, self.x, isTest=True, noiseless=noiseless) h_given_x = layer_outputs[feature_layer_name] self.shared_train, _, _ = data_provider.get_train_data_and_idx(0) start_index, end_index = T.iscalars('s_i', 'e_i') xgiven = self.shared_train[start_index:end_index] if self.shared_train.dtype=='uint8': xgiven = T.cast(xgiven, dtype='float32') if train_mean is not None and batch_mean_subtraction: tm = theano.shared(numpy.asarray(train_mean, dtype='float32')) xgiven -= tm extract_feature = theano.function([start_index, end_index], h_given_x, givens={self.x:xgiven}) ndata = data_provider.get_number_of_train_data() features = numpy.zeros((ndata,)+self.layers[self.ae_name_index_dic[feature_layer_name]].getOutputShape()[1:], dtype='float32') for minibatch_idx in xrange(data_provider.get_number_of_train_batches()): self.shared_train, s_i, e_i = data_provider.get_train_data_and_idx(minibatch_idx) pred_start = minibatch_idx*self.batch_size pred_end = (minibatch_idx+1)*self.batch_size if pred_end > ndata: pred_start = ndata-self.batch_size pred_end = ndata for j in xrange(niter): if j == 0: p = extract_feature(s_i, e_i) else: p += extract_feature(s_i, e_i) features[pred_start:pred_end] = p/float(niter) return features
def compile_functions(self, opt, noiseless_validation=True, **args): print '... compiling training functions' # propagte for training with batch normalization with upated std and mean for each batch layer_outputs = self.network_fprop(self.layers, self.x, isTest=False, noiseless=False) cost, show_cost = self.get_cost(layer_outputs, self.layers) self.opt = opt updates = self.opt.get_updates(cost, self.params) start_index, end_index = T.iscalars('s_i', 'e_i') if self.uint8_data: given_train_x = T.cast(self.shared_train[start_index:end_index], dtype='float32') else: given_train_x = self.shared_train[start_index:end_index] if self.batch_mean_subtraction: assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction' given_train_x -= self.train_mean self.train_model = theano.function( inputs=[start_index, end_index], outputs=show_cost, updates = updates, givens = { self.x: given_train_x, } ) if self.nvalid_batches>0: layer_outputs = self.network_fprop(self.layers, self.x, isTest=True, noiseLess=noiseless_validation) final_output = layer_outputs[self.ae_structure[-1]['name']] _, show_cost = self.get_cost(layer_outputs, self.layers) if self.uint8_data: given_valid_x = T.cast(self.shared_valid[start_index:end_index], dtype='float32') else: given_valid_x = self.shared_valid[start_index:end_index] if self.batch_mean_subtraction: assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction' given_valid_x -= self.train_mean self.validate_model = theano.function(inputs=[start_index, end_index], outputs=show_cost, givens = { self.x: given_valid_x, } )
def test_infer_shape(self): x = T.dvector('x') m = T.iscalars('m') a = np.random.random(50) self._compile_and_check([x, m], [repeat(x, m)], [a, 2], self.op_class) x = T.dmatrix('x') a = np.random.random((40, 50)) for axis in range(len(a.shape)): self._compile_and_check([x, m], [repeat(x, m, axis=axis)], [a, 2], self.op_class) m = T.lvector('m') repeats = np.random.random_integers(5, size=(40, )) self._compile_and_check([x, m], [repeat(x, m, axis=0)], [a, repeats], self.op_class)
def get_testing_function(test_data, test_mask, pct_blackout=0.5): raise Error("fix me!") i, batch_size = T.iscalars('i', 'batch_size') self.test_noise = T.shared_randomstreams.RandomStreams(1234).binomial( (self.inputs.shape), n=1, p=1 - pct_blackout, dtype=theano.config.floatX) self.test_noisy = self.test_noise * self.inputs self.test_active_hidden = T.nnet.sigmoid( T.dot(self.test_noisy, self.W) + self.b_in) self.test_output = T.nnet.sigmoid( T.dot(self.test_active_hidden, self.W.T) + self.b_out) # root mean squared error of unknowns only # taking the original input vector's mask of which beers had no input information (no rating) # mask out any output predicted ratings where there was no rating of the original beer # so we aren't affecting the error factor in dimensions where we don't have any meaningful information in the original input data # flattenedOutputVector = dot product ( (mask vector of which items we sent through the network to test, so we only test accuracy of non-inputted answers) with dot product ( inputMask with full output vector ) ) self.only_originally_unknown = T.dot( 1 - self.test_noise, T.dot(self.inputs_mask, self.test_output)) self.test_error = T.pow( T.mean( T.pow( T.dot(self.inputs_mask, self.test_output) - self.inputs, 2)), 0.5) self.testing_function = theano.function( [i, batch_size], self.test_error, givens={ self.inputs: test_data[i:i + batch_size], self.inputs_mask: test_mask[i:i + batch_size] }) return self.testing_function
def build_model(prepared_data, clamp_L0=0.4, eeg_column_i=None, **kwargs): # ########## # STEP1: order the data properly so that we can read from it sequentially # when training the model subject_x, skill_x, correct_y, start_x, eeg_x, eeg_table, stim_pairs, train_idx, valid_idx = prepared_data N = len(correct_y) train_mask = idx_to_mask(train_idx, N) valid_mask = idx_to_mask(valid_idx, N) # sort data by subject and skill sorted_i = sorted(xrange(N), key=lambda i: (subject_x[i], skill_x[i], start_x[i])) skill_x = skill_x[sorted_i] subject_x = subject_x[sorted_i] correct_y = correct_y[sorted_i] start_x = start_x[sorted_i] train_mask = train_mask[sorted_i] valid_mask = valid_mask[sorted_i] train_idx = np.nonzero(train_mask)[0] valid_idx = np.nonzero(valid_mask)[0] n_skills = np.max(skill_x) + 1 n_subjects = np.max(subject_x) + 1 # binarize eeg eeg_single_x = np.zeros(N) if eeg_column_i is not None: eeg_column = eeg_table[eeg_x, eeg_column_i] above_median = np.greater(eeg_column, np.median(eeg_column)) eeg_single_x[above_median] = 1 # prepare parameters p_T = 0.5 p_G = 0.1 p_S = 0.2 p_L0 = 0.7 if clamp_L0 is None: p_L0 = 0.7 else: p_L0 = clamp_L0 # eeg_single_x = np.zeros(N) parameter_base = np.ones(n_skills) tp_L0, t_L0 = make_probability(parameter_base * p_L0, name='L0') tp_T, t_T = make_probability(np.ones((n_skills, 2)) * p_T, name='p(T)') tp_G, t_G = make_probability(p_G, name='p(G)') tp_S, t_S = make_probability(p_S, name='p(S)') # declare and prepare variables for theano i = T.ivector('i') dummy_float = make_shared(0, name='dummy') skill_i, subject_i = T.iscalars('skill_i', 'subject_i') correct_y = make_shared(correct_y, to_int=True) eeg_single_x = make_shared(eeg_single_x, to_int=True) def step(correct_i, eeg, prev_L, prev_p_C, P_T, P_S, P_G): Ln = prev_L + (1 - prev_L) * P_T[eeg] p_C = prev_L * (1 - P_S) + (1 - prev_L) * P_G return Ln, p_C # set up theano functions ((results, p_C), updates) = theano.scan(fn=step, sequences=[correct_y[i], eeg_single_x[i]], outputs_info=[tp_L0[skill_i], dummy_float], non_sequences=[tp_T[skill_i], tp_G, tp_S]) p_y = T.stack(1 - p_C, p_C) loss = neg_log_loss(p_y, correct_y[i]) learning_rate = T.fscalar('learning_rate') if clamp_L0 is None: params = [t_T, t_L0] else: params = [t_T] update_parameters = [(param, param - learning_rate * T.grad(loss, param)) for param in params] tf_train = theano.function(inputs=[i, skill_i, learning_rate], updates=update_parameters, outputs=[loss, results, i], allow_input_downcast=True) tf_valid = theano.function(inputs=[i, skill_i], outputs=[loss, results, i], allow_input_downcast=True) def f_train((i, (subject_i, skill_i)), learning_rate): return tf_train(i, skill_i, learning_rate)
def train_batch(): current, prediction = T.iscalars('current', 'prediction') learn_step = self.learn_batch(current, prediction) train_one_epoch = theano.function([current, prediction], learn_step, updates=[train_cost, ])
import numpy as np import theano import theano.tensor as T ''' Learn one more theano function => scan() sample code from tutorial is at hmm_class file Fibonacci ''' N = T.iscalars('N') def recurrence(n, fn_1, fn_2): return fn_1 + fn_2, fn_1 outputs, updates = theano.scan( # 這裡的outputs會是兩個scale, 但經過 iterator 會形成兩個list # 因為 recurrence的 return有兩個參數,經過scan跑出來會是一個list內部包著兩個array fn=recurrence, sequences=T.arange(N), n_steps=N, outputs_info=[1., 1.]) fabonacci = theano.function( inputs=[N], outputs=outputs, # 這個參數參考到scan function 的output,output的[] 可加可不加 ) o_val = fabonacci(8)
def compile_functions(self, opt, noiseless_validation=True, **args): print '... compiling training functions' (prior_gen_cost, prior_gen_show_cost, prior_dis_cost, prior_dis_show_cost, data_gen_cost, data_gen_show_cost, data_dis_cost, data_dis_show_cost, rec_cost, rec_show_cost) = self.get_cost(isTest=False) self.opt = opt prior_gen_updates = self.opt.get_updates(prior_gen_cost, self.enc_params) prior_dis_updates = self.opt.get_updates(prior_dis_cost, self.prior_dis_params) data_gen_updates = self.opt.get_updates(data_gen_cost, self.dec_params) data_dis_updates = self.opt.get_updates(data_dis_cost, self.data_dis_params) ae_updates = self.opt.get_updates(rec_cost, self.enc_params) #+rec_cost +self.dec_params start_index, end_index = T.iscalars('s_i', 'e_i') if self.uint8_data: print 'converting uint8 data to float32 for each batch' given_train_x = T.cast(self.shared_train[start_index:end_index], dtype='float32') else: given_train_x = self.shared_train[start_index:end_index] if self.batch_mean_subtraction: assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction' given_train_x -= self.train_mean if self.batch_data_process_func is not None: given_train_x = self.batch_data_process_func(given_train_x) # self.get_data_dis_cost = theano.function( # [start_index, end_index, self.z], # # data_dis_show_cost, # givens={self.x:given_train_x} # ) self.train_ae_model = theano.function( [start_index, end_index], rec_show_cost, updates=ae_updates, givens={self.x:given_train_x} ) self.train_data_gen_model = theano.function( [start_index, end_index],#[self.z], # data_gen_show_cost, updates=data_gen_updates, givens={self.x:given_train_x} ) self.train_data_dis_model = theano.function( [start_index, end_index], #, self.z data_dis_show_cost, updates=data_dis_updates, givens={self.x:given_train_x} ) self.train_prior_gen_model = theano.function( [start_index, end_index], prior_gen_show_cost, updates=prior_gen_updates, givens={self.x:given_train_x} ) self.train_prior_dis_model = theano.function( [start_index, end_index, self.z], prior_dis_show_cost, updates=prior_dis_updates, givens={self.x:given_train_x} )
def __init__(self, numargs, embed_size, pred_vocab_size, arg_vocab_size, initial_pred_rep=None, initial_arg_rep = None, margin = 5, lr=0.01, activation=T.nnet.sigmoid): numpy_rng = numpy.random.RandomState(12345) theano_rng = RandomStreams(54321) self.lr = lr #margin = 5 # Initializing predicate representations if initial_pred_rep is not None: num_preds, pred_dim = initial_pred_rep.shape assert pred_vocab_size == num_arrays, "Initial predicate representation is not the same size as pred_vocab_size" assert embed_size == pred_dim, "Initial predicate representation does not have the same dimensionality as embed_size" else: initial_pred_rep_range = 4 * numpy.sqrt(6. / (pred_vocab_size + embed_size)) initial_pred_rep = numpy.asarray(numpy_rng.uniform(low = -initial_pred_rep_range, high = initial_pred_rep_range, size = (pred_vocab_size, embed_size))) self.pred_rep = theano.shared(value=initial_pred_rep, name='P') # Initializing argument representations if initial_arg_rep is not None: arg_rep_len, arg_dim = initial_arg_rep.shape assert arg_vocab_size == arg_rep_len, "Initial argument representation is not the same size as arg_vocab_size" assert embed_size == arg_dim, "Initial argument representation does not have the same dimensionality as embed_size" else: initial_arg_rep_range = 4 * numpy.sqrt(6. / (arg_vocab_size + embed_size)) initial_arg_rep = numpy.asarray(numpy_rng.uniform(low = -initial_arg_rep_range, high = initial_arg_rep_range, size = (arg_vocab_size, embed_size))) self.arg_rep = theano.shared(value=initial_arg_rep, name='A') # Initialize scorer scorer_dim = embed_size * (numargs + 1) # Predicate is +1 initial_scorer_range = 4 * numpy.sqrt(6. / scorer_dim) initial_scorer = numpy.asarray(numpy_rng.uniform(low = -initial_scorer_range, high = initial_scorer_range, size = scorer_dim)) self.scorer = theano.shared(value=initial_scorer, name='s') # Initialize indicator indicator_dim = embed_size * (numargs + 1) # Predicate is +1 initial_indicator_range = 4 * numpy.sqrt(6. / (indicator_dim + numargs)) initial_indicator = numpy.asarray(numpy_rng.uniform(low = -initial_indicator_range, high = initial_indicator_range, size = (indicator_dim, numargs))) self.indicator = theano.shared(value=initial_indicator, name='I') # Define symbolic pred-arg self.pred_ind = T.iscalar('p') self.arg_inds = T.iscalars(numargs) pred = self.pred_rep[self.pred_ind].reshape((1, embed_size)) args = self.arg_rep[self.arg_inds].reshape((1, embed_size * numargs)) pred_arg = activation(T.concatenate([pred, args], axis=1)) # Define symbolic rand pred-arg for training scorer rand_pred_ind = theano_rng.random_integers(low=0, high=pred_vocab_size-1) rand_arg_inds = theano_rng.random_integers([1, numargs], low=0, high=arg_vocab_size-1) rand_pred = self.pred_rep[rand_pred_ind].reshape((1, embed_size)) rand_args = self.arg_rep[rand_arg_inds].reshape((1, embed_size * numargs)) rand_pred_arg = activation(T.concatenate([rand_pred, rand_args], axis=1)) # Define symbolic pred_rand-arg for training indicator pred_rand_arg = activation(T.concatenate([pred, rand_args], axis=1)) # Define scores and loss self.corr_score = T.sum(T.dot(pred_arg, self.scorer)) rand_score = T.sum(T.dot(rand_pred_arg, self.scorer)) self.margin_loss = T.maximum(0, margin - self.corr_score + rand_score) # Define indicator values and loss orig_ind_labels = T.constant(numpy.zeros(numargs)) self.indicator_pred = T.nnet.sigmoid(T.dot(pred_arg, self.indicator)) rand_ind_labels = T.constant(numpy.ones(numargs)) rand_indicator_pred = T.nnet.sigmoid(T.dot(pred_rand_arg, self.indicator)) self.indicator_loss = T.mean((self.indicator_pred - orig_ind_labels) ** 2) + T.mean((rand_indicator_pred - rand_ind_labels) ** 2) # Define params and inputs self.score_params = [self.pred_rep, self.arg_rep, self.scorer] self.indicator_params = [self.pred_rep, self.arg_rep, self.indicator] self.score_ind_inputs = [self.pred_ind] + list(self.arg_inds)
def build_model(prepared_data, clamp_L0=None, **kwargs): # ########## # STEP1: order the data properly so that we can read from it sequentially # when training the model subject_x, skill_x, correct_y, start_x, eeg_x, eeg_table, stim_pairs, train_idx, valid_idx = prepared_data N = len(correct_y) train_mask = idx_to_mask(train_idx, N) valid_mask = idx_to_mask(valid_idx, N) # sort data by subject and skill sorted_i = sorted(xrange(N), key=lambda i: (subject_x[i], skill_x[i], start_x[i])) skill_x = skill_x[sorted_i] subject_x = subject_x[sorted_i] correct_y = correct_y[sorted_i] start_x = start_x[sorted_i] train_mask = train_mask[sorted_i] valid_mask = valid_mask[sorted_i] train_idx = np.nonzero(train_mask)[0] valid_idx = np.nonzero(valid_mask)[0] n_skills = np.max(skill_x) + 1 # #### # STEP 2: initialize parameters p_G = 0.1 p_S = 0.2 feat_x = eeg_x feat_table = eeg_table feat_columns = range(feat_table.shape[1]) # [0, 1, 2, 3, 4, 5, 6] feat_width = len(feat_columns) if clamp_L0 is None: Beta0 = make_shared(np.random.rand(n_skills)) Beta = make_shared(np.random.rand(n_skills, feat_width)) b = make_shared(np.random.rand(n_skills)) Gamma = make_shared(np.random.rand(n_skills, feat_width)) g = make_shared(np.random.rand(n_skills)) tp_G, t_G = make_probability(p_G, name='p(G)') tp_S, t_S = make_probability(p_S, name='p(S)') # declare and prepare variables for theano i = T.ivector('i') dummy_float = make_shared(0, name='dummy') skill_i, subject_i = T.iscalars('skill_i', 'subject_i') correct_y = make_shared(correct_y, to_int=True) feat_x = make_shared(feat_x, to_int=True) feat_table = make_shared(feat_table) # set up theano functions def step(correct_i, feat, prev_L, prev_p_C, skill_i, P_S, P_G): L_true_given_true = sigmoid(T.dot(Beta[skill_i].T, feat[feat_columns]) + b[skill_i]) L_true_given_false = sigmoid(T.dot(Gamma[skill_i].T, feat[feat_columns]) + g[skill_i]) Ln = prev_L * L_true_given_true + (1 - prev_L) * L_true_given_false p_C = prev_L * (1 - P_S) + (1 - prev_L) * P_G return Ln, p_C if clamp_L0 is None: L0 = sigmoid(Beta0[skill_i]) else: L0 = make_shared(clamp_L0) ((results, p_C), updates) = theano.scan(fn=step, sequences=[correct_y[i], feat_table[feat_x[i]]], outputs_info=[L0, dummy_float], non_sequences=[skill_i, tp_G, tp_S]) p_y = T.stack(1 - p_C, p_C) loss = neg_log_loss(p_y, correct_y[i]) learning_rate = T.fscalar('learning_rate') if clamp_L0 is None: params = [Beta0, Beta, Gamma, g, b] else: params = [Beta, Gamma, g, b] update_parameters = [(param, param - learning_rate * T.grad(loss, param)) for param in params] tf_train = theano.function(inputs=[i, skill_i, learning_rate], updates=update_parameters, outputs=[loss, results, i], allow_input_downcast=True) tf_valid = theano.function(inputs=[i, skill_i], outputs=[loss, results, i], allow_input_downcast=True) def f_train((i, (subject_i, skill_i)), learning_rate): everything = tf_train(i, skill_i, learning_rate) return everything[:3]
def model_choice(models, obs): k = [i for i in xrange(2, 9)] Statistics = [] for ki in k: print 'K = ', ki num_M = models[ki-2].shape[0] print 'Num Models: ', num_M numNbins = len(obs[ki-2]) numHbins = len(obs[ki-2][0]) M = theano.shared(np.asarray(models[ki-2], dtype = theano.config.floatX)) ObSym = T.matrix() # Symbolic tensor for observation batches - indexed elements of Obs shared variable are passed through this Pred = theano.function([], predictiveness_profiles(M, ki, len(models[ki-2])))() # This should be dealt with better too... Pred_n = Pred Pred = theano.shared(np.asarray(Pred, dtype = theano.config.floatX)) # setup inference schemas and theano symbolic tensors if INFERENCE == 'underfit': profiles = make_agression_profiles(num_profiles, num_alpha) #alpha = theano.shared(np.asmatrix(np.linspace(0.0,1.0, num = num_alpha, endpoint = False), dtype=theano.config.floatX)) Alpha = T.arange(0., 1.0, 1./num_alpha) Agression_profiles = T.matrix('Agr') nAlpha, nM, nO = T.iscalars('','','') Choice_Maker = Underfit_Choice(M, ObSym, nM, nO, ki, nAlpha, Alpha, Agression_profiles, Pred, pValue_alg) #only works for 0... elif INFERENCE == 'bayes': profiles = make_priors_profiles(num_priors, num_M) Priors_profiles = T.matrix('Priors') Loss_funcs = T.arange(1,5) # Loss functions are choices in bayesian_choice numbered [1,4] nM, nO = T.iscalars('','') Choice_Maker = Bayesian_Choice(M, ObSym, nM, nO, ki, Priors_profiles, Loss_funcs) else: print 'unknown inference algorithm...' quit() # all data for this K k_Data = kData(numNbins, numHbins, num_profiles) for i in xrange(numNbins): for j in xrange(numHbins): print 'bin ', i, j t0 = time.time() if obs[ki-2][i][j] == [] or obs[ki-2][i][j][0].shape[1] == 0: #there are no observtions in this N*H bin... continue else: num_obs = obs[ki-2][i][j][0].shape[0] # allocate for predictiveness of model choice vs universe for each obs for each profile k_pred = kPred(num_obs, num_profiles) num_batches = int(np.ceil(num_obs/np.float(BATCH_SIZE))) for batch_index in xrange(num_batches): top = BATCH_SIZE*(batch_index+1) if batch_index < (num_batches-1) else num_obs n_obs = top - BATCH_SIZE*(batch_index) print 'batch index ', batch_index, '\t num obs: ', top - BATCH_SIZE*batch_index if INFERENCE == 'underfit': batch_choice = Choice_Maker.Choice_Profile_F(profiles, num_alpha, num_M, n_obs, obs[ki-2][i][j][0][BATCH_SIZE*batch_index:top]) print batch_choice for prof in xrange(num_profiles): k_pred[prof][BATCH_SIZE*(batch_index):top] = get_predictiveness_array(batch_choice[prof], obs[ki-2][i][j][1], Pred_n, n_obs) elif INFERENCE == 'bayes': batch_choice = Choice_Maker.Choice_Profile_F(profiles, num_M, n_obs, obs[ki-2][i][j][0][BATCH_SIZE*batch_index:top]) print batch_choice for pr in xrange(num_priors): for lf in xrange(num_loss_funcs): k_pred[pr*num_loss_funcs + lf][BATCH_SIZE*(batch_index):top] = get_predictiveness_array(batch_choice[pr][lf], obs[ki-2][i][j][1], Pred_n, n_obs) for prof in xrange(num_profiles): pred_moments = get_moments(k_pred[prof], num_obs) for m in xrange(len(pred_moments)): k_Data[prof][m][i,j] = pred_moments[m] t1 = time.time() print 'single bin takes: ',(t1-t0)/60., ' minutes' Statistics.append(k_Data) f = open('%s_k%d.pkl'%(name, ki), 'wb') pickle.dump(k_Data, f) f.close() return Statistics
def model_choice(models, obs): k = [i for i in xrange(2, 9)] Statistics = [] for ki in k: print 'K = ', ki num_M = models[ki - 2].shape[0] print 'Num Models: ', num_M numNbins = len(obs[ki - 2]) numHbins = len(obs[ki - 2][0]) M = theano.shared( np.asarray(models[ki - 2], dtype=theano.config.floatX)) ObSym = T.matrix( ) # Symbolic tensor for observation batches - indexed elements of Obs shared variable are passed through this Pred = theano.function( [], predictiveness_profiles(M, ki, len( models[ki - 2])))() # This should be dealt with better too... Pred_n = Pred Pred = theano.shared(np.asarray(Pred, dtype=theano.config.floatX)) # setup inference schemas and theano symbolic tensors if INFERENCE == 'underfit': profiles = make_agression_profiles(num_profiles, num_alpha) #alpha = theano.shared(np.asmatrix(np.linspace(0.0,1.0, num = num_alpha, endpoint = False), dtype=theano.config.floatX)) Alpha = T.arange(0., 1.0, 1. / num_alpha) Agression_profiles = T.matrix('Agr') nAlpha, nM, nO = T.iscalars('', '', '') Choice_Maker = Underfit_Choice(M, ObSym, nM, nO, ki, nAlpha, Alpha, Agression_profiles, Pred, pValue_alg) #only works for 0... elif INFERENCE == 'bayes': profiles = make_priors_profiles(num_priors, num_M) Priors_profiles = T.matrix('Priors') Loss_funcs = T.arange( 1, 5 ) # Loss functions are choices in bayesian_choice numbered [1,4] nM, nO = T.iscalars('', '') Choice_Maker = Bayesian_Choice(M, ObSym, nM, nO, ki, Priors_profiles, Loss_funcs) else: print 'unknown inference algorithm...' quit() # all data for this K k_Data = kData(numNbins, numHbins, num_profiles) for i in xrange(numNbins): for j in xrange(numHbins): print 'bin ', i, j t0 = time.time() if obs[ki - 2][i][j] == [] or obs[ki - 2][i][j][0].shape[1] == 0: #there are no observtions in this N*H bin... continue else: num_obs = obs[ki - 2][i][j][0].shape[0] # allocate for predictiveness of model choice vs universe for each obs for each profile k_pred = kPred(num_obs, num_profiles) num_batches = int(np.ceil(num_obs / np.float(BATCH_SIZE))) for batch_index in xrange(num_batches): top = BATCH_SIZE * (batch_index + 1) if batch_index < ( num_batches - 1) else num_obs n_obs = top - BATCH_SIZE * (batch_index) print 'batch index ', batch_index, '\t num obs: ', top - BATCH_SIZE * batch_index if INFERENCE == 'underfit': batch_choice = Choice_Maker.Choice_Profile_F( profiles, num_alpha, num_M, n_obs, obs[ki - 2][i][j][0][BATCH_SIZE * batch_index:top]) print batch_choice for prof in xrange(num_profiles): k_pred[prof][BATCH_SIZE * ( batch_index):top] = get_predictiveness_array( batch_choice[prof], obs[ki - 2][i][j][1], Pred_n, n_obs) elif INFERENCE == 'bayes': batch_choice = Choice_Maker.Choice_Profile_F( profiles, num_M, n_obs, obs[ki - 2][i][j][0][BATCH_SIZE * batch_index:top]) print batch_choice for pr in xrange(num_priors): for lf in xrange(num_loss_funcs): k_pred[pr * num_loss_funcs + lf][BATCH_SIZE * (batch_index ):top] = get_predictiveness_array( batch_choice[pr][lf], obs[ki - 2][i][j][1], Pred_n, n_obs) for prof in xrange(num_profiles): pred_moments = get_moments(k_pred[prof], num_obs) for m in xrange(len(pred_moments)): k_Data[prof][m][i, j] = pred_moments[m] t1 = time.time() print 'single bin takes: ', (t1 - t0) / 60., ' minutes' Statistics.append(k_Data) f = open('%s_k%d.pkl' % (name, ki), 'wb') pickle.dump(k_Data, f) f.close() return Statistics
def __init__(self, numargs, embed_size, pred_vocab_size, arg_vocab_size, initial_pred_rep=None, initial_arg_rep=None, margin=5, lr=0.01, activation=T.nnet.sigmoid): numpy_rng = numpy.random.RandomState(12345) theano_rng = RandomStreams(54321) self.lr = lr #margin = 5 # Initializing predicate representations if initial_pred_rep is not None: num_preds, pred_dim = initial_pred_rep.shape assert pred_vocab_size == num_arrays, "Initial predicate representation is not the same size as pred_vocab_size" assert embed_size == pred_dim, "Initial predicate representation does not have the same dimensionality as embed_size" else: initial_pred_rep_range = 4 * numpy.sqrt( 6. / (pred_vocab_size + embed_size)) initial_pred_rep = numpy.asarray( numpy_rng.uniform(low=-initial_pred_rep_range, high=initial_pred_rep_range, size=(pred_vocab_size, embed_size))) self.pred_rep = theano.shared(value=initial_pred_rep, name='P') # Initializing argument representations if initial_arg_rep is not None: arg_rep_len, arg_dim = initial_arg_rep.shape assert arg_vocab_size == arg_rep_len, "Initial argument representation is not the same size as arg_vocab_size" assert embed_size == arg_dim, "Initial argument representation does not have the same dimensionality as embed_size" else: initial_arg_rep_range = 4 * numpy.sqrt( 6. / (arg_vocab_size + embed_size)) initial_arg_rep = numpy.asarray( numpy_rng.uniform(low=-initial_arg_rep_range, high=initial_arg_rep_range, size=(arg_vocab_size, embed_size))) self.arg_rep = theano.shared(value=initial_arg_rep, name='A') # Initialize scorer scorer_dim = embed_size * (numargs + 1) # Predicate is +1 initial_scorer_range = 4 * numpy.sqrt(6. / scorer_dim) initial_scorer = numpy.asarray( numpy_rng.uniform(low=-initial_scorer_range, high=initial_scorer_range, size=scorer_dim)) self.scorer = theano.shared(value=initial_scorer, name='s') # Initialize indicator indicator_dim = embed_size * (numargs + 1) # Predicate is +1 initial_indicator_range = 4 * numpy.sqrt(6. / (indicator_dim + numargs)) initial_indicator = numpy.asarray( numpy_rng.uniform(low=-initial_indicator_range, high=initial_indicator_range, size=(indicator_dim, numargs))) self.indicator = theano.shared(value=initial_indicator, name='I') # Define symbolic pred-arg self.pred_ind = T.iscalar('p') self.arg_inds = T.iscalars(numargs) pred = self.pred_rep[self.pred_ind].reshape((1, embed_size)) args = self.arg_rep[self.arg_inds].reshape((1, embed_size * numargs)) pred_arg = activation(T.concatenate([pred, args], axis=1)) # Define symbolic rand pred-arg for training scorer rand_pred_ind = theano_rng.random_integers(low=0, high=pred_vocab_size - 1) rand_arg_inds = theano_rng.random_integers([1, numargs], low=0, high=arg_vocab_size - 1) rand_pred = self.pred_rep[rand_pred_ind].reshape((1, embed_size)) rand_args = self.arg_rep[rand_arg_inds].reshape( (1, embed_size * numargs)) rand_pred_arg = activation( T.concatenate([rand_pred, rand_args], axis=1)) # Define symbolic pred_rand-arg for training indicator pred_rand_arg = activation(T.concatenate([pred, rand_args], axis=1)) # Define scores and loss self.corr_score = T.sum(T.dot(pred_arg, self.scorer)) rand_score = T.sum(T.dot(rand_pred_arg, self.scorer)) self.margin_loss = T.maximum(0, margin - self.corr_score + rand_score) # Define indicator values and loss orig_ind_labels = T.constant(numpy.zeros(numargs)) self.indicator_pred = T.nnet.sigmoid(T.dot(pred_arg, self.indicator)) rand_ind_labels = T.constant(numpy.ones(numargs)) rand_indicator_pred = T.nnet.sigmoid( T.dot(pred_rand_arg, self.indicator)) self.indicator_loss = T.mean( (self.indicator_pred - orig_ind_labels)**2) + T.mean( (rand_indicator_pred - rand_ind_labels)**2) # Define params and inputs self.score_params = [self.pred_rep, self.arg_rep, self.scorer] self.indicator_params = [self.pred_rep, self.arg_rep, self.indicator] self.score_ind_inputs = [self.pred_ind] + list(self.arg_inds)
def build_model(prepared_data, clamp_L0=0.4, **kwargs): # ########## # STEP1: order the data properly so that we can read from it sequentially # when training the model subject_x, skill_x, correct_y, start_x, eeg_x, eeg_table, stim_pairs, train_idx, valid_idx = prepared_data N = len(correct_y) train_mask = idx_to_mask(train_idx, N) valid_mask = idx_to_mask(valid_idx, N) # sort data by subject and skill sorted_i = sorted(xrange(N), key=lambda i: (subject_x[i], skill_x[i], start_x[i])) skill_x = skill_x[sorted_i] subject_x = subject_x[sorted_i] correct_y = correct_y[sorted_i] start_x = start_x[sorted_i] train_mask = train_mask[sorted_i] valid_mask = valid_mask[sorted_i] train_idx = np.nonzero(train_mask)[0] valid_idx = np.nonzero(valid_mask)[0] n_skills = np.max(skill_x) + 1 n_subjects = np.max(subject_x) + 1 # prepare parameters p_T = 0.5 p_G = 0.1 p_S = 0.2 if clamp_L0 is None: p_L0 = 0.7 else: p_L0 = clamp_L0 parameter_base = np.ones(n_skills) tp_L0, t_L0 = make_probability(parameter_base * p_L0, name='L0') tp_T, t_T = make_probability(parameter_base * p_T, name='p(T)') tp_G, t_G = make_probability(parameter_base * p_G, name='p(G)') tp_S, t_S = make_probability(parameter_base * p_S, name='p(S)') # declare and prepare variables for theano i = T.ivector('i') dummy_float = make_shared(0, name='dummy') skill_i, subject_i = T.iscalars('skill_i', 'subject_i') correct_y = make_shared(correct_y, to_int=True) def step(correct_i, prev_L, prev_p_C, P_T, P_S, P_G): Ln = prev_L + (1 - prev_L) * P_T p_C = prev_L * (1 - P_S) + (1 - prev_L) * P_G return Ln, p_C # set up theano functions ((results, p_C), updates) = theano.scan(fn=step, sequences=correct_y[i], outputs_info=[tp_L0[skill_i], dummy_float], non_sequences=[tp_T[skill_i], tp_G[skill_i], tp_S[skill_i]]) p_y = T.stack(1 - p_C, p_C) loss = neg_log_loss(p_y, correct_y[i]) learning_rate = T.fscalar('learning_rate') if clamp_L0 is None: params = [t_T, t_L0] else: params = [t_T] update_parameters = [(param, param - learning_rate * T.grad(loss, param)) for param in params] tf_train = theano.function(inputs=[i, skill_i, learning_rate], updates=update_parameters, outputs=[loss, results, i], allow_input_downcast=True) tf_valid = theano.function(inputs=[i, skill_i], outputs=[loss, results, i], allow_input_downcast=True) def f_train((i, (subject_i, skill_i)), learning_rate): return tf_train(i, skill_i, learning_rate)
def fit(self, train_X, optimizer, param_init = None, sample_every=None): self.opt = optimizer n_train, n_vis = train_X.shape batch_size = self.batch_size if sample_every == None: sample_every = 10000000 #theano.config.profile = True #theano.config.exception_verbosity='high' assert(n_vis == self.nv) train_X = self.shared_dataset(train_X) n_batches = np.ceil(n_train / float(batch_size)).astype('int') # theano variables for managing data (index minibatches, n examples in batch) index, n_ex = T.iscalars('batch_index', 'n_ex') batch_start = index*batch_size batch_stop = T.minimum(n_ex, (index + 1)*batch_size) effective_batch_size = batch_stop - batch_start # theano variables for learning lr = T.scalar('lr', dtype=theano.config.floatX) mom = T.scalar('mom', dtype=theano.config.floatX) if self.k == 1: # this one is for scaning over a batch and getting connectivity for each example # return grads too because T.grads through scan is awful # takes ~3x longer, but can experiment connectivity #K, grads = self.mpf.rbm_K2G(self.X, effective_batch_size) # this tiles out the minibatch matrix into a 3D tensor to compute connectivity #K, offs, y, y1, z= self.mpf.rbm_K(self.X, effective_batch_size) K = self.mpf.rbm_K(self.X, effective_batch_size) elif self.k == 2: if DEBUG: return_values = self.mpf.debug_rbm_K_2wise(self.X, effective_batch_size) K = return_values[-1] else: K = self.mpf.rbm_K_2wise(self.X, effective_batch_size) else: raise('NotImplemented') reg = self.L1_reg * self.mpf.L1 + self.L2_reg * self.mpf.L2 reg_grad = T.grad(reg, self.mpf.theta) # if not scan (tile out matrix into tensor) cost = K + reg grads = T.grad(cost, self.mpf.theta) # otherwise #grads = grads + reg_grad if param_init == None: self.mpf.theta.set_value(random_theta(D, DH, k=self.k)) else: self.mpf.theta.set_value(np.asarray(np.concatenate(param_init), dtype=theano.config.floatX)) if optimizer == 'sgd': updates = [] theta = self.mpf.theta theta_update = self.mpf.theta_update upd = mom * theta_update - lr * grads updates.append((theta_update, upd)) updates.append((theta, theta + upd)) print 'compiling theano function' if DEBUG: return_values = list(return_values) return_values.append(cost) return_values.append(grads) train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=return_values, updates=updates, givens={self.X: train_X[batch_start:batch_stop]}) else: train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=cost, updates=updates, givens={self.X: train_X[batch_start:batch_stop]}) self.current_epoch = 0 start = time.time() learning_rate_init = self.learning_rate while self.current_epoch < self.n_epochs: print 'epoch:', self.current_epoch self.current_epoch += 1 effective_mom = self.final_momentum if self.current_epoch > self.momentum_switchover else self.initial_momentum avg_epoch_cost = 0 last_debug = None for minibatch_idx in xrange(n_batches): avg_cost = train_model(minibatch_idx, n_train, self.learning_rate, effective_mom) #print '\t\t', np.isnan(gr).sum(), np.isnan(yy).sum(), np.isnan(yy1).sum(), np.isnan(zz).sum() if DEBUG: return_values, avg_cost, gradients = avg_cost[:-2], avg_cost[-2], avg_cost[-1] print_debug(return_values, last_debug) last_debug = return_values avg_epoch_cost += avg_cost #print '\t', minibatch_idx, avg_cost print '\t avg epoch cost:', avg_epoch_cost/n_batches self.learning_rate *= self.learning_rate_decay theta_fit = split_theta(self.mpf.theta.get_value(), self.mpf.n_visible, self.mpf.n_hidden, k=self.mpf.k) if (self.current_epoch % sample_every == 0): sample_and_save(theta_fit, self.mpf.n_hidden, self.current_epoch, learning_rate_init, self.mpf.k, self.opt) theta_opt = self.mpf.theta.get_value() end = time.time() elif optimizer == 'cg' or optimizer == 'bfgs': print "compiling theano functions" get_batch_size = theano.function([index, n_ex], effective_batch_size, name='get_batch_size') batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') def train_fn_cost_grads(theta_value): print 'nbatches', n_batches self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_losses_grads = [batch_cost_gradst(i, n_train) for i in xrange(n_batches)] train_losses = [i[0] for i in train_losses_grads] train_grads = [i[1] for i in train_losses_grads] train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)] print len(train_losses), len(train_grads) print train_losses[0].shape, train_grads[0].shape returns = np.average(train_losses, weights=train_batch_sizes), np.average(train_grads, weights=train_batch_sizes, axis=0) return returns def train_fn_cost(theta_value): print 'nbatches', n_batches self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_costs = [batch_cost(i, n_train) for i in xrange(n_batches)] train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)] return np.average(train_costs, weights=train_batch_sizes) def train_fn_grads(theta_value): print 'nbatches', n_batches self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_grads = [batch_grads(i, n_train) for i in xrange(n_batches)] train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)] return np.average(train_grads, weights=train_batch_sizes, axis=0) ############### # TRAIN MODEL # ############### def my_callback(): print 'wtf' from scipy.optimize import minimize from scipy.optimize import fmin_bfgs, fmin_l_bfgs_b if optimizer == 'cg': pass elif optimizer == 'bfgs': print 'using bfgs' #theta_opt, f_theta_opt, info = fmin_l_bfgs_b(train_fn, self.mpf.theta.get_value(), iprint=1, maxfun=self.n_epochs) start = time.time() disp = True print 'ready to minimize' #result_obj = minimize(train_fn, self.mpf.theta.get_value(), jac=True, method='BFGS', options={'maxiter':self.n_epochs, 'disp':disp}, callback=my_callback()) #theta_opt = fmin_bfgs(f=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs) theta_opt, fff, ddd = fmin_l_bfgs_b(func=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs) print 'done minimize ya right' end = time.time() elif optimizer == 'sof': print "compiling theano functions" batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost') def train_fn(theta_value, i): self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True) train_losses, train_grads = batch_cost_grads(i, n_train) return train_losses, train_grads ############### # TRAIN MODEL # ############### if param_init == None: theta.set_value(random_theta(D, DH)) else: w0, bh0, bv0 = param_init self.mpf.theta.set_value(np.asarray(np.concatenate((w0, bh0, bv0)), dtype=theano.config.floatX)) print 'using sof' sys.path.append('/export/mlrg/ebuchman/Programming/Sum-of-Functions-Optimizer') from sfo import SFO print 'n batches', n_batches print 'n epochs' , self.n_epochs optimizer = SFO(train_fn, self.mpf.theta.get_value(), np.arange(n_batches)) start = time.time() theta_opt = optimizer.optimize(num_passes = self.n_epochs) end = time.time() self.mpf.theta.set_value(theta_opt.astype(theano.config.floatX), borrow=True) return end-start
def __init__(self, inf=1e37): pos, vel = T.fmatrices(['pos', 'vel']) nc, N, n_steps = T.iscalars(['nc', 'N', 'n_steps']) ra, rb, re, r0 = T.fscalars(['ra', 'rb', 're', 'r0']) v0, j, b = T.fscalars(['v0', 'J', 'b']) nu = trng.uniform(size=(N, 2), low=0.0, high=3.14159, dtype='floatX') def distance_tensor(X): E = X.reshape((X.shape[0], 1, -1)) - X.reshape((1, X.shape[0], -1)) D = T.sqrt(T.sum(T.square(E), axis=2)) return D def direction_tensor(X): E = X.reshape((X.shape[0], 1, -1)) - X.reshape((1, X.shape[0], -1)) L = T.sqrt(T.sum(T.square(E), axis=2)) L = T.pow(L + T.identity_like(L), -1) L = T.stack([L, L, L], axis=2) return L * E def neighbourhood(X): D = distance_tensor(X) N = T.argsort(D, axis=0) mask = T.cast(T.lt(N, nc), 'float32') return N[1:nc + 1], mask def alignment(X, Y): n, d = neighbourhood(X) return T.sum(Y[n], axis=0) def cohesion(X, inf=100.0): D = distance_tensor(X) E = direction_tensor(X) n, d = neighbourhood(X) F = T.zeros_like(E) D = T.stack([D, D, D], axis=2) d = T.stack([d, d, d], axis=2) c1 = T.lt(D, rb) c2 = T.and_(T.gt(D, rb), T.lt(D, ra)) c3 = T.and_(T.gt(D, ra), T.lt(D, r0)) F = T.set_subtensor(F[c1], -E[c1]) F = T.set_subtensor(F[c2], 0.25 * (D[c2] - re) / (ra - re) * E[c2]) F = T.set_subtensor(F[c3], E[c3]) return T.sum(d * F, axis=0) def perturbation(nu=nu): phi = nu[:, 0] theta = 2.0 * nu[:, 1] return T.stack([ T.sin(theta) * T.sin(phi), T.cos(theta) * T.sin(phi), T.cos(phi) ], axis=1) def step(X, dX): X_ = X + dX V_ = j * nc / v0 * (alignment( X, dX)) + b * (cohesion(X)) + nc * (perturbation()) dV = T.sqrt(T.sum(T.square(V_), axis=1)).reshape(V_.shape[0], 1) dV = T.stack([dV, dV, dV], axis=1) V = v0 * V_ / dV return T.cast(X_, 'float32'), T.cast(V, 'float32') def probability(X, Y): n, d = neighbourhood(X) vDv = T.batched_dot(Y[n].swapaxes(0, 1), Y) p = T.exp((j / 2.0) * T.sum(vDv, axis=1)) return p / T.sum(p) sim, update = theano.scan(step, outputs_info=[pos, vel], n_steps=n_steps) pos_, vel_ = sim mean_final_velocity = 1 / (N * v0) * T.sqrt( T.sum(T.square(T.sum(vel_[-1], axis=0)))) particle_probability = probability(pos_[-1], vel_[-1]) self.f = theano.function( [pos, vel, nc, ra, rb, r0, re, j, v0, b, N, n_steps], [pos_, vel_], allow_input_downcast=True) self.g = theano.function( [pos, vel, nc, ra, rb, r0, re, j, v0, b, N, n_steps], mean_final_velocity, allow_input_downcast=True) self.h = theano.function( [pos, vel, nc, ra, rb, r0, re, j, v0, b, N, n_steps], particle_probability, allow_input_downcast=True)
def __init__(self, num_words, num_rels, vocab_embed_size, lr=0.01, tensor_activation=T.tanh, num_noise_samples=1, init_dense_vocab=None): numpy_rng = numpy.random.RandomState(89677) theano_rng = RandomStreams(12783) rng_box_limit = 4 * numpy.sqrt(6. / (vocab_embed_size + vocab_embed_size + num_rels)) rng_box_low = 0 rng_box_high = rng_box_limit init_box = numpy.asarray(numpy_rng.uniform(low=rng_box_low, high=rng_box_high, size=(vocab_embed_size, vocab_embed_size, num_rels))) rng_proj_low = -4 * numpy.sqrt(6. / (num_words + vocab_embed_size)) rng_proj_high = 4 * numpy.sqrt(6. / (num_words + vocab_embed_size)) if init_dense_vocab is None: init_dense_vocab = numpy.asarray(numpy_rng.uniform(low=rng_proj_low, high=rng_proj_high, size=(num_words, vocab_embed_size))) init_rev_dense_vocab = numpy.asarray(numpy_rng.uniform(low=rng_proj_low, high=rng_proj_high, size=(vocab_embed_size, num_words))) self.B = theano.shared(value=init_box, name='B') self.P = theano.shared(value=init_dense_vocab, name='P') self.P_hat = theano.shared(value=init_rev_dense_vocab, name='P_hat') self.vocab = T.eye(num_words) word_activation = T.nnet.softmax self.rel = T.eye(num_rels) rel_activation = T.nnet.softmax self.lr = lr self.x_ind, self.y_ind, self.r_ind = T.iscalars('x_ind', 'y_ind', 'r_ind') x = self.vocab[self.x_ind] self.x_rep = T.dot(x, self.P) y = self.vocab[self.y_ind] self.y_rep = T.dot(y, self.P) r = self.rel[self.r_ind] # Assumption: Corresponding dimensions: 0 -> x, 1 -> y, 2 -> r # TODO: Where do we apply activations? Do we have to, at all? pred_xy = tensor_activation(T.tensordot(r, self.B, axes=(0,2))) pred_y = T.dot(T.tensordot(self.x_rep, pred_xy, axes=(0,0)), self.P_hat) self.prob_y = word_activation(pred_y) pred_x = T.dot(T.tensordot(self.y_rep, pred_xy, axes=(0,1)), self.P_hat) self.prob_x = word_activation(pred_x) pred_yr = tensor_activation(T.tensordot(self.x_rep, self.B, axes=(0,0))) self.prob_r = rel_activation(T.tensordot(self.y_rep, pred_yr, axes=(0,0))) self.score = T.dot(y, T.dot(T.tensordot(self.x_rep, T.tensordot(r, self.B, axes=(0,2)), axes=(0,0)), self.P_hat).T) # y \times (((x \times P) \times (r \otimes B)) \times P_hat) rand_margin_score = T.constant(0) noise_log_likelihood = T.constant(0) # The noise distribution is one where words and the relation are independent of each other. The probability of the right tuple and the corrupted tuple are both equal in this distribution. noise_prob = num_noise_samples/float(num_words * num_words * num_rels) rand_x_ind = theano_rng.random_integers(low=0, high=num_words-1) rand_y_ind = theano_rng.random_integers(low=0, high=num_words-1) rand_r_ind = theano_rng.random_integers(low=0, high=num_rels-1) rand_x = self.vocab[rand_x_ind] rand_x_rep = T.dot(rand_x, self.P) rand_y = self.vocab[rand_y_ind] rand_y_rep = T.dot(rand_y, self.P) rand_r = self.rel[rand_r_ind] rand_score = T.dot(rand_y, T.dot(T.tensordot(rand_x_rep, T.tensordot(rand_r, self.B, axes=(0,2)), axes=(0,0)), self.P_hat).T) for _ in range(num_noise_samples): rand_margin_score += rand_score noise_log_likelihood += T.log(noise_prob/(T.abs_(rand_score) + noise_prob)) self.nce_margin_loss = T.maximum(0, 1 - self.score + rand_margin_score) # NCE negative log likelihood:-1 * {log(score/(score + num_noise_samples*noise_prob)) + \sum_{i=1}^k (log(noise_prob/(rand_score + noise_prob)))} self.nce_prob_loss = -(T.log(T.abs_(self.score)/(T.abs_(self.score) + noise_prob)) + noise_log_likelihood) self.cost_inputs = [self.x_ind, self.y_ind, self.r_ind] self.params = [self.B, self.P, self.P_hat] self.x_loss = self.ce(x, self.prob_x) self.y_loss = self.ce(y, self.prob_y) self.r_loss = self.ce(r, self.prob_r)