def __init__(self): self.name = self.__class__.__name__ #Symbolic expressions for the prediction function (and compiled one too), the loss, the regularization, and the loss to #optimize (loss + lmbda * regul) #To be defined by the child classes: self.pred_func = None self.pred_func_compiled = None self.loss_func = None self.regul_func = None self.loss_to_opt = None #Symbolic variables for training values self.ys = TT.vector('ys') self.rows = TT.lvector('rows') self.cols = TT.lvector('cols') self.tubes = TT.lvector('tubes') #Current values for which the loss is currently compiled #3 dimensions: self.n = 0 #Number of subject entities self.m = 0 #Number of relations self.l = 0 #Number of object entities #and rank: self.k = 0 #and corresponding number of parameters (i.e. n*k + m*k + l*k for CP_Model) self.nb_params = 0
def recognize_dataset(self, dataset_test = None, seqlen=None, batch_size =100) : #Here, we don't ignore the 6 first frames cause we want to test recognition performance on each frames of the test dataset n_test_batches = (dataset_test.get_value(borrow=True).shape[0]-(self.delay*self.freq)*len(seqlen)) / batch_size n_dim = dataset_test.get_value(borrow=True).shape[1] # allocate symbolic variables for the data index = T.lvector() # index to a [mini]batch index_hist = T.lvector() # index to history input_log = T.nnet.sigmoid(T.dot(self.x, self.crbm_layer.W)+ T.dot(self.x_history, self.crbm_layer.B) + self.crbm_layer.hbias) prob = self.logLayer.p_y_given_x prediction = self.logLayer.y_pred print_prediction = theano.function( [index, index_hist], [prediction, prob], givens={ self.x:dataset_test[index], self.x_history:dataset_test[index_hist].reshape((batch_size, self.delay * n_dim)) }, name='print_prediction' ) # valid starting indices datasetindex = range(self.delay*self.freq, dataset_test.get_value(borrow=True).shape[0]) permindex = np.array(datasetindex) for batch_index in xrange(n_test_batches): data_idx = permindex[batch_index * batch_size : (batch_index + 1) * batch_size] hist_idx = np.array([data_idx - n*self.freq for n in xrange(1, self.delay + 1)]).T for index_in_batch in range(batch_size) : print "(frame %d):" %(batch_index*batch_size+index_in_batch+1) print "%% of recognition for each pattern : " print print_prediction(data_idx, hist_idx.ravel())[1][index_in_batch] print "So, recognized pattern is :" print print_prediction(data_idx, hist_idx.ravel())[0][index_in_batch] print "-----------"
def __theano_init__(self): # Theano tensor for I/O X = T.lmatrix('X') Y = T.lvector('Y') N = T.lvector('N') # network structure l_in = L.layers.InputLayer(shape=(self.batch_size, self.n_gram), input_var = X) l_we = L.layers.EmbeddingLayer(l_in, self.vocab_size, self.word_dim, W = self.D) l_f1 = L.layers.DenseLayer(l_we, self.hidden_dim1, W = self.C, b = self.Cb) l_f2 = L.layers.DenseLayer(l_f1, self.hidden_dim2, W = self.M, b = self.Mb) l_out = L.layers.DenseLayer(l_f2, self.vocab_size, W = self.E, b = self.Eb, nonlinearity=None) # lasagne.layers.get_output produces a variable for the output of the net O = L.layers.get_output(l_out) # (batch_size, vocab_size) lossfunc = NCE(self.batch_size, self.vocab_size, self.noise_dist, self.noise_sample_size) loss = lossfunc.evaluate(O, Y, N) # loss = T.nnet.categorical_crossentropy(O, Y).mean() # Retrieve all parameters from the network all_params = L.layers.get_all_params(l_out, trainable=True) # Compute AdaGrad updates for training updates = L.updates.adadelta(loss, all_params) # Theano functions for training and computing cost self.train = theano.function([l_in.input_var, Y, N], loss, updates=updates, allow_input_downcast=True) self.compute_loss = theano.function([l_in.input_var, Y, N], loss, allow_input_downcast=True) self.weights = theano.function(inputs = [], outputs = [self.D, self.C, self.M, self.E, self.Cb, self.Mb, self.Eb])
def train_minibatch_fn(self, evaluate=False): """ Initialize this Theano function once """ X = T.lmatrix('X_train') L_x = T.lvector('L_X_train') Y = T.lmatrix('Y_train') L_y = T.lvector('L_y_train') learning_rate = T.dscalar('learning_rate') momentum = T.dscalar('momentum') weight_decay = T.dscalar('weight_decay') loss, accuracy = self.loss(X, L_x, Y, L_y, weight_decay) updates = self.get_sgd_updates(loss, learning_rate, momentum) outputs = [loss, accuracy] if evaluate: precision, recall = self.evaluate(X, L_x, Y, L_y) outputs = outputs + [precision, recall] return theano.function( inputs=[X, L_x, Y, L_y, learning_rate, momentum, weight_decay], outputs=outputs, updates=updates )
def test_random_integers_vector(self): rng_R = random_state_type() low = tensor.lvector() high = tensor.lvector() post_r, out = random_integers(rng_R, low=low, high=high) assert out.ndim == 1 f = compile.function([rng_R, low, high], [post_r, out], accept_inplace=True) low_val = [100, 200, 300] high_val = [110, 220, 330] rng = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(utt.fetch_seed()) # Arguments of size (3,) rng0, val0 = f(rng, low_val, high_val) numpy_val0 = numpy.asarray([numpy_rng.random_integers(low=lv, high=hv) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) rng1, val1 = f(rng0, low_val[:-1], high_val[:-1]) numpy_val1 = numpy.asarray([numpy_rng.random_integers(low=lv, high=hv) for lv, hv in zip(low_val[:-1], high_val[:-1])]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = compile.function([rng_R, low, high], random_integers(rng_R, low=low, high=high, size=(3,)), accept_inplace=True) rng2, val2 = g(rng1, low_val, high_val) numpy_val2 = numpy.asarray([numpy_rng.random_integers(low=lv, high=hv) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, rng2, low_val[:-1], high_val[:-1])
def test_random_integers_vector(self): random = RandomStreams(utt.fetch_seed()) low = tensor.lvector() high = tensor.lvector() out = random.random_integers(low=low, high=high) assert out.ndim == 1 f = function([low, high], out) low_val = [100, 200, 300] high_val = [110, 220, 330] seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(low_val, high_val) numpy_val0 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(low_val[:-1], high_val[:-1]) numpy_val1 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1) for lv, hv in zip(low_val[:-1], high_val[:-1])]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([low, high], random.random_integers(low=low, high=high, size=(3,))) val2 = g(low_val, high_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, low_val[:-1], high_val[:-1])
def predict_hidden(self, dataset=None, batch_size=100): # compute number of minibatches for training, validation and testing n_test_batches = dataset.get_value(borrow=True).shape[0] / batch_size n_dim = dataset.get_value(borrow=True).shape[1] # allocate symbolic variables for the data index = T.lvector() # index to a [mini]batch index_hist = T.lvector() # index to history #print hidden layer [pre_sigmoid_h1, h1_mean, h1_sample] = self.sample_h_given_v(self.input, self.input_history) print_hidden = theano.function( [index, index_hist], [h1_sample], givens={ self.input:dataset[index], self.input_history:dataset[index_hist].reshape((batch_size, self.delay * self.n_visible)) }, name='print_hidden' ) # valid starting indices datasetindex = range(self.delay, dataset.get_value(borrow=True).shape[0]) permindex = np.array(datasetindex) #For each frame in minibatch for batch_index in xrange(n_test_batches): data_idx = permindex[batch_index * batch_size : (batch_index + 1) * batch_size] hist_idx = np.array([data_idx - n for n in xrange(1, self.delay + 1)]).T for index_in_batch in range(batch_size) : print "Hidden CRBM (frame %d):" %(batch_index*batch_size+index_in_batch+1) print print_hidden(data_idx, hist_idx.ravel())[0][index_in_batch] print "-----------"
def pretraining_functions(self, train_set_x, batch_size, k, layer=0, static=False, with_W=False, binary=False): """Creates functions for doing CD Generates a function for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. Args: train_set_x: Shared var. that contains all datapoints used for training the RBM batch_size: int, the size of each minibatch k: number of Gibbs steps to do in CD-k / PCD-k layer: which layer of the dbn to generate functions for static: if True, ignore all temporal components with_W: Whether or not to include the W in update binary: if true, make visible layer binary Returns: CD function """ # allocate symbolic variables for the data index = T.lvector() # index to a [mini]batch index_hist = T.lvector() # index to history lr = T.dscalar() rbm = self.rbm_layers[layer] rbm.binary = binary # get the cost and the gradient corresponding to one step of CD-15 cost, updates = rbm.get_cost_updates(k=k, static=static, with_W=with_W) ################################# # Training the RBM # ################################# if static: # updates only on non-temporal components fn = theano.function( [index, lr], outputs=cost, updates=updates, givens={self.x: train_set_x[index], self.lr: lr}, name="train_tarbm_static", ) else: # updates including temporal components fn = theano.function( [index, index_hist, lr], outputs=cost, updates=updates, givens={ self.x: train_set_x[index], self.x_hist: train_set_x[index_hist].reshape((batch_size, self.delay * np.prod(self.n_ins))), self.lr: lr, }, name="train_tarbm", ) return fn
def test(model): dim = 128 v_size = 7810 margin = 1.0 #load model f = open(model, 'rb') input_params = cPickle.load(f) emb, wx, wh, bh, wa = input_params f.close() embLayer = emb_layer(pre_train=emb, v = v_size, dim = dim) rnnLayer = rnn_layer(input=None, wx=wx, wh=wh, bh=bh, emb_layer = embLayer, nh = dim) att = attention_layer(input=None, rnn_layer=rnnLayer, margin = margin) q = T.lvector('q') a = T.lscalar('a') p = T.lvector('p') t = T.lscalar('t') inputs = [q,a,p,t] score = att.predict(inputs) pred = theano.function(inputs=inputs,outputs=score) pool = ThreadPool() f = open('./data/test-small.id','r') count = 1 print 'time_b:%s' %time.clock() to_pred = [] for line in f: if count % 10000 == 0: print count / 10000 count += 1 #print 'time_b:%s' %time.clock() line = line[:-1] tmp = line.split('\t') in_q = numpy.array(tmp[0].split(' ')).astype(numpy.int) - 1 in_a = int(tmp[1].split(' ')[2]) - 1 in_p = numpy.array(tmp[1].split(' ')).astype(numpy.int) - 1 in_t = int(tmp[2]) - 1 lis = (in_q, in_a, in_p, in_t) to_pred.append(lis) #print 'time_load:%s' %time.clock() #print 'time_score:%s' %time.clock() f.close() ay = numpy.asarray(to_pred) #results = map(pred, list(ay[:,0]), list(ay[:,1]),list(ay[:,2]),list(ay[:,3])) results = pool.map(pred, to_pred) #results = [] #for p in to_pred: # results.append(att.predict(p,params)) print 'time_e:%s' %time.clock() #print results pool.close() pool.join()
def test(model): dim = 128 v_size = 7810 margin = 1.0 #load model f = open(model, 'rb') input_params = cPickle.load(f) emb, wx, wh, bh, wa = input_params f.close() embLayer = emb_layer(pre_train=emb, v = v_size, dim = dim) rnnLayer = rnn_layer(input=None, wx=wx, wh=wh, bh=bh, emb_layer = embLayer, nh = dim) att = attention_layer(input=None, rnn_layer=rnnLayer, margin = margin) q = T.lvector('q') a = T.lscalar('a') p = T.lvector('p') t = T.lscalar('t') inputs = [q,a,p,t] #emb_num = T.lscalar('emb_num') #nh = T.scalar('nh') #dim = T.scalar('dim') score = att.predict(inputs) pred = theano.function(inputs=inputs,outputs=score) wf = open('./data/res','w') f = open('./data/test.id','r') count = 1 print 'time_b:%s' %time.clock() for line in f: if count % 10000 == 0: print count / 10000 print 'time_1w:%s' %time.clock() count += 1 #print 'time_b:%s' %time.clock() line = line[:-1] tmp = line.split('\t') in_q = numpy.array(tmp[0].split(' ')).astype(numpy.int) - 1 #x = emb[q].reshape((q.shape[0], emb.shape[1])) in_a = int(tmp[1].split(' ')[2]) - 1 in_p = numpy.array(tmp[1].split(' ')).astype(numpy.int) - 1 in_t = int(tmp[2]) - 1 #in_lis = [in_q, in_a, in_p, in_t] #print 'time_load:%s' %time.clock() s = pred(in_q, in_a, in_p, in_t) #print s wf.write(str(s) + '\n') #print 'time_score:%s' %time.clock() f.close() wf.close()
def test_no_reuse(): x = T.lvector() y = T.lvector() f = theano.function([x, y], x + y) #provide both inputs in the first call f(numpy.ones(10, dtype='int64'), numpy.ones(10, dtype='int64')) try: f(numpy.ones(10)) except TypeError: return assert not 'should not get here'
def run_mlp(train_data, valid_data, valid_score, test_data, test_score, We_init, options): tmp = np.diag(np.ones(options.dim, dtype='float32')) W_init = np.asarray(np.concatenate((tmp, tmp), axis=0)) g1batchindices = T.lvector(); g2batchindices = T.lvector() p1batchindices = T.lvector(); p2batchindices = T.lvector() # Create an instance of the MLP class mlp = Layer(We_init, W_init, T.tanh, options.lamda_w, options.lamda_ww) #compute phrase vectors bigram_output = theano.function([g1batchindices, g2batchindices], mlp.output(g1batchindices, g2batchindices)) cost = squared_error(mlp, g1batchindices, g2batchindices, p1batchindices, p2batchindices) cost = cost + mlp.word_reg updates = adagrad(cost, mlp.params, learning_rate=0.005, epsilon=1e-6) train_model = theano.function([g1batchindices, g2batchindices, p1batchindices, p2batchindices], cost, updates=updates) # compute number of minibatches for training batch_size = int(options.batchsize) n_train_batches = int(len(train_data) * 1.0 // batch_size) iteration = 0 max_iteration = options.epochs while iteration < max_iteration: iteration += 1 seed = range(len(train_data)) random.shuffle(seed) train_data = [train_data[i] for i in seed] score = valid_model(bigram_output, valid_data, valid_score) accuary = test_model(bigram_output, test_data, test_score) print "iteration: {0} valid_score: {1} test_score: {2}".format(iteration, score[0], accuary[0]) for minibatch_index in range(n_train_batches): train_data_batch = train_data[minibatch_index * batch_size : (minibatch_index + 1) * batch_size] train_data_batch_x1 = [i[0][0] for i in train_data_batch] train_data_batch_x2 = [i[0][1] for i in train_data_batch] train_data_batch_y1 = [i[1][0] for i in train_data_batch] train_data_batch_y2 = [i[1][1] for i in train_data_batch] train_model(train_data_batch_x1, train_data_batch_x2, train_data_batch_y1, train_data_batch_y2)
def propup(self, data, layer=0, static=False): """ propogate the activity through layer 0 to the hidden layer and return an array of [2, samples, dimensions] where the first 2 dimensions are [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)] so far only works for the first rbm layer """ if not isinstance(data, theano.tensor.sharedvar.TensorSharedVariable): data = theano.shared(data) # allocate symbolic variables for the data index = T.lvector() # index to a [mini]batch index_hist = T.lvector() # index to history rbm = self.rbm_layers[layer] ################################# # Training the CRBM # ################################# # get the cost and the gradient corresponding to one step of CD-15 [pre_sig, post_sig] = rbm.propup(static) if static: # the purpose of train_crbm is solely to update the CRBM parameters fn = theano.function([], outputs=[pre_sig, post_sig], givens={self.x: data}, name="propup_tarbm_static") return np.array(fn()) else: # indexing is slightly complicated # build a linear index to the starting frames for this batch # (i.e. time t) gives a batch_size length array for data data_idx = np.arange(self.delay, data.get_value(borrow=True).shape[0]) # now build a linear index to the frames at each delay tap # (i.e. time t-1 to t-delay) # gives a batch_size x delay array of indices for history hist_idx = np.array([data_idx - n for n in xrange(1, self.delay + 1)]).T # the purpose of train_crbm is solely to update the CRBM parameters fn = theano.function( [index, index_hist], outputs=[pre_sig, post_sig], givens={ self.x: data[index], self.x_hist: data[index_hist].reshape((len(data_idx), self.delay * np.prod(self.n_ins))), }, name="train_tarbm", ) return np.array(fn(data_idx, hist_idx.ravel()))
def _compile_bp(self): ''' compile backpropagation foreach of the dqns. ''' self.bprop_by_goal = {} for (goal, dqn) in self.dqn_by_goal.items(): states = dqn.states action_values = dqn.action_values params = dqn.params targets = T.vector('target') last_actions = T.lvector('action') # loss function. mse = layers.MSE(action_values[T.arange(action_values.shape[0]), last_actions], targets) # l2 penalty. l2_penalty = 0. for param in params: l2_penalty += (param ** 2).sum() cost = mse + self.l2_reg * l2_penalty # back propagation. updates = optimizers.Adam(cost, params, alpha=self.lr) td_errors = T.sqrt(mse) self.bprop_by_goal[goal] = theano.function(inputs=[states, last_actions, targets], outputs=td_errors, updates=updates)
def test_softmax_optimizations_w_bias2(self): x = tensor.matrix('x') b = tensor.vector('b') c = tensor.vector('c') one_of_n = tensor.lvector('one_of_n') op = crossentropy_categorical_1hot env = gof.Env( [x, b, c, one_of_n], [op(softmax(T.add(x,b,c)), one_of_n)]) assert env.outputs[0].owner.op == op print 'BEFORE' for node in env.toposort(): print node.op print '----' theano.compile.mode.optdb.query( theano.compile.mode.OPT_FAST_RUN).optimize(env) print 'AFTER' for node in env.toposort(): print node.op print '====' assert len(env.toposort()) == 3 assert str(env.outputs[0].owner.op) == 'OutputGuard' assert env.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
def test_binomial_vector(self): rng_R = random_state_type() n = tensor.lvector() prob = tensor.vector() post_r, out = binomial(rng_R, n=n, p=prob) assert out.ndim == 1 f = compile.function([rng_R, n, prob], [post_r, out], accept_inplace=True) n_val = [1, 2, 3] prob_val = numpy.asarray([.1, .2, .3], dtype=config.floatX) rng = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(utt.fetch_seed()) # Arguments of size (3,) rng0, val0 = f(rng, n_val, prob_val) numpy_val0 = numpy_rng.binomial(n=n_val, p=prob_val) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) rng1, val1 = f(rng0, n_val[:-1], prob_val[:-1]) numpy_val1 = numpy_rng.binomial(n=n_val[:-1], p=prob_val[:-1]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = compile.function([rng_R, n, prob], binomial(rng_R, n=n, p=prob, size=(3,)), accept_inplace=True) rng2, val2 = g(rng1, n_val, prob_val) numpy_val2 = numpy_rng.binomial(n=n_val, p=prob_val, size=(3,)) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, rng2, n_val[:-1], prob_val[:-1])
def GRU_question(self, dimension_fact_embedding, num_hidden_units_questions, num_hidden_units_episodes, max_question_len, dimension_word_embeddings): self.question_idxs = T.lmatrix("question_indices") # as many columns as words in the context window and as many lines as words in the sentence self.question_mask = T.lvector("question_mask") q = self.emb[self.question_idxs].reshape((self.question_idxs.shape[0], dimension_word_embeddings)) # x basically represents the embeddings of the words IN the current sentence. So it is shape def slice_w(x, n): return x[n*num_hidden_units_questions:(n+1)*num_hidden_units_questions] def question_gru_recursion(x_cur, h_prev, q_mask): W_in_stacked = T.concatenate([self.W_question_reset_gate_x, self.W_question_update_gate_x, self.W_question_hidden_gate_x], axis=1) W_hid_stacked = T.concatenate([self.W_question_reset_gate_h, self.W_question_update_gate_h, self.W_question_hidden_gate_h], axis=1) input_n = T.dot(x_cur, W_in_stacked) hid_input = T.dot(h_prev, W_hid_stacked) resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = T.tanh(resetgate) updategate = T.tanh(updategate) hidden_update = slice_w(input_n, 2) + resetgate * slice_w(hid_input, 2) hidden_update = T.tanh(hidden_update) h_cur = (1 - updategate) * hidden_update + updategate * hidden_update h_cur = q_mask * h_cur + (1 - q_mask) * h_prev # h_cur = T.tanh(T.dot(self.W_fact_to_hidden, x_cur) + T.dot(self.W_hidden_to_hidden, h_prev)) return h_cur state = self.h0_questions for jdx in range(max_question_len): state = question_gru_recursion(q[jdx], state, self.question_mask[jdx]) return T.tanh(T.dot(state, self.W_question_to_vector) + self.b_question_to_vector)
def test_multinomial_vector(self): rng_R = random_state_type() n = tensor.lvector() pvals = tensor.matrix() post_r, out = multinomial(rng_R, n=n, pvals=pvals) assert out.ndim == 2 f = compile.function([rng_R, n, pvals], [post_r, out], accept_inplace=True) n_val = [1, 2, 3] pvals_val = [[0.1, 0.9], [0.2, 0.8], [0.3, 0.7]] pvals_val = numpy.asarray(pvals_val, dtype=config.floatX) rng = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(utt.fetch_seed()) # Arguments of size (3,) rng0, val0 = f(rng, n_val, pvals_val) numpy_val0 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)]) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) rng1, val1 = f(rng0, n_val[:-1], pvals_val[:-1]) numpy_val1 = numpy.asarray( [numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val[:-1], pvals_val[:-1])] ) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = compile.function([rng_R, n, pvals], multinomial(rng_R, n=n, pvals=pvals, size=(3,)), accept_inplace=True) rng2, val2 = g(rng1, n_val, pvals_val) numpy_val2 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)]) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, rng2, n_val[:-1], pvals_val[:-1])
def test_optimize_xent_vector2(self): verbose = 0 mode = theano.compile.mode.get_default_mode() if mode == theano.compile.mode.get_mode('FAST_COMPILE'): mode = 'FAST_RUN' rng = numpy.random.RandomState(utt.fetch_seed()) x_val = rng.randn(5) b_val = rng.randn(5) y_val = numpy.asarray([2]) x = T.dvector('x') b = T.dvector('b') y = T.lvector('y') def print_graph(func): for i, node in enumerate(func.maker.fgraph.toposort()): print i, node # Last node should be the output print i, printing.pprint(node.outputs[0]) print ## Test that a biased softmax is optimized correctly bias_expressions = [ T.sum(-T.log(softmax(x + b)[T.arange(y.shape[0]), y])), -T.sum(T.log(softmax(b + x)[T.arange(y.shape[0]), y])), -T.sum(T.log(softmax(x + b))[T.arange(y.shape[0]), y]), T.sum(-T.log(softmax(b + x))[T.arange(y.shape[0]), y])] for expr in bias_expressions: f = theano.function([x, b, y], expr, mode=mode) if verbose: print_graph(f) try: prev, last = f.maker.fgraph.toposort()[-2:] assert len(f.maker.fgraph.toposort()) == 3 # [big_op, sum, dim_shuffle] f(x_val, b_val, y_val) except Exception: theano.printing.debugprint(f) raise backup = config.warn.sum_div_dimshuffle_bug config.warn.sum_div_dimshuffle_bug = False try: g = theano.function([x, b, y], T.grad(expr, x), mode=mode) finally: config.warn.sum_div_dimshuffle_bug = backup if verbose: print_graph(g) try: ops = [node.op for node in g.maker.fgraph.toposort()] assert len(ops) <= 6 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_with_bias in ops assert softmax_grad not in ops g(x_val, b_val, y_val) except Exception: theano.printing.debugprint(g) raise
def __init__(self, model): """ Initialize the stochastic block model for the adjacency matrix """ self.model = model self.prms = model['network']['graph'] self.N = model['N'] # SBM has R latent clusters self.R = self.prms['R'] # A RxR matrix of connection probabilities per pair of clusters self.B = T.dmatrix('B') # SBM has a latent block or cluster assignment for each node self.Y = T.lvector('Y') # For indexing, we also need Y as a column vector and tiled matrix self.Yv = T.reshape(self.Y, [self.N, 1]) self.Ym = T.tile(self.Yv, [1, self.N]) self.pA = self.B[self.Ym, T.transpose(self.Ym)] # A probability of each cluster self.alpha = T.dvector('alpha') # Hyperparameters governing B and alpha self.b0 = self.prms['b0'] self.b1 = self.prms['b1'] self.alpha0 = self.prms['alpha0'] # Define complete adjacency matrix self.A = T.bmatrix('A') # Define log probability log_p_B = T.sum((self.b0 - 1) * T.log(self.B) + (self.b1 - 1) * T.log(1 - self.B)) log_p_alpha = T.sum((self.alpha0 - 1) * T.log(self.alpha)) log_p_A = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA)) self.log_p = log_p_B + log_p_alpha + log_p_A
def test_multinomial_vector(self): random = RandomStreams(utt.fetch_seed()) n = tensor.lvector() pvals = tensor.matrix() out = random.multinomial(n=n, pvals=pvals) assert out.ndim == 2 f = function([n, pvals], out) n_val = [1, 2, 3] pvals_val = [[.1, .9], [.2, .8], [.3, .7]] pvals_val = numpy.asarray(pvals_val, dtype=config.floatX) seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(n_val, pvals_val) numpy_val0 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)]) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(n_val[:-1], pvals_val[:-1]) numpy_val1 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val[:-1], pvals_val[:-1])]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([n, pvals], random.multinomial(n=n, pvals=pvals, size=(3,))) val2 = g(n_val, pvals_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)]) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, n_val[:-1], pvals_val[:-1])
def test_illegal_things(self): i0 = TT.iscalar() i1 = TT.lvector() i2 = TT.bmatrix() self.failUnlessRaises(TypeError, FAS, [i1, slice(None, i2, -1), i0]) self.failUnlessRaises(TypeError, FAS, [i1, slice(None, None, i2), i0]) self.failUnlessRaises(TypeError, FAS, [i1, slice(i2, None, -1), i0])
def test_binomial_vector(self): random = RandomStreams(utt.fetch_seed()) n = tensor.lvector() prob = tensor.vector() out = random.binomial(n=n, p=prob) assert out.ndim == 1 f = function([n, prob], out) n_val = [1, 2, 3] prob_val = numpy.asarray([.1, .2, .3], dtype=config.floatX) seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(n_val, prob_val) numpy_val0 = numpy_rng.binomial(n=n_val, p=prob_val) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(n_val[:-1], prob_val[:-1]) numpy_val1 = numpy_rng.binomial(n=n_val[:-1], p=prob_val[:-1]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([n, prob], random.binomial(n=n, p=prob, size=(3,))) val2 = g(n_val, prob_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy_rng.binomial(n=n_val, p=prob_val, size=(3,)) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, n_val[:-1], prob_val[:-1])
def test_bug_2009_06_02_trac_387(): y = tensor.lvector('y') f = theano.function([y], tensor.int_div( tensor.DimShuffle(y[0].broadcastable, ['x'])(y[0]), 2)) sys.stdout.flush() print(f(numpy.ones(1, dtype='int64') * 3))
def train_rnn(): rng = numpy.random.RandomState(1234) q = T.lvector("q") pos = T.lscalar("pos") neg = T.lscalar("neg") inputs = [q, pos, neg] embLayer = emb_layer(None, 100, 5) rnn = rnn_layer(input=inputs, emb_layer=embLayer, nh=5) cost = rnn.loss() gradient = T.grad(cost, rnn.params) lr = 0.001 updates = OrderedDict((p, p - lr * g) for p, g in zip(rnn.params, gradient)) train = theano.function(inputs=[q, pos, neg], outputs=cost, updates=updates) print rnn.emb.eval()[0] e0 = rnn.emb.eval() for i in range(0, 3): idq = rng.randint(size=10, low=0, high=100) idpos = rng.random_integers(100) idneg = rng.random_integers(100) train(idq, idpos, idneg) rnn.normalize() print rnn.emb.eval() - e0
def test_grad_types(self): # This function simply tests the behaviour of the AbstractConv # Ops, not their optimizations cpu_input = tensor.ftensor4() cpu_filters = tensor.ftensor4() cpu_topgrad = tensor.ftensor4() gpu_input = gpu_ftensor4() gpu_filters = gpu_ftensor4() gpu_topgrad = gpu_ftensor4() out_shape = tensor.lvector() # Check the gradient of the forward conv2d for input, filters in itertools.product((cpu_input, gpu_input), (cpu_filters, gpu_filters)): output = conv.conv2d(input, filters) grad_input, grad_filters = theano.grad(output.sum(), wrt=(input, filters)) assert grad_input.type == input.type, (grad_input, grad_input.type, input, input.type) assert grad_filters.type == filters.type, (grad_filters, grad_filters.type, filters, filters.type) # Check the gradient of gradweight for input, topgrad in itertools.product((cpu_input, gpu_input), (cpu_topgrad, gpu_topgrad)): grad_filters = conv.AbstractConv2d_gradWeights()(input, topgrad, out_shape) grad_input, grad_topgrad = theano.grad(grad_filters.sum(), wrt=(input, topgrad)) assert grad_input.type == input.type, (grad_input, grad_input.type, input, input.type) assert grad_topgrad.type == topgrad.type, (grad_topgrad, grad_topgrad.type, topgrad, topgrad.type) # Check the gradient of gradinputs for filters, topgrad in itertools.product((cpu_filters, gpu_filters), (cpu_topgrad, gpu_topgrad)): grad_input = conv.AbstractConv2d_gradInputs()(filters, topgrad, out_shape) grad_filters, grad_topgrad = theano.grad(grad_input.sum(), wrt=(filters, topgrad)) assert grad_filters.type == filters.type, (grad_filters, grad_filters.type, filters, filters.type) assert grad_topgrad.type == topgrad.type, (grad_topgrad, grad_topgrad.type, topgrad, topgrad.type)
def __init__(self, config=None, defaults=defaults, inputs_hook=None, hiddens_hook=None, params_hook=None, use_data_layer=None, rand_crop=None, batch_size=None): # combine everything by passing to Model's init super(AlexNet, self).__init__(**{arg: val for (arg, val) in locals().iteritems() if arg is not 'self'}) # configs can now be accessed through self dictionary if self.inputs_hook or self.hiddens_hook or self.params_hook: log.error("Inputs_hook, hiddens_hook, and params_hook not implemented yet for AlexNet!") self.flag_datalayer = self.use_data_layer #################### # Theano variables # #################### # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data self.x = T.ftensor4('x') self.y = T.lvector('y') self.rand = T.fvector('rand') ########## # params # ########## self.params = [] # make the network! self.build_computation_graph()
def train_dA(lr=0.1, training_epochs=15, params_dict = False, print_every = 100, data=None): x = T.lvector('x') input_size = T.scalar(dtype='int64') dA = make_dA(params=params_dict, input_size=input_size, data=x) cost, updates, output = dA.get_cost_updates(lr=lr) model = theano.function( [x], [cost, output], updates=updates, givens={input_size: x.shape[0]} ) start_time = time.clock() for epoch in xrange(training_epochs): cost_history = [] for index in range(len(data)): cost, predict= model(data[index]) cost_history.append(cost) if index % print_every == 0: print 'Iteration %d, cost %f' % (index, cost) print predict print 'Training epoch %d, cost ' % epoch, numpy.mean(cost_history) training_time = (time.clock() - start_time) print 'Finished training %d epochs, took %d seconds' % (training_epochs, training_time) return cost_history, dA.get_params(), model
def test_softmax_optimizations_w_bias_vector(self): x = tensor.vector('x') b = tensor.vector('b') one_of_n = tensor.lvector('one_of_n') op = crossentropy_categorical_1hot fgraph = gof.FunctionGraph( [x, b, one_of_n], [op(softmax(x + b), one_of_n)]) assert fgraph.outputs[0].owner.op == op #print 'BEFORE' #for node in fgraph.toposort(): # print node.op #print printing.pprint(node.outputs[0]) #print '----' theano.compile.mode.optdb.query( theano.compile.mode.OPT_FAST_RUN).optimize(fgraph) #print 'AFTER' #for node in fgraph.toposort(): # print node.op #print '====' assert len(fgraph.toposort()) == 3 assert str(fgraph.outputs[0].owner.op) == 'OutputGuard' assert (fgraph.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias)
def __init__(self, height, width, channels, timesteps, hidden_size, output_size, batch_size, num_convpools= 2, num_filters= 5): NeuralNet.__init__(self) self.batch_size = batch_size tensor5 = T.TensorType('float64', [False]*5) self.x1 = tensor5('inputs') self.y = T.lvector('targets') self.layers = [] n_batch, n_steps, n_channels, width, height = (batch_size, timesteps, channels, width, height) n_out_filters = 7 filter_shape = (3, 3) l_in = lasagne.layers.InputLayer( (None, n_steps, n_channels, width, height), input_var= self.x1) l_in_to_hid = lasagne.layers.Conv2DLayer( lasagne.layers.InputLayer((None, n_channels, width, height)), n_out_filters, filter_shape, pad='same') l_hid_to_hid = lasagne.layers.Conv2DLayer( lasagne.layers.InputLayer(l_in_to_hid.output_shape), n_out_filters, filter_shape, pad='same') l_rec = lasagne.layers.CustomRecurrentLayer( l_in, l_in_to_hid, l_hid_to_hid) l_reshape = lasagne.layers.ReshapeLayer(l_rec, (-1, np.prod(l_rec.output_shape[2:]))) l_out = lasagne.layers.DenseLayer( l_reshape, num_units= output_size, nonlinearity=lasagne.nonlinearities.linear) self.layers = [l_in, l_rec, l_out] self.network = l_out self.initiliaze(mode= 'classify')
def optimization_sgd(trainvec, testvec, n_epochs, batch_size, alpha=0.01, beta=0.05): i = T.lvector('i') j = T.lvector('j') x = T.dvector('x') num_user = 6040 num_item = 3952 factors = 20 init_mean = 0 init_stdev = 0.02 mfobj = MF_Batch(i, j, num_user, num_item, factors, init_mean, init_stdev) regcost, error = mfobj.errors(x, beta) gp, gq = T.grad(cost=regcost, wrt=[mfobj.P, mfobj.Q]) updates = [(mfobj.P, T.inc_subtensor(mfobj.P[i, :], -gp[i, :] * alpha)), (mfobj.Q, T.inc_subtensor(mfobj.Q[j, :], -gq[j, :] * alpha))] train_model = theano.function( inputs=[i, j, x], #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]], outputs=regcost, updates=updates) test_model = theano.function( inputs=[i, j, x], #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]], outputs=error) mean_rating = np.mean(trainvec[:, 2]) done_looping = False epoch = 0 N = len(trainvec) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 totalErrors = 0 testErrors = 0 for k in range(int(math.floor(N / batch_size))): batch = np.arange(k * batch_size, min(N - 1, (k + 1) * batch_size)) idi = trainvec[batch, 0] - 1 idj = trainvec[batch, 1] - 1 ratings = trainvec[batch, 2] - mean_rating minibatch_cost = train_model(idi, idj, ratings) totalErrors += minibatch_cost NN = len(testvec) batch_size = 1000 for k in range(int(math.floor(NN / batch_size))): batch = np.arange(k * batch_size, min(NN - 1, (k + 1) * batch_size)) p_idx = testvec[batch, 0] - 1 q_idx = testvec[batch, 1] - 1 ratings = testvec[batch, 2] - mean_rating testErrors += test_model(p_idx, q_idx, ratings) print( "the training cost at epoch {} is {}, and the testing error is {}". format(epoch, np.sqrt(totalErrors / N), np.sqrt(testErrors / NN))) # test it on the test dataset NN = len(testvec) batch_size = 1000 diff = 0 for k in range(int(math.floor(NN / batch_size))): batch = np.arange(k * batch_size, min(NN - 1, (k + 1) * batch_size)) p_idx = testvec[batch, 0] - 1 q_idx = testvec[batch, 1] - 1 ratings = testvec[batch, 2] - mean_rating diff += test_model(p_idx, q_idx, ratings) print("Total average test error for {} instances is {}".format( NN, np.sqrt(diff / NN)))
def train_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed): c = config if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed data, model = initialize_data_and_model(config, train_phase=True) # full main loop can be saved... main_loop_path = os.path.join(save_path, 'main_loop.tar') # or only state (log + params) which can be useful not to pickle embeddings state_path = os.path.join(save_path, 'training_state.tar') stream_path = os.path.join(save_path, 'stream.pkl') best_tar_path = os.path.join(save_path, "best_model.tar") keys = tensor.lmatrix('keys') n_identical_keys = tensor.lvector('n_identical_keys') words = tensor.ltensor3('words') words_mask = tensor.matrix('words_mask') if theano.config.compute_test_value != 'off': #TODO test_value_data = next( data.get_stream('train', batch_size=4, max_length=5).get_epoch_iterator()) words.tag.test_value = test_value_data[0] words_mask.tag.test_value = test_value_data[1] if use_keys(c) and use_n_identical_keys(c): costs = model.apply(words, words_mask, keys, n_identical_keys, train_phase=True) elif use_keys(c): costs = model.apply(words, words_mask, keys, train_phase=True) else: costs = model.apply(words, words_mask, train_phase=True) cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(words.shape[1], 'length') perplexity, = VariableFilter(name='perplexity')(cg) monitored_vars = [length, cost, perplexity] if c['proximity_coef']: proximity_term, = VariableFilter(name='proximity_term')(cg) monitored_vars.append(proximity_term) print "inputs of the model:", cg.inputs parameters = cg.get_parameter_dict() trained_parameters = parameters.values() saved_parameters = parameters.values() if c['embedding_path']: if c['freeze_pretrained']: logger.debug( "Exclude pretrained encoder embeddings from the trained parameters" ) to_freeze = 'main' elif c['provide_targets']: logger.debug( "Exclude pretrained targets from the trained parameters") to_freeze = 'target' trained_parameters = [ p for p in trained_parameters if not p == model.get_def_embeddings_params(to_freeze) ] saved_parameters = [ p for p in saved_parameters if not p == model.get_def_embeddings_params(to_freeze) ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) train_monitored_vars = list(monitored_vars) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) # We use a completely random seed on purpose. With Fuel server # it's currently not possible to restore the state of the training # stream. That's why it's probably better to just have it stateless. stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None training_stream = data.get_stream( 'train', batch_size=c['batch_size'], max_length=c['max_length'], seed=stream_seed, remove_keys=not use_keys(c), remove_n_identical_keys=not use_n_identical_keys(c)) print "trainin_stream will contains sources:", training_stream.sources original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) validate = c['mon_freq_valid'] > 0 if validate: valid_stream = data.get_stream( 'valid', batch_size=c['batch_size_valid'], max_length=c['max_length'], seed=stream_seed, remove_keys=not use_keys(c), remove_n_identical_keys=not use_n_identical_keys(c)) validation = DataStreamMonitoring( monitored_vars, valid_stream, prefix="valid").set_conditions(before_first_epoch=not fast_start, on_resumption=True, every_n_batches=c['mon_freq_valid']) track_the_best = TrackTheBest(validation.record_name(cost), choose_best=min).set_conditions( on_resumption=True, after_epoch=True, every_n_batches=c['mon_freq_valid']) # don't save them the entire main loop to avoid pickling everything if c['fast_checkpoint']: cp_path = state_path load = (LoadNoUnpickling(cp_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_main_loop': False, 'save_separately': ['log', 'iteration_state'], 'parameters': saved_parameters } else: cp_path = main_loop_path load = (Load(cp_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_separately': ['iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(cp_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches'] > 0 or c[ 'checkpoint_every_n_epochs'] > 0: intermediate_cp = IntermediateCheckpoint( cp_path, every_n_epochs=c['checkpoint_every_n_epochs'], every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) if validate: checkpoint = checkpoint.add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (best_tar_path, )) extensions = [ load, StartFuelServer(original_training_stream, stream_path, before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']) ] extensions.extend([ TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), ]) if validate: extensions.extend([validation, track_the_best]) extensions.append(checkpoint) if c['checkpoint_every_n_batches'] > 0 or c[ 'checkpoint_every_n_epochs'] > 0: extensions.append(intermediate_cp) extensions.extend( [Printing(on_resumption=True, every_n_batches=c['mon_freq_train'])]) if validate and c['n_valid_early'] > 0: extensions.append( FinishIfNoImprovementAfter(track_the_best.notification_name, iterations=c['n_valid_early'] * c['mon_freq_valid'], every_n_batches=c['mon_freq_valid'])) extensions.append(FinishAfter(after_n_epochs=c['n_epochs'])) logger.info("monitored variables during training:" + "\n" + pprint.pformat(train_monitored_vars, width=120)) logger.info("monitored variables during valid:" + "\n" + pprint.pformat(monitored_vars, width=120)) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
def _construct_mlp(datasets, learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, batch_size=20, n_hidden=200): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron Note: Parameters need tuning. :type datasets: tuple :param datasets: (inputs, targets) :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type batch_size: int :param batch_size: number of examples in one batch :type n_hidden: int :param n_hidden: number of hidden units to be used in class HiddenLayer """ inputs, targets = datasets temp_train_set_x = [] temp_train_set_y = [] train_set_x = [] train_set_y = [] valid_set_x = [] valid_set_y = [] test_set_x = [] test_set_y = [] # stratified k-fold to split test and temporary train, which contains # validation and train skf = StratifiedShuffleSplit(targets, 1, 0.2) for temp_train_index, test_index in skf: # print("TEMP_TRAIN:", temp_train_index, "TEST:", test_index) temp_train_set_x.append(inputs[temp_train_index]) temp_train_set_y.append(targets[temp_train_index]) test_set_x.append(inputs[test_index]) test_set_y.append(targets[test_index]) # convert from list-wrapping array to array test_set_x = test_set_x[0] test_set_y = test_set_y[0] temp_train_set_x = temp_train_set_x[0] temp_train_set_y = temp_train_set_y[0] # stratified k-fold to split valid and train skf = StratifiedShuffleSplit(temp_train_set_y, 1, 0.25) for train_index, valid_index in skf: # print("TRAIN: ", train_index, ", VALID: ", valid_index) train_set_x.append(temp_train_set_x[train_index]) train_set_y.append(temp_train_set_y[train_index]) valid_set_x.append(temp_train_set_x[valid_index]) valid_set_y.append(temp_train_set_y[valid_index]) # convert from list-wrapping array to array train_set_x = train_set_x[0] train_set_y = train_set_y[0] valid_set_x = valid_set_x[0] valid_set_y = valid_set_y[0] # check shape # print("train_set_x shape: " + str(train_set_x.shape)) # print("train_set_y shape: " + str(train_set_y.shape)) # print("valid_set_x shape: " + str(valid_set_x.shape)) # print("valid_set_y shape: " + str(valid_set_y.shape)) # print("test_set_x shape: " + str(test_set_x.shape)) # print("test_set_y shape: " + str(test_set_y.shape)) # convert to theano.shared variable train_set_x = theano.shared(value=train_set_x, name='train_set_x') train_set_y = theano.shared(value=train_set_y, name='train_set_y') valid_set_x = theano.shared(value=valid_set_x, name='valid_set_x') valid_set_y = theano.shared(value=valid_set_y, name='valid_set_y') test_set_x = theano.shared(value=test_set_x, name='test_set_x') test_set_y = theano.shared(value=test_set_y, name='test_set_y') # compute number of minibatches for training, validation and testing n_train_batches = int(train_set_x.get_value().shape[0] / batch_size) n_valid_batches = int(valid_set_x.get_value().shape[0] / batch_size) n_test_batches = int(test_set_x.get_value().shape[0] / batch_size) # check batch # print("n_train_batches:" + str(n_train_batches)) # print("n_valid_batches:" + str(n_valid_batches)) # print("n_test_batches:" + str(n_test_batches)) print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.lvector('y') # the labels are presented as 1D vector of [int] labels # set a random state that is related to the time # noinspection PyUnresolvedReferences rng = numpy.random.RandomState(int((time.time()))) # construct the MLP class classifier = MLP(rng=rng, input_=x, n_in=_std_height * _std_width, n_hidden=n_hidden, n_out=len(_captcha_provider.chars)) # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }, mode='FAST_RUN') validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }, mode='FAST_RUN') # compute the gradient of cost with respect to theta (sorted in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, mode='FAST_RUN') print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant if T.lt(n_train_batches, patience / 2): validation_frequency = n_train_batches else: validation_frequency = patience / 2 # go through this many minibatches before checking the network # on the validation set; in this case we check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch += 1 for minibatch_index in range(n_train_batches): # noinspection PyUnusedLocal minibatch_avg_cost = train_model(minibatch_index) iteration = (epoch - 1) * n_train_batches + minibatch_index if (iteration + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch {0}, minibatch {1}/{2}, validation error {3}'. format(epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iteration * patience_increase) best_validation_loss = this_validation_loss best_iter = iteration # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print( ' epoch {0}, minibatch {1}/{2}, test error of best ' 'model {3}'.format(epoch, minibatch_index + 1, n_train_batches, test_score * 100)) if patience <= iteration: done_looping = True break end_time = time.time() print('Optimization complete. Best validation score of {0} obtained at ' 'iteration {1}, with test performance {2}'.format( best_validation_loss * 100, best_iter + 1, test_score * 100)) print('Time used for testing the mlp is', end_time - start_time) return classifier
import numpy import theano.tensor as T from theano import shared, function x = T.matrix() y = T.lvector() w = shared(numpy.random.randn(100)) b = shared(numpy.zeros(())) print "Initial model:" print w.get_value(), b.get_value()
def __init__(self, pa): self.input_height = pa.network_input_height self.input_width = pa.network_input_width self.output_height = pa.network_output_dim self.num_frames = pa.num_frames self.update_counter = 0 states = T.tensor4( 'states' ) # states is [batch_size,channel_size,input_height,input_width ] actions = T.lvector( 'actions' ) # actions shape is batch_size.It is action that network choose values = T.vector( 'values' ) # values shape is batch_size. It is value that action stand for. print('network_input_height=', pa.network_input_height) print('network_input_width=', pa.network_input_width) print('network_output_dim=', pa.network_output_dim) # image represent self.l_out = build_pg_network(self.input_height, self.input_width, self.output_height) self.lr_rate = pa.lr_rate self.rms_rho = pa.rms_rho self.rms_eps = pa.rms_eps params = lasagne.layers.helper.get_all_params(self.l_out) print('params=', params, 'counts', lasagne.layers.count_params(self.l_out)) self._get_param = theano.function([], params) # ===================================== # Training #====================================== prob_act = lasagne.layers.get_output( self.l_out, states) # shape is [batch_size ,output_height].It is action prob. self._get_act_prob = theano.function([states], prob_act, allow_input_downcast=True) #======================================= # policy gradients #======================================= N = states.shape[0] loss = T.log(prob_act[T.arange(N), actions]).dot(values) / N grads = T.grad(loss, params) updates = rmsprop_updates(grads, params, self.lr_rate, self.rms_rho, self.rms_eps) self._train_fn = theano.function([states, actions, values], loss, updates=updates, allow_input_downcast=True) self._get_loss = theano.function([states, actions, values], loss, allow_input_downcast=True) self._get_grad = theano.function([states, actions, values], grads) # -------supervised learning -------------------- su_target = T.ivector('su_target') su_loss = lasagne.objectives.categorical_crossentropy( prob_act, su_target) su_loss = su_loss.mean() l2_penalty = lasagne.regularization.regularize_network_params( self.l_out, lasagne.regularization.l2) # l1_penalty = lasagne.regularization.regularize_network_params(self.l_out, lasagne.regularization.l1) su_loss += 1e-3 * l2_penalty print('lr_rate=', self.lr_rate) su_updates = lasagne.updates.rmsprop(su_loss, params, self.lr_rate, self.rms_rho, self.rms_eps) self._su_train_fn = theano.function([states, su_target], [su_loss, prob_act], updates=su_updates) self._su_loss = theano.function([states, su_target], [su_loss, prob_act]) self._debug = theano.function([states], [states.flatten(2)])
def __theano_build__(self): E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c x_a = T.ivector('x_a') x_b = T.ivector('x_b') y = T.lvector('y') def forward_step(x_t, s_t_prev): # Word embedding layer x_e = E[:, x_t] # GRU layer 1 z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t_prev)) r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t_prev)) c_t = T.tanh(U[2].dot(x_e) + W[2].dot(s_t_prev * r_t)) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev # directly return the hidden state as intermidate output return [s_t] # sentence a vector (states) a_s, updates = theano.scan(forward_step, sequences=x_a, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) b_s, updates = theano.scan(forward_step, sequences=x_b, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # semantic similarity # s_sim = manhattan_distance(a_s[-1],b_s[-1]) # for classification using simple strategy sena = a_s[-1] senb = b_s[-1] combined_s = T.concatenate([sena, senb], axis=0) # softmax class o = T.nnet.softmax(V.dot(combined_s) + c)[0] # in case the o contains 0 which cause inf eps = np.asarray([1.0e-10] * self.label_dim, dtype=theano.config.floatX) o = o + eps om = o.reshape((1, o.shape[0])) prediction = T.argmax(om, axis=1) o_error = T.nnet.categorical_crossentropy(om, y) # cost cost = T.sum(o_error) # updates updates = sgd_updates_adadelta(norm=0, params=self.params, cost=cost) # monitor parameter mV = V * T.ones_like(V) mc = c * T.ones_like(c) mU = U * T.ones_like(U) mW = W * T.ones_like(W) gV = T.grad(cost, V) gc = T.grad(cost, c) gU = T.grad(cost, U) gW = T.grad(cost, W) mgV = gV * T.ones_like(gV) mgc = gc * T.ones_like(gc) mgU = gU * T.ones_like(gU) mgW = gW * T.ones_like(gW) # Assign functions self.monitor = theano.function([x_a, x_b], [sena, senb, mV, mc, mU, mW]) self.monitor_grad = theano.function([x_a, x_b, y], [mgV, mgc, mgU, mgW]) self.predict = theano.function([x_a, x_b], om) self.predict_class = theano.function([x_a, x_b], prediction) self.ce_error = theano.function([x_a, x_b, y], cost) # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates # find the nan self.sgd_step = theano.function( [x_a, x_b, y], [], updates=updates # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) )
def sgd_optimization_mnist(learning_rate=2e-2, loss_weight = 1.8e+8, curriculum_rate=0.1, n_curriculum_epochs=300, epoch_iters = 20, converge = 1e-4, minibatch_size = 50, batch_size=4, k = 4, func = 'concavefeature', func_parameter = 0.5, deep = True): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ print('loading data...') datasets = load_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] labels_, cluster_centers_, center_nn = datasets[3] num_cluster = cluster_centers_.shape[0] isize = int(numpy.sqrt(train_set_x.get_value(borrow=True).shape[1])) # compute number of minibatches for training, validation and testing n_train = train_set_x.get_value(borrow=True).shape[0] n_train_batches = n_train // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('building the model...') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch cindex = T.lvector() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels if deep is False: # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=isize**2, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) cost_vec = classifier.negative_log_likelihood_vec(y) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] else: nfea = 500 nkerns=[20, 50] n_channels = 1 rng = numpy.random.RandomState(23455) layer0_input = x.reshape((-1, 1, isize, isize)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(None, 1, isize, isize), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) isize1 = int((isize - 5 + 1)/2) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(None, nkerns[0], isize1, isize1), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) isize2 = int((isize1 - 5 + 1)/2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * isize2 * isize2, n_out=nfea, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer classifier = LogisticRegression(input=layer2.output, n_in=nfea, n_out=10) # the cost we minimize during training is the NLL of the model cost = classifier.negative_log_likelihood(y) cost_vec = classifier.negative_log_likelihood_vec(y) # create a list of all model parameters to be fit by gradient descent params = classifier.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[cindex], outputs=classifier.errors(y), updates=updates, givens={ x: train_set_x[cindex], y: train_set_y[cindex] } ) loss_model = theano.function( inputs=[cindex], outputs=cost_vec, givens={ x: train_set_x[cindex], y: train_set_y[cindex] } ) error_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-3 ############### # TRAIN MODEL # ############### print('training the model...') # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant #validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() #initialize minGain, sinGain, optSubmodular = initSubmodularFunc(cluster_centers_, k) real_iter = 0 validation_frequency = 100 old_epoch_all_loss = float('inf') loss_weight0 = loss_weight passed_index = numpy.array([]) passed_index_epoch = numpy.array([]) passes = 0 output_seq = () for curriculum_epoch in range(n_curriculum_epochs): print('Epoch', curriculum_epoch) old_all_loss = 0 for iters in range(epoch_iters): if len(passed_index) <= n_train*0.45: # compute loss loss_vec = loss_model(center_nn) * loss_weight / len(center_nn) all_loss = sum(loss_vec) #loss_vec_center = numpy.asarray([sum(loss_vec[labels_ == i]) for i in range(num_cluster)]) loss_vec_center = loss_vec topkLoss = sum(numpy.partition(loss_vec_center, -k)[-k:]) optObj = optSubmodular + topkLoss print(optSubmodular, topkLoss) # update A (topkIndex) left_index = pruneGroundSet(minGain, sinGain, loss_vec_center, k) topkIndex = modularLowerBound(cluster_centers_[left_index,:], k, func, func_parameter, loss_vec_center[left_index], optObj) topkIndex = left_index[topkIndex] # update classifier (train_model) train_index = numpy.array([]) for i in range(len(topkIndex)): train_index = numpy.append(train_index, numpy.where(labels_ == topkIndex[i])[0]) train_index = numpy.random.permutation(train_index.astype(int)) print('number of training samples =', len(train_index)) passes += len(train_index) passed_index = numpy.unique(numpy.append(passed_index, train_index)) passed_index_epoch = numpy.unique(numpy.append(passed_index_epoch, train_index)) else: train_index = numpy.random.permutation(numpy.setxor1d(numpy.arange(n_train), passed_index_epoch).astype(int)) #train_index = numpy.random.permutation(numpy.arange(n_train).astype(int)) passes += len(train_index) passed_index_epoch = numpy.array([]) #passed_index = numpy.arange(n_train) # training by mini-batch sgd start_index = 0 train_loss = numpy.array([]) while start_index < len(train_index): end_index = min([start_index + minibatch_size, len(train_index)]) batch_index = train_index[start_index : end_index] start_index = end_index train_loss = numpy.append(train_loss, train_model(batch_index)) this_train_loss = numpy.mean(train_loss) # stop the current epoch if converge diff_loss = old_all_loss - all_loss if diff_loss >= 0 and diff_loss <= all_loss * converge: break # show validation and test error peoriodically else: old_all_loss = all_loss if (iters + real_iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) train_score = [error_model(i) for i in range(n_train_batches)] this_train_score = numpy.mean(train_score) print( 'minibatch %i, %i trainings, %i passes, trainErr %f %%, validErr %f %%, testErr %f %%' % ( iters + real_iter + 1, len(passed_index), passes, this_train_score * 100., this_validation_loss * 100., test_score * 100. ) ) output_seq = output_seq + (numpy.array([len(passed_index),passes,this_train_score * 100.,this_validation_loss * 100.,test_score * 100.]),) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, (iters + real_iter + 1) * patience_increase) best_validation_loss = this_validation_loss # save the best model with open('best_model.pkl', 'wb') as f: pickle.dump(classifier, f) #print('Up to now %i training samples are used'%(len(passed_index))) # record total number of iterations real_iter += iters # adjust learning rate if all_loss > 1.001 * old_epoch_all_loss: print('no improvement: reduce learning rate!') learning_rate *= 0.96 old_epoch_all_loss = all_loss # increase curriculum rate loss_weight *= curriculum_rate + 1 if patience <= iters + real_iter + 1: break end_time = timeit.default_timer() print( ( 'Optimization complete with best validation score of %f %%,' 'with test performance %f %%' ) % (best_validation_loss * 100., test_score * 100.) ) #print('The code run for %d epochs, with %f epochs/sec' % ( #epoch, 1. * epoch / (end_time - start_time))) #print(('The code for file ' + #os.path.split(__file__)[1] + #' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) output_seq = numpy.vstack(output_seq) return output_seq
def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): """ This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias We check that we loop when their is too much threads """ n_in = 1000 batch_size = 4097 n_out = 1250 if not isinstance(mode_with_gpu, theano.compile.DebugMode): n_in = 4098 n_out = 4099 x = T.fmatrix('x') y = T.lvector('y') b = T.fvector('b') #W = T.fmatrix('W') #we precompute the dot with big shape before to allow the test of #GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error #(the launch timed out and was terminated) on GPU card not #powerful enough. We need the big shape to check for corner #case. dot_result = T.fmatrix('dot_result') # Seed numpy.random with config.unittests.rseed utt.seed_rng() xx = numpy.asarray(numpy.random.rand(batch_size, n_in), dtype=numpy.float32) #?????yy = numpy.ones((batch_size,),dtype='float32') yy = numpy.ones((batch_size, ), dtype='int32') b_values = numpy.zeros((n_out, ), dtype='float32') W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32') dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32') del W_values p_y_given_x = T.nnet.softmax(dot_result + b) y_pred = T.argmax(p_y_given_x, axis=-1) loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y]) dW = T.grad(loss, dot_result) classify = theano.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_without_gpu) classify_gpu = theano.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_with_gpu) #theano.printing.debugprint(classify) #theano.printing.debugprint(classify_gpu) assert any([ isinstance(node.op, T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.fgraph.toposort() ]) assert any([ isinstance(node.op, GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.fgraph.toposort() ]) out = classify(yy, b_values, dot_value) gout = classify_gpu(yy, b_values, dot_value) assert len(out) == len(gout) == 3 assert numpy.allclose(out[0], gout[0]) assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(gout[2] - out[2]).max() assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val) for id, val in enumerate(out[1] - gout[1]) if val != 0]
def test_RNN(nh=100, nl=3, n_in=32): WS_file_filter_regex = r'WS_P[0-9]*_S[0-9].mat' WS_file_filter_regex_P1 = r'WS_P1_S[0-9].mat' AllLifts_P1 = r'P1_AllLifts.mat' # nearest element in list min(myList, key=lambda x:abs(x-myNumber)) # Wait until implemented in brain.data.util # needed are eeg and eeg_t data as train set data # for now train and test set contain empty lists train_set, valid_set, test_set = load_data(participant=1) train_set_x, train_set_y = train_set valid_set_x, valid_set_y = valid_set test_set_x, test_set_y = test_set data = getTables(WS_file_filter_regex_P1) datasetSize = len(data) print(data) print(datasetSize) # HandStart(33), LiftOff(18) # returns list targets event_data = getRaw(AllLifts_P1)[0]['P']['AllLifts'] handStart = event_data[:, 33] liftOff = event_data[:, 18] # get nearest index in eeg data for given event and get length of longest/shortest eeg window handStartIndex = [] liftOffIndex = [] maxLengthEEG = 0 minLengthEEG = numpy.inf for i in range(len(data)): handStartIndex.append( numpy.where(data[i]['eeg_t'] == min( data[i]['eeg_t'], key=lambda x: abs(handStart[i] - x)))[0][0]) liftOffIndex.append( numpy.where(data[i]['eeg_t'] == min( data[i]['eeg_t'], key=lambda x: abs(liftOff[i] - x)))[0][0]) if len(data[i]['eeg']) > maxLengthEEG: maxLengthEEG = len(data[i]['eeg']) if len(data[i]['eeg']) < minLengthEEG: minLengthEEG = len(data[i]['eeg']) sequenceLength = maxLengthEEG # Construct target vectors (0 = 'none' event) targets = numpy.zeros((datasetSize, sequenceLength), dtype='int64') for i in range(datasetSize): targets[i][handStartIndex[i] - 100:handStartIndex[i] + 100] = 1 targets[i][liftOffIndex[i] - 100:liftOffIndex[i] + 100] = 2 #print(str(handStartIndex[i]) + " " + str(liftOffIndex[i])) # Construct data array with 0 padding at the end for shorter sequences eeg_data = numpy.zeros((datasetSize, sequenceLength, 32)) for i in range(datasetSize): eeg_data[i, 0:data[i]['eeg'].shape[0]] = data[i]['eeg'] #eeg_data[i, :] = data[i]['eeg'][0: sequenceLength] tmpl = [(n_in, nh), (nh, nh), (nh, nl), nh, nl, nh] wrt, (Wx, Wh, W, bh, b, h0) = climin.util.empty_with_views(tmpl) params = [Wx, Wh, W, bh, b, h0] x = T.dmatrix('x') y = T.lvector('y') classifier = RNN(x, y, nh, nl, n_in) # copy preinitialized weight matrices Wx[...] = classifier.Wx.get_value(borrow=True)[...] Wh[...] = classifier.Wh.get_value(borrow=True)[...] W[...] = classifier.W.get_value(borrow=True)[...] def set_pars(): for p, p_class in zip(params, classifier.params): p_class.set_value(p, borrow=True) def loss(parameters, inpt, targets): set_pars() return classifier.cost.eval({x: inpt[0], y: targets[0]}) def d_loss_wrt_pars(parameters, inpt, targets): set_pars() grads = [] print(loss(parameters, inpt, targets)) for d in classifier.gradients: grads.append(d.eval({x: inpt[0], y: targets[0]})) return numpy.concatenate([ grads[0].flatten(), grads[1].flatten(), grads[2].flatten(), grads[3], grads[4], grads[5] ]) args = ((i, {}) for i in climin.util.iter_minibatches( [eeg_data[0:1], targets[0:1]], 1, [0, 0])) opt = climin.adadelta.Adadelta(wrt, d_loss_wrt_pars, step_rate=1, decay=0.9, momentum=0, offset=0.0001, args=args) def plot(): figure, (axes) = plt.subplots(4, 1) x_axis = numpy.arange(sequenceLength) result = classifier.result_sequence.eval({x: eeg_data[0]}) axes[0].set_title("labels") axes[0].plot(x_axis, targets[0], label="targets") axes[1].set_title("none_prob") axes[1].plot(x_axis, result[:, 0], label="none") axes[2].set_title("handStart_prob") axes[2].plot(x_axis, result[:, 1], label="handStart") axes[3].set_title("liftOff_prob") axes[3].plot(x_axis, result[:, 2], label="liftOff") figure.subplots_adjust(hspace=0.5) figure.savefig('test.png') plt.close(figure) for info in opt: iteration = info['n_iter'] if iteration % 10 == 0: plot() if iteration > 500: break plot()
def __init__(self, config): self.config = config batch_size = config['batch_size'] flag_datalayer = config['use_data_layer'] # ##################### BUILD NETWORK ########################## # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data # [2015-11-11] Jinlong note: for CPU version, a bug exit for group=2, # since memory is not continuous; Will fix it depends on requirement x = T.ftensor4('x') y = T.lvector('y') rand = T.fvector('rand') print '... building the model' self.layers = [] params = [] weight_types = [] if flag_datalayer: data_layer = DataLayer(input=x, image_shape=(batch_size, 3, 256, 256), cropsize=227, rand=rand, mirror=True, flag_rand=config['rand_crop']) layer1_input = data_layer.output else: layer1_input = x convpool_layer1 = ConvPoolLayer( input=layer1_input, image_shape=(batch_size, 3, 227, 227), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=True, ) self.layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type convpool_layer2 = ConvPoolLayer( input=convpool_layer1.output, image_shape=(batch_size, 96, 27, 27), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, lrn=True, ) self.layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type convpool_layer3 = ConvPoolLayer( input=convpool_layer2.output, image_shape=(batch_size, 256, 13, 13), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, ) self.layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type convpool_layer4 = ConvPoolLayer( input=convpool_layer3.output, image_shape=(batch_size, 384, 13, 13), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, ) self.layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type convpool_layer5 = ConvPoolLayer( input=convpool_layer4.output, image_shape=(batch_size, 384, 13, 13), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.1, lrn=False, ) self.layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type fc_layer6_input = T.flatten(convpool_layer5.output, 2) fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096) self.layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type dropout_layer6 = DropoutLayer(fc_layer6.output, n_in=4096, n_out=4096) fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096) self.layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type dropout_layer7 = DropoutLayer(fc_layer7.output, n_in=4096, n_out=4096) softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output, n_in=4096, n_out=1000) self.layers.append(softmax_layer8) params += softmax_layer8.params weight_types += softmax_layer8.weight_type # #################### NETWORK BUILT ####################### self.cost = softmax_layer8.negative_log_likelihood(y) self.errors = softmax_layer8.errors(y) self.errors_top_5 = softmax_layer8.errors_top_x(y, 5) self.params = params self.x = x self.y = y self.rand = rand self.weight_types = weight_types self.batch_size = batch_size
def build_finetune_functions(self, datasets, batch_size, learning_rate, L1_param, L2_param, mom): (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] index = T.lvector('index') gparams = T.grad( self.dropout_negative_log_likelihood + L1_param * self.L1 + L2_param * self.L2, self.params) self.gparams_mom = [] for param in self.params: gparam_mom = theano.shared( numpy.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)) self.gparams_mom.append(gparam_mom) updates1 = OrderedDict() for param, gparam, gparam_mom in zip(self.params, gparams, self.gparams_mom): updates1[gparam_mom] = mom * gparam_mom - learning_rate * gparam updates1[param] = param + updates1[gparam_mom] train_model = theano.function( inputs=[index], outputs=self.dropout_negative_log_likelihood, updates=updates1, givens={ self.x: train_set_x[index], self.y: train_set_y[index] }) # error check train_error_fn = theano.function(inputs=[index], outputs=self.error, givens={ self.x: train_set_x[index], self.y: train_set_y[index] }) valid_error_fn = theano.function(inputs=[index], outputs=self.error, givens={ self.x: valid_set_x[index], self.y: valid_set_y[index] }) # performance check : error rate, sensitivity, specificity, auc test_error_fn = theano.function(inputs=[index], outputs=self.error, givens={ self.x: test_set_x[index], self.y: test_set_y[index] }) test_sensitivity_fn = theano.function(inputs=[index], outputs=self.sensitivity, givens={ self.x: test_set_x[index], self.y: test_set_y[index] }) test_specificity_fn = theano.function(inputs=[index], outputs=self.specificity, givens={ self.x: test_set_x[index], self.y: test_set_y[index] }) test_class1_pred_fn = theano.function(inputs=[index], outputs=self.class1_pred, givens={ self.x: test_set_x[index], self.y: test_set_y[index] }) test_y_fn = theano.function(inputs=[index], outputs=self.y, givens={self.y: test_set_y[index]}) n_train_exp = train_set_x.get_value(borrow=True).shape[0] n_valid_exp = valid_set_x.get_value(borrow=True).shape[0] n_test_exp = test_set_x.get_value(borrow=True).shape[0] def getSums(fn, n_exp, batch_size): val_sum = 0. tot_len = 0. n_batches = n_exp / batch_size resid = n_exp - (n_batches * batch_size) IDX = range(n_exp) for i in range(n_batches): sum_val, len_val = fn(IDX[i * batch_size:(i + 1) * batch_size]) val_sum += sum_val tot_len += len_val if resid != 0: sum_val, len_val = fn( IDX[n_batches * batch_size:(n_batches * batch_size) + resid]) val_sum += sum_val tot_len += len_val return val_sum / tot_len def getVals(fn, n_exp, batch_size): vals = list() n_batches = n_exp / batch_size resid = n_exp - (n_batches * batch_size) IDX = range(n_exp) for i in range(n_batches): vals += fn(IDX[i * batch_size:(i + 1) * batch_size]).tolist() if resid != 0: vals += fn( IDX[n_batches * batch_size:(n_batches * batch_size) + resid]).tolist() return vals def errorcheck(): train_error = getSums(train_error_fn, n_train_exp, batch_size) valid_error = getSums(valid_error_fn, n_valid_exp, batch_size) return train_error, valid_error def performance(): test_error = getSums(test_error_fn, n_test_exp, batch_size) test_sensitivity = getSums(test_sensitivity_fn, n_test_exp, batch_size) test_specificity = getSums(test_specificity_fn, n_test_exp, batch_size) test_y = getVals(test_y_fn, n_test_exp, batch_size) test_class1_pred = getVals(test_class1_pred_fn, n_test_exp, batch_size) test_roc = ROCData(zip(test_y, test_class1_pred)) return test_error, test_sensitivity, test_specificity, test_roc return train_model, errorcheck, performance
def build_mlp(args, netid, input_var=None, mask_inputs=False): """Build MLP model""" # pylint: disable=bad-continuation # This creates an MLP of two hidden layers of 800 units each, followed by # a softmax output layer of 10 units. It applies 20% dropout to the input # data and 50% dropout to the hidden layers. # Input layer, specifying the expected input shape of the network # (unspecified batchsize, 1 channel, 28 rows and 28 columns) and # linking it to the given Theano variable `input_var`, if any: l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input_var, name="%d_%s" % (netid, "l_in")) mask_in = None if mask_inputs: mask_in = T.ltensor3() # Apply 20% dropout to the input data: l_in_drop = dropout.DropoutLayer(l_in, mask=mask_in, p=args.input_dropout_rate, name="%d_%s" % (netid, "l_in_drop")) # Add a fully-connected layer of 800 units, using the linear rectifier, and # initializing weights with Glorot's scheme (which is the default anyway): l_hid1 = lasagne.layers.DenseLayer( l_in_drop, num_units=200, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform(), name="%d_%s" % (netid, "l_hid1")) # We'll now add dropout of 50%: mask_hid1 = None if mask_inputs: mask_hid1 = T.lvector() l_hid1_drop = dropout.DropoutLayer(l_hid1, mask=mask_hid1, p=args.dropout_rate, name="%d_%s" % (netid, "l_hid1_drop")) # Another 800-unit layer: l_hid2 = lasagne.layers.DenseLayer( l_hid1_drop, num_units=200, nonlinearity=lasagne.nonlinearities.rectify, name="%d_%s" % (netid, "l_hid2")) # 50% dropout again: mask_hid2 = None if mask_inputs: mask_hid2 = T.lvector() l_hid2_drop = dropout.DropoutLayer(l_hid2, mask=mask_hid2, p=args.dropout_rate, name="%d_%s" % (netid, "l_hid2_drop")) # Finally, we'll add the fully-connected output layer, of 10 softmax units: l_out = lasagne.layers.DenseLayer( l_hid2_drop, num_units=10, nonlinearity=lasagne.nonlinearities.softmax, name="%d_%s" % (netid, "l_out")) masks = [mask_in, mask_hid1, mask_hid2] # Each layer is linked to its incoming layer(s), so we only need to pass # the output layer to give access to a network in Lasagne: return l_out, masks
def __init__(self, config): self.config = config batch_size = config.batch_size lib_conv = config.lib_conv group = (2 if config.grouping else 1) LRN = (True if config.LRN else False) print 'LRN, group', LRN, group # ##################### BUILD NETWORK ########################## # allocate symbolic variables for the data x = T.ftensor4('x') y = T.lvector('y') print '... building the model with ConvLib %s, LRN %s, grouping %i ' \ % (lib_conv, LRN, group) self.layers = [] params = [] weight_types = [] layer1_input = x convpool_layer1 = ConvPoolLayer( input=layer1_input, image_shape=((3, 224, 224, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 3, 227, 227)), filter_shape=((3, 11, 11, 96) if lib_conv == 'cudaconvnet' else (96, 3, 11, 11)), convstride=4, padsize=(0 if lib_conv == 'cudaconvnet' else 3), group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=LRN, lib_conv=lib_conv) self.layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type convpool_layer2 = ConvPoolLayer( input=convpool_layer1.output, image_shape=((96, 27, 27, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 96, 27, 27)), filter_shape=((96, 5, 5, 256) if lib_conv == 'cudaconvnet' else (256, 96, 5, 5)), convstride=1, padsize=2, group=group, poolsize=3, poolstride=2, bias_init=0.1, lrn=LRN, lib_conv=lib_conv, ) self.layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type convpool_layer3 = ConvPoolLayer( input=convpool_layer2.output, image_shape=((256, 13, 13, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 256, 13, 13)), filter_shape=((256, 3, 3, 384) if lib_conv == 'cudaconvnet' else (384, 256, 3, 3)), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, lib_conv=lib_conv, ) self.layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type convpool_layer4 = ConvPoolLayer( input=convpool_layer3.output, image_shape=((384, 13, 13, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 384, 13, 13)), filter_shape=((384, 3, 3, 384) if lib_conv == 'cudaconvnet' else (384, 384, 3, 3)), convstride=1, padsize=1, group=group, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, lib_conv=lib_conv, ) self.layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type convpool_layer5 = ConvPoolLayer( input=convpool_layer4.output, image_shape=((384, 13, 13, batch_size) if lib_conv == 'cudaconvnet' else (batch_size, 384, 13, 13)), filter_shape=((384, 3, 3, 256) if lib_conv == 'cudaconvnet' else (256, 384, 3, 3)), convstride=1, padsize=1, group=group, poolsize=3, poolstride=2, bias_init=0.0, lrn=False, lib_conv=lib_conv, ) self.layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type if lib_conv == 'cudaconvnet': fc_layer6_input = T.flatten( convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2) else: fc_layer6_input = convpool_layer5.output.flatten(2) fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096) self.layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type dropout_layer6 = DropoutLayer(fc_layer6.output) fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096) self.layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type dropout_layer7 = DropoutLayer(fc_layer7.output) softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output, n_in=4096, n_out=1000) self.layers.append(softmax_layer8) params += softmax_layer8.params weight_types += softmax_layer8.weight_type # #################### NETWORK BUILT ####################### self.cost = softmax_layer8.negative_log_likelihood(y) self.errors = softmax_layer8.errors(y) self.errors_top_5 = softmax_layer8.errors_top_x(y, 5) self.params = params self.x = x self.y = y # self.rand = rand self.weight_types = weight_types self.batch_size = batch_size
def __init__(self, config): ModelBase.__init__(self) self.config = config self.verbose = self.config['verbose'] self.name = 'alexnet' batch_size = config['batch_size'] flag_datalayer = config['use_data_layer'] lib_conv = config['lib_conv'] n_softmax_out = config['n_softmax_out'] # ##################### BUILD NETWORK ########################## # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data x = T.ftensor4('x') y = T.lvector('y') rand = T.fvector('rand') lr = T.scalar('lr') if self.verbose: print 'AlexNet 2/16' self.layers = [] params = [] weight_types = [] if flag_datalayer: data_layer = DataLayer(input=x, image_shape=(3, 256, 256, batch_size), cropsize=227, rand=rand, mirror=True, flag_rand=config['rand_crop']) layer1_input = data_layer.output else: layer1_input = x convpool_layer1 = ConvPoolLayer(input=layer1_input, image_shape=(3, 227, 227, batch_size), filter_shape=(3, 11, 11, 96), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, lrn=True, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer1) params += convpool_layer1.params weight_types += convpool_layer1.weight_type convpool_layer2 = ConvPoolLayer(input=convpool_layer1.output, image_shape=(96, 27, 27, batch_size), filter_shape=(96, 5, 5, 256), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, lrn=True, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer2) params += convpool_layer2.params weight_types += convpool_layer2.weight_type convpool_layer3 = ConvPoolLayer(input=convpool_layer2.output, image_shape=(256, 13, 13, batch_size), filter_shape=(256, 3, 3, 384), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, lrn=False, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer3) params += convpool_layer3.params weight_types += convpool_layer3.weight_type convpool_layer4 = ConvPoolLayer(input=convpool_layer3.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 384), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, lrn=False, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer4) params += convpool_layer4.params weight_types += convpool_layer4.weight_type convpool_layer5 = ConvPoolLayer(input=convpool_layer4.output, image_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 256), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, lrn=False, lib_conv=lib_conv, verbose=self.verbose) self.layers.append(convpool_layer5) params += convpool_layer5.params weight_types += convpool_layer5.weight_type fc_layer6_input = T.flatten( convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2) fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096, verbose=self.verbose) self.layers.append(fc_layer6) params += fc_layer6.params weight_types += fc_layer6.weight_type dropout_layer6 = DropoutLayer(fc_layer6.output, n_in=4096, n_out=4096, verbose=self.verbose) fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096, verbose=self.verbose) self.layers.append(fc_layer7) params += fc_layer7.params weight_types += fc_layer7.weight_type dropout_layer7 = DropoutLayer(fc_layer7.output, n_in=4096, n_out=4096, verbose=self.verbose) softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output, n_in=4096, n_out=n_softmax_out, verbose=self.verbose) self.layers.append(softmax_layer8) params += softmax_layer8.params weight_types += softmax_layer8.weight_type # #################### NETWORK BUILT ####################### self.p_y_given_x = softmax_layer8.p_y_given_x self.y_pred = softmax_layer8.y_pred self.cost = softmax_layer8.negative_log_likelihood(y) self.errors = softmax_layer8.errors(y) if n_softmax_out < 5: self.errors_top_5 = softmax_layer8.errors_top_x(y, n_softmax_out) else: self.errors_top_5 = softmax_layer8.errors_top_x(y, 5) self.params = params # inputs self.x = x self.y = y self.rand = rand self.lr = lr self.shared_x = theano.shared( np.zeros( (3, config['input_width'], config['input_height'], config['file_batch_size']), # for loading large batch dtype=theano.config.floatX), borrow=True) self.shared_y = theano.shared(np.zeros((config['file_batch_size'], ), dtype=int), borrow=True) self.shared_lr = theano.shared(np.float32(config['learning_rate'])) # training related self.base_lr = np.float32(config['learning_rate']) self.step_idx = 0 self.mu = config['momentum'] # def: 0.9 # momentum self.eta = config['weight_decay'] #0.0002 # weight decay self.weight_types = weight_types self.batch_size = batch_size self.grads = T.grad(self.cost, self.params) # shared variable for storing momentum before exchanging momentum(delta w) self.vels = [ theano.shared(param_i.get_value() * 0.) for param_i in self.params ] # shared variable for accepting momentum during exchanging momentum(delta w) self.vels2 = [ theano.shared(param_i.get_value() * 0.) for param_i in self.params ] self.train = None self.get_vel = None self.descent_vel = None self.val = None self.inference = None
def build_model(self): if self.verbose: print(self.name) # start graph construction from scratch import theano.tensor as T if seed_weight_on_pid: import theanompi.models.layers2 as layers import os layers.rng = np.random.RandomState(os.getpid()) from theanompi.models.layers2 import (ConvPoolLRN,Dropout,FC, Dimshuffle, Crop, Subtract, Softmax,Flatten,LRN, Constant, Normal) self.x = T.ftensor4('x') self.y = T.lvector('y') self.lr = T.scalar('lr') # subtract_layer = Subtract(input=self.x, # input_shape=(self.channels, # self.data.width, # self.data.height, # self.batch_size), # subtract_arr = self.data.rawdata[4], # printinfo = self.verbose # ) # # crop_layer = Crop(input=subtract_layer, # output_shape=(self.channels, # self.input_width, # self.input_height, # self.batch_size), # flag_batch=batch_crop_mirror, # printinfo = self.verbose # ) convpool_layer1 = ConvPoolLRN(input=self.x, #crop_layer, input_shape=(self.channels, self.input_width, self.input_height, self.batch_size), filter_shape=(3, 11, 11, 96), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, b=0.0, lrn=True, lib_conv=lib_conv, printinfo = self.verbose #output_shape = (96, 27, 27, batch_size) ) convpool_layer2 = ConvPoolLRN(input=convpool_layer1, #input_shape=(96, 27, 27, batch_size), filter_shape=(96, 5, 5, 256), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, b=0.1, lrn=True, lib_conv=lib_conv, printinfo = self.verbose #output_shape=(256, 13, 13, batch_size), ) convpool_layer3 = ConvPoolLRN(input=convpool_layer2, #input_shape=(256, 13, 13, batch_size), filter_shape=(256, 3, 3, 384), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, b=0.0, lrn=False, lib_conv=lib_conv, printinfo = self.verbose #output_shape=(384, 13, 13, batch_size), ) convpool_layer4 = ConvPoolLRN(input=convpool_layer3, #input_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 384), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, b=0.1, lrn=False, lib_conv=lib_conv, printinfo = self.verbose #output_shape=(384, 13, 13, batch_size), ) convpool_layer5 = ConvPoolLRN(input=convpool_layer4, #input_shape=(384, 13, 13, batch_size), filter_shape=(384, 3, 3, 256), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, b=0.0, lrn=False, lib_conv=lib_conv, printinfo = self.verbose #output_shape=(256, 6, 6, batch_size), ) shuffle = Dimshuffle(input=convpool_layer5, new_axis_order=(3,0,1,2), printinfo=self.verbose ) fc_layer6_input = Flatten(input=shuffle, #input_shape=(batch_size, 256, 6, 6), axis = 2, printinfo=self.verbose ) fc_layer6 = FC(input=fc_layer6_input, # n_in=9216, n_out=4096, W=Normal((fc_layer6_input.output_shape[1], 4096), std=0.005), b=Constant((4096,), val=0.1), printinfo = self.verbose ) dropout_layer6 = Dropout(input=fc_layer6, # n_in=4096, n_out=fc_layer6.output_shape[1], prob_drop=0.5, printinfo = self.verbose) fc_layer7 = FC(input=dropout_layer6, # n_in=4096, n_out=4096, W = Normal((dropout_layer6.output_shape[1], 4096), std=0.005), b = Constant((4096,), val=0.1), printinfo = self.verbose ) dropout_layer7 = Dropout(input=fc_layer7, #n_in=4096, n_out=fc_layer7.output_shape[1], prob_drop=0.5, printinfo = self.verbose) softmax_layer8 = Softmax(input=dropout_layer7, #n_in=4096, n_out=self.n_softmax_out, W = Normal((dropout_layer7.output_shape[1], self.n_softmax_out), mean=0, std=0.01), b = Constant((self.n_softmax_out,),val=0), printinfo = self.verbose) self.output_layer = softmax_layer8 self.cost = softmax_layer8.negative_log_likelihood(self.y) self.error = softmax_layer8.errors(self.y) self.error_top_5 = softmax_layer8.errors_top_x(self.y)
def __theano_build__(self): E, V, U, W, b, c, W_att, b_att = self.E, self.V, self.U, self.W, self.b, self.c, self.W_att, self.b_att x_a = T.ivector('x_a') x_b = T.ivector('x_b') y = T.lvector('y') def forward_direction_step(x_t, s_t_prev): # Word embedding layer x_e = E[:, x_t] # GRU layer 1 z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t_prev)) + b[0] r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t_prev)) + b[1] c_t = T.tanh(U[2].dot(x_e) + W[2].dot(s_t_prev * r_t) + b[2]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev # directly return the hidden state as intermidate output return [s_t] def backward_direction_step(x_t, s_t_prev): # Word embedding layer x_e = E[:, x_t] # GRU layer 2 z_t = T.nnet.hard_sigmoid(U[3].dot(x_e) + W[3].dot(s_t_prev)) + b[3] r_t = T.nnet.hard_sigmoid(U[4].dot(x_e) + W[4].dot(s_t_prev)) + b[4] c_t = T.tanh(U[5].dot(x_e) + W[5].dot(s_t_prev * r_t) + b[5]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev # directly return the hidden state as intermidate output return [s_t] # sentence a vector (states) forward direction a_s_f, updates = theano.scan(forward_direction_step, sequences=x_a, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) backward direction a_s_b, updates = theano.scan(backward_direction_step, sequences=x_a[::-1], truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) forward direction b_s_f, updates = theano.scan(forward_direction_step, sequences=x_b, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) backward direction b_s_b, updates = theano.scan(backward_direction_step, sequences=x_b[::-1], truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # combine the sena a_s = T.concatenate([a_s_f, a_s_b[::-1]], axis=1) b_s = T.concatenate([b_s_f, b_s_b[::-1]], axis=1) def soft_attention(h_i): return T.tanh(W_att.dot(h_i) + b_att) def weight_attention(h_i, a_j): return h_i * a_j a_att, updates = theano.scan(soft_attention, sequences=a_s) b_att, updates = theano.scan(soft_attention, sequences=b_s) # softmax # a_att = (59,1) # b_att = (58,1) a_att = T.exp(a_att) a_att = a_att.flatten() a_att = a_att / a_att.sum() b_att = T.exp(b_att) b_att = b_att.flatten() b_att = b_att / b_att.sum() a_s_att, updates = theano.scan(weight_attention, sequences=[a_s, a_att]) b_s_att, updates = theano.scan(weight_attention, sequences=[b_s, b_att]) # eps = np.asarray([1.0e-10]*self.label_dim,dtype=theano.config.floatX) # semantic similarity # s_sim = manhattan_distance(a_s[-1],b_s[-1]) # for classification using simple strategy # for now we still use the last word vector as sentence vector # apply a simple single hidden layer on each word in sentence # # a (wi) = attention(wi) = tanh(w_att.dot(wi)+b) # theano scan # exp(a) # sena = a_s_att.sum(axis=0) senb = b_s_att.sum(axis=0) combined_s = T.concatenate([sena, senb], axis=0) # softmax class o = T.nnet.softmax(V.dot(combined_s) + c)[0] # in case the o contains 0 which cause inf and nan eps = np.asarray([1.0e-10] * self.label_dim, dtype=theano.config.floatX) o = o + eps om = o.reshape((1, o.shape[0])) prediction = T.argmax(om, axis=1) o_error = T.nnet.categorical_crossentropy(om, y) # cost cost = T.sum(o_error) # updates updates = sgd_updates_adadelta(norm=0, params=self.params, cost=cost) # monitor parameter mV = V * T.ones_like(V) mc = c * T.ones_like(c) mU = U * T.ones_like(U) mW = W * T.ones_like(W) gV = T.grad(cost, V) gc = T.grad(cost, c) gU = T.grad(cost, U) gW = T.grad(cost, W) mgV = gV * T.ones_like(gV) mgc = gc * T.ones_like(gc) mgU = gU * T.ones_like(gU) mgW = gW * T.ones_like(gW) # Assign functions self.comsen = theano.function([x_a, x_b], [a_att, b_att]) self.monitor = theano.function([x_a, x_b], [sena, senb, mV, mc, mU, mW]) self.monitor_grad = theano.function([x_a, x_b, y], [mgV, mgc, mgU, mgW]) self.predict = theano.function([x_a, x_b], om) self.predict_class = theano.function([x_a, x_b], prediction) self.ce_error = theano.function([x_a, x_b, y], cost) # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates # find the nan self.sgd_step = theano.function( [x_a, x_b, y], [], updates=updates # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) )
def optimization_adadelta(trainvec, testvec, n_epochs, batch_size, alpha=0.001, beta=0.1): i = T.lvector('i') j = T.lvector('j') x = T.dvector('x') num_user = 6040 num_item = 3952 factors = 20 init_mean = 0 init_stdev = 0.02 mfobj = MF_Batch(i, j, num_user, num_item, factors, init_mean, init_stdev) regcost, error = mfobj.errors(x, beta) grads = T.grad(cost=regcost, wrt=[mfobj.P, mfobj.Q]) #f_grad = theano.function([i, j, x], grads, name='f_grad') lr = T.scalar(name='lr') f_grad_shared, f_update = adadelta(lr, mfobj.params2, grads, i, j, x, regcost) test_model = theano.function( inputs=[i, j, x], #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]], outputs=error) mean_rating = np.mean(trainvec[:, 2]) done_looping = False epoch = 0 N = len(trainvec) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 totalErrors = 0 testErrors = 0 for k in range(int(math.floor(N / batch_size))): batch = np.arange(k * batch_size, min(N - 1, (k + 1) * batch_size)) idi = trainvec[batch, 0] - 1 idj = trainvec[batch, 1] - 1 ratings = trainvec[batch, 2] - mean_rating batch_cost = f_grad_shared(idi, idj, ratings) f_update(alpha) totalErrors += batch_cost NN = len(testvec) batch_size = 1000 for k in range(int(math.floor(NN / batch_size))): batch = np.arange(k * batch_size, min(NN - 1, (k + 1) * batch_size)) p_idx = testvec[batch, 0] - 1 q_idx = testvec[batch, 1] - 1 ratings = testvec[batch, 2] - mean_rating testErrors += test_model(p_idx, q_idx, ratings) print( "the training cost at epoch {} is {}, and the testing error is {}". format(epoch, np.sqrt(totalErrors / N), np.sqrt(testErrors / NN))) # test it on the test dataset NN = len(testvec) batch_size = 1000 diff = 0 for k in range(int(math.floor(NN / batch_size))): batch = np.arange(k * batch_size, min(NN - 1, (k + 1) * batch_size)) p_idx = testvec[batch, 0] - 1 q_idx = testvec[batch, 1] - 1 ratings = testvec[batch, 2] - mean_rating diff += test_model(p_idx, q_idx, ratings) print("Total average test error for {} instances is {}".format( NN, np.sqrt(diff / NN)))
def __theano_build__(self): E = self.E W = self.W U = self.U V = self.V b = self.b c = self.c x = T.lvector('x') # y = T.lvector('y') # def forward_prop_step(x_t, h_t_prev, c_t_prev): # Word embedding layer x_e = E[:, x_t] i_t = T.nnet.sigmoid(W[0].dot(x_e) + U[0].dot(h_t_prev) + b[0]) f_t = T.nnet.sigmoid(W[1].dot(x_e) + U[1].dot(h_t_prev) + b[1]) o_t = T.nnet.sigmoid(W[2].dot(x_e) + U[2].dot(h_t_prev) + b[2]) u_t = T.tanh(W[3].dot(x_e) + U[3].dot(h_t_prev) + b[3]) c_t = i_t*u_t + f_t * c_t_prev h_t = o_t * T.tanh(c_t) # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row # o = T.nnet.softmax(V.dot(h_t) + c)[0] # o = T.nnet.softmax(V[0].dot(h_t) + c) return [h_t, c_t] [h_t, c_t], updates = theano.scan(fn=forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[ dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)) ]) # o is an array for o[t] is output of time step t # we only care the output of final time step def forward_prop_step_b(x_t, h_t_prev_b, c_t_prev_b): # the backward # Word embedding layer x_e_b = E[:, x_t] i_t_b = T.nnet.sigmoid(W[4].dot(x_e_b) + U[4].dot(h_t_prev_b) + b[4]) f_t_b = T.nnet.sigmoid(W[5].dot(x_e_b) + U[5].dot(h_t_prev_b) + b[5]) o_t_b = T.nnet.sigmoid(W[6].dot(x_e_b) + U[6].dot(h_t_prev_b) + b[6]) u_t_b = T.tanh(W[7].dot(x_e_b) + U[7].dot(h_t_prev_b) + b[7]) c_t_b = i_t_b * u_t_b + f_t_b * c_t_prev_b h_t_b = o_t_b * T.tanh(c_t_b) # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row # o = T.nnet.softmax(V.dot(h_t) + c)[0] # o_b = T.nnet.softmax(V[1].dot(h_t) + c) return [h_t_b, c_t_b] [h_t_b, c_t_b], updates = theano.scan(fn=forward_prop_step_b, sequences=x[::-1], truncate_gradient=self.bptt_truncate, outputs_info=[dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim))]) final_h = h_t[-1] final_h_b = h_t_b[-1] final_h_concat = T.concatenate([final_h,final_h_b], axis=0) final_o = T.nnet.softmax(V[0].dot(final_h_concat) + c) # a array with one row prediction = T.argmax(final_o[0], axis=0) print('final_o', final_o.ndim) print('y ', y.ndim) final_o_error = T.sum(T.nnet.categorical_crossentropy(final_o, y)) cost = final_o_error # gradient dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # function self.predict = theano.function([x], final_o) self.predict_class = theano.function([x], prediction) self.ce_error = theano.function([x,y], cost) # SGD parameters learning_rate = T.scalar('learning_rate') self.sgd_step = theano.function([x,y,learning_rate],[], updates=[(self.U, self.U - learning_rate * dU), (self.V, self.V - learning_rate * dV), (self.W, self.W - learning_rate * dW), (self.E, self.E - learning_rate * dE), (self.b, self.b - learning_rate * db), (self.c, self.c - learning_rate * dc)])
broadcastable=param.broadcastable) accu_new = accu + grad**2 updates[accu] = accu_new updates[param] = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) return updates ### momentum ### ######## main ######################### if __name__ == '__main__': # parameters setup X = T.matrix('X') y = T.lvector('y') nn_input_dim = 39 # 39 features as input dim nn_output_dim = 48 # 48 categorical phones hidden_shape = [128, 128] # shape of hidden layer nn_shape = [nn_input_dim] + hidden_shape + [nn_output_dim ] # [39,1000,1000,48] nn shape ### initialize weight/bias ### W = {} b = {} # weight and bias layers = range(len(nn_shape) - 1) # [0,1,2] for layer in layers: shape = nn_shape[layer:layer + 2] # shape of this layer dim = nn_shape[layer + 1] # dim of bias W[layer] = init_weight(shape, index=layer)
def __init__(self, batch_size, train_set,initial_weights=None,weigths_service = None): # lowAndHigh_c1Values= [-0.3,0.3],lowAndHigh_c3Values = [-0.1,0.1],lowAndHigh_fc5Values = [-0.01,0.01],lowAndHigh_fc6Values = [-0.001,0.001]): x = T.tensor4('x') # the data is presented as rasterized images y = T.lvector('y') # batch_size = 50000 # img_input = x #T.reshape(x,(batch_size, 1, 28, 28)) self.cnn = arqui.OCRLenetArquitecture( img_input=x, batch_size=batch_size, initWeights=initial_weights, weigths_service = weigths_service ) # the cost we minimize during training is the NLL of the model cost = self.cnn.LR6.negative_log_likelihood(y) errors = self.cnn.LR6.errors(y) self.Weights = [self.cnn.LR6.Filter, self.cnn.LR6.Bias, self.cnn.FC5.Filter, self.cnn.FC5.Bias, self.cnn.C3.Filter, self.cnn.C1.Filter] grads = T.grad(cost, self.Weights, disconnected_inputs="raise") # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. learningRate = T.dscalar() updates = [ (param_i, param_i + (learningRate * grad_i)) for param_i, grad_i in zip(self.Weights, grads) ] trainset_x = theano.shared(train_set[0]) trainset_y = theano.shared(train_set[1]) index = T.lscalar() #bs = T.lscalar() self.train_model = theano.function( [index, learningRate], cost, # self.classifier.FC.p_y_given_x,#dropout.output updates=updates, givens={ x: trainset_x[index * batch_size: (index + 1) * batch_size], y: trainset_y[index * batch_size: (index + 1) * batch_size] } ) self.evaluation_model_with_cost = theano.function( [index], cost, # self.classifier.FC.p_y_given_x,#dropout.output givens={ x: trainset_x[index * batch_size: (index + 1) * batch_size], y: trainset_y[index * batch_size: (index + 1) * batch_size] } ) self.evaluation_model_with_errors = theano.function( [index], errors, givens={ x: trainset_x[index * batch_size: (index + 1) * batch_size], y: trainset_y[index * batch_size: (index + 1) * batch_size] } )
def __init__(self): super(M1, self).__init__() self.a = T.dscalar() self.b = T.lscalar() self.c = T.lvector()
def create_train_rbm(learning_rate=1e-3, training_epochs=200, dataset=None, seqlen=None, batch_size=10, n_hidden=30): """ Demonstrate how to train a RBM :param learning_rate: learning rate used for training the RBM :param training_epochs: number of epochs used for training :param dataset: path the the pickled dataset :param batch_size: size of a batch used to train the RBM :param n_chains: number of parallel Gibbs chains to be used for sampling :param n_samples: number of samples to plot for each chain """ # compute number of minibatches for training, validation and testing n_train_batches = int( dataset.get_value(borrow=True).shape[0] / batch_size) #number of minibatch n_dim = dataset.get_value( borrow=True).shape[1] #number of data in each frames . # allocate symbolic variables for the data index = T.lvector() # list of index : shuffle data x = T.matrix('x') # the data : matrix cos to minibatch? # initialize storage for the persistent chain (state = hidden # layer of chain) persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden), dtype=theano.config.floatX), borrow=True) # construct the RBM class rbm = RBM(input=x, n_visible=n_dim, n_hidden=n_hidden) # get the cost and the gradient corresponding to one step of CD-15 cost, updates = rbm.get_cost_updates(lr=learning_rate, persistent=persistent_chain, k=1) ################################# # Training the RBM # ################################# # it is ok for a theano function to have no output # the purpose of train_rbm is solely to update the RBM parameters train_rbm = theano.function( [index], #minibatch index cost, updates=updates, givens={x: dataset[index]}, #for the [index]minibatch name='train_rbm') plotting_time = 0. start_time = timeit.default_timer() #shuffle data : Learn gesture after gesture could introduce a bias. In order to avoid this, #we shuffle data to learn all gestures at the same time datasetindex = [] last = 0 for s in seqlen: datasetindex += range(last, last + s) last += s permindex = numpy.array(datasetindex) rbm.numpy_rng.shuffle(permindex) #In order visualize cost evolution during training phase cost_y = [] # go through training epochs for epoch in range(training_epochs): # go through the training set mean_cost = [] for batch_index in range(int(n_train_batches)): #for each minibatch data_idx = permindex[ batch_index * batch_size:(batch_index + 1) * batch_size] #get a list of index in the shuffle index-list mean_cost += [train_rbm(data_idx)] print('Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost)) cost_y.append(numpy.mean(mean_cost)) end_time = timeit.default_timer() pretraining_time = (end_time - start_time) - plotting_time print('RBM : Training took %f minutes' % (pretraining_time / 60.)) return rbm, cost_y
def init_function(self): self.seq_loc = T.lvector() self.seq_idx = T.lvector() self.target = T.lvector() self.target_content_index = T.lscalar() self.seq_len = T.lscalar() self.solution = T.matrix() self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0) self.all_tar_vector = T.take(self.Vw, self.target, axis=0) self.tar_vector = T.mean(self.all_tar_vector, axis=0) self.target_vector_dim = self.tar_vector.dimshuffle('x', 0) self.seq_matrix = T.concatenate([self.seq_matrix[0:self.target_content_index], self.target_vector_dim, self.seq_matrix[self.target_content_index + 1:]], axis=0) h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(self.bc, dtype=theano.config.floatX) def rnn(X, aspect): def encode_forward(x_t, h_fore, c_fore): v = T.concatenate([h_fore, x_t]) f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf) i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi) o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo) c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc) h_next = o_t * T.tanh(c_next) return h_next, c_next def encode_backward(x_t, h_fore, c_fore): v = T.concatenate([h_fore, x_t]) f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf) i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi) o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo) c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc) h_next = o_t * T.tanh(c_next) return h_next, c_next loc_for = T.zeros_like(self.seq_loc) + self.target_content_index al_for = self.a_for_left * T.exp( -self.b_for_left * T.abs_( self.seq_loc[0:self.target_content_index] - loc_for[0:self.target_content_index])) am_for = self.a_for_middle * [1] a_for = T.concatenate([al_for, am_for]) locate_for = T.zeros_like(self.seq_matrix[0:self.target_content_index + 1], dtype=T.config.floatX) + T.reshape(a_for, [-1, 1]) loc_back = T.zeros_like(self.seq_loc) + self.target_content_index ar_back = self.a_back_right * T.exp( -self.b_back_right * T.abs_( self.seq_loc[self.target_content_index + 1:] - loc_back[self.target_content_index + 1:])) ar_back = ar_back[::-1] a_back = T.concatenate([am_for, ar_back]) locate_back = T.zeros_like(self.seq_matrix[self.target_content_index:], dtype=T.config.floatX) + T.reshape( a_back, [-1, 1]) scan_result_forward, _forward = theano.scan(fn=encode_forward, sequences=locate_for * X[0:self.target_content_index + 1], outputs_info=[h, c]) scan_result_backward, _backward = theano.scan(fn=encode_backward, sequences=locate_back * X[self.target_content_index:][::-1], outputs_info=[h, c]) embedding_l = scan_result_forward[0] embedding_r = scan_result_backward[0] h_target_for = embedding_l[-1] h_target_back = embedding_r[-1] attention_h_target_l = embedding_l cont_l = T.concatenate([h_target_for, h_target_back]) yuyi_l = T.transpose(cont_l) alpha_h_l = T.dot(T.dot(attention_h_target_l, self.alpha_h_W_L), yuyi_l) alpha_tmp_l = T.nnet.softmax(alpha_h_l) r_l = T.dot(alpha_tmp_l, embedding_l) h_star_L = T.tanh(T.dot(r_l, self.Wp_L)) attention_h_target_r = embedding_r cont_r = T.concatenate([h_target_for, h_target_back]) yuyi_r = T.transpose(cont_r) alpha_h_r = T.dot(T.dot(attention_h_target_r, self.alpha_h_W_R), yuyi_r) alpha_tmp_r = T.nnet.softmax(alpha_h_r) r_r = T.dot(alpha_tmp_r, embedding_r) h_star_R = T.tanh(T.dot(r_r, self.Wp_R)) embedding = T.concatenate([h_star_L, h_star_R], axis=1) return embedding embedding = rnn(self.seq_matrix, self.tar_vector) embedding_for_train = embedding * self.srng.binomial(embedding.shape, p=0.5, n=1, dtype=embedding.dtype) embedding_for_test = embedding * 0.5 self.pred_for_train = T.nnet.softmax(T.dot(embedding_for_train, self.Ws) + self.bs) self.pred_for_test = T.nnet.softmax(T.dot(embedding_for_test, self.Ws) + self.bs) self.l2 = sum([T.sum(param ** 2) for param in self.params]) - T.sum(self.Vw ** 2) self.loss_sen = -T.tensordot(self.solution, T.log(self.pred_for_train), axes=2) self.loss_l2 = 0.5 * self.l2 * self.regular self.loss = self.loss_sen + self.loss_l2 grads = T.grad(self.loss, self.params) self.updates = collections.OrderedDict() self.grad = {} for param, grad in zip(self.params, grads): g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \ dtype=theano.config.floatX)) self.grad[param] = g self.updates[g] = g + grad self.func_train = theano.function( inputs=[self.seq_idx, self.target, self.solution, self.target_content_index, self.seq_loc, self.seq_len, theano.In(h, value=self.h0), theano.In(c, value=self.c0)], outputs=[self.loss, self.loss_sen, self.loss_l2], updates=self.updates, on_unused_input='warn') self.func_test = theano.function( inputs=[self.seq_idx, self.target, self.target_content_index, self.seq_loc, self.seq_len, theano.In(h, value=self.h0), theano.In(c, value=self.c0)], outputs=self.pred_for_test, on_unused_input='warn')
def main(): ########## # LAYERS # ######### HOME_DIR = "semeval_parsed" timestamp = str(long(time.time() * 1000)) input_fname = '200M' embedding = 'custom' data_dir = HOME_DIR + '_' + input_fname numpy_rng = numpy.random.RandomState(123) print "Load Parameters" parameter_map = cPickle.load( open(data_dir + '/parameters_distant_winner.p', 'rb')) input_shape = parameter_map['inputShape'] filter_width = parameter_map['filterWidth'] n_in = parameter_map['n_in'] st = parameter_map['st'] fname_wordembeddings = os.path.join( data_dir, 'emb_smiley_tweets_embedding_topic.npy') print "Loading word embeddings from", fname_wordembeddings vocab_emb_overlap = numpy.load(fname_wordembeddings) ndim = vocab_emb_overlap.shape[1] ndim = 5 fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic')) alphabet = cPickle.load(open(fname_vocab)) dummy_word_id = alphabet.fid vocab_emb_overlap = (numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25).astype(numpy.float32) def relu(x): return x * (x > 0) activation = relu tweets = T.imatrix('tweets_train') topics = T.imatrix('topics') y = T.lvector('y') batch_tweets = T.imatrix('batch_x_q') batch_topics = T.imatrix('batch_top') batch_y = T.lvector('batch_y') lookup_table_words = nn_layers.LookupTableFastStatic( W=parameter_map['LookupTableFastStaticW'].get_value(), pad=filter_width - 1) lookup_table_topic = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=filter_width - 1) lookup_table = nn_layers.ParallelLookupTable( layers=[lookup_table_words, lookup_table_topic]) filter_shape = parameter_map['FilterShape' + str(filter_width)] filter_shape = (filter_shape[0], filter_shape[1], filter_shape[2], filter_shape[3] + ndim) input_shape = (input_shape[0], input_shape[1], input_shape[2], input_shape[3] + ndim) conv_layers = [] fan_in = numpy.prod(filter_shape[1:]) fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) W_bound = numpy.sqrt(1. / fan_in) W_data = numpy.asarray(numpy_rng.uniform(low=-W_bound, high=W_bound, size=(filter_shape[0], filter_shape[1], filter_shape[2], ndim)), dtype=theano.config.floatX) W_map = parameter_map['Conv2dLayerW' + str(filter_width)].get_value() print W_map.shape print W_data.shape W_data = numpy.concatenate((W_map, W_data), axis=3) conv = nn_layers.Conv2dLayer(W=theano.shared(W_data, name="W_conv1d", borrow=True), rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB' + str(filter_width)], b_size=filter_shape[0], activation=activation) shape1 = parameter_map['PoolingShape1'] pooling = nn_layers.KMaxPoolLayerNative(shape=shape1, ignore_border=True, st=st) input_shape2 = parameter_map['input_shape2' + str(filter_width)] filter_shape2 = parameter_map['FilterShape2' + str(filter_width)] con2 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW2' + str(filter_width)], rng=numpy_rng, input_shape=input_shape2, filter_shape=filter_shape2) non_linearity2 = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB2' + str(filter_width)], b_size=filter_shape2[0], activation=activation) shape2 = parameter_map['PoolingShape2'] pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2, ignore_border=True) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling, con2, non_linearity2, pooling2]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'], b=parameter_map['LinearLayerB'], rng=numpy_rng, n_in=n_in, n_out=n_in, activation=activation) n_outs = 2 classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, hidden_layer, classifier ]) inputs_train = [batch_tweets, batch_topics, batch_y] givens_train = {tweets: batch_tweets, topics: batch_topics, y: batch_y} inputs_pred = [batch_tweets, batch_topics] givens_pred = {tweets: batch_tweets, topics: batch_topics} nnet_tweets.set_input((tweets, topics)) print nnet_tweets params = nnet_tweets.params cost = nnet_tweets.layers[-1].training_cost(y) predictions = nnet_tweets.layers[-1].y_pred updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=0, word_vec_name='None') train_fn = theano.function( inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, ) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) def predict_batch(batch_iterator): preds = numpy.hstack([ pred_fn(batch_x_q, batch_topics) for (batch_x_q, batch_topics) in batch_iterator ]) return preds[:batch_iterator.n_samples] ####################### # Supervised Learining# ###################### batch_size = 1000 training_2016_tids = numpy.load( os.path.join(data_dir, 'task-BD-train-2016.tids.npy')) training_2016_tweets = numpy.load( os.path.join(data_dir, 'task-BD-train-2016.tweets.npy')) training_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-BD-train-2016.sentiments.npy')) training_2016_topics = numpy.load( os.path.join(data_dir, 'task-BD-train-2016.topics.npy')) dev_2016_tids = numpy.load( os.path.join(data_dir, 'task-BD-dev-2016.tids.npy')) dev_2016_tweets = numpy.load( os.path.join(data_dir, 'task-BD-dev-2016.tweets.npy')) dev_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-BD-dev-2016.sentiments.npy')) dev_2016_topics = numpy.load( os.path.join(data_dir, 'task-BD-dev-2016.topics.npy')) devtest_2016_tids = numpy.load( os.path.join(data_dir, 'task-BD-devtest-2016.tids.npy')) devtest_2016_tweets = numpy.load( os.path.join(data_dir, 'task-BD-devtest-2016.tweets.npy')) devtest_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-BD-devtest-2016.sentiments.npy')) devtest_2016_topics = numpy.load( os.path.join(data_dir, 'task-BD-devtest-2016.topics.npy')) test_2016_tids = numpy.load( os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tids.npy')) test_2016_tweets = numpy.load( os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tweets.npy')) test_2016_topics = numpy.load( os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.topics.npy')) training_full_tweets = numpy.concatenate( (training_2016_tweets, dev_2016_tweets), axis=0) training_full_sentiments = numpy.concatenate( (training_2016_sentiments, dev_2016_sentiments), axis=0) training_full_topics = numpy.concatenate( (training_2016_topics, dev_2016_topics), axis=0) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training_full_tweets, training_full_topics, training_full_sentiments], batch_size=batch_size, randomize=True) devtest2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [devtest_2016_tweets, devtest_2016_topics], batch_size=batch_size, randomize=False) test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets, test_2016_topics], batch_size=batch_size, randomize=False) W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 100 early_stop = 20 check_freq = 4 timer_train = time.time() no_best_dev_update = 0 best_dev_acc = -numpy.inf num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (tweet, topic, y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1): train_fn(tweet, topic, y_label) if i % check_freq == 0 or i == num_train_batches: y_pred_devtest_2016 = predict_batch(devtest2016_iterator) dev_acc_2016_devtest = semeval_f1_taskB( devtest_2016_sentiments, y_pred_devtest_2016) if dev_acc_2016_devtest > best_dev_acc: print( 'devtest 2016 epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, i, dev_acc_2016_devtest, best_dev_acc)) best_dev_acc = dev_acc_2016_devtest best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] no_best_dev_update = 0 #cPickle.dump(parameter_map, open(data_dir+'/parameters_{}.p'.format('supervised_posneg'), 'wb')) y_pred_test_2016 = predict_batch(test2016_iterator) numpy.save(data_dir + '/predictions_test_2016', y_pred_test_2016) numpy.save(data_dir + '/predictions_devtest2016', y_pred_devtest_2016) zerout_dummy_word() print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) ####################### # Get Sentence Vectors# ###################### batch_size = input_shape[0] inputs_senvec = [batch_tweets, batch_topics] givents_senvec = {tweets: batch_tweets, topics: batch_topics} output = nnet_tweets.layers[-2].output output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) sets = [(dev_2016_tids, dev_2016_topics, dev_2016_tweets, 'task-BD-dev-2016'), (training_2016_tids, training_2016_topics, training_2016_tweets, 'task-BD-train-2016'), (devtest_2016_tids, devtest_2016_topics, devtest_2016_tweets, 'task-BD-devtest-2016'), (test_2016_tids, test_2016_topics, test_2016_tweets, 'SemEval2016-task4-test.subtask-BD')] for (fids, ftop, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'sentence_vecs_topic/{}.txt'.format(name)), 'w+') for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet, topic) for vec in o: fname.write(fids[counter]) for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break ############################## # Get Predictions Probabilites# ############################# batch_size = input_shape[0] output = nnet_tweets.layers[-1].p_y_given_x output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) for (fids, ftop, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'prob_predictions_topic/{}.txt'.format(name)), 'w+') for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet, topic) for vec in o: for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break
weights_init = IsotropicGaussian(0.01) biases_init = Constant(0.001) # ========================================================================================== # THE MODEL # ========================================================================================== print('Building model ...') bricks = [] dropout_locs = [] # THEANO INPUT VARIABLES eeg = tensor.tensor3('eeg') # batch x time x feature acc = tensor.tensor3('acc') # batch x time x feature label = tensor.lvector('label') # batch eeg_len = 150 * 25 acc_len = 150 acc_chan = 3 def normalize(var, axis): var = var - var.mean(axis=axis, keepdims=True) var = var / tensor.sqrt((var**2).mean(axis=axis, keepdims=True)) return var eeg1 = normalize(eeg, axis=0) eeg2 = normalize(eeg, axis=1) eeg3 = normalize(eeg1, axis=1)
def jobman(state, channel): # load dataset rng = numpy.random.RandomState(state['seed']) # declare the dimensionalies of the input and output if state['chunks'] == 'words': state['n_in'] = 10000 state['n_out'] = 10000 else: state['n_in'] = 50 state['n_out'] = 50 train_data, valid_data, test_data = get_text_data(state) ## BEGIN Tutorial ### Define Theano Input Variables x = TT.lvector('x') y = TT.lvector('y') h0 = theano.shared( numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32')) ### Neural Implementation of the Operators: \oplus #### Word Embedding emb_words = MultiLayer(rng, n_in=state['n_in'], n_hids=eval(state['inp_nhids']), activation=eval(state['inp_activ']), init_fn='sample_weights_classic', weight_noise=state['weight_noise'], rank_n_approx=state['rank_n_approx'], scale=state['inp_scale'], sparsity=state['inp_sparse'], learn_bias=True, bias_scale=eval(state['inp_bias']), name='emb_words') #### Deep Transition Recurrent Layer rec = eval(state['rec_layer'])( rng, eval(state['nhids']), activation=eval(state['rec_activ']), #activation = 'TT.nnet.sigmoid', bias_scale=eval(state['rec_bias']), scale=eval(state['rec_scale']), sparsity=eval(state['rec_sparse']), init_fn=eval(state['rec_init']), weight_noise=state['weight_noise'], name='rec') #### Stiching them together ##### (1) Get the embedding of a word x_emb = emb_words(x, no_noise_bias=state['no_noise_bias']) ##### (2) Embedding + Hidden State via DT Recurrent Layer reset = TT.scalar('reset') rec_layer = rec(x_emb, n_steps=x.shape[0], init_state=h0 * reset, no_noise_bias=state['no_noise_bias'], truncate_gradient=state['truncate_gradient'], batch_size=1) ## BEGIN Exercise: DOT-RNN ### Neural Implementation of the Operators: \lhd #### Exercise (1) #### Hidden state -> Intermediate Layer emb_state = MultiLayer(rng, n_in=eval(state['nhids'])[-1], n_hids=eval(state['dout_nhid']), activation=linear, init_fn=eval(state['dout_init']), weight_noise=state['weight_noise'], scale=state['dout_scale'], sparsity=state['dout_sparse'], learn_bias=True, bias_scale=eval(state['dout_bias']), name='emb_state') #### Exercise (1) #### Input -> Intermediate Layer emb_words_out = MultiLayer(rng, n_in=state['n_in'], n_hids=eval(state['dout_nhid']), activation=linear, init_fn='sample_weights_classic', weight_noise=state['weight_noise'], scale=state['dout_scale'], sparsity=state['dout_sparse'], rank_n_approx=state['dout_rank_n_approx'], learn_bias=False, bias_scale=eval(state['dout_bias']), name='emb_words_out') #### Hidden State: Combine emb_state and emb_words_out #### Exercise (1) outhid_activ = UnaryOp(activation=eval(state['dout_activ'])) #### Exercise (2) outhid_dropout = DropOp(dropout=state['dropout'], rng=rng) #### Softmax Layer output_layer = SoftmaxLayer(rng, eval(state['dout_nhid']), state['n_out'], scale=state['out_scale'], bias_scale=state['out_bias_scale'], init_fn="sample_weights_classic", weight_noise=state['weight_noise'], sparsity=state['out_sparse'], sum_over_time=True, name='out') ### Few Optional Things #### Direct shortcut from x to y if state['shortcut_inpout']: shortcut = MultiLayer(rng, n_in=state['n_in'], n_hids=eval(state['inpout_nhids']), activations=eval(state['inpout_activ']), init_fn='sample_weights_classic', weight_noise=state['weight_noise'], scale=eval(state['inpout_scale']), sparsity=eval(state['inpout_sparse']), learn_bias=eval(state['inpout_learn_bias']), bias_scale=eval(state['inpout_bias']), name='shortcut') #### Learning rate scheduling (1/(1+n/beta)) state['clr'] = state['lr'] def update_lr(obj, cost): stp = obj.step if isinstance(obj.state['lr_start'], int) and stp > obj.state['lr_start']: time = float(stp - obj.state['lr_start']) new_lr = obj.state['clr'] / (1 + time / obj.state['lr_beta']) obj.lr = new_lr if state['lr_adapt']: rec.add_schedule(update_lr) ### Neural Implementations of the Language Model #### Training if state['shortcut_inpout']: additional_inputs = [rec_layer, shortcut(x)] else: additional_inputs = [rec_layer] ##### Exercise (1): Compute the output intermediate layer outhid = outhid_activ(emb_state(rec_layer) + emb_words_out(x)) ##### Exercise (2): Apply Dropout outhid = outhid_dropout(outhid) train_model = output_layer(outhid, no_noise_bias=state['no_noise_bias'], additional_inputs=additional_inputs).train( target=y, scale=numpy.float32(1. / state['seqlen'])) nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1] if state['carry_h0']: train_model.updates += [(h0, nw_h0)] #### Validation h0val = theano.shared( numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32')) rec_layer = rec(emb_words(x, use_noise=False), n_steps=x.shape[0], batch_size=1, init_state=h0val * reset, use_noise=False) nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1] ##### Exercise (1): Compute the output intermediate layer outhid = outhid_activ(emb_state(rec_layer) + emb_words_out(x)) ##### Exercise (2): Apply Dropout outhid = outhid_dropout(outhid, use_noise=False) if state['shortcut_inpout']: additional_inputs = [rec_layer, shortcut(x, use_noise=False)] else: additional_inputs = [rec_layer] valid_model = output_layer(outhid, additional_inputs=additional_inputs, use_noise=False).validate(target=y, sum_over_time=True) valid_updates = [] if state['carry_h0']: valid_updates = [(h0val, nw_h0)] valid_fn = theano.function([x, y, reset], valid_model.out, name='valid_fn', updates=valid_updates) #### Sampling ##### single-step sampling def sample_fn(word_tm1, h_tm1): x_emb = emb_words(word_tm1, use_noise=False, one_step=True) h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1] outhid = outhid_dropout(outhid_activ( emb_state(h0, use_noise=False, one_step=True) + emb_words_out(word_tm1, use_noise=False, one_step=True), one_step=True), use_noise=False, one_step=True) word = output_layer.get_sample(state_below=outhid, additional_inputs=[h0], temp=1.) return word, h0 ##### scan for iterating the single-step sampling multiple times [samples, summaries], updates = scan(sample_fn, states=[ TT.alloc(numpy.int64(0), state['sample_steps']), TT.alloc(numpy.float32(0), 1, eval(state['nhids'])[-1]) ], n_steps=state['sample_steps'], name='sampler_scan') ##### build a Theano function for sampling sample_fn = theano.function([], [samples], updates=updates, profile=False, name='sample_fn') ##### Load a dictionary dictionary = numpy.load(state['dictionary']) if state['chunks'] == 'chars': dictionary = dictionary['unique_chars'] else: dictionary = dictionary['unique_words'] def hook_fn(): sample = sample_fn()[0] print 'Sample:', if state['chunks'] == 'chars': print "".join(dictionary[sample]) else: for si in sample: print dictionary[si], print ### Build and Train a Model #### Define a model model = LM_Model(cost_layer=train_model, weight_noise_amount=state['weight_noise_amount'], valid_fn=valid_fn, clean_before_noise_fn=False, noise_fn=None, rng=rng) if state['reload']: model.load(state['prefix'] + 'model.npz') #### Define a trainer ##### Training algorithm (SGD) if state['moment'] < 0: algo = SGD(model, state, train_data) else: algo = SGD_m(model, state, train_data) ##### Main loop of the trainer main = MainLoop(train_data, valid_data, test_data, model, algo, state, channel, train_cost=False, hooks=hook_fn, validate_postprocess=eval(state['validate_postprocess'])) ## Run! main.main()
def init_function(self): self.seq_idx = T.lvector() self.tar_scalar = T.lscalar() self.solution = T.matrix() self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0) self.tar_vector = T.take(self.Va, self.tar_scalar, axis=0) h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like( self.bc, dtype=theano.config.floatX) def encode(x_t, h_fore, c_fore, tar_vec): v = T.concatenate([h_fore, x_t, tar_vec]) f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf) i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi) o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo) c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc) h_next = o_t * T.tanh(c_next) return h_next, c_next scan_result, _ = theano.scan(fn=encode, sequences=[self.seq_matrix], outputs_info=[h, c], non_sequences=[self.tar_vector]) embedding = scan_result[ 0] # embedding in there is a matrix, include[h_1, ..., h_n] # attention matrix_aspect = T.zeros_like( embedding, dtype=theano.config.floatX)[:, :self.dim_aspect] + self.tar_vector hhhh = T.concatenate( [T.dot(embedding, self.Wh), T.dot(matrix_aspect, self.Wv)], axis=1) M_tmp = T.tanh(hhhh) alpha_tmp = T.nnet.softmax(T.dot(M_tmp, self.w)) r = T.dot(alpha_tmp, embedding) h_star = T.tanh(T.dot(r, self.Wp) + T.dot(embedding[-1], self.Wx)) embedding = h_star # embedding in there is a vector, represent h_n_star # dropout embedding_for_train = embedding * self.srng.binomial( embedding.shape, p=0.5, n=1, dtype=embedding.dtype) embedding_for_test = embedding * 0.5 self.pred_for_train = T.nnet.softmax( T.dot(embedding_for_train, self.Ws) + self.bs) self.pred_for_test = T.nnet.softmax( T.dot(embedding_for_test, self.Ws) + self.bs) self.l2 = sum([T.sum(param**2) for param in self.params]) - T.sum(self.Vw**2) self.loss_sen = -T.tensordot( self.solution, T.log(self.pred_for_train), axes=2) self.loss_l2 = 0.7 * self.l2 * self.regular self.loss = self.loss_sen + self.loss_l2 grads = T.grad(self.loss, self.params) self.updates = collections.OrderedDict() self.grad = {} for param, grad in zip(self.params, grads): g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \ dtype=theano.config.floatX)) self.grad[param] = g self.updates[g] = g + grad self.func_train = theano.function( inputs=[ self.seq_idx, self.tar_scalar, self.solution, theano.In(h, value=self.h0), theano.In(c, value=self.c0) ], outputs=[self.loss, self.loss_sen, self.loss_l2], updates=self.updates, on_unused_input='warn') self.func_test = theano.function(inputs=[ self.seq_idx, self.tar_scalar, theano.In(h, value=self.h0), theano.In(c, value=self.c0) ], outputs=self.pred_for_test, on_unused_input='warn')
feedback_dim=alphabet_size, name="feedback"), name="readout") seq_gen = SequenceGenerator(readout=readout, transition=rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") seq_gen.push_initialization_config() rnn.weights_init = Orthogonal() seq_gen.initialize() # z markov_tutorial x = tensor.lvector('features') x = x.reshape((x.shape[0], 1)) cost = aggregation.mean(seq_gen.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" cost_cg = ComputationGraph(cost) # theano.printing.pydotprint(cost, outfile="./pics/symbolic_graph_unopt.png", var_with_name_simple=True) algorithm = GradientDescent(cost=cost, parameters=list( Selector(seq_gen).get_parameters().values()), step_rule=Scale(0.001)) # AUDIOSCOPE OBSERVABLES (some) observables = [] observables += cost_cg.outputs
def _build_functions(self): """ Create Theano functions that underly higher level functionality. None of the created functions should be used directly by the user. """ if self.cost == log_likelihood: target = T.lvector() else: target = tensor(1 + len(self.layers[-1].layer_shape)) if self.cost == log_likelihood: cost = -T.sum( T.log(self.symb_output)[T.arange(target.shape[0]), target]) elif self.cost == mse: cost = T.sum((self.symb_output - target)**2) else: raise "unsupported cost function" # Feedforward an input. self._feedforward_fs = [] for y in self.layer_ys: self._feedforward_fs += [function([self.symb_input], y)] self._feedforward = function([self.symb_input], self.symb_output) #Introspection! self.layer_infos = [] to_compute = [] def add_compute(p): to_compute.append(p) n = add_compute.n add_compute.n += 1 return n add_compute.n = 0 for layer, y, n in zip(self.layers, self.layer_ys, range(100)): param_name_count = 0 def get_param_name(param): if param == layer.W: return "W" if param == layer.b: return "b" name = "param_" + str(param_name_count) return name info = { "name": str(n) + "_" + layer.__class__.__name__, "compute_names": ["y", "y_grad"], "compute_ns": [add_compute(y), add_compute(T.grad(cost, y))] } if layer.activation and layer.activation == sigmoid: info["activation"] = "sigmoid" elif layer.activation and layer.activation == ReLU: info["activation"] = "ReLU" elif layer.activation and layer.activation == linear: info["activation"] = "linear" else: info["activation"] = None for p in layer.params or []: p_name = get_param_name(p) info["compute_names"].append(p_name) info["compute_ns"].append(add_compute(p)) info["compute_names"].append(p_name + "_grad") info["compute_ns"].append(add_compute(T.grad(cost, p))) self.layer_infos.append(info) self._complete_introspect = function([self.symb_input, target], to_compute) #Test performance on some input vs target answer self._test_cost = function([self.symb_input, target], cost) aug_params = [x for l in self.layers for x in l.aug_params] # Scale parameter momentum scale_constant = T.scalar() self._scale_param_momentum = function([scale_constant], [], updates=[ (x.momentum, scale_constant * x.momentum) for x in aug_params ]) # Weight decay decay_constant = T.scalar() self._scale_weights = function([decay_constant], [], updates=[(x.var, decay_constant * x.var) for x in aug_params]) # Update momentum based on cost gradient for given learning rate, input and target. learning_rate = T.scalar() self._momentum_deriv_add = function( [self.symb_input, target, learning_rate], [], updates=[(x.momentum, x.momentum - learning_rate * T.grad(cost, x.var)) for x in aug_params]) learning_rates = [ T.scalar() if l.params else None for l in self.layers ] used_learning_rates = filter(lambda x: x != None, learning_rates) self._momentum_deriv_add_perlayer = function( [self.symb_input, target] + used_learning_rates, [], updates=[(x.momentum, x.momentum - learning_rate_ * T.grad(cost, x.var)) for learning_rate_, l in zip(learning_rates, self.layers) for x in l.aug_params]) # NORMALIZED SGD: Update momentum based on cost gradient for given learning rate, input and target. self._momentum_deriv_add_normalized = function( [self.symb_input, target, learning_rate], [], updates=[(x.momentum, x.momentum - learning_rate * T.sqrt(float(x.size)) * norm_L2(T.grad(cost, x.var))) for x in aug_params]) #print [product(x.shape) for x in aug_params] self._momentum_deriv_add_perlayer_normalized = function( [self.symb_input, target] + used_learning_rates, [], updates=[(x.momentum, x.momentum - learning_rate * T.sqrt(float(x.size)) * norm_L2(T.grad(cost, x.var))) for learning_rate, l in zip(learning_rates, self.layers) for x in l.aug_params]) # Update parameters based on paramter momentum self._learn = function([], [], updates=[(x.var, x.var + x.momentum) for x in aug_params]) # Nesterov method self._nesterov_reset_base = function([], [], updates=[(x.base, x.var) for x in aug_params]) self._nesterov_set_params = function([], [], updates=[(x.var, x.base + x.momentum) for x in aug_params]) self._nesterov_learn = function([], [], updates=[(x.base, x.base + x.momentum) for x in aug_params]) # accuracy! guesses = T.argmax(self.symb_output, axis=1) if self.cost == log_likelihood: ans = target self._av_correct_confidence = function( [self.symb_input, target], T.mean(self.symb_output[T.arange(target.shape[0]), target])) else: ans = T.argmax(target, axis=1) self._corrects = theano.function([self.symb_input, target], T.sum(T.eq(guesses, ans))) self._inspect_grad = function( [self.symb_input, target], [T.grad(cost, x.var) for x in aug_params])
def __init__( self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class(input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback( num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter(criterion['name'], eos_label, num_phonemes, criterion.get( 'min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout_config = dict(readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps')