def get_output_for(self, input, **kwargs): ''' Computes 2D FFT. Input layer must have dimension [n, 2, nx, ny] ''' if self.is_3d: n, nc, nx, ny, nt = self.data_shape lin = T.transpose(input, axes=(0, 4, 1, 2, 3)) lin = lin.reshape((-1, nc, nx, ny)) lout, updates = theano.scan(self.transform, sequences=lin) lout = lout.reshape((-1, nt, nc, nx, ny)) out = T.transpose(lout, axes=(0, 2, 3, 4, 1)) return out # def loop_over_n(i, arr): # out, updates = theano.scan(self.transform, # sequences=arr[:, :, i])[0] # return out # nt = self.data_shape[-1] # out, updates = theano.scan(loop_over_n, # non_sequences=input, # sequences=xrange(nt)) # return out out, updates = theano.scan(self.transform, sequences=input) return out
def cost_seq(self, start, end, A, tagger_out, targets): # compute gold seq's score with using A and tagger_out gold_seq = T.argmax(targets, axis=1) seq_score = start[gold_seq[0]] seq_score += end[gold_seq[-1]] # tagger_out_scores tout_chooser = lambda gold_i, i, tagger_out: tagger_out[i][gold_i] tout_seq_scores, updates = theano.scan( fn=tout_chooser, sequences=[gold_seq, T.arange(gold_seq.shape[0])], non_sequences=[tagger_out], outputs_info=None ) seq_score += tout_seq_scores.sum() # A matrix scores A_chooser = lambda i, next_i, A: A[i][next_i] A_seq_scores, updates = theano.scan( fn=A_chooser, sequences=[gold_seq[:-1], gold_seq[1:]], non_sequences=[A], outputs_info=None ) seq_score += A_seq_scores.sum() return seq_score
def test_scan_err1(self): # This test should fail when building fx for the first time orig_compute_test_value = theano.config.compute_test_value try: theano.config.compute_test_value = 'raise' k = T.iscalar("k") A = T.matrix("A") k.tag.test_value = 3 A.tag.test_value = numpy.random.rand(5,3).astype(config.floatX) def fx(prior_result, A): return T.dot(prior_result, A) # Since we have to inspect the traceback, # we cannot simply use self.assertRaises() try: theano.scan( fn=fx, outputs_info=T.ones_like(A), non_sequences=A, n_steps=k) assert False except ValueError, e: # Get traceback tb = sys.exc_info()[2] # Get frame info 4 layers up frame_info = traceback.extract_tb(tb)[-5] # We should be in the "fx" function defined above assert os.path.split(frame_info[0])[1] == 'test_compute_test_value.py' assert frame_info[2] == 'fx' finally: theano.config.compute_test_value = orig_compute_test_value
def output_h_vals(self, train=False): if self.inputs_dict.has_key('input_single'): input = self.get_input('input_single', train) #(nb_sample, input_dim) X = TU.repeat(input, self.input_length) # (input_length, nb_sample, input_dim) mask = None else: input = self.get_input('input_sequence', train) # (nb_sample, input_length, input_dim) X = input.dimshuffle((1, 0, 2)) # (input_length, nb_sample, input_dim) mask = self.get_input_mask('input_sequence',train) # (nb_sample, input_length) if mask: mask = T.cast(mask, dtype='int8').dimshuffle((1, 0, 'x')) # (input_length, nb_sample, 1) #h_0 = T.zeros((X.shape[1], self.output_dim), X.dtype) # (nb_samples, output_dim) h_0 = self._get_initial_state(X) if mask: h_vals, _ = theano.scan( self.step, sequences=[mask, X], outputs_info=h_0, non_sequences=[self.W, self.U, self.b], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards, strict=True) else: h_vals, _ = theano.scan( self.step_no_mask, sequences=[X], outputs_info=h_0, non_sequences=[self.W, self.U, self.b], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards, strict=True) return h_vals #(input_length, nb_samples, output_dim)
def apply(self , src , mask_length , tgt): """ viterbi algorithm """ result , updates = theano.scan( fn = self.train_step, sequences = src, outputs_info = [self.A_start, None] , non_sequences = self.A , n_steps = mask_length ) # the score of best path best_path_score = result[0][-1].max() idx = T.argmax(result[0][-1]) #backtracking res2 , _ = theano.scan( fn = lambda dps , idx , idx2 : [dps[idx] , idx], sequences = result[1][::-1], outputs_info = [idx , idx], n_steps = mask_length ) # the path of best score best_path = res2[1] #if len(best_path) < seq_len: # best_path.extend((seq_len - len(best_path)) * [2]) # the score of tgt path tgt_score = self.decode(src , mask_length , tgt) # max_margin max_margin = T.sum(T.neq(tgt[:mask_length] , best_path)) cost = best_path_score + max_margin - tgt_score return T.switch(T.lt(cost , T.alloc(numpy.float32(0.))) , T.alloc(numpy.float32(0.)) , cost ),best_path
def __init__(self, layers, num_possible_characters): print("Building the model...") self.rng = theano.tensor.shared_randomstreams.RandomStreams() self.model = StackedCells(num_possible_characters, layers=layers, activation=T.tanh, celltype=LSTM) self.model.layers[0].in_gate2.activation = lambda x: x self.model.layers.append(Layer(layers[-1], num_possible_characters, lambda x: T.nnet.softmax(x)[0])) num_steps = T.scalar(dtype='int32') # function to put into scan to fire the network recurrently def step(prev_char, *prev_hiddens): new_hiddens = self.model.forward(int_to_onehot(T.cast(prev_char, 'int32'), num_possible_characters), prev_hiddens) dist = new_hiddens[-1] next_char = self.rng.choice(size=[1], a=num_possible_characters, p=dist) return [T.cast(next_char, 'int32')] + new_hiddens[:-1] results, updates = theano.scan(step, n_steps=num_steps, outputs_info=[dict(initial=np.int32([-1]),taps=[-1])] + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in self.model.layers if hasattr(layer, 'initial_hidden_state')]) self.forward_pass = theano.function([num_steps], [results[0].dimshuffle((1,0))[0]], updates=updates, allow_input_downcast=True) training_data = T.matrix('training data') # list of character values less than num_possible_characters def step_inner(prev_char, desired_output, *prev_hiddens): new_hiddens = self.model.forward(int_to_onehot(prev_char, num_possible_characters), prev_hiddens) prob_correct = new_hiddens[-1][desired_output] return [prob_correct] + new_hiddens[:-1] # I have no idea whether nesting scan will work at all def step_outer(training_sample, *initial_states): print(list(initial_states)) # different call to scan that uses the training data as prior timesteps results_inner, updates_inner = theano.scan(step_inner, n_steps=training_sample.shape[0], sequences=[dict(input=T.cast(T.concatenate(([0], training_sample)), 'int32'), taps=[0,1])], outputs_info=[None] + list(initial_states)) return results_inner, updates_inner results_outer, updates_outer = theano.scan(step_outer, n_steps=training_data.shape[0], sequences=[training_data], non_sequences=[layer.initial_hidden_state for layer in self.model.layers if hasattr(layer, 'initial_hidden_state')], outputs_info=[None, None]) results_inner = results[0] # this should be a list of each "results" from step_inner updates_inner = results[1] # this should be a list of updates # I want to find the zero position of each results vector in results_inner prob_correct_v = results_inner[:][0] # should be a matrix of probabilities between 0 and 1 cost = -T.mean(T.log(prob_correct_v)) # mean should take the average across all dimensions u, gsums, xsums, lr, max_norm = create_optimization_updates(cost, self.model.params, method='adadelta') # combine all the updates into one dictionary all_updates = {} for d in updates_inner: all_updates.update(d) all_updates.update(updates_outer) self.training_pass = theano.function([training_data], [cost], updates=all_updates + u, allow_input_downcast=True) self.validation_pass = theano.function([training_data], [cost], updates=all_updates, allow_input_downcast=True)
def get_output(self, train=False): self._train_state = train X, eps = self.get_input(train).values() eps = eps.dimshuffle(1, 0, 2) canvas, init_enc, init_dec = self._get_initial_states(X) if self.inner_rnn == 'gru': outputs, updates = scan(self._step, sequences=eps, outputs_info=[canvas, init_enc, init_dec, None], non_sequences=[X, ] + self.params, # n_steps=self.n_steps, truncate_gradient=self.truncate_gradient) elif self.inner_rnn == 'lstm': outputs, updates = scan(self._step_lstm, sequences=eps, outputs_info=[0*canvas, 0*init_enc, 0*init_enc, 0*init_dec, 0*init_dec, None], non_sequences=[X, ] + self.params, truncate_gradient=self.truncate_gradient) kl = outputs[-1].sum(axis=0).mean() if train: # self.updates = updates self.regularizers = [SimpleCost(kl), ] if self.return_sequences: return [outputs[0].dimshuffle(1, 0, 2, 3, 4), kl] else: return [outputs[0][-1], kl]
def build_rnnrbm(self, n_visible, n_hidden, n_hidden_recurrent): u0 = T.zeros((self.n_hidden_recurrent,)) # initial value for the RNN hidden def recurrence(v_t, u_tm1): bv_t = self.bv + T.dot(u_tm1, self.Wuv) bh_t = self.bh + T.dot(u_tm1, self.Wuh) generate = v_t is None if generate: v_t, _, _, updates = self.build_rbm(T.zeros((n_visible,)), self.W, bv_t, bh_t, k=25) u_t = T.tanh(self.bu + T.dot(v_t, self.Wvu) + T.dot(u_tm1, self.Wuu)) return ([v_t, u_t], updates) if generate else [u_t, bv_t, bh_t] (u_t, bv_t, bh_t), updates_train = theano.scan( lambda v_t, u_tm1, *_: recurrence(v_t, u_tm1), sequences=self.v, outputs_info=[u0, None, None], non_sequences=self.params, ) v_sample, cost, monitor, updates_rbm = self.build_rbm(self.v, self.W, bv_t[:], bh_t[:], k=15) updates_bh_t = updates_train.copy() updates_train.update(updates_rbm) # symbolic loop for sequence generation (v_t, u_t), updates_generate = theano.scan( lambda u_tm1, *_: recurrence(None, u_tm1), outputs_info=[None, u0], non_sequences=self.params, n_steps=1 ) return (self.v, v_sample, cost, monitor, self.params, updates_train, v_t, updates_generate, bh_t, updates_bh_t)
def nin(X, param): w1, w2, w3, b1, b2, b3 = param X = X.dimshuffle(0, 1, 'x', 2, 3) # (n,32,1,r,c) w1 = w1.dimshuffle(0, 1, 2, 'x', 3, 4) # (64,32,16,1,3,3) w2 = w2.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,16,1,1) w3 = w3.dimshuffle(0, 1, 2, 'x', 'x') # (64,2,32,1,1) b1 = b1.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,16,1,1) b2 = b2.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,1,1,1) b3 = b3.dimshuffle(0, 'x', 1, 'x', 'x') # (64,1,2,1,1) indexi = T.arange(w1.shape[0], dtype='int32') # (0:64) indexi = T.repeat(indexi, w1.shape[1], axis=0) indexj = T.arange(w1.shape[1], dtype='int32') # (0:64) indexj = T.tile(indexj, w1.shape[0]) results, updates = scan(fn=metaOp1, sequences=[indexi, indexj], outputs_info=None, non_sequences=[X, w1, w2, b1, b2], strict=True) # (64*32,n,1,r,c) metaShape1 = results.shape[-4], results.shape[-2], results.shape[-1] reshaped1 = results.reshape((w1.shape[0], w1.shape[1]) + metaShape1) # (64,32,n,r,c) permuted1 = T.transpose(reshaped1, axes=(0, 2, 1, 3, 4)) # (64,n,32,r,c) indexi = T.arange(w1.shape[0], dtype='int32') # (0:64) results, updates = scan(fn=metaOp2, sequences=[indexi], outputs_info=None, non_sequences=[permuted1, w3, b3], strict=True) # (64,n,2,r,c) permuted2 = T.transpose(results, axes=(1, 0, 2, 3, 4)) # (n,64,2,r,c) metaShape2 = permuted2.shape[-2], permuted2.shape[-1] reshaped2 = permuted2.reshape((permuted2.shape[0], -1) + metaShape2) # (n,128,r,c) return reshaped2
def __theano_build__(self): params = self.params param_names = self.param_names hidden_dim = self.hidden_dim x1 = T.imatrix('x1') # first sentence x2 = T.imatrix('x2') # second sentence x1_mask = T.fmatrix('x1_mask') #mask x2_mask = T.fmatrix('x2_mask') y = T.ivector('y') # label y_c = T.ivector('y_c') # class weights # Embdding words _E1 = params["E"].dot(params["W"][0]) + params["B"][0] _E2 = params["E"].dot(params["W"][1]) + params["B"][1] statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim]) statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim]) def rnn_cell(x, mx, ph, Wh): h = T.tanh(ph.dot(Wh) + x) h = mx[:, None] * h + (1-mx[:, None]) * ph return [h] [h1], updates = theano.scan( fn=rnn_cell, sequences=[statex1, x1_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))], non_sequences=params["W"][2]) [h2], updates = theano.scan( fn=rnn_cell, sequences=[statex2, x2_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=h1[-1])], non_sequences=params["W"][3]) #predict _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"]) _p = T.argmax(_s, axis=1) _c = T.nnet.categorical_crossentropy(_s, y) _c = T.sum(_c * y_c) _l = T.sum(params["lrW"]**2) _cost = _c + 0.01 * _l # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # Gradients and updates _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay) # Assign functions self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads) self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c) self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s) self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p) self.sgd_step = theano.function( [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay], updates=_updates)
def loss_fn_per_context(word_position,context): # sum up the global vectors of the context context_vector = T.sum(W_g[context], axis = 0) # start with -1 with none of the words disambiguated start = -1*T.ones_like(context) output_alg, updates = theano.scan(l2C, sequences = [context, T.arange(4)], outputs_info = [start, context_vector]) disambiguated_senses = output_alg[0][-1] augmented_context_vector = output_alg[1][-1] sense_of_actual_word = disambiguated_senses[word_position] #return T.argsort(T.dot(context_vector, W_s[actual_word].T)), T.dot(context_vector, W_s[actual_word].T) actual_word = context[word_position] # Compute loss to update the global word vectors ignoring the word itself def score(i): return T.switch(T.eq(i, actual_word), 0, T.log(T.nnet.sigmoid(T.dot(W_g[actual_word], W_g[i])))) scores, ignore_updates = theano.scan(score, sequences = [context]) def calc_score(context_word, sense_of_context_word): return T.switch(T.eq(context_word, actual_word), 0, T.log(T.nnet.sigmoid(T.dot(W_s[actual_word][sense_of_actual_word], W_s[context_word][sense_of_context_word] )))) sense_scores, ignore_updates_ = theano.scan(calc_score, sequences = [context, disambiguated_senses]) loss_this_example = T.sum(scores, axis = 0) + T.sum(sense_scores, axis = 0) return loss_this_example
def __init__(self, ne, de, cs, nh, nc, L2_reg = 0.0, rng = np.random.RandomState()): self.nc = nc self.hiddenLayer = Layer(de*cs, nh, rng = rng) self.outputLayer = Layer(nh, nc) self.emb = theano.shared(rng.normal(loc = 0.0, scale = 0.01, size = (ne, de)).astype(theano.config.floatX)) A = rng.normal(loc = 0.0, scale = 0.01, size = (nc, nc)).astype(theano.config.floatX) self.A = theano.shared(value = A, name = 'A', borrow = True) self.params = self.hiddenLayer.params + self.outputLayer.params + [self.emb, self.A] self.names = ['Wh', 'bh', 'w', 'b', 'emb', 'A'] idxs = T.imatrix('idxs') x = self.emb[idxs].reshape((idxs.shape[0], de*cs)) y = T.bvector('y') ans = T.bvector('ans') INF = 1e9 result, updates1 = theano.scan(fn = self.one_step, sequences = x, outputs_info = [theano.shared(0.0), theano.shared(-INF), theano.shared(-INF), theano.shared(-INF), None, None, None, None]) self.decode = theano.function(inputs = [idxs], outputs = result, updates = updates1) score, updates2 = theano.scan(fn = self.two_step, sequences = [x, dict(input = y, taps = [-1, 0]), dict(input = ans, taps = [-1, 0])], outputs_info = theano.shared(0.0)) cost = score[-1] gradients = T.grad(cost, self.params) lr = T.scalar('lr') for p, g in zip(self.params, gradients): updates2[p] = p + lr * g self.fit = theano.function(inputs = [idxs, y, ans, lr], outputs = cost, updates = updates2) self.normalize = theano.function(inputs = [], updates = {self.emb: self.emb / T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0, 'x')})
def function(self, input_tensor): init_hs = T.zeros((input_tensor.shape[1], self.output_neurons)) init_cs = T.zeros((input_tensor.shape[1], self.output_neurons)) lstm_out_1, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.d_forward, go_forwards=True), outputs_info=[init_hs,init_cs], sequences=input_tensor, non_sequences=None) lstm_out_2, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.d_backward, go_forwards=False), outputs_info=[init_hs,init_cs], sequences=input_tensor, non_sequences=None) lstm_out_3, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.u_forward, go_forwards=True), outputs_info=[init_hs,init_cs], sequences=input_tensor, non_sequences=None, go_backwards=True) lstm_out_4, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.u_backward, go_forwards=False), outputs_info=[init_hs,init_cs], sequences=input_tensor, non_sequences=None, go_backwards=True) return T.concatenate((lstm_out_1[0], lstm_out_2[0], lstm_out_3[0][::-1], lstm_out_4[0][::-1]), axis=2)
def get_square_norm_gradients_scan(D_by_layer, cost, accum = 0): # This returns a theano variable that will be of shape (minibatch_size, ). # It will contain, for each training example, the associated square-norm of the total gradient. # If you take the element-wise square-root afterwards, you will get # the associated 2-norms, which is what you want for importance sampling. for (layer_name, D) in D_by_layer.items(): backprop_output = tensor.grad(cost, D['output']) if D.has_key('weight'): A = D['input'] B = backprop_output S, _ = theano.scan(fn=lambda A, B: tensor.sqr(tensor.outer(A,B)).sum(), sequences=[A,B]) accum = accum + S if D.has_key('bias'): B = backprop_output S, _ = theano.scan(fn=lambda B: tensor.sqr(B).sum(), sequences=[B]) accum = accum + S return accum
def mvNormal_logp(mu, tau, value): """ This logp function is for multivariate normal distribution Inputs: ------- mu = mu values assumed for each observation (num_obs x dims) tau = tau values assumed for each observations (num_obs x dim x dim) value = observed values (num_obs x dims) Output: ------- output = log likelihood """ dim = mu.shape[-1] k = tau.shape[1] n_count = value.shape[0] delta = value - mu # first function long_sum1, updates = theano.scan(lambda n: tt.log(1.0 / tt.nlinalg.det(n)), sequences=[tau], strict=True) # second function long_sum2, updates = theano.scan(lambda t, d: d.reshape((1, -1)).dot(t).dot(d), sequences=[tau, delta], strict=True) output = k * tt.log(2 * np.pi) output += long_sum1 output += long_sum2 output *= -1 / 2.0 return output
def inference(self): # A bit hacky # Re-initialize the visible unit (avoid copying useless dimshuffle # part of the graph computation of v) self.v = self.v_init # We have to dimshuffle so that time is the first dimension self.v = self.v.dimshuffle((1,0,2)) # Write the recurrence to get the bias for the RBM (_, bv_t, bh_t), updates_inference = theano.scan( fn=self.recurrence, sequences=self.v, outputs_info=[self.u0, None, None]) # Reshuffle the variables self.bv_dynamic = bv_t.dimshuffle((1,0,2)) self.bh_dynamic = bh_t.dimshuffle((1,0,2)) self.v = self.v.dimshuffle((1,0,2)) # Train the RBMs by blocks # Perform k-step gibbs sampling v_chain, updates_rbm = theano.scan( fn=lambda v,bv,bh: self.gibbs_step(v,bv,bh)[1], outputs_info=[self.v], non_sequences=[self.bv_dynamic, self.bh_dynamic], n_steps=self.k ) # Add updates of the rbm updates_inference.update(updates_rbm) # Get last sample of the gibbs chain v_sample = v_chain[-1] mean_v = self.gibbs_step(v_sample,self.bv_dynamic,self.bh_dynamic)[0] return v_sample, mean_v, updates_inference
def gibbs_all(self, sample, W, vBias, hBias, countSteps, function_mode): if function_mode < 3: gibbsOne_format = lambda sample: self.list_function_for_gibbs[function_mode](sample, W, vBias, hBias); format, updates = theano.scan(fn=gibbsOne_format, \ outputs_info=sample, \ n_steps=countSteps) return format, updates else: if function_mode == MODE_WITH_COIN_EXCEPT_LAST: gibbsOne_format = lambda sample: self.list_function_for_gibbs[MODE_WITH_COIN](sample, W, vBias, hBias); format, updates = theano.scan(fn=gibbsOne_format, \ outputs_info=sample, \ n_steps=countSteps - 1) gibbsOne_format = lambda sample: self.list_function_for_gibbs[MODE_WITHOUT_COIN](sample, W, vBias, hBias); res = gibbsOne_format(format[-1]) res = T.concatenate([format, [res]]) return res, updates else: gibbsOne_format = lambda sample: self.list_function_for_gibbs[MODE_WITHOUT_COIN](sample, W, vBias, hBias); format, updates = theano.scan(fn=gibbsOne_format, \ outputs_info=sample, \ n_steps=countSteps - 1) gibbsOne_format = lambda sample: self.list_function_for_gibbs[MODE_WITH_COIN](sample, W, vBias, hBias); res = gibbsOne_format(format[-1]) res = T.concatenate([format, [res]]) return res, updates
def call(self, x, mask=None): def _step(v1, v2): cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=2, keepdims=True) + 1e-6), (v2) / T.sqrt(T.sum(T.sqr(v2), axis=2, keepdims=True) + 1e-6), [[2], [2]]) return cosine_score l_s = x[0] # n_b x n_s x n_w_s x D l_a = x[1] # n_b x 4 x n_w_qa x D # get cosine similarity for ALL word pairs output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None) # n_b x n_s x n_w_s x 4 x n_w_qa # return T.max(T.max(output, axis=4), axis=2) output = output.dimshuffle(2, 1, 0, 3, 4) # n_w_s x n_s x n_b x 4 x n_w_qa def slide_max(i, X): size = self.window_size M = X[i:i + size] W = self.w_gaussian return T.max((W * M.T).T, axis=0), theano.scan_module.until(i >= X.shape[0] - size + 1) output, _ = theano.scan(slide_max, sequences=[ T.arange(0, stop=(output.shape[0] - self.window_size + 1), step=3, dtype='int32')], non_sequences=output) if self.use_qa_idf: average = weighted_average(output.dimshuffle(2, 1, 0, 3, 4), x[2], axis=4) else: average = masked_mean(output.dimshuffle(2, 1, 0, 3, 4), axis=4) return T.max(average, axis=2) * self.alpha
def renet_layer_ud(X, Wx, Wh, Wo, Bh, Bo, H0, w, h, wp, hp): def recurrence(x_t, h_tm1): dot = T.dot(Wx, x_t) h_t = T.tanh(dot + T.dot(h_tm1, Wh) + Bh) s_t = T.tanh(T.dot(h_t, Wo) + Bo) return [h_t, s_t] list_of_images = [] for j in xrange(w/wp): # x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten(ndim=2) # reshape the row into a 2-D matrix to be fed into scan x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten().reshape((h/hp, X.shape[0]*wp*hp)) [h1, s1], _ = theano.scan( fn=recurrence, sequences=x, outputs_info=[H0, None], n_steps=x.shape[0] ) [h2, s2], _ = theano.scan( fn=recurrence, sequences=x, outputs_info=[H0, None], n_steps=x.shape[0], go_backwards=True ) # combine the last values of s1 and s2 into an image img = T.concatenate([s1.T, s2.T]) list_of_images.append(img) return T.stacklists(list_of_images).dimshuffle((1, 0, 2))
def get_output(self, train=False): input = self.get_input(train) proj_input = self.activation(T.tensordot(input, self.att_proj, axes=(3,0))) #else: # proj_fun = lambda proj_i, inp: T.tensordot(inp, proj_i, axes=((1,3), (0,1))) # lin_proj_input, _ = theano.scan(fn=proj_fun, sequences=self.att_proj, non_sequences=input) # proj_input = self.activation(lin_proj_input.dimshuffle((1,0,2,3))) if self.context == 'word': att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 0)) elif self.context == 'clause': #att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 1)).sum(axis=2) def step(a_t, h_tm1, W_in, W, sc): h_t = T.tanh(T.tensordot(a_t, W_in, axes=(2,0)) + T.tensordot(h_tm1, W, axes=(2,0))) s_t = T.tensordot(h_t, sc, axes=(2,0)) return h_t, s_t [_, scores], _ = theano.scan(step, sequences=[proj_input.dimshuffle(2,0,1,3)], outputs_info=[T.zeros((proj_input.shape[0], self.td1, self.rec_hid_dim)), None], non_sequences=[self.rec_in_weights, self.rec_hid_weights, self.att_scorer]) att_scores = scores.dimshuffle(1,2,0) elif self.context == 'para': att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 2)).sum(axis=(1, 2)) # Nested scans. For shame! def get_sample_att(sample_input, sample_att): sample_att_inp, _ = theano.scan(fn=lambda s_att_i, s_input_i: T.dot(s_att_i, s_input_i), sequences=[T.nnet.softmax(sample_att), sample_input]) return sample_att_inp att_input, _ = theano.scan(fn=get_sample_att, sequences=[input, att_scores]) return att_input
def __init__(self, cell, rng, layer_id, shape, X, mask, is_train = 1, batch_size = 1, p = 0.5): prefix = "SentDecoderLayer_" layer_id = "_" + layer_id self.in_size, self.out_size = shape self.X = X self.summs = batch_size self.W_hy = init_weights((self.in_size, self.out_size), prefix + "W_hy" + layer_id) self.b_y = init_bias(self.out_size, prefix + "b_y" + layer_id) if cell == "gru": self.decoder = GRULayer(rng, prefix + layer_id, shape, self.X, mask, is_train, 1, p) def _active(pre_h, x): h = self.decoder._active(x, pre_h) y = T.tanh(T.dot(h, self.W_hy) + self.b_y) return h, y [h, y], updates = theano.scan(_active, n_steps = self.summs, sequences = [], outputs_info = [{'initial':self.X, 'taps':[-1]}, T.alloc(floatX(0.), 1, self.out_size)]) elif cell == "lstm": self.decoder = LSTMLayer(rng, prefix + layer_id, shape, self.X, mask, is_train, 1, p) def _active(pre_h, pre_c, x): h, c = self.decoder._active(x, pre_h, pre_c) y = T.tanh(T.dot(h, self.W_hy) + self.b_y) return h, c, y [h, c, y], updates = theano.scan(_active, n_steps = self.summs, sequences = [], outputs_info = [{'initial':self.X, 'taps':[-1]}, {'initial':self.X, 'taps':[-1]}, T.alloc(floatX(0.), 1, self.out_size)]) y = T.reshape(y, (self.summs, self.out_size)) self.activation = y self.params = self.decoder.params + [self.W_hy, self.b_y]
def get_output_for(self,net_input,**kwargs): if 'unary' in kwargs and kwargs['unary']==True: return net_input logger.info('Initializing the messages') Wp=self.W unary_sequence = net_input.dimshuffle(1,0,2) #Reshuffling the batched unary potential shape so that it can be used for word level iterations in theano.scan def forward_scan1(unary_sequence,forward_sm,Wp): forward_sm=forward_sm+unary_sequence forward_sm=theano_logsumexp(forward_sm.dimshuffle(0,1,'x')+Wp,1) return forward_sm def backward_scan1(unary_sequence,forward_sm,Wp): forward_sm=forward_sm+unary_sequence forward_sm=theano_logsumexp(forward_sm.dimshuffle(0,1,'x')+Wp.T,1) return forward_sm forward_results,_=theano.scan(fn=forward_scan1,sequences=[unary_sequence],outputs_info=T.zeros_like(unary_sequence[0]),non_sequences=[Wp],n_steps=unary_sequence.shape[0]-1) backward_results,_=theano.scan(fn=backward_scan1,sequences=[unary_sequence[::-1]],outputs_info=T.zeros_like(unary_sequence[0]),non_sequences=[Wp],n_steps=unary_sequence.shape[0]-1) backward_results=T.concatenate([backward_results[::-1],T.zeros_like(backward_results[:1])],axis=0) forward_results=T.concatenate([T.zeros_like(forward_results[:1]),forward_results],axis=0) unnormalized_prob = forward_results+unary_sequence+backward_results marginal_results = theano_logsumexp(unnormalized_prob,axis=2) normalized_prob = unnormalized_prob - marginal_results.dimshuffle(0,1,'x') # provided for debugging purposes. #marginal_all = theano.function([l_in.input_var,l_mask.input_var],marginal_results) #probs=theano.function([l_in.input_var,l_mask.input_var],normalized_prob.dimshuffle(1,0,2)) if 'normalized' in kwargs and kwargs['normalized']==True: return normalized_prob.dimshuffle(1,0,2) else: return unnormalized_prob.dimshuffle(1,0,2)
def fprop(self, data): if self.use_ground_truth: self.input_space.validate(data) features, phones = data init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda f, p, h, o: self.fprop_step(f, p, h, o) ((h, out), updates) = theano.scan(fn=fn, sequences=[features, phones], outputs_info=[dict(initial=init_h, taps=[-1]), init_out]) return out else: self.input_space.validate(data) features, phones = data init_in = features[0] init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda t, p, f, h, o: self.fprop_step_prime(t, p, f, h, o) ((f, h, out), updates) = theano.scan(fn=fn, sequences=[features, phones], outputs_info=[init_in, dict(initial=init_h, taps=[-1]), init_out]) return out
def For_MMD_Sub_class(self,target,data,omega,num_FF,Xlabel): Num=T.sum(Xlabel,0) D_num=Xlabel.shape[1] N=data.shape[0] F_times_Omega = T.dot(data, omega)#minibatch_size*n_rff Phi = (self.sf2**0.5 /num_FF**0.5 ) * T.concatenate([T.cos(F_times_Omega), T.sin(F_times_Omega)],1) #各RFFは2N_rffのたてベクトル Phi_total=T.sum(Phi.T,-1)/N #Domain_number*2N_rffの行列 Phi_each_domain, updates = theano.scan(fn=lambda a,b: T.switch(T.neq(b,0), Phi.T*a/b, 0), sequences=[Xlabel.T,Num]) each_Phi=T.sum(Phi_each_domain,-1) #まず自分自身との内積 結果はD次元のベクトル each_domain_sum=T.sum(each_Phi*each_Phi,-1) #全体の内積 tot_sum=T.dot(Phi_total,Phi_total) #全体とドメインのクロス内積 tot_domain_sum, updates=theano.scan(fn=lambda a: a*Phi_total, sequences=[each_Phi]) #MMDの計算 MMD_central=T.sum(each_domain_sum)+D_num*tot_sum-2*T.sum(tot_domain_sum) return MMD_central
def apply(self): result , updates = theano.scan( fn = self.train_step, sequences = self.f, outputs_info = [self.A_start , None], non_sequences = self.A , n_steps = self.tgt.shape[0] ) best_path_score = result[0][-1].max() idx = T.argmax(result[0][-1]) res2 , _ = theano.scan( fn = lambda dps , idx : [dps[idx] , idx], sequences = result[1][::-1], outputs_info = [idx , None] ) best_path = res2[1] tgt_score = self.decode() max_margin = T.sum(T.neq(self.tgt , best_path)) self.cost = best_path_score + max_margin - tgt_score #if T.lt(self.cost , T.alloc(numpy.int64(0))): # self.cost = T.alloc(numpy.int64(0)) #return T.argmax(result[-1]) #self.cost = T.mean(T.nnet.categorical_crossentropy(self.p_y_given_x , tgt)) #return best_path_score #return best_path return self.cost
def __dealWithOneDoc(self, DocSentenceCount0, oneDocSentenceCount1, \ docs, corpusPos, oneDocSentenceWordCount, docW, docB, sentenceW, sentenceB, posW, posB): # t = T.and_((shareRandge < oneDocSentenceCount1 + 1), (shareRandge >= DocSentenceCount0)).nonzero() oneDocSentenceWordCount = oneDocSentenceWordCount[DocSentenceCount0:oneDocSentenceCount1 + 1] sentenceResults0, _ = theano.scan(fn=self.__dealWithSentence, non_sequences=[docs, sentenceW, sentenceB], sequences=[dict(input=oneDocSentenceWordCount, taps=[-1, -0])], strict=True) sentenceResults1, _ = theano.scan(fn=self.__dealWithSentence, non_sequences=[corpusPos, posW, posB], sequences=[dict(input=oneDocSentenceWordCount, taps=[-1, -0])], strict=True) sentenceResults = T.concatenate([sentenceResults0, sentenceResults1], axis=1) # p = printing.Print('docPool') # docPool = p(docPool) # p = printing.Print('sentenceResults') # sentenceResults = p(sentenceResults) # p = printing.Print('doc_out') # doc_out = p(doc_out) doc_out = conv.conv2d(input=sentenceResults, filters=docW) docPool = downsample.max_pool_2d(doc_out, (self.__MAXDIM, 1), mode=self.__pooling_mode, ignore_border=False) docOutput = T.tanh(docPool + docB.dimshuffle([0, 'x', 'x'])) doc_embedding = docOutput.flatten(1) return doc_embedding
def call(self, x, mask=None): maxlen = x.shape[1] hidden0 = x # shape: (batch_size, maxlen, hidden_dim) pyramid, _ = theano.scan(fn=self.build_pyramid, sequences=T.arange(maxlen-1), outputs_info=[hidden0], non_sequences=maxlen) # shape: (maxlen-1, batch_size, maxlen, hidden_dim) hidden0 = K.expand_dims(hidden0, dim=0) # shape: (1, batch_size, maxlen, hidden_dim) pyramid = K.concatenate([hidden0, pyramid], axis=0) # shape: (maxlen, batch_size, maxlen, hidden_dim) hierarchy, _ = theano.scan(fn=self.compress_pyramid, sequences=[T.arange(maxlen, 0, -1), pyramid]) # shape: (maxlen, batch_size, hidden_dim) hierarchy = K.permute_dimensions(hierarchy, (1, 0, 2)) # shape: (batch_size, maxlen, hidden_dim) return hierarchy
def _build_model(self, input, options, layers, params, go_backwards=False): def _step1(x_, t_, layer_): layer_ = str(layer_.data) v = layers['conv_' + layer_ + '_v'].conv(x_) t = layers['conv_' + layer_ + '_t'].conv(t_) h = v + t return x_, h def _step2(h, r_, layer_): layer_ = str(layer_.data) o = h + params['b_' + layer_].dimshuffle('x', 0, 'x', 'x') if layer_ != str(len(options['filter_shape']) - 1): r = layers['conv_' + layer_ + '_r'].conv(r_) o = tensor.nnet.relu(o + r) return o rval = input if go_backwards: rval = rval[::-1] for i in range(len(options['filter_shape'])): rval, _ = theano.scan(_step1, sequences=[rval], outputs_info=[rval[0], None], non_sequences=[i], name='rnn_layers_k_' + str(i)) rval = rval[1] rval, _ = theano.scan(_step2, sequences=[rval], outputs_info=[rval[-1]], non_sequences=[i], name='rnn_layers_q_' + str(i)) proj = rval return proj
def layers(self, n_layers=1): layers = [] params = [] layer_output = [] for i in xrange(n_layers): if i == 0: layer_input = self.x.reshape((self.batch_size, self.n_words, self.n_in)).dimshuffle(1, 0, 2) # 100 * 10 * 32 layer = FirstLayer(n_i=self.n_in) else: layer_input = layer_output[-1][::-1] layer = Layer(n_i=self.n_in) [h, c], _ = theano.scan(fn=layer.forward, sequences=layer_input, outputs_info=[self.h0, self.c0]) layers.append(layer) params.extend(layer.params) layer_output.append(h) layer_input = layer_output[-1] layer = LastLayer(n_i=self.n_in, n_h=self.n_y) y, _ = theano.scan(fn=layer.forward, sequences=layer_input, outputs_info=[None]) layers.append(layer) params.extend(layer.params) layer_output.append(y) return layers, params, layer_output
def predict(self, input): #input is an array of vectors (2D np.array) self.input = input padw = int(self.window/2) if padw>0: padding = np.asarray([np.zeros((self.dim_in,), dtype=theano.config.floatX)] * (padw)) inp = T.concatenate((padding, input, padding), axis=0) else: inp = self.input seq = T.arange(T.shape(inp)[0]-self.window+1) self.input, _ = theano.scan(lambda v: inp[v : v+self.window].flatten(), sequences=seq) # initialize the gates out = theano.shared(numpy.zeros((self.dim_out,), dtype=theano.config.floatX)) # gate computations def rnn_step(x, h_prev): if self.use_bias: out = T.nnet.sigmoid(T.dot(x, self.Wx) + T.dot(h_prev, self.Wh) + self.b) else: out = T.nnet.sigmoid(T.dot(x, self.Wx) + T.dot(h_prev, self.Wh)) return out self.output, _ = theano.scan(fn=rnn_step, sequences = dict(input=self.input, taps=[0]), outputs_info = [out]) if self.use_last_output: self.output = self.output[-1] if self.pooling != None: self.output = self.pooling(self.output) return self.output
def restrictedBoltzmannMachines(learning_rate, training_epochs, dataset, batch_size, n_chains, n_samples, output_folder, n_hidden, destination_file): """ Demonstrate how to train and afterwards sample from it using Theano. This is demonstrated on MNIST. :param learning_rate: learning rate used for training the RBM :param training_epochs: number of epochs used for training :param dataset: path the the pickled dataset :param batch_size: size of a batch used to train the RBM :param n_chains: number of parallel Gibbs chains to be used for sampling :param n_samples: number of samples to plot for each chain """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) print " " print "###################" print "# BUILD THE MODEL #" print "###################" print " " print "Building the model ..." # initialize storage for the persistent chain (state = hidden # layer of chain) persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden), dtype=theano.config.floatX), borrow=True) # construct the RBM class rbm = RBM(input=x, n_visible=28 * 28, n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng) # get the cost and the gradient corresponding to one step of CD-15 cost, updates = rbm.get_cost_updates(lr=learning_rate, persistent=persistent_chain, k=15) ################################# # Training the RBM # ################################# print " " print "####################" print "# TRAINING THE RBM #" print "####################" print " " print "Training the RBM ..." if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) # start-snippet-5 # it is ok for a theano function to have no output # the purpose of train_rbm is solely to update the RBM parameters train_rbm = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}, name='train_rbm') plotting_time = 0. start_time = time.clock() # go through training epochs for epoch in xrange(training_epochs): # go through the training set mean_cost = [] for batch_index in xrange(n_train_batches): mean_cost += [train_rbm(batch_index)] print 'Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost) # Plot filters after each training epoch plotting_start = time.clock() # Construct image from the weight matrix image = Image.fromarray( tile_raster_images(X=rbm.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_at_epoch_%i.png' % epoch) plotting_stop = time.clock() plotting_time += (plotting_stop - plotting_start) end_time = time.clock() pretraining_time = (end_time - start_time) - plotting_time print('Training took %f minutes' % (pretraining_time / 60.)) # end-snippet-5 start-snippet-6 ################################# # Sampling from the RBM # ################################# print " " print "####################################" print "# EXTRACT THE SAMPLES FROM THE RBM #" print "####################################" print " " print "Extracting the samples from the RBM ..." # find out the number of test samples number_of_test_samples = test_set_x.get_value(borrow=True).shape[0] # pick random test examples, with which to initialize the persistent chain test_idx = rng.randint(number_of_test_samples - n_chains) persistent_vis_chain = theano.shared( numpy.asarray(test_set_x.get_value(borrow=True)[test_idx:test_idx + n_chains], dtype=theano.config.floatX)) # end-snippet-6 start-snippet-7 plot_every = 1000 # define one step of Gibbs sampling (mf = mean-field) define a # function that does `plot_every` steps before returning the # sample for plotting ([presig_hids, hid_mfs, hid_samples, presig_vis, vis_mfs, vis_samples], updates) = theano.scan( rbm.gibbs_vhv, outputs_info=[None, None, None, None, None, persistent_vis_chain], n_steps=plot_every) # add to updates the shared variable that takes care of our persistent # chain :. updates.update({persistent_vis_chain: vis_samples[-1]}) # construct the function that implements our persistent chain. # we generate the "mean field" activations for plotting and the actual # samples for reinitializing the state of our persistent chain sample_fn = theano.function([], [vis_mfs[-1], vis_samples[-1]], updates=updates, name='sample_fn') # create a space to store the image for plotting ( we need to leave # room for the tile_spacing as well) image_data = numpy.zeros((29 * n_samples + 1, 29 * n_chains - 1), dtype='uint8') for idx in xrange(n_samples): # generate `plot_every` intermediate samples that we discard, # because successive samples in the chain are too correlated vis_mf, vis_sample = sample_fn() print 'Plotting sample ...', idx image_data[29 * idx:29 * idx + 28, :] = tile_raster_images( X=vis_mf, img_shape=(28, 28), tile_shape=(1, n_chains), tile_spacing=(1, 1)) # construct image image = Image.fromarray(image_data) image.save(destination_file) # end-snippet-7 os.chdir('../')
def build_decoder(self, inputs, source, target, smask=None, tmask=None, context=None): """ Build the Pointer Network Decoder Computational Graph """ # inputs : (nb_samples, source_num, ptr_embedd_dim) # source : (nb_samples, source_num, source_dim) # smask : (nb_samples, source_num) # target : (nb_samples, target_num) # tmask : (nb_samples, target_num) # context: (nb_sample, context_dim) # initialized hidden state. assert context is not None Init_h = self.Initializer(context) # target is the source inputs. X = self.grab_source(inputs, target) # (nb_samples, target_num, source_dim) nb_dim = X.shape[0] tg_num = X.shape[1] sc_dim = X.shape[2] # since it changes to two pointers once a time: # concatenate + reshape def _get_ht(A, mask=False): if A.ndim == 2: B = A[:, -1:] if mask: B *= 0. A = T.concatenate([A, B], axis=1) return A[:, ::2], A[:, 1::2] else: B = A[:, -1:, :] print B.ndim if mask: B *= 0. A = T.concatenate([A, B], axis=1) return A[:, ::2, :], A[:, 1::2, :] Xh, Xt = _get_ht(X) Th, Tt = _get_ht(target) Mh, Mt = _get_ht(tmask, mask=True) Xa = Xh + Xt Xa = T.concatenate( [alloc_zeros_matrix(nb_dim, 1, sc_dim), Xa[:, :-1, :, :]], axis=1) Xa = Xa.dimshuffle((1, 0, 2)) # eat by recurrent net def _recurrence(x, prev_h, c, s, s_mask): # RNN read-out x_out = self.RNN(x, mask=None, C=c, init_h=prev_h, one_step=True) h_out = self.att_head(x_out, s, s_mask, return_log=True) t_out = self.att_tail(x_out, s, s_mask, return_log=True) return x_out, h_out, t_out outputs, _ = theano.scan(_recurrence, sequences=[Xa], outputs_info=[Init_h, None, None], non_sequences=[context, source, smask]) log_prob_head = outputs[1].dimshuffle((1, 0, 2)) log_prob_tail = outputs[2].dimshuffle((1, 0, 2)) log_prob = T.sum(self.grab_prob(log_prob_head, Th) * Mh, axis=1) \ + T.sum(self.grab_prob(log_prob_tail, Tt) * Mt, axis=1) return log_prob
def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, ctx): xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1) xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1) xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1) attinp_h1, attgate_h1 = att_to_h1.proj(w_tm1) h1_t = cell1.step(xinp_h1_t + attinp_h1, xgate_h1_t + attgate_h1, h1_tm1) h1inp_h2, h1gate_h2 = h1_to_h2.proj(h1_t) h1inp_h3, h1gate_h3 = h1_to_h3.proj(h1_t) a_t = h1_t.dot(h1_to_att_a) b_t = h1_t.dot(h1_to_att_b) k_t = h1_t.dot(h1_to_att_k) a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) k_t = k_tm1 + tensor.exp(k_t) ss_t = calc_phi(k_t, a_t, b_t, u) # calculate and return stopping criteria sh_t = calc_phi(k_t, a_t, b_t, u_max) ss5 = ss_t.dimshuffle(0, 1, 'x') ss6 = ss5 * ctx.dimshuffle(1, 0, 2) w_t = ss6.sum(axis=1) attinp_h2, attgate_h2 = att_to_h2.proj(w_t) attinp_h3, attgate_h3 = att_to_h3.proj(w_t) h2_t = cell2.step(xinp_h2_t + h1inp_h2 + attinp_h2, xgate_h2_t + h1gate_h2 + attgate_h2, h2_tm1) h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t) h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3 + attinp_h3, xgate_h3_t + h1gate_h3 + h2gate_h3 + attgate_h3, h3_tm1) out_t = h1_t.dot(h1_to_outs) + h2_t.dot(h2_to_outs) + h3_t.dot( h3_to_outs) out_t = out_t.dimshuffle(1, 0, 'x') counter = tensor.arange(out_t.shape[0]) switch = out_t.shape[0] // 2 def sample_out_step(c_t, o_tm1, x_tm1, v_h1_tm1): j_tm1 = tensor.concatenate((x_tm1, o_tm1), axis=1) vinp_h1_t, vgate_h1_t = inp_to_v_h1.proj(j_tm1) v_h1_t = v_cell1.step(vinp_h1_t, vgate_h1_t, v_h1_tm1) o = v_h1_t.dimshuffle('x', 0, 'x', 1) mu_mag, sigma_mag, coeff_mag = _slice_outs(o) mu_phase, sigma_phase, coeff_phase = _slice_outs(o) # Filthiest of the filthy hacks s = tensor.ge(switch, c_t) mu = s * (mu_mag) + (1 - s) * (mu_phase) sigma = s * (sigma_mag) + (1 - s) * (sigma_phase) coeff = s * (coeff_mag) + (1 - s) * (coeff_phase) mu = mu[0].dimshuffle(0, 'x', 1) sigma = sigma[0].dimshuffle(0, 'x', 1) coeff = coeff[0] samp_mag = sample_single_dimensional_gmms(mu, sigma, coeff, srng) samp_phase = sample_single_dimensional_gmms(mu, sigma, coeff, srng) samp_phase = tensor.mod(samp_phase + np.pi, 2 * np.pi) - np.pi samp = s * samp_mag + (1 - s) * samp_phase return samp, v_h1_t init_corr_out = tensor.zeros((out_t.shape[1], n_density)) init_samp_out = tensor.zeros((out_t.shape[1], 1)) r, isupdates = theano.scan(fn=sample_out_step, sequences=[counter, out_t], outputs_info=[init_samp_out, init_corr_out]) corr_out_t = r[0] x_t = corr_out_t.dimshuffle(2, 1, 0)[0] return x_t, h1_t, h2_t, h3_t, k_t, w_t, ss_t, sh_t, isupdates
""" # Old multistep code which doesn't work with updates in the internal scan n_steps_sym = tensor.iscalar() n_steps_sym.tag.test_value = 10 (sampled, h1_s, h2_s, h3_s, k_s, w_s, stop_s, stop_h), supdates = theano.scan( fn=sample_step, n_steps=n_steps_sym, sequences=[], outputs_info=[init_x, init_h1, init_h2, init_h3, init_kappa, init_w, None, None], non_sequences=[context]) """ (h1, h2, h3, kappa, w), updates = theano.scan( fn=step, sequences=[inp_h1, inpgate_h1, inp_h2, inpgate_h2, inp_h3, inpgate_h3], outputs_info=[init_h1, init_h2, init_h3, init_kappa, init_w], non_sequences=[context]) outs = h1.dot(h1_to_outs) + h2.dot(h2_to_outs) + h3.dot(h3_to_outs) orig_shapes = outs.shape outs = outs.dimshuffle(2, 1, 0) outs = outs.reshape((orig_shapes[2], orig_shapes[1] * orig_shapes[0], 1)) shuff_inpt_shapes = inpt.shape shuff_inpt = inpt.dimshuffle(2, 1, 0) shuff_inpt = shuff_inpt.reshape( (shuff_inpt_shapes[2], shuff_inpt_shapes[1] * shuff_inpt_shapes[0], 1)) def out_step(x_tm1, o_tm1, v_h1_tm1):
input2 = T.dtensor4() left2 = T.ivector() right2 = T.ivector() Slen2 = T.ivector() input1 = T.dtensor3() left1 = T.iscalar() right1 = T.iscalar() Slen1 = T.iscalar() # ok = atData(input1, left1, right1, Slen1) ok, _ = theano.scan(atData, sequences=[input2, left2, right2, Slen2]) myfunc = theano.function([input2, left2, right2, Slen2], ok, on_unused_input='ignore') input2_init = np.reshape(np.arange(2 * 5280, dtype=theano.config.floatX), (2, 1, 88, 60)) left2_init = np.asarray([1, 2], dtype='int32') right2_init = np.asarray([60, 59], dtype='int32') Slen2_init = np.asarray([70, 69], dtype='int32') input1_init = np.reshape(np.arange(5280, dtype=theano.config.floatX), (1, 88, 60)) left1_init = 1 right1_init = 60 Slen1_init = 70 # ok1, ok2, ok3 = myfunc(input_init, left_init, right_init) #
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo, phi, derphi, phi0, derphi0, c1, c2, n_iters=10, profile=False): """ WRITEME Part of the optimization algorithm in `scalar_search_wolfe2`. Parameters ---------- a_lo : float Step size a_hi : float Step size phi_lo : float Value of f at a_lo phi_hi : float Value of f at a_hi derphi_lo : float Value of derivative at a_lo phi : callable Generates computational graph derphi : callable Generates computational graph phi0 : float Value of f at 0 derphi0 : float Value of the derivative at 0 c1 : float Wolfe parameter c2 : float Wolfe parameter profile : bool True if you want printouts of profiling information """ # Function reprensenting the computations of one step of the while loop def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1 * dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2 * dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq', TT.isnan(a_j_quad), a_j_quad > b - qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or( 'condc', TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and( 'stop', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='phi_rec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ([ phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime ], theano.scan_module.scan_utils.until(stop)) maxiter = n_iters # cubic interpolant check delta1 = TT.constant(numpy.asarray(0.2, dtype=theano.config.floatX)) # quadratic interpolant check delta2 = TT.constant(numpy.asarray(0.1, dtype=theano.config.floatX)) phi_rec = phi0 a_rec = zero # Initial iteration dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) #a = ifelse(dalpha < 0, a_hi, a_lo) #b = ifelse(dalpha < 0, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # quadric interpolation qchk = delta2 * dalpha a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('mcond_q', TT.isnan(a_j), TT.bitwise_or(a_j > b - qchk, a_j < a + qchk)) a_j = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='mphirec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='marec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='mahi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='mphihi') onlyif = lazy_and( 'only_if', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo_main') phi_rec.name = 'phi_rec' a_rec.name = 'a_rec' a_lo.name = 'a_lo' a_hi.name = 'a_hi' phi_hi.name = 'phi_hi' phi_lo.name = 'phi_lo' derphi_lo.name = 'derphi_lo' vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='vderphi_aj') states = [ phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, zero, zero, zero ] # print'while_zoom' outs, updates = scan(while_zoom, outputs_info=states, n_steps=maxiter, name='while_zoom', mode=theano.Mode(linker='cvm_nogc'), profile=profile) # print 'done_while' a_star = ifelse(onlyif, a_j, outs[7][-1], name='astar') val_star = ifelse(onlyif, phi_aj, outs[8][-1], name='valstar') valprime = ifelse(onlyif, vderphi_aj, outs[9][-1], name='valprime') ## WARNING !! I ignore updates given by scan which I should not do !!! return a_star, val_star, valprime
def scalar_search_wolfe2(phi, derphi, phi0=None, old_phi0=None, derphi0=None, n_iters=20, c1=1e-4, c2=0.9, profile=False): """ Find alpha that satisfies strong Wolfe conditions. alpha > 0 is assumed to be a descent direction. Parameters ---------- phi : callable f(x) Objective scalar function. derphi : callable f'(x) Objective function derivative (can be None) phi0 : float, optional Value of phi at s=0 old_phi0 : float, optional Value of phi at previous point derphi0 : float, optional Value of derphi at s=0 c1 : float Parameter for Armijo condition rule. c2 : float Parameter for curvature condition rule. profile : flag (boolean) True if you want printouts of profiling information Returns ------- alpha_star : float Best alpha phi_star : WRITEME phi at alpha_star phi0 : WRITEME phi at 0 derphi_star : WRITEME derphi at alpha_star Notes ----- Uses the line search algorithm to enforce strong Wolfe conditions. See Wright and Nocedal, 'Numerical Optimization', 1999, pg. 59-60. For the zoom phase it uses an algorithm by [...]. """ if phi0 is None: phi0 = phi(zero) else: phi0 = phi0 if derphi0 is None and derphi is not None: derphi0 = derphi(zero) else: derphi0 = derphi0 alpha0 = zero alpha0.name = 'alpha0' if old_phi0 is not None: alpha1 = TT.minimum(one, numpy.asarray(1.01, dtype=theano.config.floatX) * numpy.asarray(2, dtype=theano.config.floatX) * \ (phi0 - old_phi0) / derphi0) else: old_phi0 = nan alpha1 = one alpha1 = TT.switch(alpha1 < zero, one, alpha1) alpha1.name = 'alpha1' # This shouldn't happen. Perhaps the increment has slipped below # machine precision? For now, set the return variables skip the # useless while loop, and raise warnflag=2 due to possible imprecision. phi0 = TT.switch(TT.eq(alpha1, zero), old_phi0, phi0) # I need a lazyif for alpha1 == 0 !!! phi_a1 = ifelse(TT.eq(alpha1, zero), phi0, phi(alpha1), name='phi_a1') phi_a1.name = 'phi_a1' phi_a0 = phi0 phi_a0.name = 'phi_a0' derphi_a0 = derphi0 derphi_a0.name = 'derphi_a0' # Make sure variables are tensors otherwise strange things happen c1 = TT.as_tensor_variable(c1) c2 = TT.as_tensor_variable(c2) maxiter = n_iters def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t, alpha_star, phi_star, derphi_star): derphi_a1 = derphi(alpha1) cond1 = TT.bitwise_or(phi_a1 > phi0 + c1 * alpha1 * derphi0, TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero)) cond2 = abs(derphi_a1) <= -c2 * derphi0 cond3 = derphi_a1 >= zero alpha_star_c1, phi_star_c1, derphi_star_c1 = \ _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, phi, derphi, phi0, derphi0, c1, c2, profile=profile) alpha_star_c3, phi_star_c3, derphi_star_c3 = \ _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi, derphi, phi0, derphi0, c1, c2, profile=profile) nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX) nw_phi = phi(nw_alpha1) alpha_star, phi_star, derphi_star = \ ifelse(cond1, (alpha_star_c1, phi_star_c1, derphi_star_c1), ifelse(cond2, (alpha1, phi_a1, derphi_a1), ifelse(cond3, (alpha_star_c3, phi_star_c3, derphi_star_c3), (nw_alpha1, nw_phi, nan), name='alphastar_c3'), name='alphastar_c2'), name='alphastar_c1') return ([ alpha1, nw_alpha1, phi_a1, ifelse(lazy_or('allconds', cond1, cond2, cond3), phi_a1, nw_phi, name='nwphi1'), ifelse(cond1, derphi_a0, derphi_a1, name='derphi'), i_t + one, alpha_star, phi_star, derphi_star ], theano.scan_module.scan_utils.until( lazy_or('until_cond_', TT.eq(nw_alpha1, zero), cond1, cond2, cond3))) states = [alpha0, alpha1, phi_a0, phi_a1, derphi_a0] # i_t states.append(zero) # alpha_star states.append(zero) # phi_star states.append(zero) # derphi_star states.append(zero) # print 'while_search' outs, updates = scan(while_search, outputs_info=states, n_steps=maxiter, name='while_search', mode=theano.Mode(linker='cvm_nogc'), profile=profile) # print 'done_while_search' out3 = outs[-3][-1] out2 = outs[-2][-1] out1 = outs[-1][-1] alpha_star, phi_star, derphi_star = \ ifelse(TT.eq(alpha1, zero), (nan, phi0, nan), (out3, out2, out1), name='main_alphastar') return alpha_star, phi_star, phi0, derphi_star
def __theano_build__(self): parameters = [E, V, U, W, b, c] = self.E, self.V, self.U, self.W, self.b, self.c x = T.imatrix('x') y = T.imatrix('y') coversion_ones = T.ones((self.mini_batch_size, 1)) def forward_prop_step(x_t, s_prev1, s_prev2, s_prev3): # Embedding layer x_e = E[:, x_t] def GRU(i, U, W, b, x_0, s_previous): b1 = T.specify_shape((coversion_ones * b[i * 3, :]).T, T.shape(x_0)) b2 = T.specify_shape((coversion_ones * b[i * 3 + 1, :]).T, T.shape(x_0)) b3 = T.specify_shape((coversion_ones * b[i * 3 + 2, :]).T, T.shape(x_0)) z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_previous) + b1) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_previous) + b2) s_candidate = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_previous * r) + b3) return (T.ones_like(z) - z) * s_candidate + z * s_previous # GRU Layer 1 s1 = GRU(0, U, W, b, x_e, s_prev1) # GRU Layer 2 s2 = GRU(1, U, W, b, s1, s_prev2) # GRU Layer 3 s3 = GRU(2, U, W, b, s2, s_prev3) # Final output calculation c_matrix = (coversion_ones * c).T juju = V.dot(s3) + c_matrix o_t = T.nnet.softmax(juju.T).T return [o_t, s1, s2, s3] # p_o = printing.Print('prediction') [o, s1, s2, s3], updates = theano.scan( forward_prop_step, sequences=x.T, truncate_gradient=self.bptt_truncate, outputs_info=[ None, dict(initial=T.zeros((self.hidden_dim, self.mini_batch_size))), dict(initial=T.zeros((self.hidden_dim, self.mini_batch_size))), dict(initial=T.zeros((self.hidden_dim, self.mini_batch_size))) ]) def p(j, name): return printing.Print(name)(j) prediction = T.argmax(o, axis=1) e = ((prediction - y.T)** 2) / (T.shape(prediction)[0] * T.shape(prediction)[1]) cost_batch = self.calculate_ce_vector(o, y) mse_cost_batch = self.calculate_mean_squared_error_vector( prediction, y) # Total cost cost = (1 / self.mini_batch_size) * self.calculate_error(o, y) # Gradients derivatives = self.calculate_gradients(cost, parameters) # Assign functions self.predict = theano.function([x], [o]) self.predict_class = theano.function([x, y], [prediction, e], allow_input_downcast=True) self.error = theano.function([x, y], e) self.calculate_loss_vector = theano.function([x, y], cost_batch, allow_input_downcast=True) self.calculate_mse_vector = theano.function([x, y], mse_cost_batch, allow_input_downcast=True) self.ce_error = theano.function([x, y], cost, allow_input_downcast=True) self.bptt = theano.function([x, y], derivatives, allow_input_downcast=True) # SGD parameters # rmsprop cache updates self.update_RMSPROP(cost, parameters, derivatives, x, y)
def __init__(self, rng, input, n_in, n_out, n_attendout, initial_hidden=None, W_rec=None, activation=T.tanh): self.input = input self.n_in = n_in self.n_out = n_out self.n_attendout = n_attendout self.n_attendin = 100 self.type = 'attendrnn' if initial_hidden is None: initial_hidden_values_s = numpy.zeros((n_out, ), dtype=theano.config.floatX) initial_hidden_s = theano.shared(value=initial_hidden_values_s, name='s0', borrow=True) self.s0 = initial_hidden_s if W_rec is None: W_type1 = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) W_type2 = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_out, n_out)), dtype=theano.config.floatX) W_type3 = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(self.n_attendin, self.n_attendout)), dtype=theano.config.floatX) W_type4 = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(self.n_in, self.n_attendin)), dtype=theano.config.floatX) W_type5 = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(self.n_out, self.n_attendin)), dtype=theano.config.floatX) b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX) W_ic = theano.shared(value=W_type1, name='W_ic', borrow=True) W_rec = theano.shared(value=W_type2, name='W_rec', borrow=True) W_outattend = theano.shared(value=W_type3, name='W_outattend', borrow=True) W_inattend_feat = theano.shared(value=W_type4, name='W_inattend_feat', borrow=True) W_inattend_prevstate = theano.shared(value=W_type5, name='W_inattend_prevstate', borrow=True) b = theano.shared(value=b_values, name='b', borrow=True) self.W_ic = W_ic self.W_rec = W_rec self.W_outattend = W_outattend self.W_inattend_feat = W_inattend_feat self.W_inattend_prevstate = W_inattend_prevstate self.b = b self.delta_W_ic = theano.shared(value=numpy.zeros( (n_in, n_out), dtype=theano.config.floatX), name='delta_W_ic') self.delta_W_rec = theano.shared(value=numpy.zeros( (n_out, n_out), dtype=theano.config.floatX), name='delta_W_rec') self.delta_W_outattend = theano.shared(value=numpy.zeros( (self.n_attendin, self.n_attendout), dtype=theano.config.floatX), name='delta_W_outattend') self.delta_W_inattend_feat = theano.shared( value=numpy.zeros((n_in, self.n_attendin), dtype=theano.config.floatX), name='delta_W_inattend_feat') self.delta_W_inattend_prevstate = theano.shared( value=numpy.zeros((n_out, self.n_attendin), dtype=theano.config.floatX), name='delta_W_inattend_prevstate') self.delta_b = theano.shared(value=numpy.zeros_like( self.b.get_value(borrow=True), dtype=theano.config.floatX), name='delta_b') self.test8 = numpy.zeros((8, ), dtype=theano.config.floatX) # sequences: h_l # prior results: s_tm1 # non sequences: W_outattend, W_inattend_prevstate, W_ic, W_rec, b, W_inattend_feat def one_step(h_l, s_tm1, W_outattend, W_inattend_prevstate, W_ic, W_rec, b, W_inattend_feat): e_tl = T.dot( T.tanh( T.dot(s_tm1, W_inattend_prevstate) + T.dot(h_l, W_inattend_feat)), W_outattend) a_tl = T.exp(e_tl) / (T.exp(e_tl)).sum(0, keepdims=True) c_t = T.dot(a_tl, self.input) s_t = T.tanh(T.dot(c_t, W_ic) + T.dot(s_tm1, W_rec) + b) return s_t self.y_vals, _ = theano.scan(fn=one_step, sequences=self.input, outputs_info=self.s0, non_sequences=[ self.W_outattend, self.W_inattend_prevstate, self.W_ic, self.W_rec, self.b, self.W_inattend_feat ]) # parameters of the model self.params = [ self.W_outattend, self.W_inattend_prevstate, self.W_ic, self.W_rec, self.b, self.W_inattend_feat ] self.delta_params = [ self.delta_W_outattend, self.delta_W_inattend_prevstate, self.delta_W_ic, self.delta_W_rec, self.delta_b, self.delta_W_inattend_feat ] sigma = lambda x: 1 / (1 + T.exp(-x)) self.output = sigma(self.y_vals)
def get_cost_updates(self, lr=0.1, persistent=None, k=1): """This functions implements one step of CD-k or PCD-k :param lr: learning rate used to train the RBM :param persistent: None for CD. For PCD, shared variable containing old state of Gibbs chain. This must be a shared variable of size (batch size, number of hidden units). :param k: number of Gibbs steps to do in CD-k/PCD-k Returns a proxy for the cost and the updates dictionary. The dictionary contains the update rules for weights and biases but also an update of the shared variable used to store the persistent chain, if one is used. """ # compute positive phase pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input) # decide how to initialize persistent chain: # for CD, we use the newly generate hidden sample # for PCD, we initialize from the old state of the chain if persistent is None: chain_start = ph_sample else: chain_start = persistent # end-snippet-2 # perform actual negative phase # in order to implement CD-k/PCD-k we need to scan over the # function that implements one gibbs step k times. # Read Theano tutorial on scan for more information : # http://deeplearning.net/software/theano/library/scan.html # the scan will return the entire Gibbs chain ([ pre_sigmoid_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means, nh_samples ], updates) = theano.scan( self.gibbs_hvh, # the None are place holders, saying that # chain_start is the initial state corresponding to the # 6th output outputs_info=[None, None, None, None, None, chain_start], n_steps=k) # start-snippet-3 # determine gradients on RBM parameters # note that we only need the sample at the end of the chain chain_end = nv_samples[-1] cost = T.mean(self.free_energy(self.input)) - T.mean( self.free_energy(chain_end)) # We must not compute the gradient through the gibbs sampling gparams = T.grad(cost, self.params, consider_constant=[chain_end]) # end-snippet-3 start-snippet-4 # constructs the update dictionary for gparam, param in zip(gparams, self.params): # make sure that the learning rate is of the right dtype updates[param] = param - gparam * T.cast( lr, dtype=theano.config.floatX) if persistent: # Note that this works only if persistent is a shared variable updates[persistent] = nh_samples[-1] # pseudo-likelihood is a better proxy for PCD monitoring_cost = self.get_pseudo_likelihood_cost(updates) else: # reconstruction cross-entropy is a better proxy for CD monitoring_cost = self.get_reconstruction_cost( updates, pre_sigmoid_nvs[-1]) return monitoring_cost, updates
def build_minibatch(self, batch_size): ''' dimension: n_steps * batch_size * embed_dim :return: ''' V, U, W, b, c = self.V, self.U, self.W, self.b, self.c x = T.tensor3('x') y = T.matrix('y') m = T.matrix('mask') self.batch_size = batch_size def forward_prop_step(x_t, m_t, s_t_prev): # This is how we calculated the hidden state in a simple RNN. No longer! # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev)) # GRU Layer z_t = T.nnet.hard_sigmoid(T.dot(x_t, U[0]) + T.dot(s_t_prev, W[0]) + b[0]) r_t = T.nnet.hard_sigmoid(T.dot(x_t, U[1]) + T.dot(s_t_prev, W[1]) + b[1]) c_t = T.tanh(T.dot(x_t, U[2]) + T.dot((s_t_prev*r_t), W[2]) + b[2]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev s_t = m_t[:, None] * s_t + (1.0 - m_t)[:, None] * s_t_prev return s_t s, _ = theano.scan( forward_prop_step, sequences=[x, m], truncate_gradient=self.bptt_truncate, outputs_info=[dict(initial=T.zeros((batch_size, self.hidden_dim)))]) # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row p_y = T.nnet.softmax(T.dot(s[-1], V) + c) # [0] prediction = T.argmax(p_y, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(p_y, y))/self.batch_size # Total cost (could add regularization here) self.cost = o_error # Assign functions self.predict = theano.function([x, m], p_y) self.predict_class = theano.function([x, m], prediction) self.ce_error = theano.function([x, y, m], self.cost) # Gradients dU = T.grad(self.cost, U) dW = T.grad(self.cost, W) db = T.grad(self.cost, b) dV = T.grad(self.cost, V) dc = T.grad(self.cost, c) self.bptt = theano.function([x, y, m], [dU, dW, db, dV, dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mU = decay * self.mU + (1 - decay) * dU ** 2 mW = decay * self.mW + (1 - decay) * dW ** 2 mV = decay * self.mV + (1 - decay) * dV ** 2 mb = decay * self.mb + (1 - decay) * db ** 2 mc = decay * self.mc + (1 - decay) * dc ** 2 self.f_update = theano.function( [x, y, m, learning_rate, theano.In(decay, value=0.9)], [], updates=[ (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc) ])
#This is just a template: it does not learn anything, and always returns the class "0": W0 = theano.shared(numpy.ones((n_words, word_embedding_size)), 'W0') W1 = theano.shared(numpy.ones((n_words, word_embedding_size)), 'W1') def rnn_step(x, h_prev, W0, W1): b = theano.tensor.dot(W0, x) a = theano.tensor.dot(W1, h_prev) c = a + b return theano.tensor.tanh(c) initial_context_vector = theano.tensor.alloc( numpy.array(0, dtype=theano.config.floatX), n_words) activations, other_info = theano.scan(rnn_step, sequences=input_vectors, outputs_info=initial_context_vector, non_sequences=[W0, W1]) activations = activations[-1] predicted_class = theano.tensor.argmax(activations) output = theano.tensor.nnet.softmax(activations)[0] cost = -theano.tensor.log(output[target_class]) updates = [(word_embeddings, word_embeddings - .1 * theano.tensor.grad(cost, word_embeddings)), (W0, W0 - .1 * theano.tensor.grad(cost, W0)), (W1, W1 - .1 * theano.tensor.grad(cost, W1))] theano.config.on_unused_input = 'ignore' Accuracy = -cost #Change this to something meaningful and it will work! train = theano.function([input_indices, target_class], [Accuracy, predicted_class],
def build_model(tparams, options): # MIKE: why is this not a shared variable as in # trng = theano.tensor.shared_randomstreams.RandomStreams(1234) trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) xt = tensor.matrix('xt', dtype=config.floatX) y = tensor.matrix('y', dtype='int64') yt = tensor.matrix('yt', dtype=config.floatX) n_timesteps = x.shape[0] n_examples = x.shape[1] if (options['arch_remap_input']): emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_examples, options['n_hid']]) else: Wemb = theano.shared( numpy.concatenate( (numpy.zeros((1,options['n_hid']),dtype=config.floatX), numpy.identity(options['n_hid'],dtype=config.floatX)), axis=0), name='Wemb') emb = Wemb[x.flatten()].reshape([n_timesteps, n_examples, options['n_hid']]) # this is the call to either lstm_layer or hpm_layer if (options['encoder'] == 'lstm'): proj = get_layer(options['encoder'])[1]( tparams, emb, xt, yt, options, prefix=options['encoder'], mask=mask) h = c = None else: proj, h, c = get_layer(options['encoder'])[1]( tparams, emb, xt, yt, options, prefix=options['encoder'], mask=mask) # proj has dim n_timesteps X n_examples X n_hid if options['use_dropout']: proj = dropout_layer(proj, use_noise, trng) def _step(proj_step): if (options['arch_output_fn'] == 'softmax'): pred_prob_step = tensor.nnet.softmax( tensor.dot(proj_step, tparams['U']) + tparams['b']) elif (options['arch_output_fn'] == 'logistic'): pred_prob_step = tensor.nnet.sigmoid( tensor.dot(proj_step, tparams['U']) + tparams['b']) else: # '1-1' pred_prob_step = (proj_step+1.0e-6) / tensor.sum(proj_step+1.0e-6,axis=1,keepdims=True) # No longer needed if there's no '0' output #pred_prob_step = tensor.concatenate([tensor.alloc(0,n_examples,1), # pred_prob_step], axis=1) return pred_prob_step # pred_prob_step should have dim n_examples X n_outputs # pred_prob has dim n_timesteps x n_examples x n_outputs # pred_step has have dim n_examples pred_prob, updates = theano.scan(_step, sequences=proj, outputs_info=None, non_sequences=None, n_steps=n_timesteps) def _cost_step_norm(pred_prob_step, y_step): # tgt_prob_step should have dim n_examples tgt_prob_step = tensor.switch(tensor.eq(y_step, 0), 1.0, pred_prob_step[tensor.arange(n_examples),y_step-1]) pred_ix_step = tensor.argmax(pred_prob_step,axis=1) + 1 if (options['type_token_sim']): corr_step = tensor.switch(tensor.eq(y_step, 0), 0, tensor.switch(tensor.eq((y_step-1)//5, (pred_ix_step-1)//5), 1, -1)) else: corr_step = tensor.switch(tensor.eq(y_step, 0), 0, tensor.switch(tensor.eq(y_step,pred_ix_step), 1, -1)) return tgt_prob_step, corr_step # cost function for predicting target value of a specific event # tgt_prob_step should have dim n_examples def _cost_step_tgt(pred_prob_step, y_step): tgt_prob_step = tensor.switch(tensor.eq(y_step, 0), 1.0, tensor.switch(tensor.gt(y_step, 0), pred_prob_step[tensor.arange(n_examples),y_step-1], 1.0-pred_prob_step[tensor.arange(n_examples),-y_step-1])) corr_step = tensor.switch(tensor.eq(y_step, 0), 0, tensor.switch(tensor.gt(tgt_prob_step, 0.5), 1, -1)) return tgt_prob_step, corr_step if (options['signed_out']): cost_fn = _cost_step_tgt else: cost_fn = _cost_step_norm (tgt_prob, corr), updates = theano.scan(cost_fn, sequences=[pred_prob, y], outputs_info=None, non_sequences=None, n_steps=n_timesteps) off = 1e-8 if tgt_prob.dtype == 'float16': off = 1e-6 # tgt_prob: probability correct (dimensions n_timesteps X n_examples) cost = -tensor.sum(tensor.log(tgt_prob.clip(off, 1.0))) # Note: not dividing by count because it will reweight minibatch by size # / tensor.sum(tensor.gt(y,0)) return use_noise, x, xt, y, yt, mask, pred_prob, corr, cost, proj, h, c, tgt_prob
def create_gradientfunctions(self,data): """This function takes as input the whole dataset and creates the entire model""" def encodingstep(x_t, h_t): return T.tanh(self.params["W_xhe"].dot(x_t) + self.params["W_hhe"].dot(h_t) + self.params["b_he"]) x = T.tensor3("x") h0_enc = T.matrix("h0_enc") result, _ = theano.scan(encodingstep, sequences = x, outputs_info = h0_enc) h_encoder = result[-1] #log sigma encoder is squared mu_encoder = T.dot(self.params["W_hmu"],h_encoder) + self.params["b_hmu"] log_sigma_encoder = T.dot(self.params["W_hsigma"],h_encoder) + self.params["b_hsigma"] #Use a very wide prior to make it possible to learn something with Z logpz = 0.005 * T.sum(1 + log_sigma_encoder - mu_encoder**2 - T.exp(log_sigma_encoder), axis = 0) seed = 42 if "gpu" in theano.config.device: srng = theano.sandbox.cuda.rng_curand.CURAND_RandomStreams(seed=seed) else: srng = T.shared_randomstreams.RandomStreams(seed=seed) #Reparametrize Z eps = srng.normal((self.latent_variables,self.batch_size), avg = 0.0, std = 1.0, dtype=theano.config.floatX) z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps h0_dec = T.tanh(self.params["W_zh"].dot(z) + self.params["b_zh"]) def decodingstep(x_t, h_t): h = T.tanh(self.params["W_hhd"].dot(h_t) + self.params["W_xhd"].dot(x_t) + self.params["b_hd"]) x = T.nnet.sigmoid(self.params["W_hx"].dot(h) + self.params["b_hx"]) return x, h x0 = T.matrix("x0") [y, _], _ = theano.scan(decodingstep, n_steps = x.shape[0], outputs_info = [x0, h0_dec]) # Clip y to avoid NaNs, necessary when lowerbound goes to 0 y = T.clip(y, 1e-6, 1 - 1e-6) logpxz = T.sum(-T.nnet.binary_crossentropy(y,x), axis = 1) logpxz = T.mean(logpxz, axis = 0) #Average over time dimension logpx = T.mean(logpxz + logpz) #Compute all the gradients gradients = T.grad(logpx, self.params.values()) #Let Theano handle the updates on parameters for speed updates = OrderedDict() epoch = T.iscalar("epoch") gamma = T.sqrt(1 - (1 - self.b2)**epoch)/(1 - (1 - self.b1)**epoch) #Adam for parameter, gradient, m, v in zip(self.params.values(), gradients, self.m.values(), self.v.values()): new_m = self.b1 * gradient + (1 - self.b1) * m new_v = self.b2 * (gradient**2) + (1 - self.b2) * v updates[parameter] = parameter + self.learning_rate * gamma * new_m / (T.sqrt(new_v)+ 1e-8) updates[m] = new_m updates[v] = new_v batch = T.iscalar('batch') givens = { h0_enc: np.zeros((self.hidden_units_encoder,self.batch_size)).astype(theano.config.floatX), x0: np.zeros((self.features,self.batch_size)).astype(theano.config.floatX), x: data[:,:,batch*self.batch_size:(batch+1)*self.batch_size] } self.updatefunction = theano.function([batch,epoch], logpx, updates=updates, givens=givens, allow_input_downcast=True) return True
def lstm_layer(tparams, state_below, xt, yt, options, prefix='lstm', mask=None): # xt and yt are used as additional inputs nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, delta_t_input, delta_t_output, h_, c_): # h_ has dim n_training_examples X n_lstm # delta_t_input has dim n_training_examples # include input and output delta_t values as LSTM input features if (options['arch_lstm_include_delta_t']): h_aug = tensor.concatenate([h_, delta_t_input[:,None], delta_t_output[:,None]], axis=1) else: h_aug = h_ preact = tensor.dot(h_aug, tparams[_p(prefix, 'U')]) preact += x_ c = tensor.tanh(_slice(preact, 3, options['n_hid'])) # original code: c = f * c_ + i * c if (options['arch_lstm_include_input_gate']): i = tensor.nnet.sigmoid(_slice(preact, 0, options['n_hid'])) c = i * c if (options['arch_lstm_include_forget_gate']): f = tensor.nnet.sigmoid(_slice(preact, 1, options['n_hid'])) c = c + f * c_ else: c = c + c_ c = m_[:, None] * c + (1. - m_)[:, None] * c_ if (options['arch_lstm_include_output_gate']): o = tensor.nnet.sigmoid(_slice(preact, 2, options['n_hid'])) h = o * tensor.tanh(c) else: h = tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) n_hid = options['n_hid'] rval, updates = theano.scan(_step, sequences=[mask, state_below, xt, yt], outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, n_hid), tensor.alloc(numpy_floatX(0.), n_samples, n_hid)], name=_p(prefix, '_layers'), n_steps=nsteps) return rval[0]
def call(self, x, mask=None): # TODO: validate input shape assert (len(x) == 3) L_flat = x[0] mu = x[1] a = x[2] if self.mode == 'full': # Create L and L^T matrix, which we use to construct the positive-definite matrix P. L = None LT = None if K.backend() == 'theano': import theano.tensor as T import theano def fn(x, L_acc, LT_acc): x_ = K.zeros((self.nb_actions, self.nb_actions)) x_ = T.set_subtensor(x_[np.tril_indices(self.nb_actions)], x) diag = K.exp(T.diag(x_)) + K.epsilon() x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], diag) return x_, x_.T outputs_info = [ K.zeros((self.nb_actions, self.nb_actions)), K.zeros((self.nb_actions, self.nb_actions)), ] results, _ = theano.scan(fn=fn, sequences=L_flat, outputs_info=outputs_info) L, LT = results elif K.backend() == 'tensorflow': import tensorflow as tf # Number of elements in a triangular matrix. nb_elems = (self.nb_actions * self.nb_actions + self.nb_actions) // 2 # Create mask for the diagonal elements in L_flat. This is used to exponentiate # only the diagonal elements, which is done before gathering. diag_indeces = [0] for row in range(1, self.nb_actions): diag_indeces.append(diag_indeces[-1] + (row + 1)) diag_mask = np.zeros(1 + nb_elems) # +1 for the leading zero diag_mask[np.array(diag_indeces) + 1] = 1 diag_mask = K.variable(diag_mask) # Add leading zero element to each element in the L_flat. We use this zero # element when gathering L_flat into a lower triangular matrix L. nb_rows = tf.shape(L_flat)[0] zeros = tf.expand_dims(tf.tile(K.zeros((1,)), [nb_rows]), 1) try: # Old TF behavior. L_flat = tf.concat(1, [zeros, L_flat]) except TypeError: # New TF behavior L_flat = tf.concat([zeros, L_flat], 1) # Create mask that can be used to gather elements from L_flat and put them # into a lower triangular matrix. tril_mask = np.zeros((self.nb_actions, self.nb_actions), dtype='int32') tril_mask[np.tril_indices(self.nb_actions)] = range(1, nb_elems + 1) # Finally, process each element of the batch. init = [ K.zeros((self.nb_actions, self.nb_actions)), K.zeros((self.nb_actions, self.nb_actions)), ] def fn(a, x): # Exponentiate everything. This is much easier than only exponentiating # the diagonal elements, and, usually, the action space is relatively low. x_ = K.exp(x) + K.epsilon() # Only keep the diagonal elements. x_ *= diag_mask # Add the original, non-diagonal elements. x_ += x * (1. - diag_mask) # Finally, gather everything into a lower triangular matrix. L_ = tf.gather(x_, tril_mask) return [L_, tf.transpose(L_)] tmp = tf.scan(fn, L_flat, initializer=init) if isinstance(tmp, (list, tuple)): # TensorFlow 0.10 now returns a tuple of tensors. L, LT = tmp else: # Old TensorFlow < 0.10 returns a shared tensor. L = tmp[:, 0, :, :] LT = tmp[:, 1, :, :] else: raise RuntimeError('Unknown Keras backend "{}".'.format(K.backend())) assert L is not None assert LT is not None P = K.batch_dot(L, LT) elif self.mode == 'diag': if K.backend() == 'theano': import theano.tensor as T import theano def fn(x, P_acc): x_ = K.zeros((self.nb_actions, self.nb_actions)) x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], x) return x_ outputs_info = [ K.zeros((self.nb_actions, self.nb_actions)), ] P, _ = theano.scan(fn=fn, sequences=L_flat, outputs_info=outputs_info) elif K.backend() == 'tensorflow': import tensorflow as tf # Create mask that can be used to gather elements from L_flat and put them # into a diagonal matrix. diag_mask = np.zeros((self.nb_actions, self.nb_actions), dtype='int32') diag_mask[np.diag_indices(self.nb_actions)] = range(1, self.nb_actions + 1) # Add leading zero element to each element in the L_flat. We use this zero # element when gathering L_flat into a lower triangular matrix L. nb_rows = tf.shape(L_flat)[0] zeros = tf.expand_dims(tf.tile(K.zeros((1,)), [nb_rows]), 1) try: # Old TF behavior. L_flat = tf.concat(1, [zeros, L_flat]) except TypeError: # New TF behavior L_flat = tf.concat([zeros, L_flat], 1) # Finally, process each element of the batch. def fn(a, x): x_ = tf.gather(x, diag_mask) return x_ P = tf.scan(fn, L_flat, initializer=K.zeros((self.nb_actions, self.nb_actions))) else: raise RuntimeError('Unknown Keras backend "{}".'.format(K.backend())) assert P is not None assert K.ndim(P) == 3 # Combine a, mu and P into a scalar (over the batches). What we compute here is # -.5 * (a - mu)^T * P * (a - mu), where * denotes the dot-product. Unfortunately # TensorFlow handles vector * P slightly suboptimal, hence we convert the vectors to # 1xd/dx1 matrices and finally flatten the resulting 1x1 matrix into a scalar. All # operations happen over the batch size, which is dimension 0. prod = K.batch_dot(K.expand_dims(a - mu, 1), P) prod = K.batch_dot(prod, K.expand_dims(a - mu, -1)) A = -.5 * K.batch_flatten(prod) assert K.ndim(A) == 2 return A
def lstm_decoder_layer(tparams_all, input_state, options, maxlen, dp, prefix="lstm_decoder_layer"): tparams_d = tparams_all[0] tparams_g = tparams_all[1] #rng = numpy.random.RandomState(4567) trng = RandomStreams(SEED) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(x_, m_, h_, c_): preact = tensor.dot(x_, tparams_g[_p(prefix, 'W')]) + tparams_g[_p(prefix, 'b')] + \ tensor.dot(h_, tparams_g[_p(prefix, 'U')]) i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')])) f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')])) o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')])) c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')])) c = f * c_ + i * c h = o * tensor.tanh(c) s = tensor.nnet.softmax(tensor.dot(h, tparams_g['to_idx_emb'])) #x_t = tensor.dot((s / s.max(axis=1)[:,None]).astype('int32').astype(theano.config.floatX), tparams_d['Wemb']) x_t = tensor.dot(tensor.switch(s < s.max(axis=1)[:,None], 0.0, 1.0).astype(theano.config.floatX), tparams_d['Wemb']) x_out = s.argmax(axis=1) m = tensor.switch(tensor.eq(x_out, 10), 0.0, 1.0).astype(theano.config.floatX) * m_ #x_t = tensor.dot(h_, tparams[_p(prefix, 'W_x')]) + tparams[_p(prefix, 'b_x')] return x_out, x_t, m, h, c ############################################################################################## rval, updates = theano.scan(_step, outputs_info=[None, input_state, tensor.alloc(numpy_floatX(1.), input_state.shape[0]), tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n']), tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n'])], name=_p(prefix, '_layers'), n_steps=maxlen) #proj_0 = rval[1]#tensor.tanh(rval[0]) m22 = trng.binomial(size=(input_state.shape[0],), p=dp, n=1, dtype=theano.config.floatX) #return rval[0]*m2, rval[1]*m2[:,None], rval[2]*m2 if(tensor.gt(maxlen, 4) == 1): x2 = tensor.alloc(numpy.asarray(0, dtype='int32'), maxlen - 4, input_state.shape[0]) x2 = tensor.concatenate((tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :], tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :], tensor.alloc(numpy.asarray(7, dtype='int32'), input_state.shape[0])[None, :], tensor.alloc(numpy.asarray(10, dtype='int32'), input_state.shape[0])[None, :], x2), axis=0) m2 = tensor.alloc(numpy_floatX(0.), maxlen - 3, input_state.shape[0]) m2 = tensor.concatenate((tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :], tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :], tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :], m2), axis=0) xt2 = tparams_d['Wemb'][x2] return rval[0]*m22+x2*(1-m22), rval[1]*m22[:,None]+xt2*(1-m22[:,None]), rval[2]*m22+m2*(1-m22) else: return rval[0]*m22, rval[1]*m22[:,None], rval[2]*m22
def hpm_layer(tparams, state_below, xt, yt, options, prefix='hpm', mask=None): nsteps = state_below.shape[0] if state_below.ndim == 3: n_examples = state_below.shape[1] else: n_examples = 1 assert mask is not None n_hid = options['n_hid'] timescales = options['timescales'] gamma = (1.0 / numpy_floatX(timescales)).reshape((1, -1, 1)) n_timescales = len(timescales) alpha0 = tensor.nnet.sigmoid( tparams[_p(prefix,'alpha0')]).dimshuffle(('x','x',0)) if (options['arch_hpm_gamma_scaled_alpha']): gamma_exp_for_alpha = (tensor.nnet.sigmoid( tparams[_p(prefix, 'gamma_exp_for_alpha')])).dimshuffle(('x','x',0)) alpha = alpha0 * gamma ** gamma_exp_for_alpha #* numpy.min(gamma) ** (1.0-gamma_exp_for_alpha) else: alpha = alpha0 * gamma # determine asymnptotic (stationary) rate from mu and alpha0 stationary_rate = tensor.nnet.softplus( tparams[_p(prefix, 'mu')]).dimshuffle(('x','x',0)) / (1.0 - alpha0) if (options['arch_hpm_gamma_scaled_mu']): gamma_exp_for_mu = (tensor.nnet.softplus( tparams[_p(prefix, 'gamma_exp_for_mu')])).dimshuffle(('x','x',0)) stationary_rate *= gamma ** gamma_exp_for_mu #agratio = 1.0 / (1.0 - tensor.nnet.sigmoid( # tparams[_p(prefix, 'alpha_gamma_ratio')]).dimshuffle(('x','x',0))) eta = tensor.nnet.softplus(tparams[_p(prefix,'eta')]).dimshuffle(('x',0)) def _timescale_posterior(likelihood, prior): # likelihood, prior, posterior have dimensions: # n_training_examples X n_timescales X n_hid # Make sure we don't crap out with 0 likelihoods off = 1e-30 if prior.dtype == 'float16': off = 1e-5 posterior = prior * likelihood + off # This doesn't work and I don't know why #posterior = tensor.switch( # tensor.gt(tensor.max(posterior,axis=1,keepdims=True), 0.0), # posterior, off) posterior = posterior / tensor.sum(posterior,axis=1,keepdims=True) return posterior def _marginalize_timescale(quantity, timescale_prob): q = quantity.dimshuffle([1,0,2]).flatten(ndim=2).dimshuffle([1,0]) t = timescale_prob.dimshuffle([1,0,2]).flatten(ndim=2).dimshuffle([1,0]) return tensor.batched_dot(q, t).reshape([n_examples, n_hid]) def _event_prob(intensity, delta_t): # remember that stationary_rate is subtracted from all intensities new_intensity = (intensity * tensor.exp(-gamma * delta_t * (1.0 - alpha0))) return new_intensity def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m, state_below, delta_t_input, delta_t_output, h_, c_, yhat_): h = _event_prob(h_, delta_t_input[:,None,None]) if (not options['arch_remap_input']): event = state_below else: net = tparams[_p(prefix,'b')] if (options['arch_hpm_recurrent']): h_with_sr = h + stationary_rate # event versus ghost event? scaled_intensity_input = h_with_sr / (eta * alpha + h_with_sr) marginal_intensity_input = _marginalize_timescale(scaled_intensity_input, c_) net += tensor.dot( marginal_intensity_input, tparams[_p(prefix,'U')]) # marginal_intensity_input: n_examples X n_hid # U: n_hid X 2n_hid (with gated) or n_hid X n_hid (without) if (options['arch_hpm_gated']): # state_below: n_examples X n_hid # W: n_hid X 2n_hid # b: 2n_hid [broadcasting is R to L] net += tensor.dot(state_below, tparams[_p(prefix, 'W')]) gate = tensor.nnet.sigmoid(_slice(net, 1, n_hid)) ungated_event = tensor.nnet.sigmoid(_slice(net, 0, n_hid)) event = gate * ungated_event else: net += state_below event = tensor.nnet.sigmoid(net) # dimensions: n_training_examples X n_timescales X n_hid event = event.dimshuffle((0,'x',1)) # credit assignment across timescales c = (event * _timescale_posterior(h + stationary_rate,c_) + (1.0-event) * c_) # update intensity h += alpha * event # clear out updates after end of sequence c = m[:, None, None] * c + (1. - m)[:, None, None] * c_ h = m[:, None, None] * h + (1. - m)[:, None, None] * h_ # predict next event conditioned on timescale hhat_with_sr = _event_prob(h, delta_t_output[:,None,None]) + stationary_rate # event versus ghost event? scaled_intensity_output = hhat_with_sr / (eta * alpha + hhat_with_sr) # expectation of intensity marginal_intensity_output = _marginalize_timescale(scaled_intensity_output, c) # has dimensions n_training_examples X n_hid # event versus ghost event? marginal_intensity_output = (marginal_intensity_output / (eta + marginal_intensity_output)) return h, c, marginal_intensity_output h = tensor.tensor3('h', dtype=config.floatX) # dimensions: n_training_examples X n_timescales X n_hid c = tensor.tensor3('c', dtype=config.floatX) # dimensions: n_training_examples X n_timescales X n_hid if (options['arch_hpm_prior_exp']): c0 = gamma ** tparams[_p(prefix,'priorexp')].dimshuffle(('x','x',0)) c0 = c0 / tensor.sum(c0, axis=1) else: c0 = 1.0 / numpy_floatX(n_timescales) rval, updates = theano.scan(_step, sequences=[mask, state_below, xt, yt], outputs_info=[tensor.alloc(numpy_floatX(0.), #h n_examples, n_timescales, n_hid), tensor.alloc(c0, #c n_examples, n_timescales, n_hid), tensor.alloc(numpy_floatX(0.),#yhat n_examples, n_hid)], name=_p(prefix, '_layers'), n_steps=nsteps) return rval[2], rval[0], rval[1] # return yhat, h, c
def rnn(self, step_function, inputs, initial_states, go_backwards=False, mask=None, unroll=False, input_length=None): '''Iterates over the time dimension of a tensor. # Arguments inputs: tensor of temporal data of shape (samples, time, ...) (at least 3D). step_function: Parameters: input: tensor with shape (samples, ...) (no time dimension), representing input for the batch of samples at a certain time step. states: list of tensors. Returns: output: tensor with shape (samples, ...) (no time dimension), new_states: list of tensors, same length and shapes as 'states'. initial_states: tensor with shape (samples, ...) (no time dimension), containing the initial values for the states used in the step function. go_backwards: boolean. If True, do the iteration over the time dimension in reverse order. mask: binary tensor with shape (samples, time), with a zero for every element that is masked. unroll: whether to unroll the RNN or to use a symbolic loop (`scan`). input_length: must be specified if using `unroll`. # Returns A tuple (last_output, outputs, new_states). last_output: the latest output of the rnn, of shape (samples, ...) outputs: tensor with shape (samples, time, ...) where each entry outputs[s, t] is the output of the step function at time t for sample s. new_states: list of tensors, latest states returned by the step function, of shape (samples, ...). ''' ndim = inputs.ndim assert ndim >= 3, 'Input should be at least 3D.' if unroll: if input_length is None: raise Exception( 'When specifying `unroll=True`, an `input_length` ' 'must be provided to `rnn`.') axes = [1, 0] + list(range(2, ndim)) inputs = inputs.dimshuffle(axes) if mask is not None: if mask.ndim == ndim - 1: mask = self.expand_dims(mask) assert mask.ndim == ndim mask = mask.dimshuffle(axes) if unroll: indices = list(range(input_length)) if go_backwards: indices = indices[::-1] successive_outputs = [] successive_states = [] states = initial_states for i in indices: output, new_states = step_function(inputs[i], states) if len(successive_outputs) == 0: prev_output = self.zeros_like(output) else: prev_output = successive_outputs[-1] output = T.switch(mask[i], output, prev_output) kept_states = [] for state, new_state in zip(states, new_states): kept_states.append(T.switch(mask[i], new_state, state)) states = kept_states successive_outputs.append(output) successive_states.append(states) outputs = T.stack(*successive_outputs) states = [] for i in range(len(successive_states[-1])): states.append( T.stack(*[ states_at_step[i] for states_at_step in successive_states ])) else: # build an all-zero tensor of shape (samples, output_dim) initial_output = step_function(inputs[0], initial_states)[0] * 0 # Theano gets confused by broadcasting patterns in the scan op initial_output = T.unbroadcast(initial_output, 0, 1) def _step(input, mask, output_tm1, *states): output, new_states = step_function(input, states) # output previous output if masked. output = T.switch(mask, output, output_tm1) return_states = [] for state, new_state in zip(states, new_states): return_states.append(T.switch(mask, new_state, state)) return [output] + return_states results, _ = theano.scan(_step, sequences=[inputs, mask], outputs_info=[initial_output] + initial_states, go_backwards=go_backwards) # deal with Theano API inconsistency if type(results) is list: outputs = results[0] states = results[1:] else: outputs = results states = [] else: if unroll: indices = list(range(input_length)) if go_backwards: indices = indices[::-1] successive_outputs = [] successive_states = [] states = initial_states for i in indices: output, states = step_function(inputs[i], states) successive_outputs.append(output) successive_states.append(states) outputs = T.stack(*successive_outputs) states = [] for i in range(len(successive_states[-1])): states.append( T.stack(*[ states_at_step[i] for states_at_step in successive_states ])) else: def _step(input, *states): output, new_states = step_function(input, states) return [output] + new_states results, _ = theano.scan(_step, sequences=inputs, outputs_info=[None] + initial_states, go_backwards=go_backwards) # deal with Theano API inconsistency if type(results) is list: outputs = results[0] states = results[1:] else: outputs = results states = [] outputs = T.squeeze(outputs) last_output = outputs[-1] axes = [1, 0] + list(range(2, outputs.ndim)) outputs = outputs.dimshuffle(axes) states = [T.squeeze(state[-1]) for state in states] return last_output, outputs, states
def rnn_step(X, H, U, b, W): """ One RNN step for all examples in a batch in parallel X: shape (batch_size, n_features) -> features at same time step for all examples in batch H: shape (batch_size, n_state) -> state at previous time step for all examples in batch U, b, W: RNN parameters returns: (batch_size, n_state) -> new state value at time step for all examples in batch """ return T.tanh(b + T.dot(X, U) + T.dot(H, W)) results, updates = theano.scan(fn=rnn_step, outputs_info=T.zeros_like(initial_state), sequences=X.dimshuffle(1, 0, 2), non_sequences=[U, b, W]) # results: (n_step, batch_size, n_state) def pred_step(H, V, c): return T.nnet.sigmoid(c + T.dot(H, V)) preds, pupds = theano.scan(fn=pred_step, outputs_info=None, sequences=results, non_sequences=[V, c]) # preds: (n_step, batch_size, n_out) # ## SGD machinery
def _run(self, num_features, num_timesteps, batch_size, mode): # determine shapes of inputs and targets depending on the batch size if batch_size == 1: inputs_size = (num_timesteps, num_features) targets_size = (num_timesteps, 1) else: inputs_size = (num_timesteps, batch_size, num_features) targets_size = (num_timesteps, batch_size, 1) # make inputs and targets shared variables inputs = theano.shared(self.rng.uniform(size=inputs_size).astype( config.floatX), borrow=True) targets = theano.shared(self.rng.uniform(size=targets_size).astype( config.floatX), borrow=True) # create symbolic inputs and targets variables if batch_size == 1: x = T.matrix('inputs') t = T.matrix('targets') else: x = T.tensor3('inputs') t = T.tensor3('inputs') x.tag.test_value = inputs.get_value(borrow=True) t.tag.test_value = targets.get_value(borrow=True) # create a set of parameters for a simple RNN W_xh = theano.shared( (0.01 * self.rng.uniform(size=(num_features, 10))).astype( config.floatX), borrow=True) W_hh = theano.shared( (0.01 * self.rng.uniform(size=(10, 10))).astype(config.floatX), borrow=True) W_hy = theano.shared( (0.01 * self.rng.uniform(size=(10, 1))).astype(config.floatX), borrow=True) b_h = theano.shared(np.zeros(10).astype(config.floatX), borrow=True) b_y = theano.shared(np.zeros(1).astype(config.floatX), borrow=True) params = [W_xh, W_hh, W_hy, b_h, b_y] # recurrent function def step(x_t, h_tm1): h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h) return h # build recurrent graph if batch_size == 1: h_0 = T.alloc(0.0, 10).astype(config.floatX) else: h_0 = T.alloc(0.0, batch_size, 10).astype(config.floatX) h, updates = theano.scan(step, sequences=[x], outputs_info=[h_0]) # network output y = T.dot(h, W_hy) + b_y # Create Gauss-Newton-Matrix object. Not really of any use here, but I # need it for Hessian-Free optimization. gn = GaussNewtonMatrix(y) # compute MSE cost = ((t - y)**2).sum(axis=1).mean() # Compute the cost at some other point in the parameter # space. Not really of any use here, but this is how I do it # during certain iterations of CG in the HF algorithm. There, # it's in fact `pi + current update proposal`. For simplicity, # I just multiply by 2 here. cost_ = theano.clone(cost, replace=dict([(pi, 2 * pi) for pi in params])) # Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG, # but for simplicity, I just take the parameters vector because it's # already there. Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0)) # compile Theano function f = theano.function([], [cost_] + Gv, givens={ x: inputs, t: targets }, mode=mode) # execute f()
def lstm_layer(tparams, input_state, mask, options, prefix='lstm_layer'): def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step_f(m_, x_, h_, c_): preact = tensor.dot(x_, tparams[_p(prefix, 'Wf')]) + tparams[_p(prefix, 'bf')] + \ tensor.dot(h_, tparams[_p(prefix, 'Uf')]) i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')])) f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')])) o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')])) c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')])) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c def _step_b(m_, x_, h_, c_): preact = tensor.dot(x_, tparams[_p(prefix, 'Wb')]) + tparams[_p(prefix, 'bb')] + \ tensor.dot(h_, tparams[_p(prefix, 'Ub')]) i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')])) f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')])) o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')])) c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')])) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c dim_proj = options[_p(prefix, 'n')] ############################################################################################## rval_f, updates_f = theano.scan(_step_f, sequences=[mask, input_state], outputs_info=[tensor.alloc(numpy_floatX(0.), input_state.shape[1], dim_proj), tensor.alloc(numpy_floatX(0.), input_state.shape[1], dim_proj)], name=_p(prefix, '_layers'), n_steps=input_state.shape[0]) rval_b, updates_b = theano.scan(_step_b, sequences=[mask, input_state], outputs_info=[tensor.alloc(numpy_floatX(0.), input_state.shape[1], dim_proj), tensor.alloc(numpy_floatX(0.), input_state.shape[1], dim_proj)], name=_p(prefix, '_layers'), n_steps=input_state.shape[0], go_backwards=True) proj_0 = rval_f[0] + rval_b[0][::-1] # Attention y_0 = (tensor.tanh(proj_0) * mask[:, :, None]) * tparams[_p(prefix, 'V')] y_0 = y_0.sum(axis=2).transpose() alpha = tensor.nnet.softmax(y_0).transpose() proj_0 = proj_0 * alpha[:, :, None]#(proj_0 * mask[:, :, None]) proj_0 = proj_0.sum(axis=0)#(proj_0 * mask[:, :, None]) ############################################################################################## proj_0 = tensor.tanh(proj_0) return proj_0
def _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, c_g, c_l, n_mcsamples, random_seed): """Return expression of approximate ELBO based on Monte Carlo sampling. """ if random_seed is None: r = MRG_RandomStreams(gen_random_state()) else: r = MRG_RandomStreams(seed=random_seed) normal_const = floatX(1 + np.log(2.0 * np.pi)) elbo = 0 # Sampling local variational parameters if uw_l is not None: l_l = (uw_l.size / 2).astype('int32') u_l = uw_l[:l_l] w_l = uw_l[l_l:] ns_l = r.normal(size=(n_mcsamples, inarray_l.tag.test_value.shape[0])) zs_l = ns_l * tt.exp(w_l) + u_l elbo += tt.sum(c_l * (w_l + 0.5 * normal_const)) else: zs_l = None # Sampling global variational parameters if uw_g is not None: l_g = (uw_g.size / 2).astype('int32') u_g = uw_g[:l_g] w_g = uw_g[l_g:] ns_g = r.normal(size=(n_mcsamples, inarray_g.tag.test_value.shape[0])) zs_g = ns_g * tt.exp(w_g) + u_g elbo += tt.sum(c_g * (w_g + 0.5 * normal_const)) else: zs_g = None if (zs_l is not None) and (zs_g is not None): def logp_(z_g, z_l): return theano.clone(logp, OrderedDict({ inarray_g: z_g, inarray_l: z_l }), strict=False) sequences = [zs_g, zs_l] elif zs_l is not None: def logp_(z_l): return theano.clone(logp, OrderedDict({inarray_l: z_l}), strict=False) sequences = [zs_l] else: def logp_(z_g): return theano.clone(logp, OrderedDict({inarray_g: z_g}), strict=False) sequences = [zs_g] logps, _ = theano.scan(fn=logp_, outputs_info=None, sequences=sequences) elbo += tt.mean(logps) return elbo
def test_machine_translation(self): """ This test case comes from https://github.com/rizar/scan-grad-speed and is an example of actual computation done with scan in the context of machine translation 'dim' has been reduced from 1000 to 5 to make the test run faster """ # Parameters from an actual machine tranlation run batch_size = 80 seq_len = 50 n_words = 80 * 50 dim = 5 # Weight matrices U = theano.shared( np.random.normal(size=(dim, dim), scale=0.0001).astype(config.floatX)) U.name = 'U' V = theano.shared(U.get_value()) V.name = 'V' W = theano.shared(U.get_value()) W.name = 'W' # Variables and their values x = T.tensor3('x') x_value = np.random.normal(size=(seq_len, batch_size, dim), scale=0.0001).astype(config.floatX) ri = T.tensor3('ri') ri_value = x_value zi = T.tensor3('zi') zi_value = x_value init = T.alloc(np.cast[config.floatX](0), batch_size, dim) def rnn_step1( # sequences x, ri, zi, # outputs_info h): pre_r = ri + h.dot(U) pre_z = zi + h.dot(V) r = T.nnet.sigmoid(pre_r) z = T.nnet.sigmoid(pre_z) after_r = r * h pre_h = x + after_r.dot(W) new_h = T.tanh(pre_h) res_h = z * new_h + (1 - z) * h return res_h # Compile the function twice, once with the optimization and once # without opt_mode = mode.including("scan") h, _ = theano.scan(rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name='fpass1', mode=opt_mode) cost = h[-1].sum() grad1 = T.grad(cost, [U, V, W]) f_opt = theano.function(inputs=[x, ri, zi], outputs=grad1, mode=opt_mode) no_opt_mode = mode.excluding("scanOp_pushout_output") h, _ = theano.scan(rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name='fpass1', mode=no_opt_mode) cost = h[-1].sum() grad1 = T.grad(cost, [U, V, W]) f_no_opt = theano.function(inputs=[x, ri, zi], outputs=grad1, mode=no_opt_mode) # Validate that the optimization has been applied scan_node_grad = [ node for node in f_opt.maker.fgraph.toposort() if isinstance(node.op, Scan) ][1] for output in scan_node_grad.op.outputs: assert not ( isinstance(output.owner.op, T.elemwise.Elemwise) and any([isinstance(i, T.Dot) for i in output.owner.inputs])) # Compare the outputs of the two functions on the same input data. f_opt_output = f_opt(x_value, ri_value, zi_value) f_no_opt_output = f_no_opt(x_value, ri_value, zi_value) utt.assert_allclose(f_opt_output, f_no_opt_output)
def plot_zero_crossing(K = 600): x = T.ftensor3() def f(X): X_ = T.zeros_like(X) X_ = T.set_subtensor(X_[:,:,0], (6.0 / (mass * length * length)) * \ ((2.0 * X[:,:,2] - 3.0 * T.cos(X[:,:,0] - X[:,:,1]) * X[:,:,3]) / \ (16.0 - 9.0 * T.square(T.cos(X[:,:,0] - X[:,:,1]))))) X_ = T.set_subtensor(X_[:,:,1], (6.0 / (mass * length * length)) * \ (8.0 * X[:,:,3] - 3.0 * T.cos(X[:,:,0] - X[:,:,1]) * X[:,:,2]) / \ (16.0 - 9.0 * T.square(T.cos(X[:,:,0] - X[:,:,1])))) X_ = T.set_subtensor(X_[:,:,2], -0.5 * mass * length * length * (X_[:,:,0] * X_[:,:,1] * T.sin(X[:,:,0] - X[:,:,1]) + \ 3.0 * gravity / length * T.sin(X[:,:,0]))) X_ = T.set_subtensor(X_[:,:,3], -0.5 * mass * length * length * (-X_[:,:,0] * X_[:,:,1] * T.sin(X[:,:,0] - X[:,:,1]) + \ gravity / length * T.sin(X[:,:,1]))) return X_ def step(X): k1 = h * f(X) k2 = h * f(X + 0.5 * k1) k3 = h * f(X + 0.5 * k2) k4 = h * f(X + k3) X_ = X + (1.0 / 6.0) * k1 + (1.0 / 3.0) * k2 + (1.0 / 3.0) * k3 + (1.0 / 6.0) * k4 return X_ result, _ = theano.scan(fn=step, outputs_info=x, n_steps=N) RK4 = theano.function([x,h,mass,length,gravity,N], result, allow_input_downcast=True) l = 1.0 m = 1.0 g = 9.81 theta1_, theta2_ = np.meshgrid(np.linspace(-np.pi, np.pi, K), np.linspace(-np.pi, np.pi, K)) initial_states = np.stack((theta1_, theta2_, np.zeros_like(theta1_), np.zeros_like(theta2_)), axis=2) state_array = RK4(initial_states, 0.025, m, l, g, 1000) min_crossing_time = [] for i in range(state_array.shape[1]): for j in range(state_array.shape[2]): theta_diff = np.mod(state_array[:,i,j,0] - state_array[:,i,j,1] - np.pi, 2.0 * np.pi) crossings = np.abs(np.diff(theta_diff)) > np.pi if np.sum(crossings) == 0: min_crossing_time.append(np.nan) else: min_crossing_time.append(np.min(np.where(crossings))) min_crossing_time = np.array(min_crossing_time) ax = plt.subplot(111) ax.imshow(np.log(min_crossing_time.reshape(K,K)), cmap='Blues_r', origin='lower', extent=[np.min(theta1_), np.max(theta1_), np.min(theta2_), np.max(theta2_)]) ax.set_aspect('equal') plt.savefig(os.path.join(__file__.split('.')[0], 'TimeToFlip.png'), dpi=400) plt.show()
def convolve1d_4D_conv2d(input, W, mode='full'): conv_out, _ = theano.scan(fn=lambda i: conv2d(input[:,:,:,i:i+1], W[:,:,:,i:i+1], border_mode=mode), outputs_info=None, sequences=[T.arange(0, W.shape[3])]) conv_out = conv_out.flatten(ndim=4).dimshuffle(1,2,3,0) return conv_out
def __init__(self, nh, nc, ne, de, cs): ''' nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings cs :: word window context size ''' #assert st in ['proba', 'argmax'] self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ (ne+1, de)).astype(theano.config.floatX)) # add one for PADDING at the end self.Wx = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ (de * cs, nh)).astype(theano.config.floatX)) self.Ws = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ (nc, nh)).astype(theano.config.floatX)) self.W = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ (nh, nc)).astype(theano.config.floatX)) self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX)) self.s0 = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX)) # bundle self.params = [ self.emb, self.Wx, self.Ws, self.W, self.bh, self.b, self.s0 ] self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 's0'] idxs = T.imatrix( ) # as many columns as context window size/lines as words in the sentence x = self.emb[idxs].reshape((idxs.shape[0], de * cs)) y = T.iscalar('y') # label def recurrence(x_t, s_tm1): h_t = T.nnet.sigmoid(T.dot(x_t, self.Wx) + \ T.dot(s_tm1, self.Ws) + self.bh) s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b)[0] return [h_t, s_t] [h, s], _ = theano.scan(fn=recurrence, \ sequences=x, outputs_info=[None, self.s0], \ n_steps=x.shape[0]) p_y_given_x_lastword = s[-1, :] p_y_given_x_sentence = s y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost and gradients and learning rate lr = T.scalar('lr') nll = -T.mean(T.log(p_y_given_x_lastword)[y]) gradients = T.grad(nll, self.params) updates = OrderedDict( (p, p - lr * g) for p, g in zip(self.params, gradients)) # theano functions self.classify = theano.function(inputs=[idxs], outputs=y_pred) self.train = theano.function(inputs=[idxs, y, lr], outputs=nll, updates=updates) self.normalize = theano.function( inputs = [], updates = {self.emb:\ self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0,'x')})
def forward_pass(x, dropout): if dropout != 0.0: x *= theano_rng.binomial( n=1, p=1 - dropout, size=x.shape, dtype=theano.config.floatX) / (1 - dropout) for i in range(Nlayers): h = (x.dimshuffle((1, 0, 2)).dot(self.Win) if i == 0 else h.dot(self.Wup[i - 1])) + self.Bhid[i] rep = lambda x: T.extra_ops.repeat( x.reshape((1, -1)), h.shape[1], axis=0) if Ah != "lstm": h = T.concatenate([ theano.scan( fn=step_rnn, sequences=[ h[:, :, Nh * d:Nh * (d + 1)], mask_float[d] ], outputs_info=[rep(self.h0[i, d])], non_sequences=[ self.Wrec[i, d], rep(self.h0[i, d]) ], go_backwards=(d == 1), )[0][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) else: h = T.concatenate([ theano.scan( fn=step_lstm, sequences=[ h[:, :, Nh * 4 * d:Nh * 4 * (d + 1)], mask_float[d] ], outputs_info=[ rep(self.c0[i, d]), rep(self.h0[i, d]) ], non_sequences=[ self.Wrec[i, d], rep(self.c0[i, d]), rep(self.h0[i, d]) ], go_backwards=(d == 1), )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) if dropout != 0.0: h *= theano_rng.binomial( n=1, p=1 - dropout, size=h.shape, dtype=theano.config.floatX) / (1 - dropout) h = h.dimshuffle((1, 0, 2)) if predictPer == "sequence": h = T.concatenate([ h[mask_int[1 - d]][:, Nh * d:Nh * (d + 1)] for d in range(Ndirs) ], axis=1) return ACTIVATION[Ay](h.dot(self.Wout) + self.Bout)
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) #input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] # We will always pass the hidden-to-hidden layer params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_pre = helper.get_output( self.hidden_to_hidden, hid_previous, **kwargs) hid_pre += input_n # Clip gradients if self.grad_clipping: hid_pre = theano.gradient.grad_clip( hid_pre, -self.grad_clipping, self.grad_clipping) return self.nonlinearity(hid_pre) def step_masked(input_n, mask_n, hid_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = step(input_n, hid_previous, *args) hid_out = T.switch(mask_n, hid, hid_previous) return [hid_out] if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step if not isinstance(self.hid_init, Layer): # The code below simply repeats self.hid_init num_batch times in # its first dimension. Turns out using a dot product and a # dimshuffle is faster than T.repeat. dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan( fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[::-1,:] return hid_out
gravity / length * T.sin(X[1]))) return X_ def step(X): k1 = h * f(X) k2 = h * f(X + 0.5 * k1) k3 = h * f(X + 0.5 * k2) k4 = h * f(X + k3) X_ = X + (1.0 / 6.0) * k1 + (1.0 / 3.0) * k2 + (1.0 / 3.0) * k3 + (1.0 / 6.0) * k4 return X_ result, _ = theano.scan(fn=step, outputs_info=x, n_steps=N) RK4 = theano.function([x,h,mass,length,gravity,N], result, allow_input_downcast=True) def plot_path(): theta1 = np.random.uniform(0.0, 2.0 * np.pi) theta2 = np.random.uniform(0.0, 2.0 * np.pi) l = 1.0 m = 1.0 g = 9.81