def softmax_with_bias_unittest_template(dtypeInput, dtypeBias): """ This is basic test for GpuSoftmaxWithBias with float64 variables We check that we loop when their is too much block TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED) """ assert dtypeInput in ['float32', 'float64'] assert dtypeBias in ['float32', 'float64'] if dtypeInput == 'float32': x = T.fmatrix('x') elif dtypeInput == 'float64': x = T.dmatrix('x') # We can't use zeros_like(x[0,::]) as this don't allow to test with # 0 shape if dtypeBias == 'float32': z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2, dtype='float32')[::2]) elif dtypeBias == 'float64': z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2, dtype='float64')[::2]) f = theano.function([x], z, mode=mode_without_gpu) f_gpu = theano.function([x], z, mode=mode_with_gpu) assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax_with_bias assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op, theano.sandbox.gpuarray.nnet.GpuSoftmaxWithBias) def cmp(n, m): #print "test_softmax",n,m if dtypeInput == 'float32': data = numpy.arange(n * m, dtype='float32').reshape(n, m) elif dtypeInput == 'float64': data = numpy.arange(n * m, dtype='float64').reshape(n, m) out = f(data) gout = f_gpu(data) assert numpy.allclose(out, gout), numpy.absolute(out - gout) cmp(2, 5) #we need to test n>32*1024 to check that we make the block loop. cmp(2 << 15, 5) cmp(4074, 400) cmp(0, 10) cmp(784, 784) cmp(4, 1000) cmp(4, 1024) cmp(4, 2000) cmp(4, 2024) #GTX285 don't have enough shared mem for this case. cmp(4, 4074) # The GTX580, 680 and kepler don't have enough shared memory. cmp(2, 10000) cmp(128, 16 * 1024) cmp(128, 64 * 1024)
def dtw(array1, array2): """ Accepts: two one dimensional arrays Returns: (float) DTW distance between them. """ s = np.zeros((array1.size+1, array2.size+1)) s[:,0] = 1e6 s[0,:] = 1e6 s[0,0] = 0.0 # Set up symbolic variables square = T.dmatrix('square') vec1 = T.dvector('vec1') vec2 = T.dvector('vec2') vec1_length = T.dscalar('vec1_length') vec2_length = T.dscalar('vec2_length') outer_loop = T.arange(vec1_length, dtype='int64') inner_loop = T.arange(vec2_length, dtype='int64') # Run the outer loop path, _ = scan(fn=outer, outputs_info=[dict(initial=square, taps=[-1])], non_sequences=[inner_loop, vec1, vec2], sequences=outer_loop) # Compile the function theano_square = function([vec1, vec2, square, vec1_length, vec2_length], path, on_unused_input='warn') # Call the compiled function and return the actual distance return theano_square(array1, array2, s, array1.size, array2.size)[-1][array1.size, array2.size]
def _compile_bp(self): ''' compile backpropagation foreach of the dqns. ''' self.bprop_by_goal = {} for (goal, dqn) in self.dqn_by_goal.items(): states = dqn.states action_values = dqn.action_values params = dqn.params targets = T.vector('target') shared_values = T.vector('shared_values') last_actions = T.lvector('action') # loss function. mse = layers.MSE(action_values[T.arange(action_values.shape[0]), last_actions], targets) \ + T.mean(abs(action_values[T.arange(action_values.shape[0]), last_actions] - shared_values)) # l2 penalty. l2_penalty = 0. for param in params: l2_penalty += (param ** 2).sum() cost = mse + self.l2_reg * l2_penalty # back propagation. updates = optimizers.Adam(cost, params, alpha=self.lr) td_errors = T.sqrt(mse) self.bprop_by_goal[goal] = theano.function(inputs=[states, last_actions, targets, shared_values], outputs=td_errors, updates=updates)
def uniq_with_lengths(seq, time_mask): """ :param seq: (time,batch) -> label :param time_mask: (time,batch) -> 0 or 1 :return: out_seqs, seq_lens. out_seqs is (max_seq_len,batch) -> label, where max_seq_len <= time. seq_lens is (batch,) -> len. """ num_batches = seq.shape[1] diffs = T.ones_like(seq) diffs = T.set_subtensor(diffs[1:], seq[1:] - seq[:-1]) time_range = T.arange(seq.shape[0]).dimshuffle([0] + ['x'] * (seq.ndim - 1)) idx = T.switch(T.neq(diffs, 0) * time_mask, time_range, -1) # (time,batch) -> idx or -1 seq_lens = T.sum(T.ge(idx, 0), axis=0) # (batch,) -> len max_seq_len = T.max(seq_lens) # I don't know any better way without scan. # http://stackoverflow.com/questions/31379971/uniq-for-2d-theano-tensor def step(batch_idx, out_seq_b1): #out_seq = seq[T.ge(idx[:, batch_idx], 0).nonzero(), batch_idx][0] out_seq = seq[:, batch_idx][T.ge(idx[:, batch_idx], 0).nonzero()] return T.concatenate((out_seq, T.zeros((max_seq_len - out_seq.shape[0],), dtype=seq.dtype))) out_seqs, _ = theano.scan( step, sequences=[T.arange(num_batches)], outputs_info=[T.zeros((max_seq_len,), dtype=seq.dtype)] ) # out_seqs is (batch,max_seq_len) return out_seqs.T, seq_lens
def filterbank_matrices(center_y, center_x, delta, sigma, N, imgshp): """Create a Fy and a Fx Parameters ---------- center_y : T.vector (shape: batch_size) center_x : T.vector (shape: batch_size) Y and X center coordinates for the attention window delta : T.vector (shape: batch_size) sigma : T.vector (shape: batch_size) Returns ------- FY, FX """ tol = 1e-4 img_height, img_width = imgshp muX = center_x.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*(T.arange(N)-N/2-0.5) muY = center_y.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*(T.arange(N)-N/2-0.5) a = T.arange(img_width) b = T.arange(img_height) FX = T.exp( -(a-muX.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 ) FY = T.exp( -(b-muY.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 ) FX = FX / (FX.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) FY = FY / (FY.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) return FY, FX
def link(self, input): self.input = input.dimshuffle(0, 1, 3, 2) # get the indexes that give the max on every line and sort them ind = T.argsort(self.input, axis=3) sorted_ind = T.sort(ind[:, :, :, -self.k_max:], axis=3) dim0, dim1, dim2, dim3 = sorted_ind.shape # prepare indices for selection indices_dim0 = T.arange(dim0)\ .repeat(dim1 * dim2 * dim3) indices_dim1 = T.arange(dim1)\ .repeat(dim2 * dim3)\ .reshape((dim1 * dim2 * dim3, 1))\ .repeat(dim0, axis=1)\ .T\ .flatten() indices_dim2 = T.arange(dim2)\ .repeat(dim3)\ .reshape((dim2 * dim3, 1))\ .repeat(dim0 * dim1, axis=1)\ .T\ .flatten() # output self.output = self.input[ indices_dim0, indices_dim1, indices_dim2, sorted_ind.flatten() ].reshape(sorted_ind.shape).dimshuffle(0, 1, 3, 2) return self.output
def k_max_pool(self, x, k): """ perform k-max pool on the input along the rows input: theano.tensor.tensor4 k: theano.tensor.iscalar the k parameter Returns: 4D tensor """ x = T.reshape(x, (x.shape[0], x.shape[1], 1, x.shape[2] * x.shape[3])) ind = T.argsort(x, axis=3) sorted_ind = T.sort(ind[:, :, :, -k:], axis=3) dim0, dim1, dim2, dim3 = sorted_ind.shape indices_dim0 = T.arange(dim0).repeat(dim1 * dim2 * dim3) indices_dim1 = ( T.arange(dim1).repeat(dim2 * dim3).reshape((dim1 * dim2 * dim3, 1)).repeat(dim0, axis=1).T.flatten() ) indices_dim2 = T.arange(dim2).repeat(dim3).reshape((dim2 * dim3, 1)).repeat(dim0 * dim1, axis=1).T.flatten() result = x[indices_dim0, indices_dim1, indices_dim2, sorted_ind.flatten()].reshape(sorted_ind.shape) shape = (result.shape[0], result.shape[1], result.shape[2] * result.shape[3], 1) result = T.reshape(result, shape) return result
def nin(X, param): w1, w2, w3, b1, b2, b3 = param X = X.dimshuffle(0, 1, 'x', 2, 3) # (n,32,1,r,c) w1 = w1.dimshuffle(0, 1, 2, 'x', 3, 4) # (64,32,16,1,3,3) w2 = w2.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,16,1,1) w3 = w3.dimshuffle(0, 1, 2, 'x', 'x') # (64,2,32,1,1) b1 = b1.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,16,1,1) b2 = b2.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,1,1,1) b3 = b3.dimshuffle(0, 'x', 1, 'x', 'x') # (64,1,2,1,1) indexi = T.arange(w1.shape[0], dtype='int32') # (0:64) indexi = T.repeat(indexi, w1.shape[1], axis=0) indexj = T.arange(w1.shape[1], dtype='int32') # (0:64) indexj = T.tile(indexj, w1.shape[0]) results, updates = scan(fn=metaOp1, sequences=[indexi, indexj], outputs_info=None, non_sequences=[X, w1, w2, b1, b2], strict=True) # (64*32,n,1,r,c) metaShape1 = results.shape[-4], results.shape[-2], results.shape[-1] reshaped1 = results.reshape((w1.shape[0], w1.shape[1]) + metaShape1) # (64,32,n,r,c) permuted1 = T.transpose(reshaped1, axes=(0, 2, 1, 3, 4)) # (64,n,32,r,c) indexi = T.arange(w1.shape[0], dtype='int32') # (0:64) results, updates = scan(fn=metaOp2, sequences=[indexi], outputs_info=None, non_sequences=[permuted1, w3, b3], strict=True) # (64,n,2,r,c) permuted2 = T.transpose(results, axes=(1, 0, 2, 3, 4)) # (n,64,2,r,c) metaShape2 = permuted2.shape[-2], permuted2.shape[-1] reshaped2 = permuted2.reshape((permuted2.shape[0], -1) + metaShape2) # (n,128,r,c) return reshaped2
def sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol, log_scale=True): """ Based on code from Shawn Tan. Credits to Kyle Kastner as well. This function computes the CTC log likelihood for a sequence that has been augmented with blank labels. """ y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype="int32") y_mask_len = tensor.sum(y_mask, axis=0, dtype="int32") if log_scale: log_probabs = _log_path_probabs(y, T.log(y_hat), y_mask, y_hat_mask, blank_symbol) batch_size = log_probabs.shape[1] # Add the probabilities of the final time steps to get the total # sequence likelihood. log_labels_probab = _log_add( log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1], log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2], ) else: probabilities = _path_probabs(y, y_hat, y_mask, y_hat_mask, blank_symbol) batch_size = probabilities.shape[1] labels_probab = ( probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1] + probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2] ) log_labels_probab = tensor.log(labels_probab) return log_labels_probab
def test_optimize_xent_vector2(self): verbose = 0 mode = theano.compile.mode.get_default_mode() if mode == theano.compile.mode.get_mode('FAST_COMPILE'): mode = 'FAST_RUN' rng = numpy.random.RandomState(utt.fetch_seed()) x_val = rng.randn(5) b_val = rng.randn(5) y_val = numpy.asarray([2]) x = T.dvector('x') b = T.dvector('b') y = T.lvector('y') def print_graph(func): for i, node in enumerate(func.maker.fgraph.toposort()): print i, node # Last node should be the output print i, printing.pprint(node.outputs[0]) print ## Test that a biased softmax is optimized correctly bias_expressions = [ T.sum(-T.log(softmax(x + b)[T.arange(y.shape[0]), y])), -T.sum(T.log(softmax(b + x)[T.arange(y.shape[0]), y])), -T.sum(T.log(softmax(x + b))[T.arange(y.shape[0]), y]), T.sum(-T.log(softmax(b + x))[T.arange(y.shape[0]), y])] for expr in bias_expressions: f = theano.function([x, b, y], expr, mode=mode) if verbose: print_graph(f) try: prev, last = f.maker.fgraph.toposort()[-2:] assert len(f.maker.fgraph.toposort()) == 3 # [big_op, sum, dim_shuffle] f(x_val, b_val, y_val) except Exception: theano.printing.debugprint(f) raise backup = config.warn.sum_div_dimshuffle_bug config.warn.sum_div_dimshuffle_bug = False try: g = theano.function([x, b, y], T.grad(expr, x), mode=mode) finally: config.warn.sum_div_dimshuffle_bug = backup if verbose: print_graph(g) try: ops = [node.op for node in g.maker.fgraph.toposort()] assert len(ops) <= 6 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_with_bias in ops assert softmax_grad not in ops g(x_val, b_val, y_val) except Exception: theano.printing.debugprint(g) raise
def __init__(self, x, y, l, window, opt, lr, init_emb, dim_emb, dim_hidden, n_vocab, L2_reg, unit, sim='cos', n_layers=1, activation=tanh): self.tr_inputs = [x, y, l] self.pr_inputs = [x, y, l] self.x = x # 1D: batch_size * l * 2, 2D: window; elem=word_id self.y = y # 1D: batch_size; elem=label self.l = l # scalar: elem=sentence length batch_size = y.shape[0] n_cands = x.shape[0] / batch_size / l self.pad = build_shared_zeros((1, dim_emb)) if init_emb is None: self.emb = theano.shared(sample_weights(n_vocab - 1, dim_emb)) else: self.emb = theano.shared(init_emb) self.E = T.concatenate([self.pad, self.emb], 0) self.W_out = theano.shared(sample_weights(dim_hidden, dim_hidden)) self.params = [self.emb, self.W_out] """ Input Layer """ e = self.E[x] # e: 1D: batch_size * l * 2, 2D: window, 3D: dim_emb x_in = e.reshape((batch_size * n_cands, l, -1)) """ Intermediate Layer """ # h: 1D: n_batch * n_cands, 2D: dim_emb h, params = cnn.layers(x_in, window, dim_emb, dim_hidden, n_layers, activation) self.params.extend(params) """ Output Layer """ h = h.reshape((batch_size, n_cands, -1)) h_1 = h[T.arange(batch_size), 0] h_2 = h[T.arange(batch_size), 1:] if sim == 'cos': y_score = cosign_similarity(h_1, h_2) else: y_score = T.batched_dot(T.dot(h_1, self.W_out), h_2.dimshuffle(0, 2, 1)) y_score_hat = T.max(y_score, 1) """ Objective Function """ self.nll = max_margin_loss(y_score_hat, y_score[T.arange(batch_size), y]) self.L2_sqr = regularization(self.params) self.cost = self.nll + L2_reg * self.L2_sqr / 2. """ Optimization """ if opt == 'adagrad': self.update = ada_grad(cost=self.cost, params=self.params, lr=lr) elif opt == 'ada_delta': self.update = ada_delta(cost=self.cost, params=self.params) elif opt == 'adam': self.update = adam(cost=self.cost, params=self.params, lr=lr) else: self.update = sgd(cost=self.cost, params=self.params, lr=lr) """ Predicts """ y_hat = T.argmax(y_score, 1) """ Check Accuracies """ self.correct = T.eq(y_hat, y)
def sample_mean(self, X): mu = X[0] sig = X[1] coeff = X[2] mu = mu.reshape((mu.shape[0], mu.shape[1]/coeff.shape[-1], coeff.shape[-1])) sig = sig.reshape((sig.shape[0], sig.shape[1]/coeff.shape[-1], coeff.shape[-1])) idx = predict( self.theano_rng.multinomial( pvals=coeff, dtype=coeff.dtype ), axis=1 ) mu = mu[T.arange(mu.shape[0]), :, idx] sig = sig[T.arange(sig.shape[0]), :, idx] epsilon = self.theano_rng.normal(size=mu.shape, avg=0., std=1., dtype=mu.dtype) z = mu + sig * epsilon return z, mu
def step(i,inputs): length = inputs.shape[0] next_level = T.dot(inputs[T.arange(0,length-i-1)],W1) + T.dot(inputs[T.arange(1,length-i)],W2) + b next_level = next_level*(next_level > 0) #next_level = inputs[T.arange(0,length-i-1)] + inputs[T.arange(1,length-i)] #next_level = theano.printing.Print('inputs')(next_level) return T.concatenate([next_level,T.zeros_like(inputs[:length-next_level.shape[0]])])
def compute_cov_field(x, t, params): t0=T.cast(T.arange(x.shape[0])*0.0+t, 'float32') t1=T.reshape(t0,(x.shape[0],1,1)) t2=T.extra_ops.repeat(t1,x.shape[1],axis=1) [centers, spreads, biases, M, b]=params diffs=x.dimshuffle(0,1,2,'x')-centers.dimshuffle('x','x',0,1) scaled_diffs=(diffs**2)*T.exp(spreads).dimshuffle('x','x',0,1) exp_terms=T.sum(scaled_diffs,axis=2)+biases.dimshuffle('x','x',0)*0.0 h=T.exp(-exp_terms) sumact=T.sum(h,axis=2) #Normalization hnorm=h/sumact.dimshuffle(0,1,'x') z=T.dot(hnorm,M) z=T.reshape(z,(x.shape[0],x.shape[1],ntgates))+b.dimshuffle('x','x',0) #nt by nb by ntgates by 1 #z=z+T.reshape(x,(t.shape[0],t.shape[1],1,nx)) z=T.exp(z) tpoints=T.cast(T.arange(ntgates),'float32')/T.cast(ntgates-1,'float32') tpoints=T.reshape(tpoints, (1,1,ntgates)) #tgating=T.exp(T.dot(t,muWT)+mubT) #nt by nb by ntgates tgating=T.exp(-kT*(tpoints-t2)**2) tgating=tgating/T.reshape(T.sum(tgating, axis=2),(t2.shape[0], t2.shape[1], 1)) tgating=T.reshape(tgating,(t2.shape[0],t2.shape[1],ntgates)) mult=z*tgating out=T.sum(mult,axis=2) return T.cast(out,'float32')
def argmax_mean(self, X): mu = X[0] sig = X[1] coeff = X[2] mu = mu.reshape((mu.shape[0], mu.shape[1]/coeff.shape[-1], coeff.shape[-1])) sig = sig.reshape((sig.shape[0], sig.shape[1]/coeff.shape[-1], coeff.shape[-1])) idx = predict(coeff) mu = mu[T.arange(mu.shape[0]), :, idx] sig = sig[T.arange(sig.shape[0]), :, idx] epsilon = self.theano_rng.normal(size=mu.shape, avg=0., std=1., dtype=mu.dtype) z = mu + sig * epsilon return z, mu
def emit(self, readouts): mu, sigma, coeff = self.gmmmlp.apply(readouts) frame_size = mu.shape[-1]/coeff.shape[-1] k = coeff.shape[-1] shape_result = coeff.shape shape_result = tensor.set_subtensor(shape_result[-1],frame_size) ndim_result = coeff.ndim mu = mu.reshape((-1, frame_size, k)) sigma = sigma.reshape((-1, frame_size,k)) coeff = coeff.reshape((-1, k)) sample_coeff = self.theano_rng.multinomial(pvals = coeff, dtype=coeff.dtype) idx = predict(sample_coeff, axis = -1) #idx = predict(coeff, axis = -1) use this line for using most likely coeff. mu = mu[tensor.arange(mu.shape[0]), :, idx] sigma = sigma[tensor.arange(sigma.shape[0]), :, idx] epsilon = self.theano_rng.normal( size=mu.shape,avg=0., std=1., dtype=mu.dtype) result = mu + sigma*epsilon return result.reshape(shape_result, ndim = ndim_result)
def negative_log_likelihood(self, y,penalty=[]): """Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label Note: we use the mean instead of the sum so that the learning rate is less dependent on the batch size """ # y.shape[0] is (symbolically) the number of rows in y, i.e., # number of examples (call it n) in the minibatch # T.arange(y.shape[0]) is a symbolic vector which will contain # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of # Log-Probabilities (call it LP) with one row per example and # one column per class LP[T.arange(y.shape[0]),y] is a vector # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. if penalty==[]: return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) else: return -T.mean(T.log ( (self.p_y_given_x)[T.arange(y.shape[0]), y])*penalty)
def negative_log_likelihood(self, label_sym): """ Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. :type label_sym: theano.tensor.TensorType :param label_sym: corresponds to a vector that gives for each example the correct label Note: we use the mean instead of the sum so that the learning rate is less dependent on the batch size """ # label_sym.shape[0] is (symbolically) the number of rows in label_sym, i.e., # number of examples (call it n) in the minibatch # T.arange(label_sym.shape[0]) is a symbolic vector which will contain # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of # Log-Probabilities (call it LP) with one row per example and # one column per class LP[T.arange(label_sym.shape[0]),label_sym] is a vector # v containing [LP[0,label_sym[0]], LP[1,label_sym[1]], LP[2,label_sym[2]], ..., # LP[n-1,label_sym[n-1]]] and T.mean(LP[T.arange(label_sym.shape[0]),label_sym]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. # loss, matrix \in R[#data,#classes] loss = theano.shared(value=numpy.ones((self.n_data,self.n_classes), dtype=theano.config.floatX), name='cost', borrow=True) T.set_subtensor(loss[T.arange(label_sym.shape[0]),label_sym], 0) #loss = 0 # score, matrix \in R[#data,1] self.score = T.max(loss + self.compatibility, axis=1) margin = T.mean(self.score - self.compatibility[T.arange(label_sym.shape[0]),label_sym]) return self.l2norm + self.C * margin
def _step(x, k, max_seq_len): tmp = x[ T.arange(x.shape[0])[:, np.newaxis, np.newaxis], T.sort(T.argsort(x, axis=1)[:, -k:, :], axis=1), T.arange(x.shape[2])[np.newaxis, np.newaxis,:], ] return T.concatenate([tmp, T.zeros([x.shape[0], max_seq_len-k, x.shape[2]])], axis=1)
def logp(self, x): n = self.n eta = self.eta diag_idxs = self.diag_idxs cumsum = tt.cumsum(x ** 2) variance = tt.zeros(n) variance = tt.inc_subtensor(variance[0], x[0] ** 2) variance = tt.inc_subtensor( variance[1:], cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]]) sd_vals = tt.sqrt(variance) logp_sd = self.sd_dist.logp(sd_vals).sum() corr_diag = x[diag_idxs] / sd_vals logp_lkj = (2 * eta - 3 + n - tt.arange(n)) * tt.log(corr_diag) logp_lkj = tt.sum(logp_lkj) # Compute the log det jacobian of the second transformation # described in the docstring. idx = tt.arange(n) det_invjac = tt.log(corr_diag) - idx * tt.log(sd_vals) det_invjac = det_invjac.sum() norm = _lkj_normalizing_constant(eta, n) return norm + logp_lkj + logp_sd + det_invjac
def sample(self, X): mu = X[0] sig = X[1] coeff = X[2] n_noise = T.cast(T.floor(coeff.shape[-1] * self.p_noise), 'int32') mu = T.concatenate( [mu, T.zeros((mu.shape[0], n_noise*sig.shape[1]/coeff.shape[-1]))], axis=1 ) mu = mu.reshape((mu.shape[0], mu.shape[1]/coeff.shape[-1], coeff.shape[-1])) sig = sig.reshape((sig.shape[0], sig.shape[1]/coeff.shape[-1], coeff.shape[-1])) idx = predict( self.theano_rng.multinomial( pvals=coeff, dtype=coeff.dtype ), axis=1 ) mu = mu[T.arange(mu.shape[0]), :, idx] sig = sig[T.arange(sig.shape[0]), :, idx] sample = self.theano_rng.normal(size=mu.shape, avg=mu, std=sig, dtype=mu.dtype) return sample
def run(self, images, h):#, error_images, h): channels = self.channels#images.shape[1] if not self.test: gx,gy,dx,dy,s2,g = self.get_params(h) else: gx,gy,dx,dy,s2,g = self.get_params_test(h) # how to handle variable sized input images? (mask??) I = images.reshape((self.batch_size*self.channels, self.height, self.width)) muX = gx.dimshuffle([0,'x']) + dx.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5) muY = gy.dimshuffle([0,'x']) + dy.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5) a = T.arange(self.width).astype(theano.config.floatX) b = T.arange(self.height).astype(theano.config.floatX) Fx = T.exp(-(a-muX.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2) Fy = T.exp(-(b-muY.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2) Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4) Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4) self.Fx = T.repeat(Fx, channels, axis=0) self.Fy = T.repeat(Fy, channels, axis=0) self.fint = self.batched_dot(self.Fy, I) # self.efint = T.dot(self.Fx, error_images) self.fim = self.batched_dot(self.fint, self.Fx.transpose([0,2,1])).reshape( (self.batch_size, self.channels*self.N*self.N)) # self.feim = T.dot(self.efint, self.Fy.transpose([0,2,1])).reshape( # (self.batch_size, channels,self.N,self.N)) return g * self.fim, (gx, gy, dx, dy, self.fint)#$T.concatenate([self.fim, self.feim], axis=1)
def geterr( self, probs, golds, occlusion ): # cross-entropy; probs: floats of (batsize, seqlen, vocabsize), gold: indexes of (batsize, seqlen) r = occlusion[:, 1:] * T.log( probs[T.arange(probs.shape[0])[:, None], T.arange(probs.shape[1])[None, :], golds] ) # --> result: floats of (batsize, seqlen) return -T.sum(r) / occlusion[:, 1:].norm(1)
def log_ctc(self, ): _1000 = tt.eye(self.n)[0] prev_mask = 1 - _1000 prevprev_mask = tt.neq(self.labels[:-2], self.labels[2:]) * \ tt.eq(self.labels[1:-1], self.blank) prevprev_mask = tt.concatenate(([0, 0], prevprev_mask)) prev_mask = safe_log(prev_mask) prevprev_mask = safe_log(prevprev_mask) prev = tt.arange(-1, self.n-1) prevprev = tt.arange(-2, self.n-2) log_pred_y = tt.log(self.inpt[:, self.labels]) def step(curr, accum): return logmul(curr, logadd(accum, logmul(prev_mask, accum[prev]), logmul(prevprev_mask, accum[prevprev]))) log_probs, _ = theano.scan( step, sequences=[log_pred_y], outputs_info=[safe_log(_1000)] ) # TODO: Add -2 if n > 1 and blank at end log_labels_probab = log_probs[-1, -2] self.cost = -log_labels_probab self.debug = tt.exp(log_probs.T)
def sample_from_joint(self, n_samples, output_2D=False): '''Samples from the joint posterior P(s_t-n_history:s_t | observations) n_samples: the number of samples to draw Returns an array with shape (n_history+1, n_samples, state_dims), where array[-1] corresponds to the current time. ''' samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),n_samples,axis=0)) idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64') samps_t0=self.current_state[idxs] t0=T.as_tensor_variable(1) [samples, ts], updates = theano.scan(fn=self.sample_step, outputs_info=[samps_t0, t0], non_sequences=[n_samples], n_steps=self.n_history) #the variable "samples" that results from the scan is time-flipped #in the sense that samples[0] corresponds to the most recent point #in time, and higher indices correspond to points in the past. #I will stick to the convention that for any collection of points in #time, [-1] will index the most recent time, and [0] will index #the point farthest in the past. So, the first axis of "samples" #needs to be flipped. flip_idxs=T.cast(-T.arange(self.n_history)+self.n_history-1,'int64') samples=T.concatenate([samples[flip_idxs], samps_t0.dimshuffle('x',0,1)], axis=0) if output_2D: samples=T.reshape(samples, ((self.n_history+1)*n_samples, self.state_dims)) return samples, updates
def keep_max(input, theta, k, sent_mask): sig_input = T.nnet.sigmoid(T.dot(input, theta)) sent_mask = sent_mask.dimshuffle(0, 'x', 1, 'x') sig_input = sig_input * sent_mask #sig_input = T.dot(input, theta) if k == 0: result = input * T.addbroadcast(sig_input, 3) return result, sig_input # get the sorted idx sort_idx = T.argsort(sig_input, axis=2) k_max_ids = sort_idx[:,:,-k:,:] dim0, dim1, dim2, dim3 = k_max_ids.shape batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3) mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3)) mapids = T.repeat(mapids, dim0, axis=0).flatten() rowids = k_max_ids.flatten() colids = T.arange(dim3).reshape((1, dim3)) colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten() sig_mask = T.zeros_like(sig_input) choosed = sig_input[batchids, mapids, rowids, colids] sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1) input_mask = sig_mask * sig_input result = input * T.addbroadcast(input_mask, 3) return result, sig_input
def ans_score(ans, outputs): arr = T.arange(ans.shape[0]) sum1 = T.sum(outputs[arr, ans]) arr = T.arange(ans.shape[0] - 1) st = self.seg.params["A"][self.seg.viterbi_startnode, ans[0]] sum2 = T.sum(self.seg.params["A"][ans[arr], ans[arr + 1]]) + st return sum1 + sum2
def neglog_2d(output, target): i = T.arange(target.shape[0]).reshape((target.shape[0], 1)) i = T.repeat(i, target.shape[1], axis=1).flatten() j = T.arange(target.shape[1]).reshape((1, target.shape[1])) j = T.repeat(j, target.shape[0], axis=0).flatten() k = target.flatten() return -T.mean(T.log(output)[i, j, k])
def _sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol, log_scale=True): ''' Based on code from Shawn Tan. Credits to Kyle Kastner as well. ''' y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32') y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32') if log_scale: log_probabs = _log_path_probabs(y, T.log(y_hat), y_mask, y_hat_mask, blank_symbol) batch_size = log_probabs.shape[1] log_labels_probab = _log_add( log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1], log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2]) else: probabilities = _path_probabs(y, y_hat, y_mask, y_hat_mask, blank_symbol) batch_size = probabilities.shape[1] labels_probab = (probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1] + probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2]) log_labels_probab = tensor.log(labels_probab) return log_labels_probab
def filterbank_matrices(self, center_y, center_x, delta, sigma): """Create a Fy and a Fx Parameters ---------- center_y : T.vector (shape: batch_size) center_x : T.vector (shape: batch_size) Y and X center coordinates for the attention window delta : T.vector (shape: batch_size) sigma : T.vector (shape: batch_size) Returns ------- FY : T.fvector (shape: ) FX : T.fvector (shape: ) """ tol = 1e-4 N = self.N rng = T.arange(N, dtype=floatX)-N/2.+0.5 # e.g. [1.5, -0.5, 0.5, 1.5] muX = center_x.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*rng muY = center_y.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*rng a = tensor.arange(self.img_width, dtype=floatX) b = tensor.arange(self.img_height, dtype=floatX) FX = tensor.exp( -(a-muX.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 ) FY = tensor.exp( -(b-muY.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 ) FX = FX / (FX.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) FY = FY / (FY.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) return FY, FX
def interpolate_bilinear(im, x, y, out_shape=None, border_mode='nearest'): if im.ndim != 4: raise TypeError('im should be a 4D Tensor image, got %dD.' % im.ndim) out_shape = out_shape if out_shape else T.shape(im)[2:] x, y = x.flatten(), y.flatten() n, c, h, w = im.shape h_out, w_out = out_shape height_f = T.cast(h, theano.config.floatX) width_f = T.cast(w, theano.config.floatX) # scale coordinates from [-1, 1] to [0, width/height - 1] x = (x + 1) / 2 * (width_f - 1) y = (y + 1) / 2 * (height_f - 1) x0_f = T.floor(x) y0_f = T.floor(y) x1_f = x0_f + 1 y1_f = y0_f + 1 if border_mode == 'nearest': x0 = T.clip(x0_f, 0, width_f - 1) x1 = T.clip(x1_f, 0, width_f - 1) y0 = T.clip(y0_f, 0, height_f - 1) y1 = T.clip(y1_f, 0, height_f - 1) elif border_mode == 'mirror': w = 2 * (width_f - 1) x0 = T.minimum(x0_f % w, -x0_f % w) x1 = T.minimum(x1_f % w, -x1_f % w) h = 2 * (height_f - 1) y0 = T.minimum(y0_f % h, -y0_f % h) y1 = T.minimum(y1_f % h, -y1_f % h) elif border_mode == 'wrap': x0 = T.mod(x0_f, width_f) x1 = T.mod(x1_f, width_f) y0 = T.mod(y0_f, height_f) y1 = T.mod(y1_f, height_f) else: raise ValueError("border_mode must be one of " "'nearest', 'mirror', 'wrap'") x0, x1, y0, y1 = (T.cast(v, 'int64') for v in (x0, x1, y0, y1)) base = T.arange(n) * w * h base = T.reshape(base, (-1, 1)) base = T.tile(base, (1, h_out * w_out)) base = base.flatten() base_y0 = base + y0 * w base_y1 = base + y1 * w idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 im_flat = T.reshape(im.dimshuffle((0, 2, 3, 1)), (-1, c)) pixel_a = im_flat[idx_a] pixel_b = im_flat[idx_b] pixel_c = im_flat[idx_c] pixel_d = im_flat[idx_d] wa = ((x1_f - x) * (y1_f - y)).dimshuffle((0, 'x')) wb = ((x1_f - x) * (1. - (y1_f - y))).dimshuffle((0, 'x')) wc = ((1. - (x1_f - x)) * (y1_f - y)).dimshuffle((0, 'x')) wd = ((1. - (x1_f - x)) * (1. - (y1_f - y))).dimshuffle((0, 'x')) output = T.sum((wa * pixel_a, wb * pixel_b, wc * pixel_c, wd * pixel_d), axis=0) output = T.reshape(output, (n, h_out, w_out, c)) return output.dimshuffle((0, 3, 1, 2))
def cost(self): return -TT.mean(TT.log(self.y)[TT.arange(self.k.shape[0]), self.k])
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') #embed.weights_init = IsotropicGaussian(0.01) embed.weights_init = Constant( init_embedding_table(filename='embeddings/vocab_embeddings.txt')) # one directional LSTM encoding q_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='q_lstm_in') q_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='q_lstm') c_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='c_lstm_in') c_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='c_lstm') bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins] q_tmp = q_lstm_ins.apply(embed.apply(question)) c_tmp = c_lstm_ins.apply(embed.apply(context)) q_hidden, _ = q_lstm.apply(q_tmp, mask=question_mask.astype( theano.config.floatX)) # lq, bs, dim c_hidden, _ = c_lstm.apply(c_tmp, mask=context_mask.astype( theano.config.floatX)) # lc, bs, dim # Attention mechanism Bilinear question attention_question = Linear(input_dim=config.pre_lstm_size, output_dim=config.pre_lstm_size, name='att_question') bricks += [attention_question] att_weights_question = q_hidden[ None, :, :, :] * attention_question.apply( c_hidden.reshape( (c_hidden.shape[0] * c_hidden.shape[1], c_hidden.shape[2]))).reshape( (c_hidden.shape[0], c_hidden.shape[1], c_hidden.shape[2]))[:, None, :, :] # --> lc,lq,bs,dim att_weights_question = att_weights_question.sum( axis=3) # sum over axis 3 -> dimensions --> lc,lq,bs att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,bs,lq att_weights_question = att_weights_question.reshape( (att_weights_question.shape[0] * att_weights_question.shape[1], att_weights_question.shape[2])) # --> lc*bs,lq att_weights_question = tensor.nnet.softmax( att_weights_question ) # softmax over axis 1 -> length of question # --> lc*bs,lq att_weights_question = att_weights_question.reshape( (c_hidden.shape[0], q_hidden.shape[1], q_hidden.shape[0])) # --> lc,bs,lq att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,lq,bs attended_question = tensor.sum( q_hidden[None, :, :, :] * att_weights_question[:, :, :, None], axis=1) # sum over axis 1 -> length of question --> lc,bs,dim attended_question.name = 'attended_question' # Match LSTM cqembed = tensor.concatenate([c_hidden, attended_question], axis=2) mlstms, mhidden_list = make_bidir_lstm_stack( cqembed, 2 * config.pre_lstm_size, context_mask.astype(theano.config.floatX), config.match_lstm_size, config.match_skip_connections, 'match') bricks = bricks + mlstms if config.match_skip_connections: menc_dim = 2 * sum(config.match_lstm_size) menc = tensor.concatenate(mhidden_list, axis=2) else: menc_dim = 2 * config.match_lstm_size[-1] menc = tensor.concatenate(mhidden_list[-2:], axis=2) menc.name = 'menc' # Attention mechanism MLP start attention_mlp_start = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_start') attention_clinear_start = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], name='attm_start') # Wym bricks += [attention_mlp_start, attention_clinear_start] layer1_start = Tanh(name='layer1_start') layer1_start = layer1_start.apply( attention_clinear_start.apply( menc.reshape( (menc.shape[0] * menc.shape[1], menc.shape[2]))).reshape( (menc.shape[0], menc.shape[1], config.attention_mlp_hidden[0]))) att_weights_start = attention_mlp_start.apply( layer1_start.reshape( (layer1_start.shape[0] * layer1_start.shape[1], layer1_start.shape[2]))) att_weights_start = att_weights_start.reshape( (layer1_start.shape[0], layer1_start.shape[1])) att_weights_start = tensor.nnet.softmax(att_weights_start.T).T attended = tensor.sum(menc * att_weights_start[:, :, None], axis=0) attended.name = 'attended' # Attention mechanism MLP end attention_mlp_end = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_end') attention_qlinear_end = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], name='atts_end') #Wum attention_clinear_end = Linear( input_dim=menc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attm_end') # Wym bricks += [ attention_mlp_end, attention_qlinear_end, attention_clinear_end ] layer1_end = Tanh(name='layer1_end') layer1_end = layer1_end.apply( attention_clinear_end.apply( menc.reshape((menc.shape[0] * menc.shape[1], menc.shape[2] ))).reshape((menc.shape[0], menc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_end.apply(attended)[None, :, :]) att_weights_end = attention_mlp_end.apply( layer1_end.reshape((layer1_end.shape[0] * layer1_end.shape[1], layer1_end.shape[2]))) att_weights_end = att_weights_end.reshape( (layer1_end.shape[0], layer1_end.shape[1])) att_weights_end = tensor.nnet.softmax(att_weights_end.T).T att_weights_start = tensor.dot( tensor.le( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_weights_start) att_weights_end = tensor.dot( tensor.ge( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_weights_end) # add attention from left and right att_weights = att_weights_start * att_weights_end #att_weights = tensor.minimum(att_weights_start, att_weights_end) att_target = tensor.zeros((ans_indices.shape[1], context.shape[0]), dtype=theano.config.floatX) att_target = tensor.set_subtensor( att_target[tensor.arange(ans_indices.shape[1]), ans_indices], 1) att_target = att_target.dimshuffle(1, 0) #att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)), # tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1) self.predictions = tensor.gt(att_weights, 0.25) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) #att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, mhidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_weights_start.name = 'att_weights_start' att_weights_end.name = 'att_weights_end' att_weights.name = 'att_weights' att_target.name = 'att_target' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars = [ cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target ] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-2, mu=0.9, decay=0.9, epochs=10, batch_sz=100, show_fig=False): X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid = Xvalid.astype(np.float32) Yvalid = Yvalid.astype(np.int32) self.rng = RandomStreams() # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W = np.random.randn(M1, K) * np.sqrt(2.0 / M1) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY_train = self.forward_train(thX) # this cost is for training cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY])) updates = momentum_updates(cost, self.params, learning_rate, mu) train_op = theano.function(inputs=[thX, thY], updates=updates) # for evaluation and prediction pY_predict = self.forward_predict(thX) cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY])) prediction = self.predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction]) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) if j % 50 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
X = T.dmatrix() y = T.ivector() prepare_data = lambda x: (theano.shared(x[0].astype('float64')), theano.shared(x[1].astype('int32'))) (training_x, training_y), (test_x, test_y), (validation_x, validation_y) = map( prepare_data, [train_set, test_set, valid_set]) W = theano.shared(numpy.zeros([dims, n_classes])) b = theano.shared(numpy.zeros(n_classes)) y_hat = T.nnet.softmax(T.dot(X, W) + b) y_pred = T.argmax(y_hat, axis=1) test_error = T.mean(T.neq(y_pred, y)) training_error = -T.mean(T.log(y_hat)[T.arange(y.shape[0]), y]) learning_rate = 0.2 params = [W, b] beta = .9 updates = [] for p in params: ms = theano.shared(1. + 0. * p.get_value()) updates += [ (p, p - learning_rate * T.grad(training_error, p) / T.sqrt(ms)), (ms, beta * ms + (1 - beta) * T.sqr(T.grad(training_error, p))) ] idx = T.ivector() training_function = theano.function(inputs=[idx], outputs=training_error,
def negative_log_likelihood(self,y): return -T.mean((self.p_y_given_x)[T.arange(y.shape[0]),y])
def get_sessions(self, environment, session_length=10, batch_size=None, initial_env_states='zeros', initial_observations='zeros', initial_hidden='zeros', experience_replay=False, unroll_scan=True, return_automatic_updates=False, optimize_experience_replay=None, **kwargs): """ Returns history of agent interaction with environment for given number of turns: :param environment: an environment to interact with :type environment: BaseEnvironment :param session_length: how many turns of interaction shall there be for each batch :type session_length: int :param batch_size: amount of independent sessions [number or symbolic]. irrelevant if there's at least one input or if you manually set any initial_*. :type batch_size: int or theano.tensor.TensorVariable :param experience_replay: whether or not to use experience replay if True, assumes environment to have a pre-defined sequence of observations and actions (as env.observations etc.) The agent will then observe sequence of observations and will be forced to take recorded actions via get_output(...,{action_layer=recorded_action} Saves some time by directly using environment.observations (list of sequences) instead of calling environment.get_action_results via environment.as_layers(...). Note that if this parameter is false, agent will be allowed to pick any actions during experience replay :type experience_replay: bool :param unroll_scan: whether use theano.scan or lasagne.utils.unroll_scan :param return_automatic_updates: whether to append automatic updates to returned tuple (as last element) :param kwargs: optional flags to be sent to NN when calling get_output (e.g. deterministic = True) :type kwargs: several kw flags (flag=value,flag2=value,...) :param initial_something: layers providing initial values for all variables at 0-th time step 'zeros' default means filling variables with zeros Initial values are NOT included in history sequences :param optimize_experience_replay: deprecated, use experience_replay :returns: state_seq,observation_seq,hidden_seq,action_seq,policy_seq, for environment state, observation, hidden state, chosen actions and agent policy respectively each of them having dimensions of [batch_i,seq_i,...] time synchronization policy: env_states[:,i] was observed as observation[:,i] BASED ON WHICH agent generated his policy[:,i], resulting in action[:,i], and also updated his memory from hidden[:,i-1] to hiden[:,i] :rtype: tuple of Theano tensors """ if optimize_experience_replay is not None: experience_replay = optimize_experience_replay warn( "optimize_experience_replay is deprecated and will be removed in 1.0.2. Use experience_replay parameter." ) env = environment if experience_replay: if not hasattr(env, "observations") or not hasattr(env, "actions"): raise ValueError( 'if optimize_experience_replay is turned on, one must provide an environment with .observations' 'and .actions properties containing observations and actions to replay.' ) if initial_env_states != 'zeros' or initial_observations != 'zeros': warn( "In experience replay mode, initial env states and initial observations parameters are unused", verbosity_level=2) if initial_hidden == 'zeros' and hasattr( env, "preceding_agent_memories"): initial_hidden = getattr(env, "preceding_agent_memories") # create recurrence self.recurrence = self.as_replay_recurrence( environment=environment, session_length=session_length, initial_hidden=initial_hidden, unroll_scan=unroll_scan, **kwargs) else: if isinstance(env, SessionPoolEnvironment) or isinstance( env, SessionBatchEnvironment): warn( "You are using experience replay environment as normal environment. " "This will work, but you can get a free performance boost " "by using passing optimize_experience_replay = True to .get_sessions", verbosity_level=2) # create recurrence in active mode (using environment.get_action_results) self.recurrence = self.as_recurrence( environment=environment, session_length=session_length, batch_size=batch_size, initial_env_states=initial_env_states, initial_observations=initial_observations, initial_hidden=initial_hidden, unroll_scan=unroll_scan, **kwargs) state_layers_dict, output_layers = self.recurrence.get_sequence_layers( ) # convert sequence layers into actual theano variables theano_expressions = lasagne.layers.get_output( list(state_layers_dict.values()) + list(output_layers)) n_states = len(state_layers_dict) states_list, outputs = theano_expressions[: n_states], theano_expressions[ n_states:] if experience_replay: assert len(states_list) == len(self.agent_states) agent_states = states_list env_states = [T.arange(session_length)] observations = env.observations else: # sort sequences into categories agent_states, env_states, observations = \ unpack_list(states_list, [len(self.agent_states), len(env.state_shapes), len(env.observation_shapes)]) policy, actions = unpack_list( outputs, [len(self.policy), len(self.action_layers)]) agent_states = OrderedDict( list(zip(list(self.agent_states.keys()), agent_states))) # if user asked for single value and not one-element list, unpack the list if type(environment.state_shapes) not in supported_sequences: env_states = env_states[0] if self.single_observation: observations = observations[0] if self.single_action: actions = actions[0] if self.single_policy: policy = policy[0] ret_tuple = env_states, observations, agent_states, actions, policy if return_automatic_updates: ret_tuple += (self.get_automatic_updates(), ) if unroll_scan and return_automatic_updates: warn( "return_automatic_updates useful when and only when unroll_scan == False", verbosity_level=2) return ret_tuple
def negative_log_likelihood(self): self.prob_of_y_given_x = T.nnet.softmax(self.x) return -T.mean( T.log(self.prob_of_y_given_x)[T.arange(self.y.shape[0]), self.y])
def fit(self, X, Y, learning_rate=1e-2, mu=0.99, reg=1e-12, epochs=400, batch_sz=20, print_period=1, show_fig=False): # X = X.astype(np.float32) Y = Y.astype(np.int32) # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W = init_weight(M1, K) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # for momentum dparams = [ theano.shared(np.zeros(p.get_value().shape)) for p in self.params ] # for rmsprop cache = [ theano.shared(np.zeros(p.get_value().shape)) for p in self.params ] # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg * T.sum([(p * p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.predict(thX) grads = T.grad(cost, self.params) # momentum only updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates, ) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % print_period == 0: costs.append(c) e = np.mean(Ybatch != p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def make_one_hot(label, dim): num = label.shape[0] one_hot = T.zeros((num, dim), theano.config.floatX) one_hot = T.set_subtensor(one_hot[T.arange(num), label], 1.) return one_hot
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-3, mu=0.99, decay=0.999, reg=1e-3, eps=1e-8, epochs=10, batch_sz=100, show_fig=False): # # 特地轉換所有參數的型別到float32,要不然就會有error,但其他支theano 的 ANN class卻不用這樣做,不知道為什麼?? learning_rate = np.float32(learning_rate) mu = np.float32(mu) decay = np.float32(decay) reg = np.float32(reg) # step1 get the data X, Y = shuffle(X, Y) X = X.astype(np.float32) # for being avalibel for GPU Y = Y.astype(np.int32) Xvalid = Xvalid.astype(np.float32) Yvalid = Yvalid.astype(np.int32) # initialize each layer and parameters of NN N, D = X.shape K = len(set(Y)) self.hidden_layers = [] # 這個list用來放HiddenLyer物件 M1 = D count = 0 for M2 in self.hidden_layer_sizes: # 建立ANN物件時輸入的參數,轉成在ANN物件內部建立實體HiddenLyer物件 h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # initialize logistic regression layer W, b = init_weight_and_bias( M1, K) # 最後一層output物件 ( the last logist regression layer ) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect all the parameters that we are going to use grediant descent self.params = [self.W, self.b] # 先把最後一層output layer放進去 for h in self.hidden_layers: self.params += h.params # 應該是照著 hidden lyer1 , hidden layer2, 的順序放進去 # for momentum, we need to create zero matrix for each layer dparams = [ theano.shared(np.zeros_like(p.get_value(), dtype=np.float32)) for p in self.params ] # for rmsprop, we need create cache cache = [ theano.shared(np.ones_like(p.get_value(), dtype=np.float32)) for p in self.params ] # step2. model # theano variabels thX = T.fmatrix('X') # data input (matrix) thY = T.ivector('Y') # target ( vector) pY = self.forward(thX) # forward的 output,出來的型別是matrix # step3. cost function # define theano's computation grath rcost = reg * T.sum([(p * p).sum() for p in self.params]) cost = -T.mean( T.log(pY[T.arange(thY.shape[0]), thY]) ) + rcost # thY there is a vector, and we do not need y2indicator in this case(算是特殊寫法) # step4. solver # define theano's operations grads = T.grad(cost, self.params) updates = [ (c, decay * c + (np.float32(1.0) - decay) * g * g) for c, g in zip(cache, grads) ] + [(p, p + mu * dp - learning_rate * g / T.sqrt(c + eps)) for p, c, dp, g in zip(self.params, cache, dparams, grads) ] + [(dp, mu * dp - learning_rate * g / T.sqrt(c + eps)) for c, dp, g in zip(cache, dparams, grads)] # updates = [ # (c, decay*c + (np.float32(1.0)-decay)*T.grad(cost, Dp)*T.grad(cost, p)) for p, c in zip(self.params, cache) # ] + [ # (p, p + mu*dp - learning_rate*T.grad(cost, p) / T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] + [ # (dp, mu*dp - learning_rate*T.grad(cost, p) / T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] train_op = theano.function(inputs=[thX, thY], updates=updates) # for evaluation and prediction prediction = self.th_prediction(thX) cost_prediction_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) n_batches = N // batch_sz costs = [] for i in range(epochs): for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz), ] train_op(Xbatch, Ybatch) if j % 20 == 0: cost_valid, preds_valid = cost_prediction_op( Xvalid, Yvalid) costs.append(cost_valid) e = error_rate(Yvalid, preds_valid) print("i:", i, " j,", j, " nb:", n_batches, " cost:", cost_valid, " error_rate:", e) if show_fig: plt.plot(costs) plt.show()
def cost(self, net): "Return the log-likelihood cost." return -T.mean( T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])
def tree_lstm_layer(tparams, inputs, options, prefix='tree_lstm', **kwargs): state_below, mask, left_mask, right_mask = inputs # state_below: #step x #sample x dim_emb # mask: #step x #sample # left_mask: #step x #sample x #step # right_mask: #step x #sample x #step nsteps = state_below.shape[0] dim = tparams[_p(prefix, 'U_l')].shape[0] n_samples = state_below.shape[1] init_state = tensor.alloc(0., n_samples, nsteps, dim) init_memory = tensor.alloc(0., n_samples, nsteps, dim) # use the slice to calculate all the different gates def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] elif _x.ndim == 2: return _x[:, n * dim:(n + 1) * dim] return _x[n * dim:(n + 1) * dim] # one time step of the lstm def _step(m_, x_, left_mask_, right_mask_, counter_, h_, c_): # zero out the input unless this is a leaf node # flag = tensor.switch(tensor.eq(tensor.sum(left_mask_, axis=1) + tensor.sum(right_mask_, axis=1), 0), 1., 0.) # x_ = x_ * flag[:, None] preact_l = tensor.dot(tensor.sum(left_mask_[:, :, None] * h_, axis=1), tparams[_p(prefix, 'U_l')]) preact_r = tensor.dot(tensor.sum(right_mask_[:, :, None] * h_, axis=1), tparams[_p(prefix, 'U_r')]) x_ = concatenate([ _slice(x_, 0, dim), _slice(x_, 1, dim), _slice(x_, 1, dim), _slice(x_, 2, dim), _slice(x_, 3, dim) ], axis=1) preact = preact_l + preact_r + x_ i = tensor.nnet.sigmoid(_slice(preact, 0, dim)) fl = tensor.nnet.sigmoid(_slice(preact, 1, dim)) fr = tensor.nnet.sigmoid(_slice(preact, 2, dim)) o = tensor.nnet.sigmoid(_slice(preact, 3, dim)) u = tensor.tanh(_slice(preact, 4, dim)) c_temp = fl * tensor.sum(left_mask_[:, :, None] * c_, axis=1) \ + fr * tensor.sum(right_mask_[:, :, None] * c_, axis=1) \ + i * u h_temp = o * tensor.tanh(c_temp) h = tensor.set_subtensor(h_[:, counter_, :], h_temp) c = tensor.set_subtensor(c_[:, counter_, :], c_temp) c = m_[:, None, None] * c + (1. - m_)[:, None, None] * c_ h = m_[:, None, None] * h + (1. - m_)[:, None, None] * h_ return h, c, i, fl, fr, o state_below = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] rval, updates = theano.scan( fn=_step, sequences=[ mask, state_below, left_mask, right_mask, tensor.arange(0, nsteps) ], outputs_info=[init_state, init_memory, None, None, None, None], name=_p(prefix, '_layers'), profile=False) return rval
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=100, show_fig=False): ''' Takes training data and test data (valid) at once, then trains and validates along the way. Modifying hyperparams of learning_rate, mu, decay, epochs (iterations = N//batch_sz * epochs), batch_sz and whether to display a figure are passed as optional variables. ''' X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid = Xvalid.astype(np.float32) Yvalid = Yvalid.astype(np.int32) self.rng = RandomStreams() # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D # first input layer is the number of features in X count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) # layer ID is just the number self.hidden_layers.append(h) M1 = M2 # input layer to next layer is this layer. count += 1 # output layer weights (last hidden layer to K output classes) W = np.random.randn(M1, K) * np.sqrt(2.0 / M1) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY_train = self.forward_train(thX) # function to calc prob Y given X # this cost is for training cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY])) # gradients wrt each param grads = T.grad(cost, self.params) # for momentum ''' np.zeros_like(array) returns an array(/matrix) of the same shape and type of the given array. Very cool, never seen this before. ''' dparams = [ theano.shared(np.zeros_like(p.get_value())) for p in self.params ] # for rmsprop, initialize cache as 1 cache = [ theano.shared(np.ones_like(p.get_value())) for p in self.params ] ''' Noting for myself that I've never seen this way of using zip to loop through multiple lists/arays with the same indices simultaneously. Makes a lot of sense now, I should see where I can use this to turn loops over indices in my code in to list comprehension that is by ele. ''' # these are the functions for updating the variables of # dparams (momentum) and cache. new_cache = [ decay * c + (1 - decay) * g * g for p, c, g in zip(self.params, cache, grads) ] new_dparams = [ mu * dp - learning_rate * g / T.sqrt(new_c + 1e-10) for p, new_c, dp, g in zip(self.params, new_cache, dparams, grads) ] ''' Using zip to create lists of tuples of the variables themselves, and the fuctions for updating them (cache, momentum params and params), where params are weights (W) and biases (b) for each layer. ''' updates = [(c, new_c) for c, new_c in zip(cache, new_cache)] + [ (dp, new_dp) for dp, new_dp in zip(dparams, new_dparams) ] + [(p, p + new_dp) for p, new_dp in zip(self.params, new_dparams)] train_op = theano.function(inputs=[thX, thY], updates=updates) # for evaluation and prediction, more theano graph set-up with tensors # still no values yet in any of these. Training loop next! pY_predict = self.forward_predict(thX) cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY])) prediction = self.predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction]) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] # theano function defined above that does all the work. # takes the data (like feed_dict in tf). The update calcs were # given to it above as a list for all layers. train_op(Xbatch, Ybatch) if j % 50 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def sum_ll(self, y): """ Sum log-lilelihood """ return T.sum(self.log_p_y_given_x[T.arange(y.shape[0]), y])
def build_model(shared_params, options): trng = RandomStreams(1234) drop_ratio = options['drop_ratio'] batch_size = options['batch_size'] n_dim = options['n_dim'] w_emb = shared_params['w_emb'] dropout = theano.shared(numpy.float32(0.)) image_feat = T.ftensor3('image_feat') # T x batch_size input_idx = T.imatrix('input_idx') input_mask = T.matrix('input_mask') # label is the TRUE label label = T.ivector('label') empty_word = theano.shared(value=np.zeros((1, options['n_emb']), dtype='float32'), name='empty_word') w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']], axis=0) input_emb = w_emb_extend[input_idx] # get the transformed image feature h_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32')) c_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32')) if options['sent_drop']: input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio) h_from_lstm, c_encode = lstm_layer(shared_params, input_emb, input_mask, h_0, c_0, options, prefix='sent_lstm') # pick the last one as encoder Y = fflayer(shared_params, image_feat, options, prefix='image_mlp', act_func=options.get('image_mlp_act', 'tanh')) r_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32')) r = wbw_attention_layer(shared_params, Y, h_from_lstm, input_mask, r_0, options, return_final=True) h_star = T.tanh( T.dot(r, shared_params['W_p_w']) + T.dot(h_from_lstm[-1], shared_params['W_x_w'])) combined_hidden = fflayer(shared_params, h_star, options, prefix='scale_to_softmax', act_func='linear') # drop the image output prob = T.nnet.softmax(combined_hidden) prob_y = prob[T.arange(prob.shape[0]), label] pred_label = T.argmax(prob, axis=1) # sum or mean? cost = -T.mean(T.log(prob_y)) accu = T.mean(T.eq(pred_label, label)) return image_feat, input_idx, input_mask, \ label, dropout, cost, accu
def fit(self, X, Y, learning_rate=0.1, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False): D = X[0].shape[1] # X is of size N x T(n) x D K = len(set(Y.flatten())) # K is the total number of classes N = len(Y) # N is the total number of individuals M = self.M self.f = activation # initial weights (3 Weights, 2 biases, 1 hidden state) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) # initial hidden state Wo = init_weight(M, K) bo = np.zeros(K) # make them theano shared (3 Weights, 2 biases, 1 hidden state) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] thX = T.fmatrix( 'X' ) # 2-dimensional (T*D). In parity problem, D=1, e.g. thX = [[0],[1],[1],[0]] thY = T.ivector( 'Y') # thY in parity problem is also a sequence thY = [0,1,0,0] def recurrence(x_t, h_t1): # do calculation at each recurrent unit # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( # do calculation at each recurrent unit fn=recurrence, outputs_info=[self.h0, None], sequences=thX, # e.g. [[0],[1],[1],[0]] n_steps=thX.shape[0], # for each time ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) # cost for one sequence cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) # cross-entropy loss grads = T.grad(cost, self.params) # gradient dparams = [theano.shared(p.get_value() * 0) for p in self.params] # momentum updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] self.predict_op = theano.function( inputs=[thX], outputs=prediction) # forward feeding one time self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction, y], # forwardprop to get loss updates=updates # backprop to update params ) # total cost for all sequences costs = [] # main training loop for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range( N ): # train one sequence at a time! So we do not specify batch_size, since batch_size = 1. c, p, rout = self.train_op( X[j], Y[j] ) # each time we train a sequence, we update all params. # print "p:", p cost += c if p[-1] == Y[ j, -1]: # only need to check the last element in p and Y for parity problem n_correct += 1 print("shape y:", rout.shape) print("i:", i, "cost:", cost, "accuracy:", (float(n_correct) / N)) costs.append(cost) if n_correct == N: break if show_fig: plt.plot(costs) plt.show()
def ind_ll(self, y): """ Individual log-lilelihood """ return self.log_p_y_given_x[T.arange(y.shape[0]), y]
def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): N = len(X) D = self.D M = self.M V = self.V self.f = activation # initial weights We = init_weight(V, D) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, V) bo = np.zeros(V) # make them theano shared self.We = theano.shared(We) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [ self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo ] thX = T.ivector('X') Ei = self.We[thX] # will be a TxD matrix thY = T.ivector('Y') # sentence input: # [START, w1, w2, ..., wn] # sentence target: # [w1, w2, w3, ..., END] def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=Ei, n_steps=Ei.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction], updates=updates) costs = [] n_total = sum((len(sentence) + 1) for sentence in X) for i in xrange(epochs): X = shuffle(X) n_correct = 0 cost = 0 for j in xrange(N): # problem! many words --> END token are overrepresented # result: generated lines will be very short # we will try to fix in a later iteration # BAD! magic numbers 0 and 1... input_sequence = [0] + X[j] output_sequence = X[j] + [1] # we set 0 to start and 1 to end c, p = self.train_op(input_sequence, output_sequence) # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 print "i:", i, "cost:", cost, "correct rate:", (float(n_correct) / n_total) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def nll(self, y): """ Mean negative log-lilelihood """ return -T.mean(self.log_p_y_given_x[T.arange(y.shape[0]), y])
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], type_hidden_units=[200, 100, 6], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True, print_freq=5, sen_reg=False, L2=False): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) num_sens = len(dataset[0][0][0]) print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." x = T.tensor3("x") type_y = T.ivector("y_type") pop_y = T.ivector("y_pop") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm)) ######################### # Construct Sen Vec ##### ######################### conv_layers = [] filter_shape = (num_maps, 1, filter_hs[0], emb_dm) pool_size = (input_height - filter_hs[0] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) # make the sentence vector maxtrix sen_vecs = conv_layer.output.reshape((x.shape[0] * x.shape[1], num_maps)) conv_layers.append(conv_layer) ######################## ## Task 1: populaiton### ######################## pop_layer_sizes = zip(hidden_units, hidden_units[1:]) pop_layer_input = sen_vecs pop_drop_input = sen_vecs pop_hidden_outs = [] pop_drop_outs = [] pop_hidden_layers = [] pop_drop_layers = [] droprate = 0.5 for layer_size in pop_layer_sizes[:-1]: U_value = np.random.random(layer_size).astype(theano.config.floatX) b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX) U = theano.shared(U_value, borrow=True, name="U") b = theano.shared(b_value, borrow=True, name="b") pop_hidden_layer = nn.HiddenLayer(rng, pop_layer_input, layer_size[0], layer_size[1], ReLU, U * (1 - droprate), b) pop_drop_hidden_layer = nn.DropoutHiddenLayer(rng, pop_drop_input, layer_size[0], layer_size[1], ReLU, droprate, U, b) pop_hidden_layers.append(pop_hidden_layer) pop_drop_layers.append(pop_drop_hidden_layer) pop_hidden_out = pop_hidden_layer.output pop_drop_out = pop_drop_hidden_layer.output pop_layer_input = pop_hidden_out pop_drop_input = pop_drop_out pop_hidden_outs.append(pop_hidden_out) pop_drop_outs.append(pop_drop_out) # construct pop classifier n_in, n_out = pop_layer_sizes[-1] W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX) b_value = np.zeros((n_out, ), dtype=theano.config.floatX) pop_W = theano.shared(W_value, borrow=True, name="pop_W") pop_b = theano.shared(b_value, borrow=True, name="pop_b") pop_act = T.dot(pop_hidden_outs[-1], pop_W * (1 - droprate)) + pop_b pop_drop_act = T.dot(pop_drop_outs[-1], pop_W) + pop_b sen_pop_probs = T.nnet.softmax(pop_act) sen_drop_pop_probs = T.nnet.softmax(pop_drop_act) pop_probs = T.mean(sen_pop_probs.reshape((x.shape[0], x.shape[1], n_out)), axis=1) pop_drop_probs = T.mean(sen_drop_pop_probs.reshape( (x.shape[0], x.shape[1], n_out)), axis=1) pop_y_pred = T.argmax(pop_probs, axis=1) pop_drop_y_pred = T.argmax(pop_drop_probs, axis=1) pop_neg_loglikelihood = -T.mean( T.log(pop_probs)[T.arange(pop_y.shape[0]), pop_y]) pop_drop_neg_loglikelihood = -T.mean( T.log(pop_drop_probs)[T.arange(pop_y.shape[0]), pop_y]) pop_errors = T.mean(T.neq(pop_y_pred, pop_y)) pop_errors_detail = T.neq(pop_y_pred, pop_y) pop_cost = pop_neg_loglikelihood pop_drop_cost = pop_drop_neg_loglikelihood ######################## ## Task 1: event type### ######################## type_layer_sizes = zip(type_hidden_units, type_hidden_units[1:]) type_layer_input = sen_vecs type_drop_input = sen_vecs type_hidden_outs = [] type_drop_outs = [] type_hidden_layers = [] type_drop_layers = [] droprate = 0.5 for layer_size in type_layer_sizes[:-1]: U_value = np.random.random(layer_size).astype(theano.config.floatX) b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX) U = theano.shared(U_value, borrow=True, name="U") b = theano.shared(b_value, borrow=True, name="b") type_hidden_layer = nn.HiddenLayer(rng, type_layer_input, layer_size[0], layer_size[1], ReLU, U * (1 - droprate), b) type_drop_hidden_layer = nn.DropoutHiddenLayer(rng, type_drop_input, layer_size[0], layer_size[1], ReLU, droprate, U, b) type_hidden_layers.append(type_hidden_layer) type_drop_layers.append(type_drop_hidden_layer) type_hidden_out = type_hidden_layer.output type_drop_out = type_drop_hidden_layer.output type_layer_input = type_hidden_out type_drop_input = type_drop_out type_hidden_outs.append(type_hidden_out) type_drop_outs.append(type_drop_out) # construct pop classifier n_in, n_out = type_layer_sizes[-1] W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX) b_value = np.zeros((n_out, ), dtype=theano.config.floatX) type_W = theano.shared(W_value, borrow=True, name="pop_W") type_b = theano.shared(b_value, borrow=True, name="pop_b") type_act = T.dot(type_hidden_outs[-1], type_W * (1 - droprate)) + type_b type_drop_act = T.dot(type_drop_outs[-1], type_W) + type_b #type_probs = T.nnet.softmax(type_max_act) #type_drop_probs = T.nnet.softmax(type_drop_max_act) sen_type_probs = T.nnet.softmax(type_act) sen_drop_type_probs = T.nnet.softmax(type_drop_act) type_probs = T.mean(sen_type_probs.reshape( (x.shape[0], x.shape[1], n_out)), axis=1) type_drop_probs = T.mean(sen_drop_type_probs.reshape( (x.shape[0], x.shape[1], n_out)), axis=1) type_y_pred = T.argmax(type_probs, axis=1) type_drop_y_pred = T.argmax(type_drop_probs, axis=1) type_neg_loglikelihood = -T.mean( T.log(type_probs)[T.arange(type_y.shape[0]), type_y]) type_drop_neg_loglikelihood = -T.mean( T.log(type_drop_probs)[T.arange(type_y.shape[0]), type_y]) type_errors = T.mean(T.neq(type_y_pred, type_y)) type_errors_detail = T.neq(type_y_pred, type_y) type_cost = type_neg_loglikelihood type_drop_cost = type_drop_neg_loglikelihood ################################## # Collect all the parameters ##### ################################## params = [] # convolution layer params for conv_layer in conv_layers: params += conv_layer.params # params for population task for layer in pop_drop_layers: params += layer.params params.append(pop_W) params.append(pop_b) # params for event type task for layer in type_drop_layers: params += layer.params params.append(type_W) params.append(type_b) if non_static: params.append(words) total_cost = pop_cost + type_cost total_drop_cost = pop_drop_cost + type_drop_cost if L2: l2_norm = 0.1 * T.sum(pop_W**2) + 0.1 * T.sum(type_W**2) for drop_layer in type_drop_layers: l2_norm += 0.1 * T.sum(drop_layer.W**2) for drop_layer in pop_drop_layers: l2_norm += 0.1 * T.sum(drop_layer.W**2) total_cost += l2_norm total_drop_cost += l2_norm total_grad_updates = sgd_updates_adadelta(params, total_drop_cost, lr_decay, 1e-6, sqr_norm_lim) total_preds = [pop_y_pred, type_y_pred] total_errors_details = [pop_errors_detail, type_errors_detail] total_out = total_preds + total_errors_details ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" np.random.seed(1234) train_x, train_pop_y, train_type_y = shared_dataset(dataset[0]) valid_x, valid_pop_y, valid_type_y = shared_dataset(dataset[1]) test_x, test_pop_y, test_type_y = shared_dataset(dataset[2]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function( [index], total_drop_cost, updates=total_grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], pop_y: train_pop_y[index * batch_size:(index + 1) * batch_size], type_y: train_type_y[index * batch_size:(index + 1) * batch_size] }) valid_train_func = function( [index], total_drop_cost, updates=total_grad_updates, givens={ x: valid_x[index * batch_size:(index + 1) * batch_size], pop_y: valid_pop_y[index * batch_size:(index + 1) * batch_size], type_y: valid_type_y[index * batch_size:(index + 1) * batch_size] }) test_pred_detail = function( [index], total_out, givens={ x: test_x[index * batch_size:(index + 1) * batch_size], pop_y: test_pop_y[index * batch_size:(index + 1) * batch_size], type_y: test_type_y[index * batch_size:(index + 1) * batch_size] }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_valid = len(dataset[1][0]) n_test = len(dataset[2][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'w') print "Start to train the model....." total_score = 0.0 while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) # do validatiovalidn valid_cost = [ valid_train_func(i) for i in np.random.permutation(xrange(n_valid_batches)) ] if epoch % print_freq == 0: # do test pop_preds = [] type_preds = [] pop_errors = [] type_errors = [] pop_sens = [] type_sens = [] for i in xrange(n_test_batches): test_pop_pred, test_type_pred, test_pop_error, test_type_error = test_pred_detail( i) pop_preds.append(test_pop_pred) type_preds.append(test_type_pred) pop_errors.append(test_pop_error) type_errors.append(test_type_error) pop_preds = np.concatenate(pop_preds) type_preds = np.concatenate(type_preds) pop_errors = np.concatenate(pop_errors) type_errors = np.concatenate(type_errors) pop_perf = 1 - np.mean(pop_errors) type_perf = 1 - np.mean(type_errors) # dumps the predictions and the choosed sentences with open( os.path.join(perf_fn, "%s_%d.pop_pred" % (exp_name, epoch)), 'w') as epf: for p in pop_preds: epf.write("%d\n" % int(p)) with open( os.path.join(perf_fn, "%s_%d.type_pred" % (exp_name, epoch)), 'w') as epf: for p in type_preds: epf.write("%d\n" % int(p)) message = "Epoch %d test pop perf %f, type perf %f, training_cost %f" % ( epoch, pop_perf, type_perf, np.mean(costs)) print message log_file.write(message + "\n") log_file.flush() if (pop_perf + type_perf) > total_score: total_score = pop_perf + type_perf # save the model model_name = os.path.join( perf_fn, "%s_%d.best_model" % (exp_name, epoch)) with open(model_name, 'wb') as mn: for param in params: cPickle.dump(param.get_value(), mn) end_time = timeit.default_timer() print "Finish one iteration using %f m" % ( (end_time - start_time) / 60.) # output the final model params print "Output the final model" model_name = os.path.join(perf_fn, "%s_%d.final_model" % (exp_name, epoch)) with open(model_name, 'wb') as mn: for param in params: cPickle.dump(param.get_value(), mn) log_file.flush() log_file.close()
def build(self): # description string: #words x #samples x = tensor.matrix('x', dtype=INT) x_mask = tensor.matrix('x_mask', dtype=FLOAT) y1 = tensor.matrix('y1', dtype=INT) y1_mask = tensor.matrix('y1_mask', dtype=FLOAT) y2 = tensor.matrix('y2', dtype=INT) y2_mask = tensor.matrix('y2_mask', dtype=FLOAT) self.inputs = OrderedDict() self.inputs['x'] = x self.inputs['x_mask'] = x_mask self.inputs['y1'] = y1 self.inputs['y2'] = y2 self.inputs['y1_mask'] = y1_mask self.inputs['y2_mask'] = y2_mask # for the backward rnn, we just need to invert x and x_mask xr = x[::-1] xr_mask = x_mask[::-1] n_timesteps = x.shape[0] n_timesteps_trg = y1.shape[0] n_timesteps_trgmult = y2.shape[0] n_samples = x.shape[1] # word embedding for forward rnn (source) emb = dropout(self.tparams['Wemb_enc'][x.flatten()], self.trng, self.emb_dropout, self.use_dropout) emb = emb.reshape([n_timesteps, n_samples, self.embedding_dim]) proj = get_new_layer(self.enc_type)[1](self.tparams, emb, prefix='encoder', mask=x_mask, layernorm=self.lnorm) # word embedding for backward rnn (source) embr = dropout(self.tparams['Wemb_enc'][xr.flatten()], self.trng, self.emb_dropout, self.use_dropout) embr = embr.reshape([n_timesteps, n_samples, self.embedding_dim]) projr = get_new_layer(self.enc_type)[1](self.tparams, embr, prefix='encoder_r', mask=xr_mask, layernorm=self.lnorm) # context will be the concatenation of forward and backward rnns ctx = [ tensor.concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1) ] for i in range(1, self.n_enc_layers): ctx = get_new_layer(self.enc_type)[1](self.tparams, ctx[0], prefix='deepencoder_%d' % i, mask=x_mask, layernorm=self.lnorm) # Apply dropout ctx = dropout(ctx[0], self.trng, self.ctx_dropout, self.use_dropout) if self.init_cgru == 'text': # mean of the context (across time) will be used to initialize decoder rnn ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] init_state = get_new_layer('ff')[1](self.tparams, ctx_mean, prefix='ff_state', activ='tanh') else: # Assume zero-initialized decoder init_state = tensor.alloc(0., n_samples, self.rnn_dim) # word embedding (target), we will shift the target sequence one time step # to the right. This is done because of the bi-gram connections in the # readout and decoder rnn. The first target will be all zeros and we will emb_lem = self.tparams['Wemb_dec_lem'][y1.flatten()] emb_lem = emb_lem.reshape( [n_timesteps_trg, n_samples, self.embedding_dim]) emb_lem_shifted = tensor.zeros_like(emb_lem) emb_lem_shifted = tensor.set_subtensor(emb_lem_shifted[1:], emb_lem[:-1]) emb_lem = emb_lem_shifted emb_fact = self.tparams['Wemb_dec_fact'][y2.flatten()] emb_fact = emb_fact.reshape( [n_timesteps_trgmult, n_samples, self.embedding_dim]) emb_fact_shifted = tensor.zeros_like(emb_fact) emb_fact_shifted = tensor.set_subtensor(emb_fact_shifted[1:], emb_fact[:-1]) emb_fact = emb_fact_shifted # Concat the 2 embeddings emb_prev = tensor.concatenate([emb_lem, emb_fact], axis=2) # decoder - pass through the decoder conditional gru with attention proj = get_new_layer('gru_cond')[1](self.tparams, emb_prev, prefix='decoder', mask=y1_mask, context=ctx, context_mask=x_mask, one_step=False, init_state=init_state, layernorm=False) # hidden states of the decoder gru proj_h = proj[0] # weighted averages of context, generated by attention module ctxs = proj[1] # weights (alignment matrix) self.alphas = proj[2] # compute word probabilities logit_gru = get_new_layer('ff')[1](self.tparams, proj_h, prefix='ff_logit_gru', activ='linear') logit_ctx = get_new_layer('ff')[1](self.tparams, ctxs, prefix='ff_logit_ctx', activ='linear') logit_lem = get_new_layer('ff')[1](self.tparams, emb_lem, prefix='ff_logit_lem', activ='linear') logit_fact = get_new_layer('ff')[1](self.tparams, emb_fact, prefix='ff_logit_fact', activ='linear') logit1 = dropout(tanh(logit_gru + logit_lem + logit_ctx), self.trng, self.out_dropout, self.use_dropout) logit2 = dropout(tanh(logit_gru + logit_fact + logit_ctx), self.trng, self.out_dropout, self.use_dropout) if self.tied_trg_emb is False: logit_trg = get_new_layer('ff')[1](self.tparams, logit1, prefix='ff_logit_trg', activ='linear') logit_trgmult = get_new_layer('ff')[1](self.tparams, logit2, prefix='ff_logit_trgmult', activ='linear') else: logit_trg = tensor.dot(logit1, self.tparams['Wemb_dec_lem'].T) logit_trgmult = tensor.dot(logit2, self.tparams['Wemb_dec_fact'].T) logit_trg_shp = logit_trg.shape logit_trgmult_shp = logit_trgmult.shape # Apply logsoftmax (stable version) log_trg_probs = -tensor.nnet.logsoftmax( logit_trg.reshape( [logit_trg_shp[0] * logit_trg_shp[1], logit_trg_shp[2]])) log_trgmult_probs = -tensor.nnet.logsoftmax( logit_trgmult.reshape([ logit_trgmult_shp[0] * logit_trgmult_shp[1], logit_trgmult_shp[2] ])) # cost y1_flat = y1.flatten() y2_flat = y2.flatten() y1_flat_idx = tensor.arange( y1_flat.shape[0]) * self.n_words_trg1 + y1_flat y2_flat_idx = tensor.arange( y2_flat.shape[0]) * self.n_words_trg2 + y2_flat cost_trg = log_trg_probs.flatten()[y1_flat_idx] cost_trg = cost_trg.reshape([n_timesteps_trg, n_samples]) cost_trg = (cost_trg * y1_mask).sum(0) cost_trgmult = log_trgmult_probs.flatten()[y2_flat_idx] cost_trgmult = cost_trgmult.reshape([n_timesteps_trgmult, n_samples]) cost_trgmult = (cost_trgmult * y2_mask).sum(0) cost = cost_trg + cost_trgmult self.f_log_probs = theano.function(list(self.inputs.values()), cost) # For alpha regularization return cost
def step_backward(T2_lp1, char_lp1): char_l = T2_lp1[T.arange(N), T.cast(char_lp1, 'int32')] # N return T.cast(char_l, 'float32')
def fit(self, trees, learning_rate=3*1e-3, mu=0.99, reg=1e-4, epochs=15, activation=T.nnet.relu, train_inner_nodes=False): D = self.D V = self.V K = self.K self.f = activation N = len(trees) We = init_weight(V, D) Wh = np.random.randn(2, D, D) / np.sqrt(2 + D + D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wh, self.bh, self.Wo, self.bo] words = T.ivector('words') parents = T.ivector('parents') relations = T.ivector('relations') labels = T.ivector('labels') def recurrence(n, hiddens, words, parents, relations): w = words[n] # any non-word will have index -1 # if T.ge(w, 0): # hiddens = T.set_subtensor(hiddens[n], self.We[w]) # else: # hiddens = T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh)) hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh)) ) r = relations[n] # 0 = is_left, 1 = is_right p = parents[n] # parent idx # if T.ge(p, 0): # # root will have parent -1 # hiddens = T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r])) hiddens = T.switch( T.ge(p, 0), T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r])), hiddens ) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, parents, relations], ) # shape of h that is returned by scan is TxTxD # because hiddens is TxD, and it does the recurrence T times # technically this stores T times too much data py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = reg*T.mean([(p*p).sum() for p in self.params]) if train_inner_nodes: # won't work for binary classification cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost else: # print "K is:", K # premean = T.log(py_x[-1]) # target = T.zeros(K) # target = T.set_subtensor(target[labels[-1]], 1) # cost = -T.mean(target * premean) cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost
def negative_log_likelihood(self, tensor_x, tensor_y): tensor_ypredproba = self.decision_function_tensor(tensor_x) return -T.mean( T.log(tensor_ypredproba)[T.arange(tensor_y.shape[0]), tensor_y])
def step(char_lm1, char_l, trans_probs_l): """Probability of going from char_lm1 to char_l using trans_probs_l tensor""" char_lm1 = T.cast(char_lm1, 'int32') char_l = T.cast(char_l, 'int32') return trans_probs_l[T.arange(N), char_lm1, char_l] # N
def smoothedSeries(obs_series, init_probs, final_probs, tmat, emission): f_result, updates = theano.scan(fn = forwardIteration, outputs_info = init_probs, non_sequences = [tmat, emission], sequences = obs_series, n_steps = obs_series.shape[0]) f_r = T.concatenate((init_probs.dimshuffle('x',0), f_result)) b_result, b_updates = theano.scan(fn = backwardIteration, outputs_info = final_probs, non_sequences = [tmat, emission], sequences = obs_series, n_steps = obs_series.shape[0], go_backwards = True) b_r = T.concatenate((final_probs.dimshuffle('x',0), b_result)) s_result, s_updates = theano.scan(fn = smoothingIteration, outputs_info = None, non_sequences = [b_r, b_r.shape[0]], sequences = [f_r, T.arange(10000)]) return s_result
def negative_log_likelihood_sum(self, y): return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
b_r = T.concatenate((final_probs_tensor.dimshuffle('x',0), b_result)) b_fn = theano.function(inputs = [final_probs_tensor, tmat_tensor, emission_tensor, obs_vec], outputs = b_r) #Now define a scan that computes the smoothed probabilities #To reverse the backward probabilities, we provide an index and the length #of dimension 1 of the backward prob matrix. #Is there a better way of reversing a sequence while iterating over #another forwards w/o an additional call to scan to reverse the original? def smoothingIteration(f_val, f_index, b_vals, max_index): x = b_vals[max_index - f_index-1] s = f_val*x return s / T.sum(s) s_result, s_updates = theano.scan(fn = smoothingIteration, outputs_info = None, non_sequences = [b_r, b_r.shape[0]], sequences = [f_r, T.arange(10000)]) s_fn = theano.function(inputs = [init_probs_tensor, final_probs_tensor, tmat_tensor, emission_tensor, obs_vec], outputs = s_result) #This function is supposed to obtain the smoothed state probabilities #from a vector passed as obs_series, given the initial probs, final #probs and emission matrix def smoothedSeries(obs_series, init_probs, final_probs, tmat, emission): f_result, updates = theano.scan(fn = forwardIteration, outputs_info = init_probs, non_sequences = [tmat, emission], sequences = obs_series, n_steps = obs_series.shape[0]) f_r = T.concatenate((init_probs.dimshuffle('x',0), f_result)) b_result, b_updates = theano.scan(fn = backwardIteration, outputs_info = final_probs, non_sequences = [tmat, emission], sequences = obs_series, n_steps = obs_series.shape[0], go_backwards = True) b_r = T.concatenate((final_probs.dimshuffle('x',0), b_result))
def fit(self, X, Y, learning_rate=0.1, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False): D = X[0].shape[1] # X is of size N x T(n) x D K = len(set(Y.flatten())) N = len(Y) M = self.M self.f = activation # initial weights Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) # make them theano shared self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] thX = T.fmatrix('X') thY = T.ivector('Y') def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=thX, n_steps=thX.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction, y], updates=updates) costs = [] for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(N): c, p, rout = self.train_op(X[j], Y[j]) # print "p:", p cost += c if p[-1] == Y[j, -1]: n_correct += 1 print("shape y:", rout.shape) print("i:", i, "cost:", cost, "classification rate:", (float(n_correct) / N)) costs.append(cost) if n_correct == N: break if show_fig: plt.plot(costs) plt.show()