def spatial_2d_padding(x, padding=(1, 1), dim_ordering='th'): '''Pad the 2nd and 3rd dimensions of a 4D tensor with "padding[0]" and "padding[1]" (resp.) zeros left and right. ''' input_shape = x.shape if dim_ordering == 'th': output_shape = (input_shape[0], input_shape[1], input_shape[2] + 2 * padding[0], input_shape[3] + 2 * padding[1]) output = T.zeros(output_shape) indices = (slice(None), slice(None), slice(padding[0], input_shape[2] + padding[0]), slice(padding[1], input_shape[3] + padding[1])) elif dim_ordering == 'tf': output_shape = (input_shape[0], input_shape[1] + 2 * padding[0], input_shape[2] + 2 * padding[1], input_shape[3]) output = T.zeros(output_shape) indices = (slice(None), slice(padding[0], input_shape[1] + padding[0]), slice(padding[1], input_shape[2] + padding[1]), slice(None)) else: raise Exception('Invalid dim_ordering: ' + dim_ordering) return T.set_subtensor(output[indices], x)
def initial_states(self, batch_size, *args, **kwargs): r"""Return initial states for an application call. Default implementation assumes that the recurrent application method is called `apply`. It fetches the state names from `apply.states` and a returns a zero matrix for each of them. :class:`SimpleRecurrent`, :class:`LSTM` and :class:`GatedRecurrent` override this method with trainable initial states initialized with zeros. Parameters ---------- batch_size : int The batch size. \*args The positional arguments of the application call. \*\*kwargs The keyword arguments of the application call. """ result = [] for state in self.apply.states: dim = self.get_dim(state) if dim == 0: result.append(tensor.zeros((batch_size,))) else: result.append(tensor.zeros((batch_size, dim))) return result
def initial_glimpses(self, batch_size, attended): return ([tensor.zeros((batch_size, self.attended_dim))] + 2 * [tensor.concatenate([ tensor.ones((batch_size, 1)), tensor.zeros((batch_size, attended.shape[0] - 1))], axis=1)] + [tensor.zeros((batch_size,), dtype='int64')])
def lllistool(i, inp, func): if func == LSTM: NUMS[i+1] *= 4 sdim = DIMS[i] if func == SimpleRecurrent or func == LSTM: sdim = DIMS[i] + DIMS[i+1] l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=sdim**(-0.5)), biases_init=IsotropicGaussian(std=sdim**(-0.5)), name='Lin{}'.format(i)) l.initialize() if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() ret = gong.apply(l.apply(inp)) elif func == LSTM: gong = func(dim=DIMS[i+1], activation=Tanh(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() print(inp) ret, _ = gong.apply( l.apply(inp), T.zeros((inp.shape[1], DIMS[i+1])), T.zeros((inp.shape[1], DIMS[i+1])), ) elif func == SequenceGenerator: gong = func( readout=None, transition=SimpleRecurrent(dim=100, activation=Rectifier(), weights_init=IsotropicGaussian(std=0.1))) ret = None elif func == None: ret = l.apply(inp) else: gong = func() ret = gong.apply(l.apply(inp)) return ret
def pad(inp, padding): if all([padval == 0 for padval in pyk.flatten(padding)]): return inp if inp.ndim == 4: # Make a zero tensor of the right shape zt = T.zeros(shape=(inp.shape[0], inp.shape[1], inp.shape[2]+sum(padding[0]), inp.shape[3]+sum(padding[1]))) # Compute assignment slice [[ystart, ystop], [xstart, xstop]] = [[padval[0], (-padval[1] if padval[1] != 0 else None)] for padval in padding] # Assign subtensor padded = T.set_subtensor(zt[:, :, ystart:ystop, xstart:xstop], inp) return padded elif inp.ndim == 5: # Make a zero tensor of the right shape zt = T.zeros(shape=(inp.shape[0], inp.shape[1]+sum(padding[2]), inp.shape[2], inp.shape[3]+sum(padding[0]), inp.shape[4]+sum(padding[1]))) # Compute assignment slice [[ystart, ystop], [xstart, xstop], [zstart, zstop]] = [[padval[0], (-padval[1] if padval[1] != 0 else None)] for padval in padding] # Assign subtensor padded = T.set_subtensor(zt[:, zstart:zstop, :, ystart:ystop, xstart:xstop], inp) return padded else: raise NotImplementedError("Padding is only implemented for 4 and 5 dimensional tensors.")
def plotUpdate(self,updates): ''' >>>get update info of each layer >>>type updates: dict >>>para updates: update dictionary ''' maxdict=T.zeros(shape=(self.deep*2+1,)) mindict=T.zeros(shape=(self.deep*2+1,)) meandict=T.zeros(shape=(self.deep*2+1,)) for i in xrange(self.deep): updw=updates[self.layers[i].w]-self.layers[i].w maxdict=T.set_subtensor(maxdict[2*i],T.max(updw)) mindict=T.set_subtensor(mindict[2*i],T.min(updw)) meandict=T.set_subtensor(meandict[2*i],T.mean(updw)) updb=updates[self.layers[i].b]-self.layers[i].b maxdict=T.set_subtensor(maxdict[2*i+1],T.max(updb)) mindict=T.set_subtensor(mindict[2*i+1],T.min(updb)) meandict=T.set_subtensor(meandict[2*i+1],T.mean(updb)) updw=updates[self.classifier.w]-self.classifier.w maxdict=T.set_subtensor(maxdict[self.deep*2],T.max(updw)) mindict=T.set_subtensor(mindict[self.deep*2],T.min(updw)) meandict=T.set_subtensor(meandict[self.deep*2],T.mean(updw)) return [maxdict,mindict,meandict]
def __init__(self, rng, input, mask, n_in, n_h): # Init params self.W_i = theano.shared(gauss_weight(rng, n_in, n_h), 'W_i', borrow=True) self.W_f = theano.shared(gauss_weight(rng, n_in, n_h), 'W_f', borrow=True) self.W_c = theano.shared(gauss_weight(rng, n_in, n_h), 'W_c', borrow=True) self.W_o = theano.shared(gauss_weight(rng, n_in, n_h), 'W_o', borrow=True) self.U_i = theano.shared(gauss_weight(rng, n_h), 'U_i', borrow=True) self.U_f = theano.shared(gauss_weight(rng, n_h), 'U_f', borrow=True) self.U_c = theano.shared(gauss_weight(rng, n_h), 'U_c', borrow=True) self.U_o = theano.shared(gauss_weight(rng, n_h), 'U_o', borrow=True) self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 'b_i', borrow=True) self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 'b_f', borrow=True) self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 'b_c', borrow=True) self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), 'b_o', borrow=True) self.params = [self.W_i, self.W_f, self.W_c, self.W_o, self.U_i, self.U_f, self.U_c, self.U_o, self.b_i, self.b_f, self.b_c, self.b_o] outputs_info = [T.zeros((input.shape[1], n_h)), T.zeros((input.shape[1], n_h))] rval, updates = theano.scan(self._step, sequences=[mask, input], outputs_info=outputs_info) # self.output is in the format (length, batchsize, n_h) self.output = rval[0]
def get_output(self, train=False): X = self.get_input().dimshuffle(1, 0, 2) Vx = T.dot(X, self.V) x_init = T.zeros((X.shape[1], self.input_dim)) s_init = T.zeros((X.shape[1], self.output_dim)) u_init = T.zeros((X.shape[1], self.causes_dim)) outputs, uptdates = scan( self._step, sequences=[X, Vx], outputs_info=[x_init, s_init, u_init], non_sequences=self.params, truncate_gradient=self.truncate_gradient) if self.return_mode == 'both': return T.concatenate([outputs[1], outputs[2]], axis=-1) elif self.return_mode == 'states': out = outputs[1] elif self.return_mode == 'causes': out = outputs[2] else: raise ValueError("return_model {0} not valid. Choose " "'both', 'states' or 'causes'".format( self.return_mode)) if self.return_sequences: return out.dimshuffle(1, 0, 2) else: return out[-1]
def initial_glimpses(self, name, batch_size, sequence): if name == "glimpses": return tensor.zeros((batch_size, self.sequence_dim)) elif name == "weights": return tensor.zeros((batch_size, sequence.shape[0])) else: raise ValueError("Unknown glimpse name {}".format(name))
def function(self, input_tensor): init_hs = T.zeros((input_tensor.shape[1], self.output_neurons)) init_cs = T.zeros((input_tensor.shape[1], self.output_neurons)) lstm_out_1, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.d_forward, go_forwards=True), outputs_info=[init_hs,init_cs], sequences=input_tensor, non_sequences=None) lstm_out_2, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.d_backward, go_forwards=False), outputs_info=[init_hs,init_cs], sequences=input_tensor, non_sequences=None) lstm_out_3, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.u_forward, go_forwards=True), outputs_info=[init_hs,init_cs], sequences=input_tensor, non_sequences=None, go_backwards=True) lstm_out_4, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.u_backward, go_forwards=False), outputs_info=[init_hs,init_cs], sequences=input_tensor, non_sequences=None, go_backwards=True) return T.concatenate((lstm_out_1[0], lstm_out_2[0], lstm_out_3[0][::-1], lstm_out_4[0][::-1]), axis=2)
def __init__(self, n_in, n_out, layers, decoder=linear.Linear, itype='int32' , solver=solvers.RMSprop(0.01)): self.data = T.matrix(dtype=itype) self.x = self.data[:-1] # T.matrix(dtype=itype) self.y = self.data[1:] # T.matrix(dtype=itype) self.mask = T.matrix(dtype='int32') self.weights = [] k,b = self.x.shape y_layer = self.x self.y_layers = [] m = n_in for n in layers: layer = lstm.LSTM(m, n) self.weights.append(layer.weights) y0 = T.zeros((b, n)) c0 = T.zeros((b, n)) y_layer, _ = layer.scanl(y0, c0, y_layer) self.y_layers.append(y_layer) m = n decode = decoder(m, n_out) self.weights.append(decode.weights) yh = decode(y_layer) self.yh = softmax.softmax(yh) self.loss_t = T.sum(crossent.crossent(self.yh, self.y)*self.mask[1:]) self.correct = T.sum(T.eq(T.argmax(self.yh, axis=2), self.y)*self.mask[1:]) self.count = T.sum(self.mask[1:]) self.solver = solver #compile theano functions self._loss = theano.function([self.data, self.mask], [self.loss_t, self.correct, self.count]) self._activations = theano.function([self.data], self.y_layers+[self.yh], givens={self.x:self.data})
def best_path_decode(self, scorematrix, scorematrix_mask=None, blank_symbol=None): """ Computes the best path by simply choosing most likely label at each timestep :param scorematrix: (T, C+1, B) :param scorematrix_mask: (T, B) :param blank_symbol: = C by default :return: resultseq (T, B), resultseq_mask(T, B) Speed much slower than pure python version (normally ~40 times on HTR tasks) """ bestlabels = tensor.argmax(scorematrix, axis=1) # (T, B) T, Cp, B = scorematrix.shape resultseq, resultseq_mask = tensor.zeros([T, B], dtype=scorematrix.dtype)-1, tensor.zeros([T, B], dtype=scorematrix.dtype) if blank_symbol is None: blank_symbol = Cp - 1 if scorematrix_mask is None: scorematrix_mask = tensor.ones([T, B], dtype=scorematrix.dtype) def step(labelseq, labelseq_mask, idx, resultseq, resultseq_mask, blank_symbol): seqlen = tensor.cast(labelseq_mask.sum(), 'int32') labelseq = self._remove_adjdup(labelseq[0:seqlen]) labelseq = self._remove_value(labelseq, blank_symbol) seqlen2 = labelseq.size resultseq = tensor.set_subtensor(resultseq[0:seqlen2, idx], labelseq) resultseq_mask = tensor.set_subtensor(resultseq_mask[0:seqlen2, idx], tensor.ones_like(labelseq)) idx += 1 return idx, resultseq, resultseq_mask outputs, updates = theano.scan(fn = step, sequences=[bestlabels.T, scorematrix_mask.T], outputs_info=[0, resultseq, resultseq_mask], non_sequences=[blank_symbol], name='decode_scan') resultseq, resultseq_mask = outputs[1][-1], outputs[2][-1] return resultseq, resultseq_mask
def calc_CER(self, resultseq, targetseq, resultseq_mask=None, targetseq_mask=None): """ Calculate the character error rate (CER) given ground truth 'targetseq' and CTC decoding output 'resultseq' :param resultseq (T1, B) :param resultseq_mask (T1, B) :param targetseq (T2, B) :param targetseq_mask (T2, B) :return: CER scalar """ if resultseq_mask is None: resultseq_mask = tensor.ones_like(resultseq) if targetseq_mask is None: targetseq_mask = tensor.ones_like(targetseq) def step(result_seq, target_seq, result_seq_mask, target_seq_mask, TE, TG): L1 = tensor.cast(result_seq_mask.sum(), 'int32') L2 = tensor.cast(target_seq_mask.sum(), 'int32') d = self._editdist(result_seq[0:L1], target_seq[0:L2]) TE += d TG += target_seq_mask.sum() return TE, TG outputs, updates = theano.scan(fn=step, sequences=[resultseq.T, targetseq.T, resultseq_mask.T, targetseq_mask.T], outputs_info=[tensor.zeros(1), tensor.zeros(1)], name='calc_CER') TE, TG = outputs[0][-1], outputs[1][-1] CER = TE/TG return CER, TE, TG
def get_coefficients(self): c1 = self.term1.coefficients c2 = self.term2.coefficients # First compute real terms ar = [] cr = [] ar.append(tt.flatten(c1[0][:, None] * c2[0][None, :])) cr.append(tt.flatten(c1[1][:, None] * c2[1][None, :])) # Then the complex terms ac = [] bc = [] cc = [] dc = [] # real * complex ac.append(tt.flatten(c1[0][:, None] * c2[2][None, :])) bc.append(tt.flatten(c1[0][:, None] * c2[3][None, :])) cc.append(tt.flatten(c1[1][:, None] + c2[4][None, :])) dc.append(tt.flatten(tt.zeros_like(c1[1])[:, None] + c2[5][None, :])) ac.append(tt.flatten(c2[0][:, None] * c1[2][None, :])) bc.append(tt.flatten(c2[0][:, None] * c1[3][None, :])) cc.append(tt.flatten(c2[1][:, None] + c1[4][None, :])) dc.append(tt.flatten(tt.zeros_like(c2[1])[:, None] + c1[5][None, :])) # complex * complex aj, bj, cj, dj = c1[2:] ak, bk, ck, dk = c2[2:] ac.append( tt.flatten( 0.5 * (aj[:, None] * ak[None, :] + bj[:, None] * bk[None, :]) ) ) bc.append( tt.flatten( 0.5 * (bj[:, None] * ak[None, :] - aj[:, None] * bk[None, :]) ) ) cc.append(tt.flatten(cj[:, None] + ck[None, :])) dc.append(tt.flatten(dj[:, None] - dk[None, :])) ac.append( tt.flatten( 0.5 * (aj[:, None] * ak[None, :] - bj[:, None] * bk[None, :]) ) ) bc.append( tt.flatten( 0.5 * (bj[:, None] * ak[None, :] + aj[:, None] * bk[None, :]) ) ) cc.append(tt.flatten(cj[:, None] + ck[None, :])) dc.append(tt.flatten(dj[:, None] + dk[None, :])) return [ tt.concatenate(vals, axis=0) if len(vals) else tt.zeros(0, dtype=self.dtype) for vals in (ar, cr, ac, bc, cc, dc) ]
return y1, y2, y3, y4, y5 X = T.tensor4() Z0 = T.matrix() # draw samples from the generator gX = gen(Z0, gwx) # feed real data and generated data through discriminator p_real = discrim(X) p_gen = discrim(gX) # compute costs based on discriminator output for real/generated data d_cost_real = sum([bce(p, T.ones(p.shape)).mean() for p in p_real]) d_cost_gen = sum([bce(p, T.zeros(p.shape)).mean() for p in p_gen]) g_cost_d = sum([bce(p, T.ones(p.shape)).mean() for p in p_gen]) # d_cost_real = bce(p_real[-1], T.ones(p_real[-1].shape)).mean() # d_cost_gen = bce(p_gen[-1], T.zeros(p_gen[-1].shape)).mean() # g_cost_d = bce(p_gen[-1], T.ones(p_gen[-1].shape)).mean() d_cost = d_cost_real + d_cost_gen + ( 1e-5 * sum([T.sum(p**2.0) for p in discrim_params])) g_cost = g_cost_d + (1e-5 * sum([T.sum(p**2.0) for p in gen_params])) cost = [g_cost, d_cost, g_cost_d, d_cost_real, d_cost_gen] lrt = sharedX(lr) d_updater = updates.Adam(lr=lrt, b1=b1, regularizer=updates.Regularizer(l2=l2)) g_updater = updates.Adam(lr=lrt, b1=b1, regularizer=updates.Regularizer(l2=l2))
def fit(self, trees, learning_rate=3 * 1e-3, mu=0.99, reg=1e-4, epochs=15, activation=T.nnet.relu, train_inner_nodes=False): D = self.D V = self.V K = self.K self.f = activation N = len(trees) We = init_weight(V, D) Wh = np.random.randn(2, D, D) / np.sqrt(2 + D + D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wh, self.bh, self.Wo, self.bo] words = T.ivector('words') parents = T.ivector('parents') relations = T.ivector('relations') labels = T.ivector('labels') def recurrence(n, hiddens, words, parents, relations): w = words[n] hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh))) r = relations[n] p = parents[n] hiddens = T.switch( T.ge(p, 0), T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r])), hiddens) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, parents, relations], ) py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = reg * T.mean([(p * p).sum() for p in self.params]) if train_inner_nodes: cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost else: cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost # grads = T.grad(cost, self.params) # dparams = [theano.shared(p.get_value()*0) for p in self.params] # # updates = [ # (p, p * mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) # ] + [ # (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) # ] updates = adagrad(cost, self.params, lr=1e-4) self.cost_predict_op = theano.function( inputs=[words, parents, relations, labels], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[words, parents, relations, labels], outputs=[h, cost, prediction], updates=updates) costs = [] sequence_indexes = range(N) if train_inner_nodes: n_total = sum(len(words) for words, _, _, _ in trees) else: n_total = N for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 for j in sequence_indexes: words, par, rel, lab = trees[j] _, c, p = self.train_op(words, par, rel, lab) cost += c if train_inner_nodes: n_correct += np.sum(p == lab) else: n_correct += (p[-1] == lab[-1]) it += 1 if it % 1 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f/r" % (it, N, float(n_correct / n_total), cost)) sys.stdout.flush() print("i:", i, "cost:", cost, "correct rate:", (float(n_correct) / n_total), "time for epoch:", (datetime.now() - t0)) costs.append(cost) print('costs:', costs) plt.plot(costs) plt.show()
def neibs2images(neibs, neib_shape, original_shape, mode='valid'): """ Function :func:`neibs2images <theano.sandbox.neighbours.neibs2images>` performs the inverse operation of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. It inputs the output of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` and reconstructs its input. Parameters ---------- neibs : 2d tensor Like the one obtained by :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. neib_shape `neib_shape` that was used in :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. original_shape Original shape of the 4d tensor given to :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` Returns ------- object Reconstructs the input of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`, a 4d tensor of shape `original_shape`. Notes ----- Currently, the function doesn't support tensors created with `neib_step` different from default value. This means that it may be impossible to compute the gradient of a variable gained by :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t. its inputs in this case, because it uses :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for gradient computation. Examples -------- Example, which uses a tensor gained in example for :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`: .. code-block:: python im_new = neibs2images(neibs, (5, 5), im_val.shape) # Theano function definition inv_window = theano.function([neibs], im_new) # Function application im_new_val = inv_window(neibs_val) .. note:: The code will output the initial image array. """ neibs = T.as_tensor_variable(neibs) neib_shape = T.as_tensor_variable(neib_shape) original_shape = T.as_tensor_variable(original_shape) new_neib_shape = T.stack([original_shape[-1] // neib_shape[1], neib_shape[1]]) output_2d = images2neibs(neibs.dimshuffle('x', 'x', 0, 1), new_neib_shape, mode=mode) if mode == 'ignore_borders': # We use set_subtensor to accept original_shape we can't infer # the shape and still raise error when it don't have the right # shape. valid_shape = original_shape valid_shape = T.set_subtensor( valid_shape[2], (valid_shape[2] // neib_shape[0]) * neib_shape[0]) valid_shape = T.set_subtensor( valid_shape[3], (valid_shape[3] // neib_shape[1]) * neib_shape[1]) output_4d = output_2d.reshape(valid_shape, ndim=4) # padding the borders with zeros for d in [2, 3]: pad_shape = list(output_4d.shape) pad_shape[d] = original_shape[d] - valid_shape[d] output_4d = T.concatenate([output_4d, T.zeros(pad_shape)], axis=d) elif mode == 'valid': # TODO: we do not implement all mode with this code. # Add a check for the good cases. output_4d = output_2d.reshape(original_shape, ndim=4) else: raise NotImplementedError("neibs2images do not support mode=%s" % mode) return output_4d
def theano_expr(self, targets, mode='stack', sparse=False): """ Return the one-hot transformation as a symbolic expression. If labels appear multiple times, their value in the one-hot vector is incremented. Parameters ---------- targets : tensor_like, 1- or 2-dimensional, integer dtype A symbolic tensor representing labels as integers between 0 and `max_labels` - 1, `max_labels` supplied at formatter construction. mode : string The way in which to convert the labels to arrays. Takes three different options: - "concatenate" : concatenates the one-hot vectors from multiple labels - "stack" : returns a matrix where each row is the one-hot vector of a label - "merge" : merges the one-hot vectors together to form a vector where the elements are the result of an indicator function NB: As the result of an indicator function the result is the same in case a label is duplicated in the input. sparse : bool If true then the return value is sparse matrix. Note that if sparse is True, then mode cannot be 'stack' because sparse matrices need to be 2D Returns ------- one_hot : TensorVariable, 1, 2 or 3-dimensional, sparse or dense A symbolic tensor representing a one-hot encoding of the \ supplied labels. """ if mode not in ('concatenate', 'stack', 'merge'): raise ValueError("%s got bad mode argument '%s'" % (self.__class__.__name__, str(self._max_labels))) elif mode == 'stack' and sparse: raise ValueError("Sparse matrices need to be 2D, hence they" "cannot be stacked") squeeze_required = False if targets.ndim != 2: if targets.ndim == 1: squeeze_required = True targets = targets.dimshuffle('x', 0) else: raise ValueError("targets tensor must be 1 or 2-dimensional") if 'int' not in str(targets.dtype): raise TypeError("need an integer tensor for targets") if sparse: if mode == 'concatenate': one_hot = theano.sparse.CSR( tensor.ones_like(targets, dtype=self._dtype).flatten(), (targets.flatten() + tensor.arange(targets.size) * self._max_labels) % (self._max_labels * targets.shape[1]), tensor.arange(targets.shape[0] + 1) * targets.shape[1], tensor.stack(targets.shape[0], self._max_labels * targets.shape[1]) ) else: one_hot = theano.sparse.CSR( tensor.ones_like(targets, dtype=self._dtype).flatten(), targets.flatten(), tensor.arange(targets.shape[0] + 1) * targets.shape[1], tensor.stack(targets.shape[0], self._max_labels) ) else: if mode == 'concatenate': one_hot = tensor.zeros((targets.shape[0] * targets.shape[1], self._max_labels)) one_hot = tensor.set_subtensor( one_hot[tensor.arange(targets.size), targets.flatten()], 1) one_hot = one_hot.reshape((targets.shape[0], targets.shape[1] * self._max_labels)) elif mode == 'merge': one_hot = tensor.zeros((targets.shape[0], self._max_labels)) one_hot = tensor.set_subtensor( one_hot[tensor.arange(targets.size) % targets.shape[0], targets.T.flatten()], 1) else: one_hot = tensor.zeros((targets.shape[0], targets.shape[1], self._max_labels)) one_hot = tensor.set_subtensor(one_hot[ tensor.arange(targets.shape[0]).reshape((targets.shape[0], 1)), tensor.arange(targets.shape[1]), targets ], 1) if squeeze_required: if one_hot.ndim == 2: one_hot = one_hot.reshape((one_hot.shape[1],)) if one_hot.ndim == 3: one_hot = one_hot.reshape((one_hot.shape[1], one_hot.shape[2])) return one_hot
def step(batch_idx, out_seq_b1): #out_seq = seq[T.ge(idx[:, batch_idx], 0).nonzero(), batch_idx][0] out_seq = seq[:, batch_idx][T.ge(idx[:, batch_idx], 0).nonzero()] return T.concatenate((out_seq, T.zeros((max_seq_len - out_seq.shape[0], ), dtype=seq.dtype)))
v_gen_embed = lasagne.layers.get_output(l_embed_char, v_gen_input) # Freeze the hidden inputs of the decoder layers, which do not tap into the encoder. for layer in dec_rnn_layers: GRULayer_freeze(layer, v_gen_input) # Readout the last state from the encoder. inputs = {l_encoder_embed: v_gen_embed, l_encoder_mask: tt.ge(v_gen_input, 0)} outputs = [l.hid_init for l in dec_rnn_layers] dec_hid_inits = lasagne.layers.get_output(outputs, inputs, deterministic=True) # Prepare the initial values fed into the scan loop of the Generator h_0 = tt.concatenate(dec_hid_inits, axis=-1) x_0 = tt.fill(tt.zeros((v_gen_input.shape[0], ), dtype="int32"), vocab.index("\x02")) x_0 = lasagne.layers.get_output(l_embed_char, x_0) m_0 = tt.ones((v_gen_input.shape[0], ), 'bool') # Compile the Generator's scan op result, updates = theano.scan(generator_step_sm, sequences=None, n_steps=n_steps, outputs_info=[x_0, h_0, m_0, None, None], strict=False, return_list=True, non_sequences=[tau, eps], go_backwards=False, name="generator/scan")
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- input : theano.TensorType Symbolic input variable. mask : theano.TensorType Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. If ``None``, then it is assumed that all sequences are of the same length. If not all sequences are of the same length, then it must be supplied as a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. Returns ------- layer_output : theano.TensorType Symblic output variable. """ input = inputs[0] # Retrieve the mask when it is supplied mask = inputs[1] if len(inputs) > 1 else None # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = input.reshape( (input.shape[0], input.shape[1], T.prod(input.shape[2:]))) num_batch = input.shape[0] encode_seqlen = input.shape[1] if mask is None: mask = T.ones((num_batch, encode_seqlen), dtype='float32') # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(cell_previous, hid_previous, alpha_prev, weighted_hidden_prev, input, mask, hUa, W_align, v_align, W_hid_stacked, W_weightedhid_stacked, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, b_stacked, *args): #compute (unormalized) attetion vector sWa = T.dot(hid_previous, W_align) # (BS, aln_num_units) sWa = sWa.dimshuffle(0, 'x', 1) # (BS, 1, aln_num_units) align_act = sWa + hUa tanh_sWahUa = self.nonlinearity_align(align_act) # (BS, seqlen, num_units_aln) # CALCULATE WEIGHT FOR EACH HIDDEN STATE VECTOR a = T.dot(tanh_sWahUa, v_align) # (BS, Seqlen, 1) a = T.reshape(a, (a.shape[0], a.shape[1])) # # (BS, Seqlen) # # ->(BS, seq_len) a = a * mask - (1 - mask) * 10000 alpha = self.attention_softmax_function(a) #alpha = T.reshape(alpha, (input.shape[0], input.shape[1])) # input: (BS, Seqlen, num_units) weighted_hidden = input * alpha.dimshuffle(0, 1, 'x') weighted_hidden = T.sum(weighted_hidden, axis=1) #sum seqlen out # Calculate gates pre-activations and slice # (BS, dec_hid) x (dec_hid, dec_hid) gates = T.dot(hid_previous, W_hid_stacked) + b_stacked # (BS, enc_hid) x (enc_hid, dec_hid) gates += T.dot(weighted_hidden, W_weightedhid_stacked) # Clip gradients if self.grad_clipping is not False: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * W_cell_to_ingate forgetgate += cell_previous * W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * W_cell_to_outgate # W_align: (num_units, aln_num_units) # U_align: (num_feats, aln_num_units) # v_align: (aln_num_units, 1) # hUa: (BS, Seqlen, aln_num_units) # hid: (BS, num_units_dec) # input: (BS, Seqlen, num_inputs) # Compute new hidden unit activation hid = outgate * self.nonlinearity_out(cell) return [cell, hid, alpha, weighted_hidden] sequences = [] step_fun = step ones = T.ones((num_batch, 1)) if isinstance(self.cell_init, T.TensorVariable): cell_init = self.cell_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) #weighted_hidden_init = T.zeros((num_batch, input.shape[2])) alpha_init = T.zeros((num_batch, encode_seqlen)) weighted_hidden_init = T.zeros((num_batch, self.num_inputs)) # The hidden-to-hidden weight matrix is always used in step hUa = T.dot(input, self.U_align) # (num_batch, seq_len, num_units_aln) non_seqs = [ input, mask, hUa, self.W_align, self.v_align, self.W_hid_stacked, self.W_weightedhid_stacked ] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] # theano.scan only allows for positional arguments, so when # self.peepholes is False, we need to supply fake placeholder arguments # for the three peephole matrices. else: non_seqs += [(), (), ()] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function non_seqs += [self.b_stacked] if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan cell_out, hid_out, alpha_out, weighted_hidden_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[ cell_init, hid_init, alpha_init, weighted_hidden_init ], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.n_decodesteps + self.decode_pre_steps) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out, alpha_out, weighted_hidden_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[ cell_init, hid_init, alpha_init, weighted_hidden_init ], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, n_steps=self.n_decodesteps + self.decode_pre_steps, strict=True)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) #a_out - (n_decodesteps, bs, seqlen) #hid_out - (n_decode_steps, bs, num_units) # mask: (BS, encode_seqlen # a_out; (n_decodesteps, BS, encode_seqlen) cell_out = cell_out.dimshuffle(1, 0, 2) hid_out = hid_out.dimshuffle(1, 0, 2) # (BS, n_decodesteps, encode_seqlen) mask = mask.dimshuffle(0, 'x', 1) alpha_out = alpha_out.dimshuffle( 1, 0, 2) # (BS, n_decodesteps, encode_seqlen) weighted_hidden_out = weighted_hidden_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] cell_out = cell_out[:, ::-1] weighted_hidden_out = weighted_hidden_out[:, ::-1] alpha_out = alpha_out[:, ::-1] if self.decode_pre_steps > 0: hid_out = hid_out[:, self.decode_pre_steps:] cell_out = hid_out[:, self.decode_pre_steps:] weighted_hidden_out = weighted_hidden_out[:, self.decode_pre_steps:] alpha_out = hid_out[:, self.decode_pre_steps:] self.hid_out = hid_out self.cell_out = cell_out self.weighted_hidden_out = weighted_hidden_out self.alpha = alpha_out if self.return_decodehid: return hid_out else: return weighted_hidden_out
def log_ctc(self, labels_len_const): def _build_diag(_d): extend_I = T.eye(labels_len + 2) return T.eye(labels_len) + extend_I[ 1:-1, :-2] + extend_I[2:, :-2] * _d[:, None] # prepare y n_samples, labels_len = self.y.shape y1 = T.concatenate( [self.y, T.ones((self.y.shape[0], 2)) * self.blank], axis=1) diag = T.neq(y1[:, :-2], y1[:, 2:]) * T.neq(y1[:, 2:], self.blank) # stretch out, (labels_len, n_samples*labels_len) diags0, _ = theano.scan(fn=_build_diag, sequences=[diag], n_steps=n_samples) shape = diags0.shape diags = T.transpose(diags0, (1, 0, 2)).reshape( (shape[1], shape[0] * shape[2])) # prepare x assert self.x.ndim == 3 # (n_steps, n_samples, softmax_output) to (n_steps, n_samples, labels_len) x1 = self.x[:, T.arange(n_samples)[:, None], self.y] dims = x1.shape # stretch out, (n_steps, n_samples * labels_len) x2 = x1.reshape((dims[0], dims[1] * dims[2])) def log_matrix_dot(x, y, z): v1 = x[:, :, None] v2 = T.tile(v1, (1, 1, labels_len_const)) v2_shape = v2.shape v3 = T.transpose(v2, (1, 0, 2)).reshape( (v2_shape[1], v2_shape[0] * v2_shape[2])) v4 = v3 + y m = T.max(v4, axis=0) v5 = v4 - m[None, :] # mask = T.nonzero(T.isnan(v5)) # v6 = T.set_subtensor(v5[mask], -np.inf) # v7 = T.exp(v5) v7 = safe_exp(v5) v8 = T.sum(v7, axis=0) # v9 = T.log(v8) v9 = safe_log(v8) v10 = v9 + m v11 = v10 + z v12 = v11.reshape((n_samples, labels_len)) return v12 # each step def _step(m_, s_, h_, diags): v = log_matrix_dot(h_, diags, s_) m_extend = T.tile(m_[:, None], (1, labels_len_const)) p = T.switch(m_extend, v, h_) return p # scan loop log_x2 = safe_log(x2) log_outputs_info = safe_log( T.set_subtensor( T.zeros((n_samples, labels_len), dtype=theano.config.floatX)[:, 0], 1)) log_diags = safe_log(diags) self.pin0 = log_x2 self.pin1 = log_outputs_info self.pin2 = log_diags self.debug, _ = theano.scan(fn=_step, sequences=[self.x_mask.T, log_x2], outputs_info=[log_outputs_info], non_sequences=[log_diags]) # prepare y_clip y_clip1 = T.concatenate([(self.y_clip - 2)[:, None], (self.y_clip - 1)[:, None]], axis=1) self.prob = self.debug[-1][T.arange(n_samples)[:, None], y_clip1] # compute loss mx = T.max(self.prob, axis=1) l1 = self.prob - mx[:, None] # l2 = T.sum(T.exp(l1), axis=1) # l3 = T.log(l2) + mx l2 = T.sum(safe_exp(l1), axis=1) l3 = safe_log(l2) + mx self.loss = T.mean(-l3)
def t_initial_state(self): # return theano.shared(name='initstate0',value=self.initial_state.astype(theano.config.floatX)) return T.concatenate([self._t_state0, T.zeros(self._win_dim)], axis=0)
def get_output_for(self, input, deterministic=False, **kwargs): out, r = T.zeros(self.get_output_shape_for(input.shape)), self.upscale for y, x in itertools.product(range(r), repeat=2): out=T.inc_subtensor(out[:,:,y::r,x::r], input[:,r*y+x::r*r,:,:]) return out
def conv1d_sd(input, filters, image_shape, filter_shape, border_mode='valid', subsample=(1, )): """ Using a single dot product. border_mode has to be 'valid' at the moment. """ if border_mode != 'valid': log.error("Unsupported border_mode for conv1d_sd: " "%s" % border_mode) raise RuntimeError("Unsupported border_mode for conv1d_sd: " "%s" % border_mode) batch_size, num_input_channels, input_length = image_shape num_filters, num_input_channels_, filter_length = filter_shape stride = subsample[0] if filter_length % stride > 0: raise RuntimeError("Filter length (%d) is not a multiple of the " "stride (%d)" % (filter_length, stride)) num_steps = filter_length // stride output_length = (input_length - filter_length + stride) // stride # pad the input so all the shifted dot products fit inside. # shape is (b, c, l) padded_length = ((input_length // filter_length) * filter_length + (num_steps - 1) * stride) # at this point, it is possible that the padded_length is SMALLER than the # input size. so then we have to truncate first. truncated_length = min(input_length, padded_length) input_truncated = input[:, :, :truncated_length] input_padded_shape = (batch_size, num_input_channels, padded_length) input_padded = T.zeros(input_padded_shape) input_padded = T.set_subtensor(input_padded[:, :, :truncated_length], input_truncated) inputs = [] for num in range(num_steps): shift = num * stride length = (padded_length - shift) // filter_length r_input_shape = (batch_size, num_input_channels, length, filter_length) r_input = input_padded[:, :, shift:length * filter_length + shift].reshape(r_input_shape) inputs.append(r_input) inputs_stacked = T.stack(*inputs) # shape is (n, b, c, w, f) filters_flipped = filters[:, :, ::-1] r_conved = T.tensordot( inputs_stacked, filters_flipped, numpy.asarray([[2, 4], [1, 2]], dtype=theano.config.floatX)) # resulting shape is (n, b, w, n_filters) # output needs to be (b, n_filters, w * n) r_conved = r_conved.dimshuffle(1, 3, 2, 0) # (b, n_filters, w, n) conved = r_conved.reshape((r_conved.shape[0], r_conved.shape[1], r_conved.shape[2] * r_conved.shape[3])) # result is (b, n_f, l) # remove padding return conved[:, :, :output_length]
def do_preprocess_scan(self, deterministic_dropout=False, **kwargs): """ Run a scan using this LSTM, preprocessing all inputs before the scan. Parameters: kwargs[k]: should be a theano tensor of shape (n_batch, n_time, ... ) Note that "relative_position" should be a keyword argument given here if there are relative shifts. deterministic_dropout: If True, apply dropout deterministically, scaling everything. If false, sample dropout Returns: A theano tensor of shape (n_batch, n_time, output_size) of activations """ assert len(kwargs) > 0, "Need at least one input argument!" n_batch, n_time = list(kwargs.values())[0].shape[:2] squashed_kwargs = { k: v.reshape([n_batch * n_time] + [x for x in v.shape[2:]]) for k, v in kwargs.items() } full_input = T.concatenate( [part.generate(**squashed_kwargs) for part in self.input_parts], 1) adjusted_input = full_input.reshape([n_batch, n_time, self.input_size]).dimshuffle( (1, 0, 2)) if "relative_position" in kwargs: relative_position = kwargs["relative_position"] diff_shifts = T.extra_ops.diff(relative_position, axis=1) cat_shifts = T.concatenate( [T.zeros((n_batch, 1), 'int32'), diff_shifts], 1) shifts = cat_shifts.dimshuffle((1, 0)) else: shifts = T.zeros(n_time, n_batch, 'int32') def _scan_fn(in_data, shifts, *other): other = list(other) if self.dropout and not deterministic_dropout: split = -len(self.tot_layer_sizes) hiddens = other[:split] masks = [None] + other[split:] else: masks = [] hiddens = other return self.perform_step(in_data, shifts, hiddens, dropout_masks=masks) if self.dropout and not deterministic_dropout: dropout_masks = UpscaleMultiDropout( [(n_batch, shape) for shape in self.tot_layer_sizes], self.dropout) else: dropout_masks = [] outputs_info = [ initial_state_with_taps(layer, n_batch) for layer in self.cells.layers ] result, _ = theano.scan(fn=_scan_fn, sequences=[adjusted_input, shifts], non_sequences=dropout_masks, outputs_info=outputs_info) final_out = get_last_layer(result).transpose((1, 0, 2)) return final_out
def get_real_coefficients(self): return (tt.zeros(0, dtype=self.dtype), tt.zeros(0, dtype=self.dtype))
def build_generator(self, version=1, encode=False): #from lasagne.layers import TransposedConv2DLayer as Deconv2DLayer global mask if mask is None: mask = T.zeros(shape=(self.batch_size, 1, 64, 64), dtype=theano.config.floatX) mask = T.set_subtensor(mask[:, :, 16:48, 16:48], 1.) self.mask = mask noise_dim = (self.batch_size, 100) theano_rng = MRG_RandomStreams(rng.randint(2**15)) noise = theano_rng.uniform(size=noise_dim) # mask_color = T.cast(T.cast(theano_rng.uniform(size=(self.batch_size,), low=0., high=2.), 'int16').dimshuffle(0, 'x', 'x', 'x') * mask, dtype=theano.config.floatX) input = ll.InputLayer(shape=noise_dim, input_var=noise) cropped_image = T.cast(T.zeros_like(self.input_) * mask + (1. - mask) * self.input_, dtype=theano.config.floatX) encoder_input = T.concatenate([cropped_image, mask], axis=1) # shoudl concat wrt channels if version == 1: if encode: gen_layers = [ ll.InputLayer(shape=(self.batch_size, 4, 64, 64), input_var=encoder_input) ] # 3 x 64 x 64 --> 64 x 32 x 32 gen_layers.append( nn.batch_norm( ll.Conv2DLayer(gen_layers[-1], 64, 4, 2, pad=1, nonlinearity=nn.lrelu)) ) # 64 x 32 x 32 --> 128 x 16 x 16 gen_layers.append( nn.batch_norm( ll.Conv2DLayer(gen_layers[-1], 128, 4, 2, pad=1, nonlinearity=nn.lrelu)) ) # 128 x 16 x 16 --> 256 x 8 x 8 gen_layers.append( nn.batch_norm( ll.Conv2DLayer(gen_layers[-1], 256, 4, 2, pad=1, nonlinearity=nn.lrelu)) ) # 256 x 8 x 8 --> 512 x 4 x 4 gen_layers.append( nn.batch_norm( ll.Conv2DLayer(gen_layers[-1], 512, 4, 2, pad=1, nonlinearity=nn.lrelu)) ) # 512 x 4 x 4 --> 1024 x 2 x 2 gen_layers.append( nn.batch_norm( ll.Conv2DLayer(gen_layers[-1], 4000, 4, 4, pad=1, nonlinearity=nn.lrelu)) ) # 1024 x 2 x 2 --> 2048 x 1 x 1 #gen_layers.append(nn.batch_norm(ll.Conv2DLayer(gen_layers[-1], 2048, 4, 2, pad=1, nonlinearity=nn.lrelu))) # flatten this out #gen_layers.append(ll.FlattenLayer(gen_layers[-1])) gen_layers.append( nn.batch_norm( nn.Deconv2DLayer(gen_layers[-1], (self.batch_size, 128 * 4, 4, 4), (5, 5), stride=(4, 4)))) # concat with noise latent_size = 2048 else: gen_layers = [input] latent_size = 100 # TODO : put batchorm back on all layers, + g=None gen_layers.append( ll.DenseLayer(gen_layers[-1], 128 * 8 * 4 * 4, W=Normal(0.02))) gen_layers.append( ll.ReshapeLayer(gen_layers[-1], (self.batch_size, 128 * 8, 4, 4))) # creating array of mixing coefficients (shared Theano floats) that will be used for mixing generated_output and image at each layer mixing_coefs = [ theano.shared(lasagne.utils.floatX(0.05)) for i in range(2) ] # theano.shared(lasagne.utils.floatX(np.array([0.5]))) for i in range(3)] mixing_coefs.append(theano.shared(lasagne.utils.floatX(1))) border = 2 gen_layers.append( nn.batch_norm(nn.Deconv2DLayer( gen_layers[-1], (self.batch_size, 128 * 2, 8, 8), (5, 5), W=Normal(0.02), nonlinearity=nn.relu), g=None)) # 4 -> 8 #gen_layers.append(ll.DropoutLayer(gen_layers[-1],p=0.5)) #gen_layers.append(nn.ResetDeconvLayer(gen_layers[-1], cropped_image, mixing_coefs[0], border=border)) #layer_a = nn.ResetDeconvLayer(gen_layers[-1], cropped_image, mixing_coefs[0]) # all new #layer_concat_a = ll.ConcatLayer([layer_a, gen_layers[-1]], axis=1) #gen_layers.append(layer_concat_a) gen_layers.append( nn.batch_norm(nn.Deconv2DLayer(gen_layers[-1], (self.batch_size, 128, 16, 16), (5, 5), W=Normal(0.02), nonlinearity=nn.relu), g=None)) # 8 -> 16 #gen_layers.append(ll.DropoutLayer(gen_layers[-1],p=0.5)) #gen_layers.append(nn.ResetDeconvLayer(gen_layers[-1], cropped_image, mixing_coefs[1], border=border*2)) #layer_b = nn.ResetDeconvLayer(gen_layers[-1], cropped_image, mixing_coefs[1]) # all new #layer_concat_b = ll.ConcatLayer([layer_b, gen_layers[-1]], axis=1) #gen_layers.append(layer_concat_b) gen_layers.append( nn.batch_norm(nn.Deconv2DLayer(gen_layers[-1], (self.batch_size, 64, 32, 32), (5, 5), W=Normal(0.02), nonlinearity=nn.relu), g=None)) # 16 -> 32 #gen_layers.append(ll.DropoutLayer(gen_layers[-1],p=0.5)) #gen_layers.append(nn.ResetDeconvLayer(gen_layers[-1], cropped_image, mixing_coefs[2], border=border*2*2)) #layer_c = nn.ResetDeconvLayer(gen_layers[-1], cropped_image, mixing_coefs[1]) # all new #layer_concat_c = ll.ConcatLayer([layer_c, gen_layers[-1]], axis=1) #gen_layers.append(layer_concat_c) gen_layers.append( nn.Deconv2DLayer( gen_layers[-1], (self.batch_size, 3, 64, 64), (5, 5), W=Normal(0.02), nonlinearity=lasagne.nonlinearities.sigmoid)) # 32 -> 64 #gen_layers.append(ll.DropoutLayer(gen_layers[-1],p=0.5)) #gen_layers.append(nn.ResetDeconvLayer(gen_layers[-1], cropped_image, mixing_coefs[3], border=border*2*2*2, trainable=False)) for layer in gen_layers: print layer.output_shape print '' GAN.mixing_coefs = mixing_coefs return gen_layers
def __theano_build__(self): E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c x = T.ivector('x') y = T.ivector('y') def forward_prop_step(x_t, s_prev): # Word embedding layer x_e = E[:, x_t] # GRU Layer 1 z = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_prev) + b[0]) r = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_prev) + b[1]) c = T.tanh(U[2].dot(x_e) + W[2].dot(s_prev * r) + b[2]) s = (T.ones_like(z) - z) * c + z * s_prev # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row o_t = T.nnet.softmax(V.dot(s) + c)[0] return [o_t, s] [o, s], updates = theano.scan( forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))]) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) # Total cost (could add regularization here) cost = o_error # Gradients dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # Assign functions self.predict = theano.function([x], [o], allow_input_downcast=True) self.predict_class = theano.function([x], prediction, allow_input_downcast=True) self.ce_error = theano.function([x, y], cost, allow_input_downcast=True) self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc], allow_input_downcast=True) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE = decay * self.mE + (1 - decay) * dE**2 mU = decay * self.mU + (1 - decay) * dU**2 mW = decay * self.mW + (1 - decay) * dW**2 mV = decay * self.mV + (1 - decay) * dV**2 mb = decay * self.mb + (1 - decay) * db**2 mc = decay * self.mc + (1 - decay) * dc**2 self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc)], allow_input_downcast=True)
def __theano_build__(self): E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c x = T.ivector('x') y = T.ivector('y') def forward_direction_prop_step(x_t, s_t_prev): # # # Word embedding layer x_e = E[:, x_t] # GRU layer 1 z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t_prev)) + b[0] r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t_prev)) + b[1] c_t = T.tanh(U[2].dot(x_e) + W[2].dot(s_t_prev * r_t) + b[2]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev # directly return the hidden state as intermidate output return [s_t] def backward_direction_prop_step(x_t, s_t_prev): # # # x_e = E[:, x_t] # GRU layer 2 z_t = T.nnet.hard_sigmoid(U[3].dot(x_e) + W[3].dot(s_t_prev)) + b[3] r_t = T.nnet.hard_sigmoid(U[4].dot(x_e) + W[4].dot(s_t_prev)) + b[4] c_t = T.tanh(U[5].dot(x_e) + W[5].dot(s_t_prev * r_t) + b[5]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev return [s_t] def o_step(combined_s_t): o_t = T.nnet.softmax(V.dot(combined_s_t) + c)[0] return o_t # forward direction states f_s, updates = theano.scan(forward_direction_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # backward direction states b_s, updates = theano.scan( backward_direction_prop_step, sequences=x[::-1], # the reverse direction input truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) self.f_s = f_s self.b_s = b_s f_b_s = b_s[::-1] # combine the forward GRU state and backward GRU state together combined_s = T.concatenate([f_s, b_s[::-1]], axis=1) # concatenate the hidden state from 2 GRU layer to do the output o, updates = theano.scan(o_step, sequences=combined_s, truncate_gradient=self.bptt_truncate, outputs_info=None) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) cost = o_error # Gradients dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # Assign functions self.predict = theano.function([x], o) self.predict_class = theano.function([x], prediction) self.ce_error = theano.function([x, y], cost) # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE = decay * self.mE + (1 - decay) * dE**2 mU = decay * self.mU + (1 - decay) * dU**2 mW = decay * self.mW + (1 - decay) * dW**2 mV = decay * self.mV + (1 - decay) * dV**2 mb = decay * self.mb + (1 - decay) * db**2 mc = decay * self.mc + (1 - decay) * dc**2 updates = [(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc)] self.sgd_step = theano.function( [x, y, learning_rate, theano.Param(decay, default=0.9)], [], updates=updates)
def conv3d(signals, filters, signals_shape=None, filters_shape=None, border_mode='valid'): """ Convolve spatio-temporal filters with a movie. It flips the filters. Parameters ---------- signals Timeseries of images whose pixels have color channels. Shape: [Ns, Ts, C, Hs, Ws]. filters Spatio-temporal filters. Shape: [Nf, Tf, C, Hf, Wf]. signals_shape None or a tuple/list with the shape of signals. filters_shape None or a tuple/list with the shape of filters. border_mode One of 'valid', 'full' or 'half'. Notes ----- Another way to define signals: (batch, time, in channel, row, column) Another way to define filters: (out channel,time,in channel, row, column) For the GPU, use nnet.conv3d. See Also -------- Someone made a script that shows how to swap the axes between both 3d convolution implementations in Theano. See the last `attachment <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_ """ if isinstance(border_mode, str): border_mode = (border_mode, border_mode, border_mode) if signals_shape is None: _signals_shape_5d = signals.shape else: _signals_shape_5d = signals_shape if filters_shape is None: _filters_shape_5d = filters.shape else: _filters_shape_5d = filters_shape Ns, Ts, C, Hs, Ws = _signals_shape_5d Nf, Tf, C, Hf, Wf = _filters_shape_5d _signals_shape_4d = (Ns * Ts, C, Hs, Ws) _filters_shape_4d = (Nf * Tf, C, Hf, Wf) if border_mode[1] != border_mode[2]: raise NotImplementedError('height and width bordermodes must match') conv2d_signal_shape = _signals_shape_4d conv2d_filter_shape = _filters_shape_4d if signals_shape is None: conv2d_signal_shape = None if filters_shape is None: conv2d_filter_shape = None out_4d = tensor.nnet.conv2d( signals.reshape(_signals_shape_4d), filters.reshape(_filters_shape_4d), input_shape=conv2d_signal_shape, filter_shape=conv2d_filter_shape, border_mode=border_mode[1]) # ignoring border_mode[2] # compute the intended output size if border_mode[1] == 'valid': Hout = Hs - Hf + 1 Wout = Ws - Wf + 1 elif border_mode[1] == 'full': Hout = Hs + Hf - 1 Wout = Ws + Wf - 1 elif border_mode[1] == 'half': Hout = Hs - (Hf % 2) + 1 Wout = Ws - (Wf % 2) + 1 elif border_mode[1] == 'same': raise NotImplementedError() else: raise ValueError('invalid border mode', border_mode[1]) # reshape the temporary output to restore its original size out_tmp = out_4d.reshape((Ns, Ts, Nf, Tf, Hout, Wout)) # now sum out along the Tf to get the output # but we have to sum on a diagonal through the Tf and Ts submatrix. if Tf == 1: # for Tf==1, no sum along Tf, the Ts-axis of the output is unchanged! out_5d = out_tmp.reshape((Ns, Ts, Nf, Hout, Wout)) else: # for some types of convolution, pad out_tmp with zeros if border_mode[0] == 'valid': Tpad = 0 elif border_mode[0] == 'full': Tpad = Tf - 1 elif border_mode[0] == 'half': Tpad = Tf // 2 elif border_mode[0] == 'same': raise NotImplementedError() else: raise ValueError('invalid border mode', border_mode[0]) if Tpad == 0: out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3) else: # pad out_tmp with zeros before summing over the diagonal out_tmp_padded = tensor.zeros(dtype=out_tmp.dtype, shape=(Ns, Ts + 2 * Tpad, Nf, Tf, Hout, Wout)) out_tmp_padded = tensor.set_subtensor( out_tmp_padded[:, Tpad:(Ts + Tpad), :, :, :, :], out_tmp) out_5d = diagonal_subtensor(out_tmp_padded, 1, 3).sum(axis=3) return out_5d
def compute_landmarks_helper(self, moms, init_landmarks): moms = T.reshape(moms[:136], (68, 2)) # 68 * 2 init_landmarks = T.reshape(init_landmarks[:136], (68, 2)) mask = T.zeros((68, 2)) mask = T.set_subtensor(mask[65:68, :], np.ones((3, 2))) initLandmarks_aftmas = init_landmarks * mask moms_aftmas = moms * mask dp = T.zeros((68, 2)) dp1 = T.zeros((68, 2)) initLandmarks_loca1 = T.alloc(initLandmarks_aftmas[65, :], 68, 2) initLandmarks_loca1_aftmas = initLandmarks_loca1 * mask initLandmarks_loca2 = T.alloc(initLandmarks_aftmas[66, :], 68, 2) initLandmarks_loca2_aftmas = initLandmarks_loca2 * mask initLandmarks_loca3 = T.alloc(initLandmarks_aftmas[67, :], 68, 2) initLandmarks_loca3_aftmas = initLandmarks_loca3 * mask weight1 = T.zeros((68, 2)) weight1_val = T.exp(- T.sum((initLandmarks_loca1_aftmas - initLandmarks_aftmas) ** 2, axis=1) / self.sigmaV2) weight1 = T.set_subtensor(weight1[:, 0], weight1_val) weight1 = T.set_subtensor(weight1[:, 1], weight1_val) val1 = T.sum(weight1 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[65, :], val1) weight2 = T.zeros((68, 2)) weight2_val = T.exp(- T.sum((initLandmarks_loca2_aftmas - initLandmarks_aftmas) ** 2, axis=1) / self.sigmaV2) weight2 = T.set_subtensor(weight2[:, 0], weight2_val) weight2 = T.set_subtensor(weight2[:, 1], weight2_val) val2 = T.sum(weight2 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[66, :], val2) weight3 = T.zeros((68, 2)) weight3_val = T.exp(- T.sum((initLandmarks_loca3_aftmas - initLandmarks_aftmas) ** 2, axis=1) / self.sigmaV2) weight3 = T.set_subtensor(weight3[:, 0], weight3_val) weight3 = T.set_subtensor(weight3[:, 1], weight3_val) val3 = T.sum(weight3 * moms_aftmas, axis=0) dp = T.set_subtensor(dp[67, :], val3) deformedShape = initLandmarks_aftmas + (dp * self.tau) deformedShape_loca1 = T.alloc(deformedShape[65, :], 68, 2) deformedShape_loca2 = T.alloc(deformedShape[66, :], 68, 2) deformedShape_loca3 = T.alloc(deformedShape[67, :], 68, 2) weight11 = T.zeros((68, 2)) weight11_val = T.exp(- T.sum((deformedShape_loca1 - deformedShape) ** 2, axis=1) / self.sigmaV2) weight11 = T.set_subtensor(weight11[:, 0], weight11_val) weight11 = T.set_subtensor(weight11[:, 1], weight11_val) val11 = T.sum(weight11 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[65, :], val11) weight22 = T.zeros((68, 2)) weight22_val = T.exp(- T.sum((deformedShape_loca2 - deformedShape) ** 2, axis=1) / self.sigmaV2) weight22 = T.set_subtensor(weight22[:, 0], weight22_val) weight22 = T.set_subtensor(weight22[:, 1], weight22_val) val22 = T.sum(weight22 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[66, :], val22) weight33 = T.zeros((68, 2)) weight33_val = T.exp(- T.sum((deformedShape_loca3 - deformedShape) ** 2, axis=1) / self.sigmaV2) weight33 = T.set_subtensor(weight33[:, 0], weight33_val) weight33 = T.set_subtensor(weight33[:, 1], weight33_val) val33 = T.sum(weight33 * moms_aftmas, axis=0) dp1 = T.set_subtensor(dp1[67, :], val33) output = (deformedShape + dp1 * self.tau).flatten() return output
def __init__(self, x, n_in, n_hidden, n_out, activation='tanh', order=1): self.x = x self.n_in = n_in self.n_hidden = n_hidden self.n_out = n_out self.order = order if activation.lower() == 'tanh': act = tanh elif activation.lower() == 'relu': act = relu elif activation.lower() == 'sigmoid': act = sigmoid elif activation.lower() == 'linear': act = lambda x: x def _slice(x, n): return x[:, n * self.n_hidden:(n + 1) * self.n_hidden] # initialize weights def ortho_weight(ndim, rng=rng): W = rng.randn(ndim, ndim) u, s, v = numpy.linalg.svd(W) return u.astype(theano.config.floatX) def uniform_weight(n1, n2, rng=rng): limit = numpy.sqrt(6. / (n1 + n2)) return rng.uniform(low=-limit, high=limit, size=(n1, n2)).astype(theano.config.floatX) def const_bias(n, value=0): return value * numpy.ones((n, ), dtype=theano.config.floatX) if self.order == 0: # no multiplicative terms self.Wx = theano.shared(uniform_weight(n_in, n_hidden), borrow=True) self.Wh = theano.shared(ortho_weight(n_hidden), borrow=True) self.bh = theano.shared(const_bias(n_hidden, 0), borrow=True) self.Wy = theano.shared(uniform_weight(n_hidden, n_out), borrow=True) self.by = theano.shared(const_bias(n_out, 0), borrow=True) self.am = [] self.ax = [] self.ah = [] self.params = [self.Wx, self.Wh, self.bh, self.Wy, self.by] self.W = [self.Wx, self.Wh, self.Wy] self.L1 = numpy.sum([abs(w).sum() for w in self.W]) self.L2 = numpy.sum([(w**2).sum() for w in self.W]) # forward function def forward(x_t, h_tm1, Wx, Wh, bh, am, ax, ah, Wy, by): preact = T.dot(x_t, Wx) + T.dot(h_tm1, Wh) + bh h_t = act(preact) y_t = softmax(T.dot(h_t, Wy) + by) return h_t, y_t, preact else: self.Wx = theano.shared(numpy.concatenate( [uniform_weight(n_in, n_hidden) for i in range(order)], axis=1), borrow=True) self.Wh = theano.shared(numpy.concatenate( [ortho_weight(n_hidden) for i in range(order)], axis=1), borrow=True) self.am = theano.shared(numpy.concatenate( [const_bias(n_hidden, 2) for i in range(order)], axis=0), borrow=True) self.ax = theano.shared(numpy.concatenate( [const_bias(n_hidden, 0.5) for i in range(order)], axis=0), borrow=True) self.ah = theano.shared(numpy.concatenate( [const_bias(n_hidden, 0.5) for i in range(order)], axis=0), borrow=True) self.bh = theano.shared(numpy.concatenate( [const_bias(n_hidden, 0) for i in range(order)], axis=0), borrow=True) self.Wy = theano.shared(uniform_weight(n_hidden, n_out), borrow=True) self.by = theano.shared(const_bias(n_out, 0), borrow=True) self.params = [ self.Wx, self.Wh, self.am, self.ax, self.ah, self.bh, self.Wy, self.by ] self.W = [self.Wx, self.Wh, self.Wy] self.L1 = numpy.sum([abs(w).sum() for w in self.W]) self.L2 = numpy.sum([(w**2).sum() for w in self.W]) # forward function def forward(x_t, h_tm1, Wx, Wh, bh, am, ax, ah, Wy, by): h_t = 1 preact = am*T.dot(x_t,Wx)*T.dot(h_tm1,Wh) \ +ax*T.dot(x_t,Wx) \ +ah*T.dot(h_tm1,Wh) \ +bh for i in range(self.order): h_t = h_t * act(_slice(preact, i)) y_t = softmax(T.dot(h_t, Wy) + by) return h_t, y_t, preact h0 = T.alloc(T.zeros((self.n_hidden, ), dtype=theano.config.floatX), x.shape[0], self.n_hidden) ([h, y, p], updates) = theano.scan( fn=forward, sequences=x.dimshuffle([1, 0, 2]), outputs_info=[dict(initial=h0, taps=[-1]), None, None], non_sequences=[ self.Wx, self.Wh, self.bh, self.am, self.ax, self.ah, self.Wy, self.by ]) self.output = y self.preact = p self.pred = T.argmax(self.output, axis=1)
def MultiOutput_Bayesian_Calibration(n_y,DataComp,DataField,DataPred,output_folder): # This is data preprocessing part n = np.shape(DataField)[0] # number of measured data m = np.shape(DataComp)[0] # number of simulation data p = np.shape(DataField)[1] - n_y # number of input x q = np.shape(DataComp)[1] - p - n_y # number of calibration parameters t xc = DataComp[:,n_y:] # simulation input x + calibration parameters t xf = DataField[:,n_y:] # observed input yc = DataComp[:,:n_y] # simulation output yf = DataField[:,:n_y] # observed output x_pred = DataPred[:,n_y:] # design points for predictions y_true = DataPred[:,:n_y] # true measured value for design points for predictions n_pred = np.shape(x_pred)[0] # number of predictions N = n+m+n_pred # Put points xc, xf, and x_pred on [0,1] for i in range(p): x_min = min(min(xc[:,i]),min(xf[:,i])) x_max = max(max(xc[:,i]),max(xf[:,i])) xc[:,i] = (xc[:,i]-x_min)/(x_max-x_min) xf[:,i] = (xf[:,i]-x_min)/(x_max-x_min) x_pred[:,i] = (x_pred[:,i]-x_min)/(x_max-x_min) # Put calibration parameters t on domain [0,1] for i in range(p,(p+q)): t_min = min(xc[:,i]) t_max = max(xc[:,i]) xc[:,i] = (xc[:,i]-t_min)/(t_max-t_min) # store mean and std of yc for future scale back use yc_mean = np.zeros(n_y) yc_sd = np.zeros(n_y) # standardization of output yf and yc for i in range(n_y): yc_mean[i] = np.mean(yc[:,i]) yc_sd[i] = np.std(yc[:,i]) yc[:,i] = (yc[:,i]-yc_mean[i])/yc_sd[i] yf[:,i] = (yf[:,i]-yc_mean[i])/yc_sd[i] # This is modeling part with pm.Model() as model: # Claim prior part eta1 = pm.HalfCauchy("eta1", beta=5) # for eta of gaussian process lengthscale = pm.Gamma("lengthscale", alpha=2, beta=1, shape=(p+q)) # for lengthscale of gaussian process tf = pm.Beta("tf", alpha=2, beta=2, shape=q) # for calibration parameters sigma1 = pm.HalfCauchy('sigma1', beta=5) # for noise y_pred = pm.Normal('y_pred', 0, 1.5, shape=(n_pred,n_y)) # for y prediction # Setup prior of right cholesky matrix sd_dist = pm.HalfCauchy.dist(beta=2.5, shape=n_y) colchol_packed = pm.LKJCholeskyCov('colcholpacked', n=n_y, eta=2,sd_dist=sd_dist) colchol = pm.expand_packed_triangular(n_y, colchol_packed) # Concate data into a big matrix[[xf tf], [xc tc], [x_pred tf]] xf1 = tt.concatenate([xf, tt.fill(tt.zeros([n,q]), tf)], axis = 1) x_pred1 = tt.concatenate([x_pred, tt.fill(tt.zeros([n_pred,q]), tf)], axis = 1) X = tt.concatenate([xf1, xc, x_pred1], axis = 0) # Concate data into a big matrix[[yf], [yc], [y_pred]] y = tt.concatenate([yf, yc, y_pred], axis = 0) # Covariance funciton of gaussian process cov_z = eta1**2 * pm.gp.cov.ExpQuad((p+q), ls=lengthscale) # Gaussian process with covariance funciton of cov_z gp = MultiMarginal(cov_func = cov_z) # Bayesian inference matrix_shape = [n+m+n_pred,n_y] outcome = gp.marginal_likelihood("outcome", X=X, y=y, colchol=colchol, noise=sigma1, matrix_shape=matrix_shape) trace = pm.sample(250,cores=1) # This part is for data collection and visualization pm.summary(trace).to_csv(output_folder + '/trace_summary.csv') print(pm.summary(trace)) name_columns = [] n_columns = n_pred for i in range(n_columns): for j in range(n_y): name_columns.append('y'+str(j+1)+'_pred'+str(i+1)) y_prediction = pd.DataFrame(np.array(trace['y_pred']).reshape(500,n_pred*n_y),columns=name_columns) #Draw Picture of cvrmse_dist and calculate index for i in range(n_y): index = list(range(0+i,n_pred*n_y+i,n_y)) y_prediction1 = pd.DataFrame(y_prediction.iloc[:,index]) y_prediction1 = y_prediction1*yc_sd[i]+yc_mean[i] # Scale y_prediction back y_prediction1.to_csv(output_folder + '/y_pred'+str(i+1)+'.csv') # Store y_prediction # Calculate the distribution of cvrmse cvrmse = 100*np.sqrt(np.sum(np.square(y_prediction1-y_true[:,i]),axis=1)/n_pred)/np.mean(y_true[:,i]) # Calculate the index and store it into csv index_cal(y_prediction1,y_true[:,i]).to_csv(output_folder + '/index'+str(i+1)+'.csv') # Draw pictrue of cvrmse distribution of each y plt.subplot(n_y, 1, i+1) plt.hist(cvrmse) plt.savefig(output_folder + '/cvrmse_dist.pdf') plt.close() #Draw Picture of Prediction_Plot for i in range(n_y): index = list(range(0+i,n_pred*n_y+i,n_y)) y_prediction_mean = np.array(pm.summary(trace)['mean'][index])*yc_sd[i]+yc_mean[i] y_prediction_975 = np.array(pm.summary(trace)['hpd_97.5'][index])*yc_sd[i]+yc_mean[i] y_prediction_025 = np.array(pm.summary(trace)['hpd_2.5'][index])*yc_sd[i]+yc_mean[i] plt.subplot(n_y, 1, i+1) # estimated probability plt.scatter(x=range(n_pred), y=y_prediction_mean) # error bars on the estimate plt.vlines(range(n_pred), ymin=y_prediction_025, ymax=y_prediction_975) # actual outcomes plt.scatter(x=range(n_pred), y=y_true[:,i], marker='x') plt.xlabel('predictor') plt.ylabel('outcome') # This is just to print original cvrmse to test whether outcome good if i == 0: cvrmse = 100*np.sqrt(np.sum(np.square(y_prediction_mean-y_true[:,0]))/len(y_prediction_mean-y_true[:,0]))/np.mean(y_true[:,0]) print(cvrmse) plt.savefig(output_folder + '/Prediction_Plot.pdf') plt.close()
def test_convolutional_layer(): batch_size=2 x = T.tensor4(); y = T.ivector() V = 200 layer_conv = Convolutional(filter_size=(5,5),num_filters=V, name="toto", weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0)) # try with no bias activation = Rectifier() pool = MaxPooling(pooling_size=(2,2)) convnet = ConvolutionalSequence([layer_conv, activation, pool], num_channels=15, image_size=(10,10), name="conv_section") convnet.push_allocation_config() convnet.initialize() output=convnet.apply(x) batch_size=output.shape[0] output_dim=np.prod(convnet.get_dim('output')) result_conv = output.reshape((batch_size, output_dim)) mlp=MLP(activations=[Rectifier().apply], dims=[output_dim, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0)) mlp.initialize() output=mlp.apply(result_conv) cost = T.mean(Softmax().categorical_cross_entropy(y.flatten(), output)) cg = ComputationGraph(cost) W = VariableFilter(roles=[WEIGHT])(cg.variables) B = VariableFilter(roles=[BIAS])(cg.variables) W = W[-1]; b = B[-1] print W.shape.eval() print b.shape.eval() import pdb pdb.set_trace() inputs_conv = VariableFilter(roles=[INPUT], bricks=[Convolutional])(cg) outputs_conv = VariableFilter(roles=[OUTPUT], bricks=[Convolutional])(cg) var_input=inputs_conv[0] var_output=outputs_conv[0] [d_W,d_S,d_b] = T.grad(cost, [W, var_output, b]) import pdb pdb.set_trace() w_shape = W.shape.eval() d_W = d_W.reshape((w_shape[0], w_shape[1]*w_shape[2]*w_shape[3])) d_b = T.zeros((w_shape[0],6*6)) #d_b = d_b.reshape((w_shape[0], 8*8)) d_p = T.concatenate([d_W, d_b], axis=1) d_S = d_S.dimshuffle((1, 0, 2, 3)).reshape((w_shape[0], batch_size, 6*6)).reshape((w_shape[0], batch_size*6*6)) #d_S = d_S.reshape((2,200, 64)) #x_value=1e3*np.random.ranf((1,15,10,10)) x_value = 1e3*np.random.ranf((2,15, 10, 10)) f = theano.function([x,y], [var_input, d_S, d_W], allow_input_downcast=True, on_unused_input='ignore') A, B, C= f(x_value, [5, 5]) print np.mean(B) return E_A = expansion_op(A, (2, 15, 10, 10), (5,5)) print E_A.shape E_A = E_A.reshape((2*36, C.shape[1])) print E_A.shape tmp = C - np.dot(B, E_A) print lin.norm(tmp, 'fro')
def embedder(x, all_embeddings): all_embeddings = T.concatenate( [all_embeddings, T.zeros((1, all_embeddings.shape[1]))], axis=0) return all_embeddings[x]
def __init__(self, u, model=None): add_citations_to_model(self.__citations__, model=model) self.u = tt.as_tensor_variable(u) u_ext = tt.concatenate([-1 + tt.zeros(1, dtype=self.u.dtype), self.u]) self.c = get_cl(u_ext) self.c_norm = self.c / (np.pi * (self.c[0] + 2 * self.c[1] / 3))
def jacobian_det(self, x): return tt.zeros(x.shape)
def cal_decoder_step(self, decoder_val): ''' Calculate the weight ratios in decoder :type decoder_val: class :param decoder_val: the class which stores the intermediate variables in decoder :returns: R_h_h, R_h_x, R_h_y, R_outenergy_2_h, R_outenergy_2_x, R_outenergy_2_y_before are theano variables, weight ratios in decoder. ''' y = decoder_val.y[self.idx].dimshuffle(0, 'x') R_state_in_y = ( y * self.dec_input_emb + self.dec_input_emb_offset[self.idx]) / ( decoder_val.state_in[self.idx] + self.ep * TT.sgn(decoder_val.state_in[self.idx])).dimshuffle('x', 0) R_state_in_y = R_state_in_y.dimshuffle(1, 0) R_reset_in_y = y * self.dec_reset_emb / ( decoder_val.reset_in[self.idx] + self.ep * TT.sgn(decoder_val.reset_in[self.idx])).dimshuffle('x', 0) R_reset_in_y = R_reset_in_y.dimshuffle(1, 0) R_gate_in_y = y * self.dec_gate_emb / ( decoder_val.gate_in[self.idx] + self.ep * TT.sgn(decoder_val.gate_in[self.idx])).dimshuffle('x', 0) R_gate_in_y = R_gate_in_y.dimshuffle(1, 0) c = decoder_val.c[self.idx].dimshuffle(0, 'x') R_gate_cin = c * self.dec_gate_context / ( decoder_val.gate_cin[self.idx] + self.ep * TT.sgn(decoder_val.gate_cin[self.idx])).dimshuffle('x', 0) R_gate_cin = R_gate_cin.dimshuffle(1, 0) R_reset_cin = c * self.dec_reset_context / ( decoder_val.reset_cin[self.idx] + self.ep * TT.sgn(decoder_val.reset_cin[self.idx])).dimshuffle('x', 0) R_reset_cin = R_reset_cin.dimshuffle(1, 0) R_state_cin = c * self.dec_input_context / ( decoder_val.state_cin[self.idx] + self.ep * TT.sgn(decoder_val.state_cin[self.idx])).dimshuffle('x', 0) R_state_cin = R_state_cin.dimshuffle(1, 0) R_gate_cin_x = TT.dot(R_gate_cin, self.R_c_x).dimshuffle(1, 0, 2) R_reset_cin_x = TT.dot(R_reset_cin, self.R_c_x) R_reset_cin_x = R_reset_cin_x.dimshuffle(1, 0, 2) R_state_cin_x = TT.dot(R_state_cin, self.R_c_x) R_state_cin_x = R_state_cin_x.dimshuffle(1, 0, 2) h_before = decoder_val.h_before[self.idx].dimshuffle(0, 'x') R_gate_h = h_before * self.dec_gate_hidden / ( decoder_val.gate[self.idx] + self.ep * TT.sgn(decoder_val.gate[self.idx])).dimshuffle('x', 0) R_gate_h = R_gate_h.dimshuffle(1, 0) R_reset_h = h_before * self.dec_reset_hidden / ( decoder_val.reset[self.idx] + self.ep * TT.sgn(decoder_val.reset[self.idx])).dimshuffle('x', 0) R_reset_h = R_reset_h.dimshuffle(1, 0) R_gate_y = R_gate_in_y * ( decoder_val.gate_in[self.idx] / (decoder_val.gate[self.idx] + self.ep * TT.sgn(decoder_val.gate[self.idx]))).dimshuffle(0, 'x') R_reset_y = R_reset_in_y * (decoder_val.reset_in[self.idx] / ( decoder_val.reset[self.idx] + self.ep * TT.sgn(decoder_val.reset[self.idx]))).dimshuffle(0, 'x') R_gate = (decoder_val.gate_cin[self.idx] / (decoder_val.gate[self.idx] + self.ep * TT.sgn(decoder_val.gate[self.idx]))).dimshuffle( 'x', 0, 'x') R_gate_x = R_gate * R_gate_cin_x R_reset = (decoder_val.reset_cin[self.idx] / (decoder_val.reset[self.idx] + self.ep * TT.sgn(decoder_val.reset[self.idx]))).dimshuffle( 'x', 0, 'x') R_reset_x = R_reset * R_reset_cin_x R_reseted_h = R_reset_h * self.weight + TT.eye(self.dim, self.dim) * self.weight R_reseted_y = R_reset_y * self.weight R_reseted_x = R_reset_x * self.weight R_state_x = R_state_cin_x * ( decoder_val.state_cin[self.idx] / (decoder_val.state[self.idx] + self.ep * TT.sgn(decoder_val.state[self.idx]))).dimshuffle('x', 0, 'x') R_state_y = R_state_in_y * (decoder_val.state_in[self.idx] / ( decoder_val.state[self.idx] + self.ep * TT.sgn(decoder_val.state[self.idx]))).dimshuffle(0, 'x') reseted = decoder_val.reseted[self.idx].dimshuffle(0, 'x') R_state_reseted = reseted * self.dec_input_hidden[self.idx] / ( decoder_val.state[self.idx] + self.ep * TT.sgn(decoder_val.state[self.idx])).dimshuffle(0, 'x') R_state_reseted = R_state_reseted.dimshuffle(1, 0) R_state_h = TT.dot(R_state_reseted, R_reseted_h) R_state_x += TT.dot(R_state_reseted, R_reseted_x).dimshuffle(1, 0, 2) R_state_y = TT.dot(R_state_reseted, R_reseted_y) R_h = (decoder_val.gate[self.idx] * decoder_val.state[self.idx] / (decoder_val.h[self.idx] + self.ep * TT.sgn(decoder_val.h[self.idx]))).dimshuffle( 0, 'x') * self.weight R_h_h = R_gate_h * R_h + R_state_h * R_h R_h2 = ((1 - decoder_val.gate[self.idx]) * decoder_val.h_before[self.idx] / (decoder_val.h[self.idx] + self.ep * TT.sgn(decoder_val.h[self.idx]))).dimshuffle( 0, 'x') R_h_h += TT.identity_like(R_h_h) * R_h2 R_h_y = R_gate_y * R_h + R_state_y * R_h R_h = (decoder_val.gate[self.idx] * decoder_val.state[self.idx] / (decoder_val.h[self.idx] + self.ep * TT.sgn(decoder_val.h[self.idx]))).dimshuffle( 'x', 0, 'x') * self.weight R_h_x = R_gate_x * R_h + R_state_x * R_h R_readout_c = c * self.dec_readout_context / ( decoder_val.readout[self.idx] + self.ep * TT.sgn(decoder_val.readout[self.idx])).dimshuffle('x', 0) R_readout_c = R_readout_c.dimshuffle(1, 0) R_readout_x = TT.dot(R_readout_c, self.R_c_x).dimshuffle(1, 0, 2) R_readout_h = h_before * self.dec_readout_hidden / ( decoder_val.readout[self.idx] + self.ep * TT.sgn(decoder_val.readout[self.idx])).dimshuffle('x', 0) R_readout_h = R_readout_h.dimshuffle(1, 0) y_before = decoder_val.y_before[self.idx].dimshuffle(0, 'x') R_readout_y_before = y_before * self.dec_readout_emb / ( decoder_val.readout[self.idx] + self.ep * TT.sgn(decoder_val.readout[self.idx])).dimshuffle('x', 0) R_readout_y_before = R_readout_y_before.dimshuffle(1, 0) dim1 = decoder_val.maxout[self.idx].shape[0] maxout = decoder_val.maxout[self.idx].reshape([dim1 / 2, 2]) maxout = TT.argmax(maxout, axis=1) maxout = maxout.reshape([dim1 / 2]) L = TT.arange(dim1 / 2) maxout = maxout + L * 2 + L * dim1 R_maxout = TT.zeros((self.dim * self.dim / 2)) R_maxout = TT.set_subtensor(R_maxout[maxout.flatten()], 1.0) R_maxout = R_maxout.reshape([self.dim / 2, self.dim]) R_maxout_y_before = TT.dot(R_maxout, R_readout_y_before) R_maxout_h = TT.dot(R_maxout, R_readout_h) R_maxout_x = TT.dot(R_maxout, R_readout_x).dimshuffle(1, 0, 2) maxout = decoder_val.maxout[self.idx].dimshuffle(0, 'x') R_outenergy1_maxout = maxout * self.dec_probs_emb / ( decoder_val.outenergy_1[self.idx] + self.ep * TT.sgn(decoder_val.outenergy_1[self.idx])).dimshuffle('x', 0) R_outenergy1_maxout = R_outenergy1_maxout.dimshuffle(1, 0) R_outenergy1_y_before = TT.dot(R_outenergy1_maxout, R_maxout_y_before) R_outenergy1_h = TT.dot(R_outenergy1_maxout, R_maxout_h) R_outenergy1_x = TT.dot(R_outenergy1_maxout, R_maxout_x).dimshuffle(1, 0, 2) probs = self.dec_probs.dimshuffle( 1, 0)[decoder_val.y_idx[self.idx]].dimshuffle(0, 'x') outenergy_1 = decoder_val.outenergy_1[self.idx].dimshuffle(0, 'x') idx = decoder_val.y_idx[self.idx] outenergy_2 = (decoder_val.outenergy_2[self.idx][idx]) R_outenergy_2 = outenergy_1 * probs / (outenergy_2 + self.ep * outenergy_2) R_outenergy_2 = R_outenergy_2.dimshuffle(1, 0) R_outenergy_2_y_before = TT.dot(R_outenergy_2, R_outenergy1_y_before) R_outenergy_2_h = TT.dot(R_outenergy_2, R_outenergy1_h) R_outenergy_2_x = TT.dot(R_outenergy_2, R_outenergy1_x).dimshuffle(1, 0, 2) return R_h_h, R_h_x, R_h_y, R_outenergy_2_h, R_outenergy_2_x, R_outenergy_2_y_before
def conv3d(x, kernel, strides=(1, 1, 1), border_mode='valid', dim_ordering='th', volume_shape=None, filter_shape=None): ''' Run on cuDNN if available. border_mode: string, "same" or "valid". ''' if dim_ordering not in {'th', 'tf'}: raise Exception('Unknown dim_ordering ' + str(dim_ordering)) if border_mode not in {'same', 'valid'}: raise Exception('Invalid border mode: ' + str(border_mode)) if dim_ordering == 'tf': # TF uses the last dimension as channel dimension, # instead of the 2nd one. # TH input shape: (samples, input_depth, conv_dim1, conv_dim2, conv_dim3) # TF input shape: (samples, conv_dim1, conv_dim2, conv_dim3, input_depth) # TH kernel shape: (out_depth, input_depth, kernel_dim1, kernel_dim2, kernel_dim3) # TF kernel shape: (kernel_dim1, kernel_dim2, kernel_dim3, input_depth, out_depth) x = x.dimshuffle((0, 4, 1, 2, 3)) kernel = kernel.dimshuffle((4, 3, 0, 1, 2)) if volume_shape: volume_shape = (volume_shape[0], volume_shape[4], volume_shape[1], volume_shape[2], volume_shape[3]) if filter_shape: filter_shape = (filter_shape[4], filter_shape[3], filter_shape[0], filter_shape[1], filter_shape[2]) if border_mode == 'same': assert (strides == (1, 1, 1)) pad_dim1 = (kernel.shape[2] - 1) pad_dim2 = (kernel.shape[3] - 1) pad_dim3 = (kernel.shape[4] - 1) output_shape = (x.shape[0], x.shape[1], x.shape[2] + pad_dim1, x.shape[3] + pad_dim2, x.shape[4] + pad_dim3) output = T.zeros(output_shape) indices = (slice(None), slice(None), slice(pad_dim1 // 2, x.shape[2] + pad_dim1 // 2), slice(pad_dim2 // 2, x.shape[3] + pad_dim2 // 2), slice(pad_dim3 // 2, x.shape[4] + pad_dim3 // 2)) x = T.set_subtensor(output[indices], x) border_mode = 'valid' border_mode_3d = (border_mode, border_mode, border_mode) conv_out = conv3d2d.conv3d(signals=x.dimshuffle(0, 2, 1, 3, 4), filters=kernel.dimshuffle(0, 2, 1, 3, 4), border_mode=border_mode_3d) conv_out = conv_out.dimshuffle(0, 2, 1, 3, 4) # support strides by manually slicing the output if strides != (1, 1, 1): conv_out = conv_out[:, :, ::strides[0], ::strides[1], ::strides[2]] if dim_ordering == 'tf': conv_out = conv_out.dimshuffle((0, 2, 3, 4, 1)) return conv_out
def theano_one_hot(idx, n): z = T.zeros((idx.shape[0], n)) one_hot = T.set_subtensor(z[T.arange(idx.shape[0]), idx], 1) return one_hot
def construct_graph(self, args, x, length, popstats=None): p = self.parameters # use `symlength` where we need to be able to adapt to longer sequences # than the ones we trained on symlength = x.shape[0] t = T.cast(T.arange(symlength), "int16") long_sequence_is_long = T.ge( T.cast(T.arange(symlength), theano.config.floatX), length) batch_size = x.shape[1] dummy_states = dict(h=T.zeros( (symlength, batch_size, args.num_hidden)), c=T.zeros( (symlength, batch_size, args.num_hidden))) output_names = "h c atilde btilde".split() for key in "abc": for stat in "mean var".split(): output_names.append("%s_%s" % (key, stat)) def stepfn(t, long_sequence_is_long, x, dummy_h, dummy_c, h, c): # population statistics are sequences, but we use them # like a non-sequence and index it ourselves. this allows # us to generalize to longer sequences, in which case we # repeat the last element. popstats_by_key = dict() for key in "abc": popstats_by_key[key] = dict() for stat in "mean var".split(): if not args.baseline and args.use_population_statistics: popstat = popstats["%s_%s" % (key, stat)] # pluck the appropriate population statistic for this # time step out of the sequence, or take the last # element if we've gone beyond the training length. # if `long_sequence_is_long` then `t` may be unreliable # as it will overflow for looong sequences. popstat = theano.ifelse.ifelse(long_sequence_is_long, popstat[-1], popstat[t]) else: popstat = None popstats_by_key[key][stat] = popstat atilde, btilde = T.dot(h, p.Wa), T.dot(x, p.Wx) a_normal, a_mean, a_var = self.bn_a.construct_graph( atilde, baseline=args.baseline, **popstats_by_key["a"]) b_normal, b_mean, b_var = self.bn_b.construct_graph( btilde, baseline=args.baseline, **popstats_by_key["b"]) ab = a_normal + b_normal g, f, i, o = [ fn(ab[:, j * args.num_hidden:(j + 1) * args.num_hidden]) for j, fn in enumerate([self.activation] + 3 * [T.nnet.sigmoid]) ] c = dummy_c + f * c + i * g c_normal, c_mean, c_var = self.bn_c.construct_graph( c, baseline=args.baseline, **popstats_by_key["c"]) h = dummy_h + o * self.activation(c_normal) return [locals()[name] for name in output_names] sequences = [ t, long_sequence_is_long, x, dummy_states["h"], dummy_states["c"] ] outputs_info = [ T.repeat(p.h0[None, :], batch_size, axis=0), T.repeat(p.c0[None, :], batch_size, axis=0), ] outputs_info.extend([None] * (len(output_names) - len(outputs_info))) outputs, updates = theano.scan(stepfn, sequences=sequences, outputs_info=outputs_info) outputs = dict(zip(output_names, outputs)) if not args.baseline and not args.use_population_statistics: # prepare population statistic estimation popstats = dict() alpha = 0.05 for key, size in zip( "abc", [4 * args.num_hidden, 4 * args.num_hidden, args.num_hidden]): for stat, init in zip("mean var".split(), [0, 1]): name = "%s_%s" % (key, stat) popstats[name] = theano.shared(init + np.zeros( ( length, size, ), dtype=theano.config.floatX), name=name) popstats[name].tag.estimand = outputs[name] updates[popstats[name]] = (alpha * outputs[name] + (1 - alpha) * popstats[name]) return outputs, updates, dummy_states, popstats
def jacobian_det(self, x): y = tt.zeros(x.shape) return tt.sum(y, axis=-1)