def marginalize_over_v_z(self, h): # energy = \sum_{i=1}^{|h|} h_i*b_i - \beta * ln(1 + e^{b_i}) # In theory should use the following line # energy = (h * self.b).T # However, when there is broadcasting, the Theano element-wise multiplication between np.NaN and 0 is 0 instead of np.NaN! # so we use T.tensordot and T.diagonal instead as a workaround! # See Theano issue #3848 (https://github.com/Theano/Theano/issues/3848) energy = T.tensordot(h, self.b, axes=0) energy = T.diagonal(energy, axis1=1, axis2=2).T if self.penalty == "softplus_bi": energy = energy - self.beta * T.log(1 + T.exp(self.b))[:, None] elif self.penalty == "softplus0": energy = energy - self.beta * T.log(1 + T.exp(0))[:, None] else: raise NameError("Invalid penalty term") energy = T.set_subtensor(energy[(T.isnan(energy)).nonzero()], 0) # Remove NaN energy = T.sum(energy, axis=0, keepdims=True).T ener = T.tensordot(h, self.W, axes=0) ener = T.diagonal(ener, axis1=1, axis2=2) ener = T.set_subtensor(ener[(T.isnan(ener)).nonzero()], 0) ener = T.sum(ener, axis=2) + self.c[None, :] ener = T.sum(T.log(1 + T.exp(ener)), axis=1, keepdims=True) return -(energy + ener)
def test_transfer(self): tensor1 = self.rng.rand(20, 10, 5, 8).astype("float32") tensor2 = self.rng.rand(5, 8, 20).astype("float32") tensor3 = self.rng.rand(8, 20, 5).astype("float32") x = tensor.ftensor4("x") y = tensor.ftensor3("y") tdot1 = tensor.tensordot(x, y, 2) f1 = theano.function([x, y], tdot1, mode=mode_with_gpu) topo1 = f1.maker.fgraph.toposort() assert topo1[-1].op == cuda.host_from_gpu # Let DebugMode debug f1(tensor1, tensor2) tdot2 = tensor.tensordot(x, y, axes=[(0, 3), (1, 0)]) f2 = theano.function([x, y], tdot2, mode=mode_with_gpu) topo2 = f2.maker.fgraph.toposort() assert topo2[-1].op == cuda.host_from_gpu f2(tensor1, tensor3) tdot3 = tensor.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]) f3 = theano.function([x, y], tdot3, mode=mode_with_gpu) topo3 = f3.maker.fgraph.toposort() assert topo3[-1].op == cuda.host_from_gpu f3(tensor1, tensor3)
def get_output(self, train=False): input = self.get_input(train) proj_input = self.activation(T.tensordot(input, self.att_proj, axes=(3,0))) #else: # proj_fun = lambda proj_i, inp: T.tensordot(inp, proj_i, axes=((1,3), (0,1))) # lin_proj_input, _ = theano.scan(fn=proj_fun, sequences=self.att_proj, non_sequences=input) # proj_input = self.activation(lin_proj_input.dimshuffle((1,0,2,3))) if self.context == 'word': att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 0)) elif self.context == 'clause': #att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 1)).sum(axis=2) def step(a_t, h_tm1, W_in, W, sc): h_t = T.tanh(T.tensordot(a_t, W_in, axes=(2,0)) + T.tensordot(h_tm1, W, axes=(2,0))) s_t = T.tensordot(h_t, sc, axes=(2,0)) return h_t, s_t [_, scores], _ = theano.scan(step, sequences=[proj_input.dimshuffle(2,0,1,3)], outputs_info=[T.zeros((proj_input.shape[0], self.td1, self.rec_hid_dim)), None], non_sequences=[self.rec_in_weights, self.rec_hid_weights, self.att_scorer]) att_scores = scores.dimshuffle(1,2,0) elif self.context == 'para': att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 2)).sum(axis=(1, 2)) # Nested scans. For shame! def get_sample_att(sample_input, sample_att): sample_att_inp, _ = theano.scan(fn=lambda s_att_i, s_input_i: T.dot(s_att_i, s_input_i), sequences=[T.nnet.softmax(sample_att), sample_input]) return sample_att_inp att_input, _ = theano.scan(fn=get_sample_att, sequences=[input, att_scores]) return att_input
def shade(self, shape, lights, camera): # See: http://en.wikipedia.org/wiki/Phong_reflection_model#Description # Since our material params are 1d we calculate bw shadings first and # convert to color after light = lights[0] material = shape.material normals = shape.normals(camera.rays) ambient_light = material.ka # diffuse (lambertian) diffuse_shadings = material.kd*T.tensordot(normals, -light.normed_dir(), 1) # specular rm = 2.0*(T.tensordot(normals, -light.normed_dir(), 1).dimshuffle( 0, 1, 'x'))*normals + light.normed_dir() specular_shadings = material.ks*(T.tensordot(rm, camera.look_at, 1) ** material.shininess) # phong phong_shadings = ambient_light + diffuse_shadings + specular_shadings colorized = phong_shadings.dimshuffle(0, 1, 'x') * material.color.dimshuffle('x', 'x', 0) * light.intensity.dimshuffle('x', 'x', 0) clipped = T.clip(colorized, 0, 1) distances = shape.distance(camera.rays) return broadcasted_switch(T.isinf(distances), [0., 0., 0.], clipped)
def sym_mask_logdensity_estimator_intermediate(self, x, mask): non_linearity_name = self.parameters["nonlinearity"].get_name() assert non_linearity_name == "sigmoid" or non_linearity_name == "RLU" x = x.T # BxD mask = mask.T # BxD output_mask = constantX(1) - mask # BxD D = constantX(self.n_visible) d = mask.sum(1) # d is the 1-based index of the dimension whose value to infer (not the size of the context) masked_input = x * mask # BxD h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1) # BxH for l in xrange(self.n_layers - 1): h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l]) # BxH z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft(self.b_alpha) z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft(self.b_mu) z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft(self.b_sigma) temp = T.exp(z_alpha) # + 1e-6 # temp += T.shape_padright(temp.sum(2)/1e-3) Alpha = temp / T.shape_padright(temp.sum(2)) # BxDxC Mu = z_mu # BxDxC Sigma = T.exp(z_sigma) # + 1e-6 #BxDxC # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask) # Mu = Mu * T.shape_padright(output_mask) # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask) # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC Phi = ( -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) ) # BxDxC logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d) return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)
def test_tensordot_reshape(): '''Test that the tensordot implementation using dimshuffle, reshape and dot gives the same results as the default (numpy) version''' # define some tensors a = numpy.arange(20, dtype=theano.config.floatX) / 20.0 b = numpy.arange(10, dtype=theano.config.floatX) / 10.0 c = numpy.arange(5, dtype=theano.config.floatX) / 5.0 d = numpy.arange(8, dtype=theano.config.floatX) / 8.0 tensor1 = numpy.tensordot(a, numpy.tensordot(b, numpy.tensordot(c, d, 0), 0), 0) tensor2 = numpy.tensordot(c, numpy.tensordot(d, a, 0), 0) tensor3 = tensor2.swapaxes(1, 2).swapaxes(0, 2) # d, a, c x = T.tensor4('x') y = T.tensor3('y') # case 1: number of axes to sum over default1 = theano.function([x,y], T.tensordot(x, y, 2))(tensor1, tensor2) reshape1 = theano.function([x,y], B.tensordot(x, y, 2))(tensor1, tensor2) assert numpy.allclose(default1, reshape1) # case 2: axis pairs default2 = theano.function([x,y], T.tensordot(x, y, axes=[(0, 3), (1, 0)]))(tensor1, tensor3) reshape2 = theano.function([x,y], B.tensordot(x, y, axes=[(0, 3), (1, 0)]))(tensor1, tensor3) assert numpy.allclose(default2, reshape2) default3 = theano.function([x,y], T.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]))(tensor1, tensor3) reshape3 = theano.function([x,y], B.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]))(tensor1, tensor3) assert numpy.allclose(default3, reshape3)
def sym_masked_neg_loglikelihood_gradient(self, x, mask): """ x is a matrix of column datapoints (DxB) D = n_visible, Bfloat = batch size """ logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h = self.sym_mask_logdensity_estimator_intermediate( x, mask ) # nnz = output_mask.sum(0) # sparsity_multiplier = T.shape_padright(T.shape_padleft((B+1e-6)/(nnz+1e-6))) # wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) #BxDxC # lp_current = log_sum_exp(wPhi, axis = 2) * output_mask #BxD # lp_current_sum = (lp_current.sum(1) * D / (D-d)).sum() #1 loglikelihood = logdensity.mean(dtype=floatX) loss = -loglikelihood dp_dz_alpha = T.grad(loss, z_alpha) # BxDxC gb_alpha = dp_dz_alpha.sum(0) # DxC gV_alpha = T.tensordot(h.T, dp_dz_alpha, [[1], [0]]).dimshuffle((1, 0, 2)) # DxHxC dp_dz_mu = T.grad(loss, z_mu) # BxDxC dp_dz_mu = dp_dz_mu * Sigma # Heuristic gb_mu = dp_dz_mu.sum(0) # DxC gV_mu = T.tensordot(h.T, dp_dz_mu, [[1], [0]]).dimshuffle((1, 0, 2)) # DxHxC dp_dz_sigma = T.grad(loss, z_sigma) # BxDxC gb_sigma = dp_dz_sigma.sum(0) # DxC gV_sigma = T.tensordot(h.T, dp_dz_sigma, [[1], [0]]).dimshuffle((1, 0, 2)) # DxHxC if self.n_layers > 1: gWs, gbs, gW1, gWflags, gb1 = T.grad(loss, [self.Ws, self.bs, self.W1, self.Wflags, self.b1]) gradients = { "V_alpha": gV_alpha, "b_alpha": gb_alpha, "V_mu": gV_mu, "b_mu": gb_mu, "V_sigma": gV_sigma, "b_sigma": gb_sigma, "Ws": gWs, "bs": gbs, "W1": gW1, "b1": gb1, "Wflags": gWflags, } else: gW1, gWflags, gb1 = T.grad(loss, [self.W1, self.Wflags, self.b1]) gradients = { "V_alpha": gV_alpha, "b_alpha": gb_alpha, "V_mu": gV_mu, "b_mu": gb_mu, "V_sigma": gV_sigma, "b_sigma": gb_sigma, "W1": gW1, "b1": gb1, "Wflags": gWflags, } # Gradients return (loss, gradients)
def output(self, input_vectors): """ Calculate the n_output dot product scalars of this layer @param input_vectors: n_input vectors (actual shape should be (n_batch, n_input, n_dimension) """ return T.sum(T.tensordot(input_vectors, self.W1, [[1], [0]]) * T.tensordot(input_vectors, self.W2, [[1], [0]]), axis=1)
def output(self, train): X = self.get_input(train) X = X.dimshuffle((1,0,2)) if self.is_entity: Entity = X[-1:].dimshuffle(1,0,2) X = X[:-1] b_y = self.b_y b_yn = T.repeat(T.repeat(b_y.reshape((1,self.output_dim)),X.shape[0],axis=0).reshape((1,X.shape[0],self.output_dim)), X.shape[1], axis=0) xif = T.dot(X, self.W_if) + self.b_if xib = T.dot(X, self.W_ib) + self.b_ib xff = T.dot(X, self.W_ff) + self.b_ff xfb = T.dot(X, self.W_fb) + self.b_fb xcf = T.dot(X, self.W_cf) + self.b_cf xcb = T.dot(X, self.W_cb) + self.b_cb xof = T.dot(X, self.W_of) + self.b_of xob = T.dot(X, self.W_ob) + self.b_ob [outputs_f, memories_f], updates_f = theano.scan( self._step, sequences=[xif, xff, xof, xcf], outputs_info=[ alloc_zeros_matrix(X.shape[1], self.output_dim), alloc_zeros_matrix(X.shape[1], self.output_dim) ], non_sequences=[self.U_if, self.U_ff, self.U_of, self.U_cf], truncate_gradient=self.truncate_gradient ) [outputs_b, memories_b], updates_b = theano.scan( self._step, sequences=[xib, xfb, xob, xcb], outputs_info=[ alloc_zeros_matrix(X.shape[1], self.output_dim), alloc_zeros_matrix(X.shape[1], self.output_dim) ], non_sequences=[self.U_ib, self.U_fb, self.U_ob, self.U_cb], truncate_gradient=self.truncate_gradient ) if self.return_sequences: y = T.add(T.add( T.tensordot(outputs_f.dimshuffle((1,0,2)), self.W_yf, [[2],[0]]), T.tensordot(outputs_b[::-1].dimshuffle((1,0,2)), self.W_yb, [[2],[0]])), b_yn) # y = T.add(T.tensordot( # T.add(outputs_f.dimshuffle((1, 0, 2)), # outputs_b[::-1].dimshuffle((1,0,2))), # self.W_y,[[2],[0]]),b_yn) if self.is_entity: return T.concatenate([y, Entity], axis=1) else: return y return T.concatenate((outputs_f[-1], outputs_b[0]))
def complex_tensordot(a, b, axes=2): AR, AI = a[0, ...], a[1, ...] BR, BI = b[0, ...], b[1, ...] output = tensor.stack([ tensor.tensordot(AR, BR, axes=axes) - tensor.tensordot(AI, BI, axes=axes), tensor.tensordot(AR, BI, axes=axes) + tensor.tensordot(AI, BR, axes=axes), ], axis=0) return output
def apply_mat_to_kron(x, a, b, arg_type="numpy"): X = x.reshape((x.shape[0], a.shape[0], b.shape[0])) if arg_type == "numpy": result = np.tensordot(np.tensordot(X, a, axes=([1], [0])), b, axes=([1], [0])) elif arg_type == "theano": result = T.tensordot(T.tensordot(X, a, axes=([1], [0])), b, axes=([1], [0])) else: raise ValueError("arg_type must be 'numpy' or 'theano'") return result.reshape((x.shape[0], -1))
def output(self, input_value): if self.size is not None: if self.dotdim is None: input_value = T.tensordot(input_value, self.weight, axes = [input_value.ndim - 1, 0]) + self.bias else: input_value = T.tensordot(input_value, self.weight, axes = [self.dotdim + 1, 0]) + self.bias if self.dotdim + 1 < input_value.ndim - 1: input_value = input_value.swapaxes(input_value.ndim - 1, self.dotdim + 1) return self.activation_function(input_value)
def contrastive_divergence_1(self, v1): '''Determine the weight updates according to CD-1''' h1 = self.sample_h_given_v(v1) v2 = self.sample_v_given_h(h1) h2p = self.propup(v2) updates = T.tensordot(v1, h1, [[0],[0]]) - T.tensordot(v2, h2p, [[0],[0]]) f = 1.0 / self.minibatch_size return (updates * f, T.sum(v1 - v2, axis=0) * f, T.sum(h1 - h2p, axis=0) * f)
def get_output_for(self, inputs, **kwargs): """ :param inputs: inputs: list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. :return: theano.TensorType Symbolic output variable. """ input = inputs[0] mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] # compute the bi-affine part # first via tensor dot ([batch, length, dim] * [dim, dim, num_label]) # output shape = [batch, length, dim, num_label] out = T.tensordot(input, self.U, axes=[[2], [0]]) # second via tensor dot ([batch, length, dim, num_label] * [batch, dim, length) # output shape = [batch, length, length, num_label] out = T.batched_tensordot(out, input.dimshuffle(0, 2, 1), axes=([2], [1])) out = out.dimshuffle(0, 1, 3, 2) # compute head bias part by tensor dot ([batch, length, dim] * [dim, num_label]) # the shape of s_h should be [batch, length, num_label] if self.W_h is not None: s_h = T.tensordot(input, self.W_h, axes=[[2], [0]]) out = out + s_h.dimshuffle(0, 1, 'x', 2) # compute child part by tensor dot ([batch, length, dim] * [dim, num_label] # the shape of s_c should be [batch, length, num_label] if self.W_c is not None: s_c = T.tensordot(input, self.W_c, axes=[[2], [0]]) out = out + s_c.dimshuffle(0, 'x', 1, 2) # add bias part. if self.b is not None: out = out + self.b.dimshuffle('x', 'x', 'x', 0) if mask is not None: mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x') out = out * mask_shuffled mask_shuffled = mask.dimshuffle(0, 'x', 1, 'x') out = out * mask_shuffled return out
def function(self, xs, h_prevs, c_prevs): biases = T.shape_padright(T.ones_like(xs[:,0])) input_vector = T.concatenate((xs, h_prevs, biases), axis=1) forget_gate = T.nnet.sigmoid(T.tensordot(input_vector, self.W_forget_theano, axes=[[1],[1]])) input_gate = T.nnet.sigmoid(T.tensordot(input_vector, self.W_input_theano, axes=[[1],[1]])) candidate_vector = T.tanh(T.tensordot(input_vector, self.W_candidate_theano, axes=[[1],[1]])) cell_state = forget_gate*c_prevs + input_gate * candidate_vector output = T.nnet.sigmoid(T.tensordot(input_vector, self.W_output_theano, axes=[[1],[1]])) h = output * T.tanh(cell_state) return h, cell_state
def __init__(self, word_context, char_context, V, K, word_context_sz, char_context_sz, rng): """ Initialize the parameters of the language model """ # word training contexts self.word_context = word_context # character training contexts self.char_context = char_context # initialize context word embedding matrix Rw of shape (V, K) Rw_values = np.asarray(rng.uniform(-0.01, 0.01, size=(V, K)), dtype=theano.config.floatX) self.Rw = theano.shared(value=Rw_values, name='Rw', borrow=True) # initialize context character embedding matrix Rc of shape (V, K) Rc_values = np.asarray(rng.uniform(-0.01, 0.01, size=(V, K)), dtype=theano.config.floatX) self.Rc = theano.shared(value=Rc_values, name='Rc', borrow=True) # initialize target word embedding matrix Q of shape (V, K) Q_values = np.asarray(rng.uniform(-0.01, 0.01, size=(V, K)), dtype=theano.config.floatX) self.Q = theano.shared(value=Q_values, name='Q', borrow=True) # initialize word weight tensor Cw of shape (word_context_sz, K, K) Cw_values = np.asarray(rng.normal(0, math.sqrt(0.1), size=(word_context_sz, K, K)), dtype=theano.config.floatX) self.Cw = theano.shared(value=Cw_values, name='Cw', borrow=True) # initialize character weight tensor Cc of shape (char_context_sz, K, K) Cc_values = np.asarray(rng.normal(0, math.sqrt(0.1), size=(char_context_sz, K, K)), dtype=theano.config.floatX) self.Cc = theano.shared(value=Cc_values, name='Cc', borrow=True) # initialize bias vector b_values = np.asarray(rng.normal(0, math.sqrt(0.1), size=(V,)), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, name='b', borrow=True) # context word representations self.r_w = self.Rw[word_context] # context character representations self.r_c = self.Rc[char_context] # predicted word representation for target word by word context self.qw_hat = T.tensordot(self.Cw, self.r_w, axes=[[0,1], [1,2]]) # predicted word representation for target word by character context self.qc_hat = T.tensordot(self.Cc, self.r_c, axes=[[0,1], [1,2]]) # combine word and charafter predictions self.q_hat = self.qw_hat + self.qc_hat # similarity score between predicted word and all target words self.s = T.transpose(T.dot(self.Q, self.q_hat) + T.reshape(self.b, (V,1))) # softmax activation function self.p_w_given_h = T.nnet.softmax(self.s) # parameters of the model self.params = [self.Rw, self.Rc, self.Q, self.Cw, self.Cc, self.b]
def get_output(self, train=False): [X_w, X_t] = self.get_input(train) t_w = self.W_t[X_w[:,:, 0]] # doc_l, n_tags*n_samples, n_dim w_w = self.W_w[X_w[:,:, 1]] dot_tw = T.sum(w_w * t_w, axis=2) inter_1 = T.tensordot(w_w, self.S, axes = [[2],[2]]) inter_2 = T.tensordot(t_w, self.P, axes = [[2],[2]]) # doc_l, n_tags*n_samples, 2,5 inter = T.sum(inter_1 * inter_2, axis = 3) sim_tw = T.tensordot(inter + T.shape_padleft(self.B, 2), self.U, axes=[[2],[0]]) sim_tw = T.reshape(sim_tw, (X_w.shape[0], X_w.shape[1])) dot_sum_w = T.sum(dot_tw * T.nnet.sigmoid(sim_tw), axis = 0)/(X_w.shape[0]) dot_w = theano.tensor.reshape(dot_sum_w, (X_w.shape[1], 1)) return self.activation(dot_w) '''
def __init__(self, model, glm, latent): """ Initialize the filtered stim model """ self.model = model self.bkgd_model = model["bkgd"] self.n = glm.n self.tuningcurves = latent[self.bkgd_model["tuningcurves"]] self.spatial_basis = self.tuningcurves.spatial_basis self.tc_spatial_shape = self.tuningcurves.spatial_shape self.tc_spatial_ndim = self.tuningcurves.spatial_ndim self.temporal_basis = self.tuningcurves.temporal_basis self.Bx = self.tuningcurves.Bx self.Bt = self.tuningcurves.Bt self.w_x = self.tuningcurves.w_x[:, self.tuningcurves.Y[self.n]] self.w_t = self.tuningcurves.w_t[:, self.tuningcurves.Y[self.n]] # Create a shared variable for the filtered stimulus. This is a 4D # tensor with dimensions: # - time # - location (pixel) # - spatial basis # - temporal basis # To get a stimulus current we need to select a location and take a # weighted sum along both the spatial and temporal axes. self.filtered_stim = theano.shared(name="stim", value=np.ones((1, 1, 1, 1))) self.locations = latent[self.bkgd_model["locations"]] self.L = self.locations.Lmatrix[self.n, :] self.loc_index = self.locations.location_prior.ravel_index(self.L) # Expose outputs to the Glm class # It matters that we do the dot products in order of outermost # to innermost dimension. This improves memory efficiency. # Compute the spatially filtered stimulus # Result is T x L x B_t self.I_stim_t = T.tensordot(self.filtered_stim, self.w_t, axes=[[3], [0]]) self.I_stim_t.name = "I_stim_t" # Take dot product with temporal basis coefficients # Result is T x L (where L is number of locations) self.I_stim_xt = T.tensordot(self.I_stim_t, self.w_x, axes=[[2], [0]]) self.I_stim_xt.name = "I_stim_xt" self.I_stim = self.I_stim_xt[:, self.loc_index] self.I_stim.name = "I_stim" # There are no latent variables in this class. They all belong # to global latent variables. self.log_p = T.constant(0.0)
def learningstep(self, Y, L, W, epsilon, threshold): s = self._activation(Y,L,W,threshold) s.name = 's_%d.%d[t]'%(self._nmultilayer,self._nlayer) W_new = W + epsilon*(T.tensordot(s,Y,axes=[0,0]) - T.sum(s,axis=0)[:,np.newaxis]*W) W_new.name = 'W_%d.%d[t]'%(self._nmultilayer,self._nlayer) return s, W_new
def gram_mat(vecs): # theano gram matrix vecs = vecs.flatten(ndim = 3) gram = T.tensordot(vecs, vecs, axes=([2], [2])) return gram
def forwardrankrel(self, x, y): """Forward function in the special case of relation ranking to avoid a broadcast problem. @TODO: think about a workaround.""" xW = T.tensordot(x, self.W, axes=([1], [0])) xW = xW.reshape((1, xW.shape[1], xW.shape[2])) xWy = ((y.reshape((y.shape[0], y.shape[1], 1))) * xW).sum(1) return self.act(xWy + self.b)
def make_consensus(self, networks, axis=2): cns = self.attrs['consensus'] if cns == 'max': return T.max(networks, axis=axis) elif cns == 'min': return T.min(networks, axis=axis) elif cns == 'mean': return T.mean(networks, axis=axis) elif cns == 'flat': if self.depth == 1: return networks if axis == 2: return networks.flatten(ndim=3) #return T.reshape(networks, (networks.shape[0], networks.shape[1], T.prod(networks.shape[2:]) )) else: return networks.flatten(ndim=2) # T.reshape(networks, (networks.shape[0], T.prod(networks.shape[1:]) )) elif cns == 'sum': return T.sum(networks, axis=axis, acc_dtype=theano.config.floatX) elif cns == 'prod': return T.prod(networks, axis=axis) elif cns == 'var': return T.var(networks, axis=axis) elif cns == 'project': p = self.add_param(self.create_random_uniform_weights(self.attrs['n_out'], 1, self.attrs['n_out'] + self.depth + 1)) return T.tensordot(p, networks, [[1], [axis]]) elif cns == 'random': idx = self.rng.random_integers(size=(1,), low=0, high=self.depth) if axis == 0: return networks[idx] if axis == 1: return networks[:,idx] if axis == 2: return networks[:,:,idx] if axis == 3: return networks[:,:,:,idx] assert False, "axis too large" else: assert False, "consensus method unknown: " + cns
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1)
def __init__(self, rng, input, n_in, n_in2, n_out, activation, W=None, b=None, use_bias=False): self.input = input self.activation = activation if W is None: W_values = np.asarray(0.01 * rng.standard_normal( size=(n_out, n_in, 1, n_in2)), dtype=theano.config.floatX) W = theano.shared(value=W_values, name='W') if b is None: b_values = np.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b') self.W = W self.b = b if use_bias: lin_output = T.dot(input, self.W,) + self.b else: lin_output = T.tensordot(input, self.W, axes = [[1,2,3],[1,2,3]]) self.output = (lin_output if activation is None else activation(lin_output)) # parameters of the model if use_bias: self.params = [self.W, self.b] else: self.params = [self.W]
def _step(self,xg_t, xo_t, xc_t, mask_tm1,h_tm1, c_tm1, u_g, u_o, u_c): h_mask_tm1 = mask_tm1 * h_tm1 c_mask_tm1 = mask_tm1 * c_tm1 act = T.tensordot( xg_t + h_mask_tm1, u_g , [[1],[2]]) gate = T.nnet.softmax(act.reshape((-1, act.shape[-1]))).reshape(act.shape) c_tilda = self.activation(xc_t + T.dot(h_mask_tm1, u_c)) sigma_se = self.k_parameters[0] sigma_per = self.k_parameters[1] sigma_b_lin = self.k_parameters[2] sigma_v_lin = self.k_parameters[3] sigma_rq = self.k_parameters[4] l_se = self.k_parameters[5] l_per = self.k_parameters[6] l_lin = self.k_parameters[7] l_rq = self.k_parameters[8] alpha_rq = self.k_parameters[9] p_per = self.k_parameters[10] k_se = T.pow(sigma_se,2) * T.exp( -T.pow(c_mask_tm1 - c_tilda,2) / (2* T.pow(l_se,2) + self.EPS)) k_per = T.pow(sigma_per,2) * T.exp( -2*T.pow(T.sin( math.pi*(c_mask_tm1 - c_tilda)/ (p_per + self.EPS) ),2) / ( T.pow(l_per,2) + self.EPS )) k_lin = T.pow(sigma_b_lin,2) + T.pow(sigma_v_lin,2) * (c_mask_tm1 - l_lin) * (c_tilda - l_lin ) k_rq = T.pow(sigma_rq,2) * T.pow( 1 + T.pow( (c_mask_tm1 - c_tilda),2) / ( 2 * alpha_rq * T.pow(l_rq,2) + self.EPS), -alpha_rq) ops = [c_mask_tm1,c_tilda,k_se, k_per, k_lin,k_rq] yshuff = T.as_tensor_variable( ops, name='yshuff').dimshuffle(1,2,0) c_t = (gate.reshape((-1,gate.shape[-1])) * yshuff.reshape((-1,yshuff.shape[-1]))).sum(axis = 1).reshape(gate.shape[:2]) o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1, u_o)) h_t = o_t * self.activation(c_t) return h_t, c_t
def next_state_fn(self, a, last_state, U, u): U_act = U[a] u_act = u[a] return T.tensordot( U_act, (last_state), [[0], [0]] ) + u_act
def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 3 Z = T.tensordot(state_below, self.W, axes=[[1],[0]]) + self.b rval = batched_softmax(Z) for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval
def get_output(self, train=False): X = self.get_input() Wx = T.tensordot(X, self.W, axes=(2, 0)).dimshuffle(1, 0, 2, 3) s_init = T.zeros((X.shape[0], self.output_dim)) u_init = T.ones((X.shape[0], self.causes_dim)) / self.causes_dim outputs, uptdates = scan( self._step, sequences=[Wx], outputs_info=[s_init, u_init], non_sequences=[self.b] + self.hid2output.params, truncate_gradient=self.truncate_gradient) if self.return_mode == 'both': return T.concatenate([outputs[0], outputs[1]], axis=-1) elif self.return_mode == 'states': out = outputs[0] elif self.return_mode == 'causes': out = outputs[1] else: raise ValueError("return_model {0} not valid. Choose " "'both', 'states' or 'causes'".format( self.return_mode)) if self.return_sequences: return out.dimshuffle(1, 0, 2) else: return out[-1]
def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy): argembed1 = self.A[args1] argembed2 = self.A[args2] weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) one = self.factorization(batchSize=l, argsEmbA=argembed1, argsEmbB=argembed2, wC=weightedC) # [l,n] u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]]) logScoresP = T.log(T.nnet.sigmoid(u)) allScores = logScoresP allScores = T.concatenate([allScores, entropy, entropy]) negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k)) negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k)) negOne = self.negFactorization1(batchSize=l, negEmbA=negembed1, argsEmbB=argembed2, wC=weightedC) negTwo = self.negFactorization2(batchSize=l, argsEmbA=argembed1, negEmbB=negembed2, wC=weightedC) g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0), negTwo + self.Ab[neg2].dimshuffle(1, 0)]) logScores = T.log(T.nnet.sigmoid(-g)) allScores = T.concatenate([allScores, logScores.flatten()]) return allScores
def hessian(objective, argument): """ Compute the directional derivative of the gradient (which is equal to the hessian multiplied by direction). """ g = T.grad(objective, argument) # Create a new tensor A, which has the same type (i.e. same dimensionality) # as argument. A = argument.type() try: # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient, rather than explicitly # calculating the hessian and then multiplying. R = T.Rop(g, argument, A) except NotImplementedError: shp = T.shape(argument) H = T.jacobian(g.flatten(), argument).reshape( T.concatenate([shp, shp]), 2*A.ndim) R = T.tensordot(H, A, A.ndim) try: hess = theano.function([argument, A], R, on_unused_input='raise') except theano.compile.UnusedInputError: warn('Theano detected unused input - suggests hessian may be zero or ' 'constant.') hess = theano.function([argument, A], R, on_unused_input='ignore') return hess
def apply(self, input0_, input1_): W, b, W_Linear = self.parameters output = T.tensordot(input0_, W, axes=[[1, 2], [0, 1]]) + T.dot( input1_, W_Linear) + b return output
def H(q, p): return 0.5 * T.tensordot(p, T.tensordot(Ker(q, q), p, [[1], [0]]), [[0, 1], [0, 1]]) + met(q, p)
def sym_masked_neg_loglikelihood_gradient(self, x, mask): """ x is a matrix of column datapoints (DxB) D = n_visible, Bfloat = batch size """ logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h = self.sym_mask_logdensity_estimator_intermediate( x, mask) # nnz = output_mask.sum(0) # sparsity_multiplier = T.shape_padright(T.shape_padleft((B+1e-6)/(nnz+1e-6))) # wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) #BxDxC # lp_current = log_sum_exp(wPhi, axis = 2) * output_mask #BxD # lp_current_sum = (lp_current.sum(1) * D / (D-d)).sum() #1 loglikelihood = logdensity.mean(dtype=floatX) loss = -loglikelihood dp_dz_alpha = T.grad(loss, z_alpha) # BxDxC gb_alpha = dp_dz_alpha.sum(0) # DxC gV_alpha = T.tensordot(h.T, dp_dz_alpha, [[1], [0]]).dimshuffle( (1, 0, 2)) # DxHxC dp_dz_mu = T.grad(loss, z_mu) # BxDxC dp_dz_mu = dp_dz_mu * Sigma # Heuristic gb_mu = dp_dz_mu.sum(0) # DxC gV_mu = T.tensordot(h.T, dp_dz_mu, [[1], [0]]).dimshuffle( (1, 0, 2)) # DxHxC dp_dz_sigma = T.grad(loss, z_sigma) # BxDxC gb_sigma = dp_dz_sigma.sum(0) # DxC gV_sigma = T.tensordot(h.T, dp_dz_sigma, [[1], [0]]).dimshuffle( (1, 0, 2)) # DxHxC if self.n_layers > 1: gWs, gbs, gW1, gWflags, gb1 = T.grad( loss, [self.Ws, self.bs, self.W1, self.Wflags, self.b1]) gradients = { "V_alpha": gV_alpha, "b_alpha": gb_alpha, "V_mu": gV_mu, "b_mu": gb_mu, "V_sigma": gV_sigma, "b_sigma": gb_sigma, "Ws": gWs, "bs": gbs, "W1": gW1, "b1": gb1, "Wflags": gWflags } else: gW1, gWflags, gb1 = T.grad(loss, [self.W1, self.Wflags, self.b1]) gradients = { "V_alpha": gV_alpha, "b_alpha": gb_alpha, "V_mu": gV_mu, "b_mu": gb_mu, "V_sigma": gV_sigma, "b_sigma": gb_sigma, "W1": gW1, "b1": gb1, "Wflags": gWflags } # Gradients return (loss, gradients)
def conv1d_sd(input, filters, image_shape, filter_shape, border_mode='valid', subsample=(1,)): """ Using a single dot product. border_mode has to be 'valid' at the moment. """ if border_mode != 'valid': log.error("Unsupported border_mode for conv1d_sd: " "%s" % border_mode) raise RuntimeError("Unsupported border_mode for conv1d_sd: " "%s" % border_mode) batch_size, num_input_channels, input_length = image_shape num_filters, num_input_channels_, filter_length = filter_shape stride = subsample[0] if filter_length % stride > 0: raise RuntimeError("Filter length (%d) is not a multiple of the " "stride (%d)" % (filter_length, stride)) num_steps = filter_length // stride output_length = (input_length - filter_length + stride) // stride # pad the input so all the shifted dot products fit inside. # shape is (b, c, l) padded_length = ((input_length // filter_length) * filter_length + (num_steps - 1) * stride) # at this point, it is possible that the padded_length is SMALLER than the # input size. so then we have to truncate first. truncated_length = min(input_length, padded_length) input_truncated = input[:, :, :truncated_length] input_padded_shape = (batch_size, num_input_channels, padded_length) input_padded = T.zeros(input_padded_shape) input_padded = T.set_subtensor(input_padded[:, :, :truncated_length], input_truncated) inputs = [] for num in range(num_steps): shift = num * stride length = (padded_length - shift) // filter_length r_input_shape = (batch_size, num_input_channels, length, filter_length) r_input = input_padded[ :, :, shift:length * filter_length + shift].reshape(r_input_shape) inputs.append(r_input) inputs_stacked = T.stack(*inputs) # shape is (n, b, c, w, f) filters_flipped = filters[:, :, ::-1] r_conved = T.tensordot(inputs_stacked, filters_flipped, numpy.asarray([[2, 4], [1, 2]], dtype=theano.config.floatX)) # resulting shape is (n, b, w, n_filters) # output needs to be (b, n_filters, w * n) r_conved = r_conved.dimshuffle(1, 3, 2, 0) # (b, n_filters, w, n) conved = r_conved.reshape((r_conved.shape[0], r_conved.shape[1], r_conved.shape[2] * r_conved.shape[3])) # result is (b, n_f, l) # remove padding return conved[:, :, :output_length]
x_train_filt_T = theano.shared(x_train_filt.transpose(2, 0, 1)) x_test_filt_T = theano.shared(x_test_filt.transpose(2, 0, 1)) y_train_T = T.cast(theano.shared(y_train[:, 0]), 'int32') y_test_T = T.cast(theano.shared(y_test[:, 0]), 'int32') # lr = 0.01 # learning rate lr = T.scalar('lr') batch_size = y_train.size / 4 epochs = 2500 index = T.lscalar('index') y = T.ivector('y') X = T.tensor3('X') csp_w = theano.shared(W) avg_v = theano.shared(V) proj_csp = T.tensordot(X, csp_w, axes=[2, 0]) layer0_out = T.pow(proj_csp, 2) variance = T.tensordot(layer0_out, avg_v, axes=[1, 0]) layer1_out = T.log((variance))[:, :, 0] layer2 = LogisticRegression(input=layer1_out, n_in=5, n_out=2) cost = layer2.negative_log_likelihood(y) + .01 * T.sum(T.pow( avg_v, 2)) - 1000 * (T.sgn(T.min(avg_v)) - 1) * T.pow(T.min(avg_v), 2) params = [csp_w, avg_v] + layer2.params grads = T.grad(cost, params) updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - lr * grad_i))
from itertools import product from warnings import warn from time import time ##===================Theano expressions and functions=================== ##-----model space----- #theano rf_stack_tnsr = tnsr.tensor3('rf_stack_tnsr') ##G x stim_size x stim_size feature_map_tnsr = tnsr.tensor4( 'feature_map_tnsr') ##T x D x stim_size x stim_size apply_rf_to_feature_maps = function(inputs=[rf_stack_tnsr, feature_map_tnsr], outputs=tnsr.tensordot(rf_stack_tnsr, feature_map_tnsr, axes=[[1, 2], [2, 3]])) #example python use case #model_space = apply_rf_to_feature_maps(rf_stack, feature_maps) ##-----prediction menu----- (uses batched_tensordot. not sure why this is necessary, but memory error if normal tensordot is used.) model_space_tnsr = tnsr.tensor3('X') ##model-space tensor: G x T x D feature_weight_tnsr = tnsr.tensor3('NU') ##feature weight tensor: G x D x V prediction_menu_tnsr = tnsr.batched_tensordot( model_space_tnsr, feature_weight_tnsr, axes=[[2], [1]]) ##prediction tensor: G x T x V bigmult = function([model_space_tnsr, feature_weight_tnsr], prediction_menu_tnsr) ##example python use case
def discriminative_free_energy(self, input=None): """ discriminative_free_energy func The correct output is p(y|x) Parameters ---------- self : RBM class object input : `[T.tensors]`, optional Used when calculating free energy of gibbs chain sampling Returns ------- F(y|x) : A `list[]` of vectors of the discriminative model free energy for each output node. Negative loglikelihood can be used as the objective function. Notes ----- The free energy for the discriminative model is computed as: :math: `F(y,x,h) = (xWh + yWh + yBx + vbias*x + hbias*h + cbias*y)`\n ` wx_b = xW_{ik} + yW_{jk} + hbias`\n ` F(y,x) = {cbias*y + yBx + sum_k[ln(1+exp(wx_b))]}`\n ` F(y|x) = {cbias + Bx + sum_k[ln(1+exp(wx_b)]}`\n ` F(y|x) = {cbias + Bx + hbias + yWh}`\n :params: used are W^1, W^2, B, c, h biases """ # amend input if given an input. e.g. free_energy(chain_end) if input is None: visibles = self.input else: visibles = input hbias = self.hbias[0] cbiases = self.cbias vbias = self.vbias xWh_params = self.V_params hWy_params = self.U_params # (items, outs, hiddens) B_params = self.B_params # rebroadcast hidden unit biases # (hiddens,) broadcast(T, F, T) --> ('x', hiddens, 'x') wx_b = hbias.dimshuffle('x', 0, 'x') utility = [] for cbias in cbiases: # (items, outs) --> ('x', outs) # utility = [cbias,...] ('x', outs) cbias = T.sum(cbias, axis=0) u = cbias.dimshuffle('x', 0) utility.append(u) # loop over all input nodes # x : input variables # W, B : weights # a : input biases for x, xWh, B in zip(visibles, xWh_params, B_params): # matrix dot product between input variables and hidden units # xw = xW_{ik} : (rows, hiddens) # wx_b = xW_{ik} + hbias : (rows, hiddens) --> (rows, hids, 'x') if xWh.ndim == 2: xw = T.dot(x, xWh) wx_b += xw.dimshuffle(0, 1, 'x') else: xw = T.tensordot(x, xWh, axes=[[1, 2], [0, 1]]) wx_b += xw.dimshuffle(0, 1, 'x') # loop over all output nodes # hWy : weights (items, outs, hiddens) for i, hWy in enumerate(hWy_params): # wx_b = W_{jk} + W_{jk} + hbias : (rows, hiddens, outs) hWy = T.sum(hWy, axis=0) wx_b += hWy.dimshuffle('x', 1, 0) # xB : (rows, items, cats) . (items, cats, items, outs) # utility[i] = cbias + Bx : (rows, outs) # utility[i] = cbias + Bx : (rows, outs) utility[i] += T.tensordot(x, T.sum(B, axis=-2), axes=[[1, 2], [0, 1]]) # sum over hiddens axis # sum_k \ln(1+\exp(wx_b)) : (rows, hiddens, outs) -- > (rows, outs) entropy = T.sum(T.log(1 + T.exp(wx_b)), axis=1) # add entropy to each expected utility term # -F(y|x) (rows, outs) energy = [] for u in utility: energy.append(u + entropy) return energy
def _inference(self, Y, W): """Return the infered class label for a given input""" W_normalized = T.switch(T.eq(W,0), 0, W/T.sum(W, axis=0)) s = T.tensordot(Y, W_normalized, axes=[1,1]) return s
def tensordot(self, a, b): return tt.tensordot(a, b, axes=(0, 0))
def gramMatrix(x): x = x.flatten(ndim=3) return T.tensordot(x, x, axes=([2], [2]))
def __init__(self, model, algo='fisher', c_lambd_inv=1e-3, rate=1.05, over_sampling=1, rescale='momentum'): """ Init self. Args: model, algo, c_lambd_inv: Start value of \lambda regularizer (used in matrix inversion and in F*v computation). rate: Change per iteration for \lambda. over_sampling: For Fisher-like methods, use multiple random vectors per one sample from dataset. rescale: Can be either False, True or 'momentum'. Implemented algos: 'gn' - Gauss-Newton matrix, 'fisher' - Fisher matrix, 'kr' - Khatri-Rao matrix, 'kr_diag' - block-diagonal KR matrix. """ self.model = model self.algo = algo self.x = self.model.x self.y = T.ivector('y') self.outc = T.matrix('outc') # due to theano bugs self.x_d = shared_empty(2) self.y_d = shared_empty(1, dtype='int32') self.outc_d = shared_empty(2) self.rand_outc_d = shared_empty(3) # --- self.rand_outc = T.tensor3('rand_outc') self.lambd_inv = T.scalar('lambd_inv') self.c_lambd_inv = c_lambd_inv self.rate = rate self.over_sampling = over_sampling self.rescale = rescale # -- target def -- self.f_loss = 0 self.f_loss_samples = 0 for i in range(self.over_sampling): self.f_loss += get_loss(self.model.a, self.rand_outc[i] + my_consider_constant(self.model.a)) * scalar_floatX(self.model.a.shape[0]) self.f_loss_samples += get_loss_samples(self.model.a, self.rand_outc[i] + my_consider_constant(self.model.a)) self.loss = get_loss(self.model.a, self.outc) self.err = get_error(get_pred(self.model.a), self.y) self.updates = OrderedDict() self.grad = sum(([T.grad(self.loss, p)] for p in self.model.params), []) self.grad_vec = T.concatenate([g.flatten() for g in self.grad]) def get_fisher_mat(): grad2d = [] for p in self.model.params: grad2d += [T.jacobian(self.f_loss_samples, p)] if grad2d[-1].ndim == 2: grad2d[-1] = grad2d[-1].dimshuffle(0, 1, 'x') grad2d_vec = T.concatenate([g.flatten(2).T for g in grad2d]).T # tensor wise: F_p,i,j = sum_k grad2d[p,i,k]*grad2d[p,k,j] # just a slow reference implementation of what is below # F = T.mean(T.batched_dot(grad2d_vec.dimshuffle(0, 1, 'x'), grad2d_vec.dimshuffle(0, 'x', 1)), 0)/self.over_sampling F = T.dot(grad2d_vec.T, grad2d_vec)/T.cast(grad2d_vec.shape[0], theano.config.floatX)/self.over_sampling return F if self.algo == 'fisher': self.grad2d = [] for p in self.model.params: self.grad2d += [T.jacobian(self.f_loss_samples, p)] if self.grad2d[-1].ndim == 2: self.grad2d[-1] = self.grad2d[-1].dimshuffle(0, 1, 'x') self.grad2d_vec = T.concatenate([g.flatten(2).T for g in self.grad2d]).T # tensor wise: F_p,i,j = sum_k grad2d[p,i,k]*grad2d[p,k,j] # just a slow reference implementation of what is below # F = T.mean(T.batched_dot(grad2d_vec.dimshuffle(0, 1, 'x'), grad2d_vec.dimshuffle(0, 'x', 1)), 0)/self.over_sampling self.F = T.dot(self.grad2d_vec.T, self.grad2d_vec)/T.cast(self.grad2d_vec.shape[0], theano.config.floatX)/self.over_sampling elif self.algo == 'gn': self.grad2d = [] for p in self.model.params: self.grad2d += [T.jacobian(self.model.a.flatten(), p)] new_shape = (self.model.a.shape[0], self.model.a.shape[1], -1) self.grad2d[-1] = self.grad2d[-1].reshape(new_shape) self.grad2d_vec = T.concatenate([g.flatten(3) for g in self.grad2d], 2) # just a slow reference implementation of what is below # self.F = T.mean(T.batched_dot(self.grad2d_vec.dimshuffle(0, 2, 1), # self.grad2d_vec.dimshuffle(0, 1, 2)), axis=0) self.F = T.tensordot(self.grad2d_vec.dimshuffle(0, 2, 1), self.grad2d_vec.dimshuffle(0, 1, 2), [(0, 2), (0, 1)])/T.cast(self.grad2d_vec.shape[0], theano.config.floatX) elif self.algo.startswith('kr'): self.grads = [] # self.acts = [T.concatenate([self.model.x, T.ones((self.model.x.shape[0], 1))], axis=1)] self.acts = [self.model.x] for l in self.model.layers: cg = T.grad(self.f_loss, l.s) self.grads.append(cg) # self.acts.append(T.concatenate([l.a, T.ones((l.a.shape[0], 1))], axis=1)) self.acts.append(l.a) self.G = [] self.A = [] self.F_block = [] self.F = [] cnt = T.cast(self.grads[0].shape[0], theano.config.floatX) for i in range(len(self.grads)): self.G += [[]] self.A += [[]] for j in range(len(self.grads)): # self.G[-1] += [T.mean(T.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)] # self.A[-1] += [T.mean(T.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)] # self.G[-1] += [T.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1))] # self.A[-1] += [T.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1))] self.G[-1] += [self.grads[i].T.dot(self.grads[j]).dimshuffle('x', 0, 1)/cnt] self.A[-1] += [self.acts[i].T.dot(self.acts[j]).dimshuffle('x', 0, 1)/cnt] if self.algo.endswith('diag'): self.G[-1][-1] *= float(i==j) self.A[-1][-1] *= float(i==j) for i in range(len(self.grads)): self.F_block += [[]] for j in range(len(self.grads)): # depends on whether you want to compute the real fisher with this or the kr approximation # since numpy-base fast_kron somehow computes 3d tensors faster than theano # cblock = fast_kron(self.A[i][j], self.G[i][j]) cblock = native_kron(self.A[i][j], self.G[i][j]) cblock = cblock.reshape(cblock.shape[1:], ndim=2) self.F_block[i] += [cblock] self.F.append(T.concatenate(self.F_block[-1], axis=1)) self.F = T.concatenate(self.F, axis=0) self.F = (self.F+self.F.T)/2 self.Fdamp = self.F+T.identity_like(self.F)*self.lambd_inv # There're 3+ different ways of computing F^-1*v in theano, # and it seems like solve_sym_pos is quite neutral in terms # of performance + it throws an exception if the provided matrix # is singular. # self.new_grad_vec = theano.tensor.slinalg.solve(self.Fdamp, self.grad_vec.dimshuffle(0, 'x')) self.new_grad_vec = solve_sym_pos(self.Fdamp, self.grad_vec) # self.new_grad_vec = gpu_solve(self.Fdamp, self.grad_vec.dimshuffle(0, 'x')) pcount = sum(p.get_value().size for p in self.model.params) self.ch_history = theano.shared(np.zeros((pcount,), dtype=theano.config.floatX)) if self.rescale == 'momentum': self.real_fish = get_fisher_mat() + T.identity_like(self.F)*self.lambd_inv FT = self.real_fish.dot(self.new_grad_vec) FM = self.real_fish.dot(self.ch_history) TFT = self.new_grad_vec.T.dot(FT) MFT = self.ch_history.T.dot(FT) MFM = self.ch_history.T.dot(FM) GT = self.grad_vec.T.dot(self.new_grad_vec) GM = self.grad_vec.T.dot(self.ch_history) tmp1 = T.stack([TFT.reshape(()), MFT.reshape(())], 0).dimshuffle('x', 0) tmp2 = T.stack([MFT.reshape(()), MFM.reshape(())], 0).dimshuffle('x', 0) A = T.concatenate([tmp1, tmp2], 0) A_pinv = T.nlinalg.MatrixPinv()(A) b = T.stack([GT.reshape(()), GM.reshape(())], 0).dimshuffle(0, 'x') res = A_pinv.dot(b).flatten() alpha = res[0] beta = res[1] self.new_grad_vec = self.new_grad_vec * alpha.reshape(()) + self.ch_history * beta.reshape(()) self.F = self.real_fish self.updates[self.ch_history] = self.new_grad_vec elif self.rescale: self.real_fish = get_fisher_mat() + T.identity_like(self.F)*self.lambd_inv lin_fac = self.grad_vec.T.dot(self.new_grad_vec) quad_fac = self.new_grad_vec.T.dot(self.real_fish.dot(self.new_grad_vec)) alpha = lin_fac/quad_fac beta = 0 * alpha self.new_grad_vec *= alpha.reshape(()) self.F = self.real_fish # self.Fdamp = self.F+T.identity_like(self.F)*self.lambd_inv # alpha = T.as_tensor_variable(1) def _apply_gradient_vec(params, new_grad_vec, updates): new_grad = [] offset = 0 for p in params: pval = p.get_value() new_grad += [new_grad_vec[offset:offset+pval.size].reshape(pval.shape)] offset += pval.size updates[p] = p - new_grad[-1] return new_grad self.new_grad = _apply_gradient_vec(self.model.params, self.new_grad_vec, self.updates) self.get_params = theano.function( inputs=[], outputs=self.model.params, on_unused_input='warn' ) self.quad_est_loss = self.new_grad_vec.T.dot(self.F.dot(self.new_grad_vec))/2 self.est_loss = self.quad_est_loss + self.grad_vec.dot(self.new_grad_vec) self.print_pls = {} self.print_pls.update({'shape': self.F.shape[0], 'rank': rank(self.F*10000)}) self.print_pls.update({'grad_mean': T.mean(self.grad_vec**2)**0.5}) self.print_pls.update({'alpha': alpha, 'beta': beta}) # self.print_pls += [self.F] # self.print_pls += [self.real_fish] self.train = theano.function( inputs=[self.lambd_inv], outputs=[self.est_loss, self.loss, self.err] + list(self.print_pls.values()), updates=self.updates, givens={ self.x: self.x_d, self.y: self.y_d, self.outc: self.outc_d, self.rand_outc: self.rand_outc_d }, on_unused_input='warn', allow_input_downcast=True, # profile=True ) self.eva = theano.function( inputs=[], outputs=[self.loss, self.err], givens={ self.x: self.x_d, self.y: self.y_d, self.outc: self.outc_d }, on_unused_input='warn', allow_input_downcast=True ) def step(self, X, y, outc): """Perform single train iteration. Args: X: input vectors y: target labels. outc: target vectors. Returns: Dict consisting of 'loss', 'err', 'est_loss', 'rho', 'delta_ll' and parameters from self.print_pls. """ self.x_d.set_value(X) self.y_d.set_value(y) self.outc_d.set_value(outc) self.rand_outc_d.set_value(floatX(nprng.randn(self.over_sampling, *outc.shape))) old_params = self.get_params() while True: # reset params to saved for op, p in zip(old_params, self.model.params): p.set_value(op) try: t_r = self.train(self.c_lambd_inv) print_pls_vals = t_r[-len(self.print_pls):] self.print_pls_res = {k: v for k, v in zip(self.print_pls.keys(), print_pls_vals)} except numpy.linalg.linalg.LinAlgError: t_r = [1e20, 1e10, 10] + [None] * len(self.print_pls) self.print_pls_res = {k: None for k in self.print_pls.keys()} e_v = self.eva() delta_ll = t_r[1] - e_v[0] rho = delta_ll/float(t_r[0]) print() print('lambda:', round(self.c_lambd_inv, 7), 'rho:', round(rho, 2), 'old loss:', t_r[1], 'new loss:', e_v[0]) if rho < 0: self.c_lambd_inv *= self.rate * 2 continue elif rho < 0.5: self.c_lambd_inv *= self.rate # self.c_lambd_inv = min(self.c_lambd_inv, 0.02) elif rho > 0.5: self.c_lambd_inv /= self.rate else: pass break # self.train.profiler.print_summary() res = {'rho': rho, 'est_loss': t_r[0], 'loss': t_r[1], 'err': t_r[2], 'delta_ll': delta_ll} res.update(self.print_pls_res) return res def evaluate(X_test, y_test, outc_test): """Return loss and error for provided dataset. Args: X_test: input vectors, y_test: target labels, outc_test: target vectors. Returns: Dict consisting of 'test_loss', 'test_err'. """ self.x_d.set_value(X_test) self.y_d.set_value(y_test) self.outc_d.set_value(outc_test) te_v = self.eva() test_loss = te_v[0] test_err = te_v[1] return {'test_loss': test_loss, 'test_err': test_err} def _check_gv_matrix_correctness(self): v = T.vector('v') get_Fv = theano.function( inputs=[v], outputs=[self.F.dot(v)], givens={ self.x: self.x_d, self.outc: self.outc_d }, allow_input_downcast=True ) grad_at = theano.function( inputs=[], outputs=sum(([T.grad(self.loss, p)] for p in self.model.params), []), givens={ self.x: self.x_d, self.outc: self.outc_d }, allow_input_downcast=True ) grads0 = grad_at() vec = [] EPS = 1e-5 for p in self.model.params: vec += [nprng.randn(*p.get_value().shape).astype(theano.config.floatX)] p.set_value(p.get_value()+vec[-1]*EPS) grads1 = grad_at() vec_vec = np.concatenate([p.flatten() for p in vec]) F_vec = get_Fv(vec_vec) F_vec_vec = np.concatenate([f.flatten() for f in F_vec]) grads0_vec = np.concatenate([p.flatten() for p in grads0]) grads1_vec = np.concatenate([p.flatten() for p in grads1]) F_vec_emp = (grads1_vec-grads0_vec)/EPS print(np.mean(F_vec_emp**2)**0.5, np.mean(F_vec_vec**2)**0.5) print(np.max(np.abs(F_vec_emp-F_vec_vec))) exit(0)
def pymc3_simple(indep, dep, img_dir_orig, degree=2, mindep=-1.0, maxdep=0.4, sampling=1000, tune=1000, uniform=True, extratext='', plot=True): img_dir = op.join(img_dir_orig, 'deg_%d' % (degree), extratext) mkpath(img_dir) ndim = len(indep) limlist = [] for indepi in indep: per = np.percentile(indepi, [1.0, 99.0]) limlist.append(per) lower, upper = min(mindep, np.amin(dep)), max( maxdep, np.amax(dep)) # Limits for dependent variable x = np.empty( (0, degree + 1)) # To set up grid on which true dust parameter n will be defined for lim in limlist: x = np.append(x, np.linspace(lim[0], lim[-1], degree + 1)[None, :], axis=0) xx = np.meshgrid(*x) #N-D Grid for polynomial computations a_poly_T = get_a_polynd( xx ).T #Array related to grid that will be used in least-squares computation aTinv = np.linalg.inv(a_poly_T) rc = -1.0 #Rcond parameter set to -1 for keeping all entries of result to machine precision, regardless of rank issues # 2-D array that will be multiplied by coefficients to calculate the dust parameter at the observed independent variable values term = calc_poly_tt(indep, degree) # breakpoint() with pm.Model() as model: # Priors on the parameters ngrid (n over the grid) and sigma (related to width of relation) if uniform: ngrid = pm.Uniform("ngrid", lower=lower - 1.0e-5, upper=upper + 1.0e-5, shape=xx[0].size, testval=np.random.uniform( lower, upper, xx[0].size)) else: ngrid = pm.TruncatedNormal("ngrid", mu=0.3, sigma=1.0, lower=lower - 1.0e-5, upper=upper + 1.0e-5, shape=xx[0].size, testval=np.random.uniform( lower, upper / 2.0, xx[0].size)) sigma = pm.HalfNormal("sigma", sigma=1) # Compute the expected n at each sample coefs = tt.dot(aTinv, ngrid) mu = tt.tensordot(coefs, term, axes=1) # Likelihood (sampling distribution) of observations dep_obs = pm.Normal("dep_obs", mu=mu, sigma=sigma, observed=dep) map_estimate = pm.find_MAP() print(map_estimate) trace = pm.sample(draws=sampling, tune=tune, init='adapt_full', target_accept=0.9, return_inferencedata=True) if plot: az.plot_trace(trace) plt.savefig(op.join(img_dir, "polyND%s_trace_pm_simp.png" % (extratext)), bbox_inches='tight', dpi=300) print(az.summary(trace, round_to=2)) return trace, xx, map_estimate
# # Define Parameter Updates # ######################################################### FA_mean_perturbations = FA.mean(axis=1) # Create List of Updates param_updates = [] for i in range(len(params)): print 'Creating updates for parameter %d...' % i print 'Calculating derivative' normalization = T.nnet.softplus(sigmas[i]) + sig_min_perturbations delta = T.tensordot(FA_mean_perturbations, r_epsilons[i], axes=[[0], [0]]) / normalization / n_perturbations # USE ADAM OPTIMIZER p_adam = Adam(delta, params[i], 0.9, 0.999, learning_rate, epsilon=10e-6) param_updates = param_updates + p_adam.updates for i in range(len(sigmas)): print 'Creating updates for std dev of parameter %d...' % i print 'Calculating derivative' normalization = T.nnet.softplus(sigmas[i]) + sig_min_perturbations outer_der = (r_epsilons[i] * r_epsilons[i] - 1.0) / normalization inner_der = T.exp(sigmas[i]) / (1.0 + T.exp(sigmas[i])) delta_sigma = T.tensordot(FA_mean_perturbations, outer_der * inner_der,
def __init__(self, num_actions): # remember parameters self.num_actions = num_actions # batch size is T_MAX now self.batch_size = 1 #BATCH_SIZE self.discount_rate = DISCOUNT_RATE self.history_length = HISTORY_LENGTH self.screen_dim = DIMS self.img_height = SCREEN_HEIGHT self.img_width = SCREEN_WIDTH self.beta = BETA self.learning_rate = LEARNING_RATE self.rms_decay = RMS_DECAY self.rms_epsilon = RMS_EPSILON # prepare tensors once and reuse them state = T.tensor3('state') reward = T.fscalar('reward') advantage = T.fscalar('advantage') action = T.iscalar('action') #beta = T.fscalar('regularization_rate') # set learning rate #self.shared_beta = theano.shared(np.zeros((1)), dtype=theano.config.floatX , # broadcastable=(True)) #self.shared_beta.set_value([BETA]) # create shared theano variables self.state_shared = theano.shared( np.zeros((self.history_length, self.img_height, self.img_width), dtype=theano.config.floatX)) self.reward_shared = theano.shared( np.zeros((1), dtype=theano.config.floatX)) self.advantage_shared = theano.shared( np.zeros((1), dtype=theano.config.floatX)) self.action_shared = theano.shared(np.zeros((1), dtype='int32')) # can add multiple nets here # Shared network parameters here self.shared_net = self.build_shared_network() shared_out = lasagne.layers.get_output(self.shared_net, state) ####### OPTIMIZATION here -------------- # Policy network parameters here self.policy_network = self.build_policy_network(self.shared_net) policy_out = lasagne.layers.get_output(self.policy_network, state) # Value network parameters here self.value_network = self.build_value_network(self.shared_net) value_out = lasagne.layers.get_output(self.value_network, state) ## ----------------------- LOSS FUNCTION SHIT STARTS HERE ---------------------------------------- # take log policy loss policy_loss = -T.log(policy_out[0][self.action_shared]).dot( self.advantage_shared) # take entropy and add with the regularizer entropy = -T.tensordot(policy_out, T.log(policy_out)).dot(-1) # add regullazrization policy_loss += self.beta * entropy policy_loss = T.sum(policy_loss) # get the value loss value_loss = ((self.reward_shared - value_out)**2) / 2 value_loss = T.sum(value_loss) total_loss = T.sum(policy_loss + (0.5 * value_loss)) ## ----------------------- LOSS FUNCTION SHIT ENDS HERE ---------------------------------------- shared_params = lasagne.layers.helper.get_all_params(self.shared_net) only_policy_params = self.policy_network.get_params() only_value_params = self.value_network.get_params() policy_params = shared_params + only_policy_params value_params = shared_params + only_value_params g_time = time.time() logger.info("graph compiling") # get grads here policy_grad = T.grad(total_loss, policy_params) value_grad = T.grad(total_loss, value_params) # there'll be two kind of updates policy_updates = rmsprop_updates(policy_grad, policy_params, self.learning_rate, self.rms_decay, self.rms_epsilon) value_updates = rmsprop_updates(value_grad, value_params, self.learning_rate, self.rms_decay, self.rms_epsilon) givens = { state: self.state_shared, reward: self.reward_shared, action: self.action_shared, advantage: self.advantage_shared, } # theano functions for accumulating the grads self._policy_grad = theano.function([], policy_grad, givens=givens) self._value_grad = theano.function([], value_grad, givens=givens) # train will take input the grads and just apply them # NEEDS work here ------------ self._train_policy = theano.function(policy_grad, [], updates=policy_updates) self._train_value = theano.function(value_grad, [], updates=value_updates) # get output for a state self._policy = theano.function([], policy_out, givens={state: self.state_shared}) self._value = theano.function([], value_out, givens={state: self.state_shared}) # need more theano functions for getting policy and value logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
rtransform = T.roll(transform, -DISTANCE, axis=2)[:, :, DISTANCE:-DISTANCE] routput = output[:, :, DISTANCE:-DISTANCE] if args.continuous: # Continuous loss function uses the determinant of the covariance matrix for the signal and residual residual = rtransform - routput # For covariance, we want to subtract out the means... sigmean = rtransform.mean(axis=(0, 2), keepdims=True) epsmean = residual.mean(axis=(0, 2), keepdims=True) rtdelta = rtransform - sigmean epsdelta = residual - epsmean # Covariance matrices sig_cov = T.tensordot(rtdelta, rtdelta, axes=( (0, 2), (0, 2))) / (rtransform.shape[0] * rtransform.shape[2]) eps_cov = T.tensordot(epsdelta, epsdelta, axes=( (0, 2), (0, 2))) / (residual.shape[0] * residual.shape[2]) det_sig = TNL.Det()(sig_cov) + 1e-48 det_eps = TNL.Det()(eps_cov) + 1e-48 entropy = T.log(det_sig) info = T.log(det_eps) # First two terms gives the entropy contrast, but we'd also like the predictions to be correct (as opposed to constant offset), so we add a third term to encourage the mean residual to be zero. loss = info - entropy + 1e-2 * (epsmean**2).mean() else: # Entropy term measures the entropy of the average transformed signal. We want to make this large entropy = -1 * (rtransform.mean(axis=(0, 2)) * T.log(rtransform.mean(axis=(0, 2)) + 1e-6)).sum()
def _setup_functions(self): # Actual parameter lengths. #sh_w_n = (self.n_state + self.n_actions + 1, self.n_state + 1, self.n_state) #print("sh_w_n", sh_w_n) sh_w_n = (self.n_actions + 1, self.n_state + 1, self.n_state) print("sh_w_n", sh_w_n) sh_w_t = (self.n_tex + 1, self.n_state + 1, self.n_ray) print("sh_w_t", sh_w_t) sh_l1 = (self.n_ray + self.n_key, self.n_interaction) print("sh_l1", sh_l1) sh_l2 = (self.n_interaction, 1) print("sh_l2", sh_l2) # Memory cells. sh_mk = (self.n_scene, self.n_key) sh_mc = (self.n_scene, 4) print("sh_mk", sh_mk) print("sh_mc", sh_mc) if not hasattr(self, "params"): print('generating weights') # (A+1)x(S+1)xS wn = uniform(sh_w_n, scale=0.2) # (P+1)x(S+1)xR wt = uniform(sh_w_t, scale=0.2) # (R+K)xH wl1 = uniform(sh_l1, scale=0.2) # H wb1 = shared0s((self.n_interaction, )) # Hx1 wl2 = uniform(sh_l2, scale=0.2) # MxK wmk = uniform(sh_mk, scale=0.2) # MxC wmc = uniform(sh_mc, scale=0.2) self.params = [wn, wt, wl1, wb1, wl2, wmk, wmc] else: wn, wt, wl1, wb1, wl2, wmk, wmc = self.params #TxNxA A = T.tensor3() #TxNxP P = T.tensor3() #TxNxC y = T.tensor3() # Inputs: NxS, NxA def state_transform(a_, s_): # Nx(S+1)xS temp_ = T.tensordot(T.concatenate( [a_, T.ones((s_.shape[0], 1))], axis=1), wn, axes=[1, 0]) # NxS return T.sum( temp_ * T.concatenate([s_, T.ones( (s_.shape[0], 1))], axis=1).dimshuffle([0, 1, 'x']), axis=1) #return s_ # TxNxS S, _ = theano.scan(fn=state_transform, outputs_info=[T.zeros([A.shape[1], self.n_state])], sequences=[A]) # TxNx(S+1)xR temp_ = T.tensordot(T.concatenate( [P, T.ones([S.shape[0], S.shape[1], 1])], axis=2), wt, axes=[2, 0]) # TxNxR Ray Elements. R = T.sum(temp_ * T.concatenate([S, T.ones((S.shape[0], S.shape[1], 1))], axis=2).dimshuffle([0, 1, 2, 'x']), axis=2) # TxNxMx(R+K) Transformation input. R_2 = T.concatenate([ T.tile(R.dimshuffle([0, 1, 'x', 2]), [1, 1, self.n_scene, 1]), T.tile(wmk.dimshuffle(['x', 'x', 0, 1]), [R.shape[0], R.shape[1], 1, 1]) ], axis=3) # TxNxMxH L1 = sigmoid( T.tensordot(R_2, wl1, axes=[3, 0]) + wb1.dimshuffle(['x', 'x', 'x', 0])) # TxNxM Soft attention weights. Att_temp = T.exp(T.tensordot(L1, wl2, axes=[3, 0]).sum(axis=3)) Att = Att_temp / (T.sum(Att_temp, axis=2, keepdims=True) + 0.01) #Att = sigmoid( T.tensordot(L1, wl2, axes=[3,0]).sum( axis=3 ) ) # TxNxC final colors. Col = T.tensordot(Att, wmc, axes=[2, 0]) rec_cost = T.sum(T.sqr(Col - y)) # / T.cast(X.shape[0], 'float32') cost = rec_cost print('getting updates') #updates = Adam([wt,wn,wmk,wl1,wb1,wl2,wmc], cost) updates = Adam(self.params, cost) print('compiling') self._fit_function = theano.function([A, P, y], cost, updates=updates) self._predict = theano.function([A, P], Col) self._next_state = theano.function([A], S) self._predict_attn = theano.function([A, P], Att) # Output just the cost to check with a test set. self._cost = theano.function([A, P, y], cost)
def __init__(self, vec_dim, output_dim, num_words, mini_batch_size=30, rho=1e-6): """ :param vec_dim: Dimension of a single word vector. :param output_dim: Output dimension. :param num_words: Number of different words. :param mini_batch_size: Size of mini-batch. :param rho: L2 penalization coefficient in the cross-entropy error. """ self.vec_dim = vec_dim self.output_dim = output_dim self.num_words = num_words self.mini_batch_size = mini_batch_size self.default_vec = lambda: np.zeros(self.vec_dim).astype(floatX) self.rho = rho # Embedding matrix L. # -------------------------- # Size : (single-word dimension, number of words). # L is trained jointly with the comp. models. self.L = 0.01 * ran(self.vec_dim, self.num_words).astype(floatX) # Neural Tensor Layer weights. # -------------------------- # V is the tensor that defines multiple bilinear forms. # W, b are classical-RNN weight and bias matrices. self.V = shared(0.01 * ran(self.vec_dim, 2 * self.vec_dim, 2 * self.vec_dim).astype(floatX), name='V', borrow=True) self.W = shared(0.01 * ran(self.vec_dim, 2 * self.vec_dim).astype(floatX), name='W', borrow=True) self.b = shared(np.zeros(self.vec_dim).astype(floatX), name='b', borrow=True) # Softmax weights. # -------------------------- # W_s, b_s are the sentiment classification weight and bias matrices. self.W_s = shared(0.01 * ran(self.output_dim, self.vec_dim).astype(floatX), name='W_s', borrow=True) self.b_s = shared(np.zeros(self.output_dim).astype(floatX), name='b_s', borrow=True) self.params = [self.V, self.W, self.b, self.W_s, self.b_s] # Only shared variables # Gradients. # -------------------------- self.np_dV = np.empty( (self.vec_dim, 2 * self.vec_dim, 2 * self.vec_dim)) self.np_dW = np.empty((self.vec_dim, 2 * self.vec_dim)) self.np_db = np.empty(self.vec_dim) self.np_dW_s = np.empty((self.output_dim, self.vec_dim)) self.np_db_s = np.empty(self.output_dim) self.dV = shared(self.np_dV.astype(floatX), name='dV', borrow=True) self.dW = shared(self.np_dW.astype(floatX), name='dW', borrow=True) self.db = shared(self.np_db.astype(floatX), name='db', borrow=True) self.dW_s = shared(self.np_dW_s.astype(floatX), name='dW_s', borrow=True) self.db_s = shared(self.np_db_s.astype(floatX), name='db_s', borrow=True) # As L is jointly trained with the above parameters, we need a "gradient" for L. # This comes in the form of a dictionary self.dL = collections.defaultdict(self.default_vec) # Theano variables for the computational graph. # -------------------------- self.p_a = T.vector('Parent activation') self.lr = T.vector('Stacked activation') self.prob = T.vector('Probabilities') self.diff = T.vector('Distribution differences') self.node_error = T.vector('Soft-max node error') self.label = T.iscalar('Label') self.cost = T.scalar('Cost') self.rate = T.scalar('Learning rate') self.scale = T.scalar('Batch scale') prob = T.dot(self.W_s, self.p_a) + self.b_s prob -= T.max(prob) prob = T.exp(prob) prob /= T.sum(prob) outer = T.outer(self.node_error, self.lr) # Recombination # -------------------------- # Returns parent activation via children activation. self.recombination = theano.function( [self.lr], T.tanh( T.dot(self.W, self.lr) + self.b + T.tensordot( self.V, T.outer(self.lr, self.lr), axes=([1, 2], [0, 1]))), allow_input_downcast=True) # Probabilities # -------------------------- # Returns posterior probabilities given parent activation. self.probabilities = theano.function([self.p_a], prob, allow_input_downcast=True) # Soft-max node error # -------------------------- # Pre-computes softmax node error given distribution difference (target - real). # The Hadamard product is added afterwards updates_1 = collections.OrderedDict() updates_1[self.dW_s] = self.dW_s + T.outer(self.diff, self.p_a) updates_1[self.db_s] = self.db_s + self.diff self.softmax_node_error = theano.function([self.diff, self.p_a], T.dot(self.W_s.T, self.diff), updates=updates_1, allow_input_downcast=True) # Soft-max node error # -------------------------- #Add penalization term to the cost. self.add_penalization_term = theano.function( [self.cost], self.cost + (self.rho / 2) * (T.sum(self.V**2) + T.sum(self.W**2) + T.sum(self.W_s**2)), allow_input_downcast=True) # Prop error # -------------------------- # Back-propagates error and updates gradients. updates_2 = collections.OrderedDict() updates_2[self.dV] = self.dV + (T.outer(self.lr, self.lr)[:, :, None] * self.node_error).T updates_2[self.dW] = self.dW + outer updates_2[self.db] = self.db + self.node_error self.prop_error = theano.function([self.node_error, self.lr], T.dot(self.W.T, self.node_error) + T.tensordot(self.V.transpose( (0, 2, 1)) + self.V, outer.T, axes=([1, 0], [0, 1])), updates=updates_2, allow_input_downcast=True) # Update params # -------------------------- # Updates all weights & biases during gradient descent. updates_3 = collections.OrderedDict() updates_3[self.V] = self.V - self.rate * self.scale * ( self.dV + self.rho * self.V) updates_3[self.W] = self.W - self.rate * self.scale * ( self.dW + self.rho * self.W) updates_3[self.b] = self.b - self.rate * self.scale * self.db updates_3[self.W_s] = self.W_s - self.rate * self.scale * ( self.dW_s + self.rho * self.W_s) updates_3[self.b_s] = self.db_s - self.rate * self.scale * self.db_s self.update_params = theano.function([self.scale, self.rate], self.scale, updates=updates_3, allow_input_downcast=True)
def gram_matrix(mat): mat = mat.flatten(ndim=3) g = T.tensordot(mat, mat, axes=([2], [2])) return g
def gram_matrix(x): x = x.flatten(ndim=3) g = T.tensordot(x, x, axes=([2], [2])) return g
def h_softmax(x, batch_size, n_outputs, n_classes, n_outputs_per_class, W1, b1, W2, b2, target=None): """ Two-level hierarchical softmax. The architecture is composed of two softmax layers: the first predicts the class of the input x while the second predicts the output of the input x in the predicted class. More explanations can be found in the original paper [1]_. If target is specified, it will only compute the outputs of the corresponding targets. Otherwise, if target is None, it will compute all the outputs. The outputs are grouped in the same order as they are initially defined. .. versionadded:: 0.7.1 Parameters ---------- x: tensor of shape (batch_size, number of features) the minibatch input of the two-layer hierarchical softmax. batch_size: int the size of the minibatch input x. n_outputs: int the number of outputs. n_classes: int the number of classes of the two-layer hierarchical softmax. It corresponds to the number of outputs of the first softmax. See note at the end. n_outputs_per_class: int the number of outputs per class. See note at the end. W1: tensor of shape (number of features of the input x, n_classes) the weight matrix of the first softmax, which maps the input x to the probabilities of the classes. b1: tensor of shape (n_classes,) the bias vector of the first softmax layer. W2: tensor of shape (n_classes, number of features of the input x, n_outputs_per_class) the weight matrix of the second softmax, which maps the input x to the probabilities of the outputs. b2: tensor of shape (n_classes, n_outputs_per_class) the bias vector of the second softmax layer. target: tensor of shape either (batch_size,) or (batch_size, 1) (optional, default None) contains the indices of the targets for the minibatch input x. For each input, the function computes the output for its corresponding target. If target is None, then all the outputs are computed for each input. Returns ------- output_probs: tensor of shape (batch_size, n_outputs) or (batch_size, 1) Output of the two-layer hierarchical softmax for input x. If target is not specified (None), then all the outputs are computed and the returned tensor has shape (batch_size, n_outputs). Otherwise, when target is specified, only the corresponding outputs are computed and the returned tensor has thus shape (batch_size, 1). Notes ----- The product of n_outputs_per_class and n_classes has to be greater or equal to n_outputs. If it is strictly greater, then the irrelevant outputs will be ignored. n_outputs_per_class and n_classes have to be the same as the corresponding dimensions of the tensors of W1, b1, W2 and b2. The most computational efficient configuration is when n_outputs_per_class and n_classes are equal to the square root of n_outputs. References ---------- .. [1] J. Goodman, "Classes for Fast Maximum Entropy Training," ICASSP, 2001, <http://arxiv.org/abs/cs/0108006>`. """ # First softmax that computes the probabilities of belonging to each class class_probs = theano.tensor.nnet.softmax(T.dot(x, W1) + b1) if target is None: # Computes the probabilites of all the outputs # Second softmax that computes the output probabilities activations = T.tensordot(x, W2, (1, 1)) + b2 output_probs = theano.tensor.nnet.softmax( activations.reshape((-1, n_outputs_per_class))) output_probs = output_probs.reshape((batch_size, n_classes, -1)) output_probs = class_probs.dimshuffle(0, 1, 'x') * output_probs output_probs = output_probs.reshape((batch_size, -1)) # output_probs.shape[1] is n_classes * n_outputs_per_class, which might # be greater than n_outputs, so we ignore the potential irrelevant # outputs with the next line: output_probs = output_probs[:, :n_outputs] else: # Computes the probabilities of the outputs specified by the targets target = target.flatten() # Classes to which belong each target target_classes = target // n_outputs_per_class # Outputs to which belong each target inside a class target_outputs_in_class = target % n_outputs_per_class # Second softmax that computes the output probabilities activations = sparse_block_dot(W2.dimshuffle('x', 0, 1, 2), x.dimshuffle(0, 'x', 1), T.zeros((batch_size, 1), dtype='int32'), b2, target_classes.dimshuffle(0, 'x')) output_probs = theano.tensor.nnet.softmax(activations.dimshuffle(0, 2)) target_class_probs = class_probs[T.arange(batch_size), target_classes] output_probs = output_probs[T.arange(batch_size), target_outputs_in_class] output_probs = target_class_probs * output_probs return output_probs
def met(q, p): return T.tensordot(nu(q), T.tensordot(p, p, [[1], [1]]).diagonal(), [[0], [0]])
def _step_2(m_, x_, r_, h_, c_, w_, y_, c2_): # Concat x_, h_ and w_ to get Nx(X+W+H) matrix ip_mat = tensor.concatenate([x_, w_, h_], axis=1 ) # Compute forget gate values # f : NxH matrix f = tensor.nnet.sigmoid( tensor.tensordot(ip_mat, tparams['weight'][0], axes=[1, 1]) + tparams['bias'][0, :][None, :]) #f = tensor.nnet.sigmoid(tensor.dot(tparams['weight'][0, :, :], ip_mat) + tparams['bias'][0, :][:, None]) # Compute input gate values # i : NxH matrix i = tensor.nnet.sigmoid(tensor.tensordot(ip_mat, tparams['weight'][1], axes=[1,1]) + tparams['bias'][1, :][None, :]) #i = tensor.nnet.sigmoid(tensor.dot(tparams['weight'][1, :, :], ip_mat) + tparams['bias'][1, :][:, None]) #c_new : NxH matrix c_new = tensor.tanh(tensor.tensordot(ip_mat, tparams['weight'][2], axes=[1,1]) + tparams['bias'][2, :][None, :]) #c_new = tensor.tanh(tensor.dot(tparams['weight'][2, :, :], ip_mat) + tparams['bias'][2, :][:, None]) # Compute new memory # c : NxH c = i * c_new + f * c_ # Retain based on mask c = m_[:, None] * c + (1. - m_)[:, None] * c_ # Compute new hidden state # h : NxH h = tensor.nnet.sigmoid( tensor.tensordot(ip_mat, tparams['weight'][3], axes=[1,1]) + tparams['bias'][3, :][None, :]) * tensor.tanh(c) #h = tensor.nnet.sigmoid( # tensor.dot(tparams['weight'][3, :, :], ip_mat) + tparams['bias'][3, :][:, None]) * tensor.tanh(c) # Retain based on mask h = m_[:, None] * h + (1. - m_)[:, None] * h_ # Predict next vector here. # U = OxH. # B = O. context = tensor.tensordot( h, tparams['U'], axes=[1,1] ) + tparams['b'][None, :] y_old = tensor.tensordot( h, tparams['U_context'], axes=[1,1] ) + tparams['b_context'][None, :] #y_old = tensor.nnet.softmax(y_old) # pred = NxO #pred = tensor.nnet.softmax( proj ); # Nx(M+1) #context = tensor.nnet.softmax(context) #temp: NxW y = tensor.nnet.softmax( ( tensor.sum(context[:, :-1, None ] * memory, axis=1) + context[:, -1][:, None] * y_old ) / options['sample_temperature'] ) #temp = tensor.sum(temp) # ArgMax? # pred[ T.arange(pred.shape[0])[:,None], T.arange(pred.shape[1])[None,:], pred.argmax( axis=2 ) ] = 1.; # Or Sample from last axis? # TxNxO Last dimension one-hot sampled. #w = trng2.multinomial( pvals=pred ); # N w_nums = ( tensor.switch( tensor.gt( r_, tensor.extra_ops.cumsum( y, axis=1 ) ), 1, 0 ) ).sum( axis=1 ); #pred[ tensor.arange(pred.shape[0])[:,None], tensor.arange(pred.shape[1])[None,:], w_nums ] = 1.; # NxW w = tensor.extra_ops.to_one_hot( w_nums, options['ydim'], dtype=config.floatX) return h, c, w.astype(config.floatX), y, context
def get_output_for(self, inputs, **kwargs): x, y = inputs[0], inputs[1] xfactor = T.tensordot(x, self.Wf, axes=(2, 1)).dimshuffle(0, 2, 1, 3) yfactor = T.tensordot(y, self.Wf, axes=(2, 1)).dimshuffle(0, 2, 1, 3) return xfactor * yfactor
def apply(self, input_): W, b = self.parameters output = T.tensordot(input_, W, axes=[[1], [0]]) + b return output
def __init__(self, E, n_users, lrate=0.0001, margin_loss=1, rng=None, init_w2v=False): # Generate random seed if not provided if rng is None: rng = np.random.RandomState(1234) #parameters if init_w2v == "gauss": U = init_w2v_gauss(rng, n_users, E) elif init_w2v == "mean": U = init_w2v_mean(rng, E, n_users) else: U = init_weight(rng, (E.shape[0], n_users)) U = theano.shared(U.astype(theano.config.floatX), borrow=True) E = theano.shared(E.astype(theano.config.floatX), borrow=True) self.params = [U] self.margin_loss = margin_loss self.lrate = lrate #input usr_idx = T.iscalar('usr') sent_idx = T.ivector('sent') neg_samp_idx = T.imatrix('neg_sample') # word_probs = T.fvector('word_probs') #word_probs = T.fscalar('word_probs') curr_lrate = T.fscalar('lrate') #embedding lookup usr = U[:, usr_idx] sent = E[:, sent_idx] neg_samples = E[:, neg_samp_idx] #loss # objectives, _ = theano.scan(fn=self.rank_loss, # outputs_info=None, # sequences=[sent_idx,neg_samp_idx], # non_sequences=[usr,E,U]) pos_score = T.dot(usr, sent) neg_score = T.tensordot(usr, neg_samples, axes=(0, 0)) loss = T.maximum(0, self.margin_loss - pos_score[:, None] + neg_score) # final_loss = loss.sum(axis=None) + word_probs.sum() final_loss = loss.sum(axis=None) #Gradient wrt to user embeddings usr_grad = T.grad(final_loss, usr) #Sparse update upd_usr = T.set_subtensor(usr, usr - curr_lrate * usr_grad) updates = ((U, upd_usr), ) # self.dbg = theano.function(inputs=[usr_idx, sent_idx, neg_samp_idx], # outputs=[usr,sent,neg_samples], # mode="FAST_COMPILE") self.dbg = theano.function(inputs=[usr_idx, sent_idx, neg_samp_idx], outputs=[usr, sent, neg_samples], allow_input_downcast=True) self.train = theano.function( inputs=[usr_idx, sent_idx, neg_samp_idx, curr_lrate], outputs=final_loss, updates=updates, mode="FAST_RUN", allow_input_downcast=True) #\propto P(message|usr) # scores_m = T.exp(T.dot(U.T,E[:,sent_idx])) scores_m = T.dot(U.T, E[:, sent_idx]) prob = T.nnet.softmax(scores_m.T).T log_prob = T.log(prob).sum(axis=1) #sum the scores for all the words # scores_m = scores_m.sum(axis=1) # user_score = scores_m[usr_idx] user_score = log_prob[usr_idx] self.predict = theano.function(inputs=[usr_idx, sent_idx], outputs=[user_score, prob], allow_input_downcast=True)
def dot(self, vec, mat): if self.depth == 1: return T.dot(vec, mat) else: return T.tensordot(vec, mat, 1)
if mean: return out.mean() else: return out t_C = T.matrix("cov","float32") DDS_var = T.nlinalg.det(t_C)*T.nlinalg.MatrixInverse()(t_C) DDS = theano.function([t_C],DDS_var,allow_input_downcast = True) def np_DDS(C): return np.linalg.det(C)*np.linalg.inv(C) t_x = T.matrix("vec_in","float32") XDX_var = (t_x * T.tensordot(t_x,T.nlinalg.MatrixInverse()(t_C),axes = [1,0])).sum(axis = 1) XDX = theano.function([t_C,t_x],XDX_var,allow_input_downcast = True) def np_XDX(C,x): return (x*np.tensordot(x,np.linalg.inv(C),axes = [1,0])).sum(axis = 1) def att_LAM(C,Q,F,x): first_term = (x[:,0]*np.tensordot(x[:,0],np.linalg.inv(C),axes = [1,0])).sum(axis = 1) Tn = x[:,1:] Tnp1 = np.tensordot(x[:,:-1,:],F,axes = [2,1]) dif = Tn - Tnp1 other_terms = dif*np.tensordot(dif,np.linalg.inv(Q),axes = [2,0]) other_terms = (other_terms).sum(axis = (1,2))
sampler = sampling.AudioFileSampler.load(path+"/sampler.p") #sampler = sampling.AudioFileSampler(["Zece/audio"+str(i+1).zfill(2)+".wav" for i in range(23)], sample_size) #sampler = sampling.AudioFileFreqSampler("but_one_day.wav", sample_size, 128, 20) #sampler = sampling.SinusSampler(sample_size) '''import pickle with open(path+"/gaussian_process.p", "rb") as f: pick = pickle.Unpickler(f) sampler = sampling.GaussianProcess(sample_size, pick.load(), 1.0) ''' ###################### x = T.dtensor3('x') batch_size = x.shape[0] z = T.tensordot(x, encode, ([1, 2], [0, 1])) + encode_bias #encoder(x) xx = generator(T.reshape(z, [-1, 1, generator.gen_dim])) mean_enc = z.mean() var_enc = T.sqr(z - mean_enc).mean() cost_enc = -(xx - x).norm(2, axis=[1, 2]).mean() / (sample_size * data_dim) #cost_enc = -T.sqr(xx - x).mean(axis=[0, 1, 2]) #cost_enc += -(mean_enc).norm(2)*0.01 - (T.log(var_enc)).norm(2)*0.001 #cost_enc += -0.01 * generator.normL1() cost_enc *= 100
def step(target_token_id, hidden_state, conv_out, gru_prediction_to_reset, gru_prediction_to_hidden, gru_prediction_to_update, gru_prev_hidden_to_reset, gru_prev_hidden_to_next, gru_prev_hidden_to_update, gru_hidden_update_bias, gru_update_bias, gru_reset_bias, conv_weights_code_l3, conv_layer3_bias, code_embeddings, all_name_reps, use_prev_stat): gated_l2 = conv_out * T.switch(hidden_state > 0, hidden_state, 0.01 * hidden_state).dimshuffle( 0, 1, 'x', 'x') gated_l2 = gated_l2 / gated_l2.norm(2) code_convolved_l3 = T.nnet.conv2d( gated_l2, conv_weights_code_l3, image_shape=(1, self.hyperparameters["conv_layer2_nfilters"], None, 1), filter_shape=self.conv_layer3_code.get_value().shape)[:, 0, :, 0] l3_out = code_convolved_l3 + conv_layer3_bias code_toks_weights = T.nnet.softmax( l3_out ) # This should be one dimension (the size of the sentence) # the first/last tokens are padding padding_size = T.constant( self.hyperparameters["layer1_window_size"] + self.hyperparameters["layer2_window_size"] + self.hyperparameters["layer3_window_size"] - 3) predicted_embedding = T.tensordot( code_toks_weights, code_embeddings[padding_size / 2 + 1:-padding_size / 2 + 1], [[1], [0]])[0] # Get the next hidden! if do_dropout: # For regularization, we can use the context embeddings *some* of the time embedding_used = T.switch(use_prev_stat, all_name_reps[target_token_id], predicted_embedding) else: embedding_used = all_name_reps[target_token_id] reset_gate = T.nnet.sigmoid( T.dot(embedding_used, gru_prediction_to_reset) + T.dot(hidden_state, gru_prev_hidden_to_reset) + gru_reset_bias) update_gate = T.nnet.sigmoid( T.dot(embedding_used, gru_prediction_to_update) + T.dot(hidden_state, gru_prev_hidden_to_update) + gru_update_bias) hidden_update = T.tanh( T.dot(embedding_used, gru_prediction_to_hidden) + reset_gate * T.dot(hidden_state, gru_prev_hidden_to_next) + gru_hidden_update_bias) next_hidden = ( 1. - update_gate) * hidden_state + update_gate * hidden_update return next_hidden, predicted_embedding, code_toks_weights
def main(data): # optimizer opt = Optimizers() # sampler theano_rng = RandomStreams(999) # import dataset n_samples = data.attrs['n_rows'] lr = 1e-3 batch_size = 128 x_data = [ data['purpose'], data['avg_speed'], data['duration'], data['trip_km'], data['n_coord'], data['interval'], data['dow'], data['startdistrict'], data['enddistrict'] ] y_data = [data['mode']] params = OrderedDict() params_shp = OrderedDict() output = [] input = [] asc_params = [] asc_params_m = [] beta_params_f = [] beta_params_s = [] beta_params_sf = [] beta_params = [] beta_params_m = [] for var in y_data: name = 'asc_' + var.name.strip('/') asc_shp = var['data'][:].squeeze().shape[1:] print('y', name, asc_shp) output.append(init_tensor((), name)) mask = np.ones(asc_shp, DTYPE_FLOATX) mask[-1] = 0. asc_value = np.zeros(asc_shp, DTYPE_FLOATX) * mask asc_params.append(shared(asc_value, name)) asc_params_m.append(shared(mask, name + '_mask')) params[name] = asc_params[-1] params_shp[name] = asc_shp for var in x_data: name = 'beta_' + var.name.strip('/') shp = var['data'].shape[1:] + asc_shp print('x', name, shp) input.append(init_tensor(var['data'].shape[1:], name)) mask = np.ones(shp, DTYPE_FLOATX) mask[..., -1] = 0. mask = mask.flatten() beta_value = np.zeros(np.prod(shp), DTYPE_FLOATX) * mask sigma_value = np.ones(np.prod(shp), DTYPE_FLOATX) * mask beta_params_f.append(shared(beta_value, name)) beta_params_sf.append(shared(sigma_value, name + '_sigma')) beta_params.append(T.reshape(beta_params_f[-1], shp)) beta_params_s.append(T.reshape(beta_params_sf[-1], shp)) beta_params_m.append(shared(mask, name + '_mask')) params[name] = beta_params_f[-1] params[name + '_sigma'] = beta_params_sf[-1] params_shp[name] = shp params_shp[name + '_sigma'] = shp # compute the utility function utility = 0. h_utility = 0. for x, b, s in zip(input, beta_params, beta_params_s): normal_sample = b[..., None] + T.sqr(s)[..., None] * theano_rng.normal( size=b.eval().shape + (1, ), avg=0., std=1., dtype=DTYPE_FLOATX) ax = [np.arange(x.ndim)[1:], np.arange(b.ndim)[:-1]] utility += T.tensordot(x, normal_sample, axes=ax) if x.ndim > 2: h_utility += T.tensordot(x, b + T.sqr(s), axes=[[1, 2], [0, 1]]) else: h_utility += T.tensordot(x, b + T.sqr(s), axes=[[1], [0]]) for y, asc in zip(output, asc_params): utility += asc[None, ..., None] h_utility += asc (d1, d2, d3) = utility.shape utility = utility.reshape((d1 * d3, d2)) p_y_given_x = T.nnet.softmax(utility) hessian_prob = T.nnet.softmax(h_utility) #! hessian_nll = T.log(hessian_prob) hessian_cr = hessian_nll[T.arange(y.shape[0]), y] hessian_cost = -T.sum(hessian_cr) nll = T.log(p_y_given_x).reshape((d3, d1, d2)) nll = nll[:, T.arange(y.shape[0]), y] cost = -T.sum(T.mean(nll, axis=0)) gparams = asc_params + beta_params_f + beta_params_sf grads = T.grad(cost, gparams) # mask gradient updates mask = asc_params_m + beta_params_m + beta_params_m for j, g in enumerate(grads): grads[j] = g * mask[j] # create list of updates to iterate over updates = opt.sgd_updates(gparams, grads, lr) # symbolic equation for the Hessian function stderrs = [] hessian = T.hessian(cost=hessian_cost, wrt=gparams) stderr = [T.sqrt(f) for f in [T.diag(2. / h) for h in hessian]] stderrs.extend(stderr) tensors = input + output shared_x = [shared(var['data'][:], borrow=True) for var in x_data] shared_y = [T.cast(shared(var['label'][:]), 'int32') for var in y_data] shared_variables = shared_x + shared_y i = T.lscalar('index') start_idx = i * batch_size end_idx = (i + 1) * batch_size print('constructing Theano computational graph...') train = theano.function( inputs=[i], outputs=cost, updates=updates, givens={ key: val[start_idx:end_idx] for key, val in zip(tensors, shared_variables) }, name='train', allow_input_downcast=True, ) std_err = theano.function( inputs=[], outputs=stderrs, givens={key: val[:] for key, val in zip(tensors, shared_variables)}, name='std errors', allow_input_downcast=True, ) # train model print('training the model...') curves = [] n_batches = n_samples // batch_size epochs = 100 epoch = 0 t0 = time.time() while epoch < epochs: epoch += 1 cost = [] for i in range(n_batches): cost_items = train(i) cost.append(cost_items) epoch_cost = np.sum(cost) curves.append((epoch, epoch_cost)) minutes, seconds = divmod(time.time() - t0, 60.) hours, minutes = divmod(minutes, 60.) print(("epoch {0:d} loglikelihood " "{1:.3f} time {hh:02d}:{mm:02d}:{ss:05.2f}").format( epoch, epoch_cost, hh=int(hours), mm=int(minutes), ss=seconds)) if (epoch % 5) == 0: print('checkpoint') param_values = {} for name, param in params.items(): param_shp = params_shp[name] param_values[name] = param.eval().reshape(param_shp) np.savetxt('params/{}.csv'.format(name), param_values[name].squeeze(), fmt='%.3f', delimiter=',') to_file = param_values, curves path = 'params/epoch_{0:d}.params'.format(epoch) with open(path, 'wb') as f: pickle.dump(to_file, f, protocol=pickle.HIGHEST_PROTOCOL) # save parameters and stderrs to .csv stderrs = std_err() params_list = [p for p in asc_params + beta_params_f + beta_params_sf] param_names = [p.name for p in asc_params + beta_params_f + beta_params_sf] for se, param, name in zip(stderrs, params_list, param_names): v = param.eval().squeeze() shp = v.shape path = 'params/stderrs_{}.csv'.format(name) np.savetxt(path, se.reshape(shp), fmt='%.3f', delimiter=',') path = 'params/tstat_{}.csv'.format(name) np.savetxt(path, v / se.reshape(shp), fmt='%.3f', delimiter=',')