def model(y1, s0, s1): if is_theano(y1, s0, s1): math = tt else: math = np # Compute the background component # TODO: This step can be sped up... A = math.dot( math.reshape(s0, (-1, 1)), math.reshape( math.concatenate(([1.0], math.zeros_like(y1)), axis=0), (1, -1) ), ) a = math.reshape(math.transpose(A), (-1,)) if math == tt: M0 = math.reshape(ts.dot(D, a), (M, -1)) else: M0 = math.reshape(D.dot(a), (M, -1)) # Compute the spot component A = math.dot( math.reshape(s1, (-1, 1)), math.reshape(math.concatenate(([1.0], y1), axis=0), (1, -1)), ) a = math.reshape(math.transpose(A), (-1,)) if math == tt: M1 = math.reshape(ts.dot(D, a), (M, -1)) else: M1 = math.reshape(D.dot(a), (M, -1)) # Remove the baseline b = math.reshape(2.0 + math.dot(B1, y1), (M, -1)) return (M0 + M1) / b
def __init__(self, rng, P_input, L2_input, **kwargs): #symbol declaration, initialization and definition x_1_tm1, x_t = (\ sparse.csr_matrix("x_1_tm1", dtype=theano.config.floatX),\ sparse.csr_matrix("x_t",dtype=theano.config.floatX)\ )\ if P_input is None else P_input[:2] #elements of history shape = kwargs.get("shape") if shape is not None: dict_size = shape[0] if len(shape) <= 1: del shape["shape"] else: shape["shape"] = shape["shape"][1:] else: dict_size = (16,1,32,32) D_1_tm1 = theano.shared(rng.normal(size=dict_size).astype(theano.config.floatX)) Dx_1_tm1 = sparse.dot(x_1_tm1, D_1_tm1)#array access=dot operation super(SequenceCNN, self).__init__(rng=rng, inputsymbol=Dx_1_tm1, **kwargs)#attaches new elements into the fgraph self.L2_output_1_tm1 = self.L2_output #elements of current time D_t = theano.shared(rng.normal(size=dict_size).astype(theano.config.floatX)) Dx_t = sparse.dot(x_t, D_t)#array access=dot operation self.L2_output_t = theano.clone(self.L2_output_1_tm1, replace={Dx_1_tm1:Dx_t}) #element prepartion for model building self.P_input = (x_1_tm1,x_t) self.params += [D_1_tm1, D_t] self.L2_output = self.L2_output_1_tm1*self.L2_output_t
def u(self, value): value = np.atleast_1d(value) assert (len(value.shape) == 1 ), "Wavelength-dependent limb darkening not yet supported." self._u = value # Did the degree of limb darkening change? if len(self._u) != self._udeg: self._udeg = len(self._u) # Force the re-instantiation of the internal map self.ydeg = self._ydeg if self._udeg > 0: # Set the coeffs self._map[1:] = self._u # Compute the limb darkening operator F = self._map.ops.F( tt.as_tensor_variable(np.append([-1.0], self._u)), tt.as_tensor_variable([np.pi]), ) self._L = ts.dot(ts.dot(self._map.ops.A1Inv, F), self._map.ops.A1).eval() self._L = csr_matrix(self._L)
def create_TrainFunc_tranPES(simfn, embeddings, marge=0.5, alpha=1., beta=1.): # parse the embedding data embedding = embeddings[0] # D x N matrix lembedding = embeddings[1] # declare the symbolic variables for training triples hp = S.csr_matrix('head positive') # N x batchsize matrix rp = S.csr_matrix('relation') tp = S.csr_matrix('tail positive') hn = S.csr_matrix('head negative') tn = S.csr_matrix('tail negative') lemb = T.scalar('embedding learning rate') lremb = T.scalar('relation learning rate') subtensorE = T.ivector('batch entities set') subtensorR = T.ivector('batch link set') # Generate the training positive and negative triples hpmat = S.dot(embedding.E, hp).T # batchsize x D dense matrix rpmat = S.dot(lembedding.E, rp).T tpmat = S.dot(embedding.E, tp).T hnmat = S.dot(embedding.E, hn).T tnmat = S.dot(embedding.E, tn).T # calculate the score pos = tranPES3(simfn, T.concatenate([hpmat, tpmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tpmat) negh = tranPES3(simfn, T.concatenate([hnmat, tpmat], axis=1).reshape((hnmat.shape[0], 2, hnmat.shape[1])).dimshuffle(0, 2, 1), hnmat, rpmat, tpmat) negt = tranPES3(simfn, T.concatenate([hpmat, tnmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tnmat) costh, outh = margeCost(pos, negh, marge) costt, outt = margeCost(pos, negt, marge) embreg = regEmb(embedding, subtensorE, alpha) lembreg = regLink(lembedding, subtensorR, beta) cost = costh + costt + embreg[0] + lembreg out = T.concatenate([outh, outt]) outc = embreg[1] # list of inputs to the function list_in = [lemb, lremb, hp, rp, tp, hn, tn, subtensorE, subtensorR] # updating the embeddings using gradient descend emb_grad = T.grad(cost, embedding.E) New_embedding = embedding.E - lemb*emb_grad remb_grad = T.grad(cost, lembedding.E) New_rembedding = lembedding.E - lremb * remb_grad updates = OrderedDict({embedding.E: New_embedding, lembedding.E: New_rembedding}) return theano.function(list_in, [cost, T.mean(out), T.mean(outc), embreg[0], lembreg], updates=updates, on_unused_input='ignore')
def fprop(self, state_below, add_noise=True): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) self.x = state_below # linear part if isinstance(self.x, S.SparseVariable): z = S.dot(self.x,self.W[0]) + self.b[0] else: z = T.dot(self.x,self.W[0]) + self.b[0] self.z = self.activate(z, self.expert_activation) # first layer non-linear part if isinstance(self.x, S.SparseVariable): h = S.dot(self.x,self.W[1]) + self.b[1] else: h = T.dot(self.x,self.W[1]) + self.b[1] # activate hidden units of non-linear part self.h = self.activate(h, self.hidden_activation) noise = 0. if add_noise: rng = MRG_RandomStreams(self.mlp.rng.randint(2**15)) noise = rng.normal(size = self.z.shape, std=self.noise_stdev , dtype=self.z.type.dtype) # second layer non-linear part self.a = T.dot(self.h,self.W[2]) + self.b[2] + noise # activate non-linear part self.m_mean = self.activate(self.a, self.gater_activation) # how many are over 0: self.effective_sparsity = T.cast(T.gt(self.m_mean, 0), theano.config.floatX).mean() # mix output of linear part with output of non-linear part self.p = self.m_mean * self.z if self.layer_name is not None: self.z.name = self.layer_name + '_z' self.h.name = self.layer_name + '_h' self.a.name = self.layer_name + '_a' self.m_mean.name = self.layer_name + '_m_mean' self.p.name = self.layer_name + '_p' return self.p
def get_output_for(self, input, **kwargs): if not isinstance(input, (S.SparseVariable, S.SparseConstant, S.sharedvar.SparseTensorSharedVariable)): raise ValueError("Input for this layer must be sparse") activation = S.dot(input, self.W) #do the convolution activation = S.dot(self.H, activation) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) return self.nonlinearity(activation)
def ForwardFn(fnsim, embeddings, leftop, rightop, marge=1.0): """ This function returns a theano function to perform a forward step, contrasting couples of positive and negative triplets. members are given as sparse matrices. For one positive triplet there is one negative triplet. :param fnsim: similarity function (on theano variables). :param embeddings: an embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. :param marge: marge for the cost function. :note: this is useful for W_SABIE [Weston et al., IJCAI 2011] """ embedding, relationl, relationr = parse_embeddings(embeddings) # inputs inpr = S.csr_matrix() inpl = S.csr_matrix() inpo = S.csr_matrix() inpln = S.csr_matrix() inprn = S.csr_matrix() inpon = S.csr_matrix() # graph lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T lhsn = S.dot(embedding.E, inpln).T rhsn = S.dot(embedding.E, inprn).T relln = S.dot(relationl.E, inpon).T relrn = S.dot(relationr.E, inpon).T simi = fnsim(leftop(lhs, rell), rightop(rhs, relr)) simin = fnsim(leftop(lhsn, relln), rightop(rhsn, relrn)) cost, out = margincost(simi, simin, marge) """ Theano function inputs. :input inpl: sparse csr matrix representing the indexes of the positive triplet 'left' member, shape=(#examples,N [Embeddings]). :input inpr: sparse csr matrix representing the indexes of the positive triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpo: sparse csr matrix representing the indexes of the positive triplet relation member, shape=(#examples,N [Embeddings]). :input inpln: sparse csr matrix representing the indexes of the negative triplet 'left' member, shape=(#examples,N [Embeddings]). :input inprn: sparse csr matrix representing the indexes of the negative triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpon: sparse csr matrix representing the indexes of the negative triplet relation member, shape=(#examples,N [Embeddings]). Theano function output. :output out: binary vector representing when the margin is violated, i.e. when an update occurs. """ return theano.function([inpl, inpr, inpo, inpln, inprn, inpon], [out], on_unused_input='ignore')
def SimFn(fnsim, embeddings, leftop, rightop): """ This function returns a Theano function to measure the similarity score for sparse matrices inputs. :param fnsim: similarity function (on Theano variables). :param embeddings: an Embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr = S.csr_matrix('inpr') inpl = S.csr_matrix('inpl') inpo = S.csr_matrix('inpo') # Graph #what is T? Are they tensor? lhs, rhs,rell,relr # we just created inpl and inplr inpo . what does it mean to calculate dot product? lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T # what is this? #ref: #leftop = LayerMat('lin', state.ndim, state.nhid) #rightop = LayerMat('lin', state.ndim, state.nhid) # on call #ry = y.reshape((y.shape[0], self.n_inp, self.n_out)) #rx = x.reshape((x.shape[0], x.shape[1], 1)) #return self.act((rx * ry).sum(1)) simi = fnsim(leftop(lhs, rell), rightop(rhs, relr)) """ Theano function inputs. :input inpl: sparse csr matrix (representing the indexes of the 'left' entities), shape=(#examples, N [Embeddings]). :input inpr: sparse csr matrix (representing the indexes of the 'right' entities), shape=(#examples, N [Embeddings]). :input inpo: sparse csr matrix (representing the indexes of the relation member), shape=(#examples, N [Embeddings]). Theano function output :output simi: matrix of score values. """ return theano.function([inpl, inpr, inpo], [simi], on_unused_input='ignore')
def log_first_stage_indep_normal_priors(theta_tilde, num_variances, num_betas, filtered_rows_to_alts): """ Calculates the log of the first stage joint density of error terms conditional on the alternative specific variances. Note that the error terms are assumed to be INDEPENDENTLY normally distributed with mean zero, conditional on the alternative specific variances. The returned value is correct up to an additive constant (which is comprised of arbitrary constants as well as the log-marginal evidence). Parameters ---------- error_terms : 1D ndarray of floats. The error terms we want to calculate the log of the joint density of. alt_variances : 1D ndarray of positive floats. Each value should represent one alternative speciific variance. filtered_rows_to_alts : 2D sparse array of zeros and ones. Each element (i, j) should denote whether row i corresponds to alternative j or not using one's and zero's reespectively. Returns ------- log_first_stage : scalar. The log of the joint density of the error terms, up to an additive constant that contains the log of the normalization constant and the log of the other arbitrary contants from the multivariate normal distribution. References ---------- Gelman, Andrew, et al. (2014). Bayesian Data Analysis, 3rd Ed. Taylor & Francis Group. pp. 576-578. """ # Get the position in theta_tilde, at which the betas start beta_neg_idx = -1 * num_betas # Split theta_tilde into its various components alt_variances = tt.exp(theta_tilde[:num_variances]) error_terms = theta_tilde[num_variances:beta_neg_idx] # If the error terms are conditionally independent given the # alternative specific variances, then the covariance matrix # for the joint distribution of errors is diagonal and the inverse # of a diagonal matrix is a diagonal matrix with the inverses on # the diagonal. The inverses are calculated below inverse_variances = 1.0 / alt_variances # Map the inverse variances to their corresponding rows of error terms long_inverse_variances =\ sparse.dot(filtered_rows_to_alts, inverse_variances) squared_errors = error_terms**2 # Below, we implement -0.5 * (theta - mu)^T Sigma^{-1} (theta - mu) for # the specific case of a diagonal Sigma, Mu = 0, and theta = error_terms log_first_stage =\ -0.5 * tt.sum(tt.mul(long_inverse_variances, squared_errors)) return log_first_stage
def get_train_function(self): # specify the computational graph target = T.matrix('target') weight = theano.shared(np.random.randn(len(self.feature_map), len(self.label_map)), name='weight') feat_mat = sparse.csr_matrix(name='feat_mat') mask_mat = sparse.csr_matrix(name='mask_mat') sum_pred = sparse.dot( mask_mat, T.nnet.softmax( sparse.dot(feat_mat, weight) ) ) pred = sum_pred / sum_pred.sum(axis=1).reshape((sum_pred.shape[0], 1)) objective = T.nnet.categorical_crossentropy(pred, target).sum() + self.param.l2_regularization * (weight ** 2).sum() grad_weight = T.grad(objective, weight) # print 'Compiling function ...' # compile the function train = theano.function(inputs = [feat_mat, mask_mat, target], outputs = [objective, weight], updates=[(weight, weight - 0.1*grad_weight)] ) return train
def optimize_func(transform_matrix): t0 = time.time() M = S.csr_matrix(dtype=theano.config.floatX) N = S.csr_matrix(dtype=theano.config.floatX) ON = S.csr_matrix(dtype=theano.config.floatX) lr = T.scalar('learning rate', dtype=theano.config.floatX) # print M, N, ON, lr TN = S.dot(transform_matrix, N) D = T.sqr(M - TN) # PD = S.sqr(N-ON) # PD = T.sqrt(S.sp_sum(PD, 1)) # TPD = T.sqr(TN - ON) # TPD = T.sqrt(TPD.sum(1)) # D2 = T.sqr(PD-TPD) cost = T.sum(D) #+ T.sum(D2) list_in = [lr, M, N, ON] gradient = T.grad(cost, transform_matrix) new_transform_matrix = transform_matrix - lr * gradient t1 = time.time() print 'opt func cost is ' + str(t1 - t0) return theano.function(list_in, cost, updates=[(transform_matrix, new_transform_matrix)], on_unused_input='ignore')
def matmul(self, a, b, transpose_a=False, transpose_b=False, a_is_sparse=False, b_is_sparse=False, name=None): if transpose_a: a = a.T if transpose_b: b = b.T if a_is_sparse or b_is_sparse: return sparse.dot(a, b) return T.dot(a, b)
def get_output_for(self, input, **kwargs): target_indices = kwargs.get('target_indices') activation = T.dot(input, self.W) #do the convolution activation = S.dot(self.H, activation) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) activation = activation[target_indices, :] return self.nonlinearity(activation)
def _get_diagonal_term(self, X_left, X_right, diag_init): diag = tn.shared(value=diag_init, name='diag') if _tn_is_sparse(X_left) or _tn_is_sparse(X_right): XlXr = tsp.mul(X_left, X_right) y_pred = tsp.dot(XlXr, diag) else: XlXr = T.mul(X_left, X_right) y_pred = T.dot(XlXr, diag) return y_pred, [diag]
def labelFunct(self, batchSize, xFeats): # xFeats [l, h] # l = batchSize # self.W = theano.printing.Print("W ") (self.W) # self.Wb = theano.printing.Print("Wb ") (self.Wb) scores = sparse.dot(xFeats, self.W) + self.Wb # [l, h] x [h, r] => [l, r] relationProbs = T.nnet.softmax(scores) # scores = theano.printing.Print("scores ") (scores) labels = T.argmax(scores, axis=1) # [l, r] => [l] # labels = theano.printing.Print("labels ") (labels) return (labels, relationProbs)
def SimFn(fnsim, embeddings, leftop, rightop, op=''): """ This function returns a Theano function to measure the similarity score for sparse matrices inputs. :param fnsim: similarity function (on Theano variables). :param embeddings: an Embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr, inpl, inpo = S.csr_matrix('inpr'), S.csr_matrix( 'inpl'), S.csr_matrix('inpo') # Graph lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T lop, rop = leftop(lhs, rell), rightop(rhs, relr) simi = fnsim(lop, rop) """ Theano function inputs. :input inpl: sparse csr matrix (representing the indexes of the 'left' entities), shape=(#examples, N [Embeddings]). :input inpr: sparse csr matrix (representing the indexes of the 'right' entities), shape=(#examples, N [Embeddings]). :input inpo: sparse csr matrix (representing the indexes of the relation member), shape=(#examples, N [Embeddings]). Theano function output :output simi: matrix of score values. """ return theano.function([inpl, inpr, inpo], [simi], on_unused_input='ignore')
def get_train_function(self): # specify the computational graph weight = theano.shared(np.random.randn(len(self.feature_map), len(self.label_map)), name='weight') # weight = theano.shared(np.zeros((len(self.feature_map), len(self.label_map))), name='weight') feat_mat = sparse.csr_matrix(name='feat_mat') f_target = T.matrix('f_target') f_mask_mat = sparse.csr_matrix(name='f_mask_mat') f_sum_pred = sparse.dot( f_mask_mat, T.nnet.softmax( sparse.dot(feat_mat, weight) ) ) f_pred = f_sum_pred / f_sum_pred.sum(axis=1).reshape((f_sum_pred.shape[0], 1)) i_target = T.matrix('i_target') i_mask_mat = sparse.csr_matrix(name='l_mask_mat') i_pred = sparse.dot( i_mask_mat, T.nnet.softmax( sparse.dot(feat_mat, weight) ) ) objective = self.param.feature_lambda * T.nnet.categorical_crossentropy(f_pred, f_target).sum() + T.nnet.categorical_crossentropy(i_pred, i_target).sum() + self.param.l2_lambda * (weight ** 2).sum() / 2 grad_weight = T.grad(objective, weight) # print 'Compiling function ...' # compile the function train = theano.function(inputs = [feat_mat, f_mask_mat, f_target, i_mask_mat, i_target], outputs = [objective, weight], updates=[(weight, weight - 0.1*grad_weight)] ) return train
def __init__(self, rng, P_input, L2_input=None, **kwargs): #1.symbol declaration, initialization and definition I = sparse.csr_matrix("I") if P_input is None else P_input shape = kwargs.get("shape") or [(16,1,32,32), (4,16,16,2,2), (4,4,4,2,2)] dict_size, kwargs["shape"] = shape[0], shape[1:] D = theano.shared(\ rng.uniform(low=-1,high=1,size=dict_size).astype(theano.config.floatX)\ ) DI = sparse.dot(I, D)#array access=dot operation #2.attaches I and D into the fgraph super(SparseCNN, self).__init__(rng=rng, P_input=DI, **kwargs) self.params += [D,] self.P_input = I#take I as input for the sparseCNN
def SimFn(fnsim, embeddings, leftop, rightop, op=''): """ This function returns a Theano function to measure the similarity score for sparse matrices inputs. :param fnsim: similarity function (on Theano variables). :param embeddings: an Embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr, inpl, inpo = S.csr_matrix('inpr'), S.csr_matrix('inpl'), S.csr_matrix('inpo') # Graph lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T lop, rop = leftop(lhs, rell), rightop(rhs, relr) simi = fnsim(lop, rop) """ Theano function inputs. :input inpl: sparse csr matrix (representing the indexes of the 'left' entities), shape=(#examples, N [Embeddings]). :input inpr: sparse csr matrix (representing the indexes of the 'right' entities), shape=(#examples, N [Embeddings]). :input inpo: sparse csr matrix (representing the indexes of the relation member), shape=(#examples, N [Embeddings]). Theano function output :output simi: matrix of score values. """ return theano.function([inpl, inpr, inpo], [simi], on_unused_input='ignore')
def _generate_train_model_batch_function(self): #s = T.matrix('s', dtype=self.floatX) s = S.csr_matrix('s', dtype=self.floatX) #u = T.vector('u', dtype=self.intX) i = T.vector('i', dtype=self.intX) y = T.vector('y', dtype=self.intX) #items = T.vector('items', dtype=self.intX) Sit = self.S sit = s.T #Uu = self.U[u] Iy = self.I[y] BSy = self.BS[y] #BUy = self.BU[y] BIy = self.BI[y] I1i = self.I1[i] I2y = self.I2[y] #predU = T.dot( Iy, Uu.T ).T + BUy.flatten() se = S.dot( Sit.T, sit ) #se = T.dot( Sit.T, sit ) predS = T.dot( Iy, se ).T + BSy.flatten() predI = T.dot( I1i, I2y.T ) + BIy.flatten() pred = predS + predI #+ predU pred = getattr(self, self.activation )( pred ) cost = getattr(self, self.objective )( pred, y ) param_list = [self.S] fullparam_list = [self.I,self.I1,self.I2,self.BI,self.BS] #+ [self.U] subparam_list = [Iy,I1i,I2y,BIy,BSy] #+ [Uu] subparam_idx = [y,i,y,y,y] #+ [u] updates = self.descent( cost, param_list, fullparam_list, subparam_list, subparam_idx, self.learning_rate, momentum=self.momentum ) #updates = getattr(self, self.learn)(cost, [self.U,self.S,self.I,self.IC,self.BI,self.BS], self.learning_rate) #updates = getattr(self, self.learn)(cost, , ,, self.learning_rate, momentum=self.momentum) #self.train_model_batch = theano.function(inputs=[s, i, u, y, items], outputs=cost, updates=updates ) inp = [s, i, y] #+ [u] self.train_model_batch = theano.function(inputs=inp, outputs=cost, updates=updates )
def compRelationProbsFunc(self, xFeats): # xFeats [l, h] matrix # xFeats = theano.printing.Print("xFeats")(xFeats) # self.Wb = theano.printing.Print("Wb ") (self.Wb) # self.W = theano.printing.Print("W ") (self.W) # scores of each role by a classifier relationScores = sparse.dot(xFeats, self.W) + self.Wb # [l, h] x [h, r] => [l, r] #relationScores = theano.printing.Print("relationScores=")(relationScores) # convert it to probabilities relationProbs = T.nnet.softmax(relationScores) #relationProbs = theano.printing.Print("relationProbs = ")(relationProbs) return relationProbs # [l, r]
def compRelationProbsFunc(self, xFeats): # xFeats [l, h] matrix # xFeats = theano.printing.Print("xFeats")(xFeats) # self.Wb = theano.printing.Print("Wb ") (self.Wb) # self.W = theano.printing.Print("W ") (self.W) # scores of each role by a classifier relationScores = sparse.dot( xFeats, self.W) + self.Wb # [l, h] x [h, r] => [l, r] #relationScores = theano.printing.Print("relationScores=")(relationScores) # convert it to probabilities relationProbs = T.nnet.softmax(relationScores) #relationProbs = theano.printing.Print("relationProbs = ")(relationProbs) return relationProbs # [l, r]
def get_output_for(self, input, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) # According to pull-request 595 from eduardo4jesus # Though it might be the case, the activation layer will remain # dense since Weights represent dense matrix ( Kinda makes sense) if (type(input) == S.SparseVariable) or (type(input) == S.SparseConstant): activation = S.dot(input, self.W) else: activation = T.dot(input, self.W) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) return self.nonlinearity(activation)
def __init__(self, rng, x, topic_num=100): #input L2_input = sparse.csr_matrix("x",dtype=theano.config.floatX) #params vocab_size = x.shape[1] mu, sigma = x.data.mean(), x.data.var()**0.5 rng = numpy.random.RandomState(numpy.random.randint(2**32-1)) if rng is None else rng self.L2_w = theano.shared(\ numpy.asarray(\ rng.normal(loc=mu,scale=sigma,size=(vocab_size, topic_num)),\ dtype=theano.config.floatX\ ),\ borrow=True\ ) self.L2_b = theano.shared(numpy.zeros(topic_num,dtype=theano.config.floatX), borrow=True) self.params = [self.L2_w, self.L2_b] #stick-breaking:sticks->orthgonal sticks L2_stick = sparse.dot(L2_input,self.L2_w)+self.L2_b-\ 0.5*(L2_input.size/vocab_size*tensor.sum(self.L2_w**2,0)+self.L2_b**2) zero_space = tensor.zeros((L2_input.shape[0],1),dtype=theano.config.floatX) L2_orth_stick = tensor.join(1, L2_stick, zero_space)\ - tensor.join(1, zero_space, tensor.cumsum(L2_stick,1)) Pasterik_orth_stick = tensor.log(1 + tensor.exp(L2_orth_stick)) #training model definition Likelihood = tensor.mean(Pasterik_orth_stick) grads = theano.grad(Likelihood, self.params)#gradient w.r.t params eta = tensor.scalar("eta") updates = [(param, param+eta*grad) for param, grad in zip(self.params, grads)] self._fit = theano.function(\ inputs=[L2_input, eta],\ outputs=Likelihood,\ updates=updates\ ) #predict model definition self._predict = theano.function(\ inputs=[L2_input],\ outputs=tensor.argmax(L2_stick,axis=-1)\ ) self._codec = theano.function(\ inputs=[L2_input],\ outputs=L2_stick>0\ )
def __init__(self, x_in, n_in, n_out, activation=None, rng=None, seed=0): """ Initialize the layer. Inputs: - x_in: a symbolic theano variable describing the input - n_in: dimensions the input will have - n_out: dimensions the output should have - activation: non-linear activation function applied to the output (if any) - seed: used to initialize the random number generator """ if rng is None: rng = np.random.RandomState(seed) # initialize the weights - optimal values depend on the activation function if activation is None: W_values = rng.randn(n_in, n_out) * 0.01 else: W_values = np.asarray(rng.uniform( low=-np.sqrt(6. / (n_in + n_out)), high=np.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) if activation == T.nnet.sigmoid: W_values *= 4 self.W = theano.shared(value=W_values, name='W', borrow=True) # initialize the biases b as a vector of n_out 0s self.b = theano.shared(value=np.zeros((n_out, ), dtype=theano.config.floatX), name='b', borrow=True) # compute the output if isinstance(x_in.type, sparse.type.SparseType): lin_output = sparse.dot(x_in, self.W) + self.b else: lin_output = T.dot(x_in, self.W) + self.b # apply the activation function (if any) self.output = (lin_output if not activation else activation(lin_output)) # parameters of the model self.params = [self.W, self.b]
def __init__(self, rng, x, topic_num=100): #input L2_input = sparse.csr_matrix("x",dtype=theano.config.floatX) #params vocab_size = x.shape[1] mu, sigma = x.data.mean(), 2.56*x.data.var()**0.5 rng = numpy.random.RandomState(numpy.random.randint(2**32-1)) if rng is None else rng self.L2_w = theano.shared(\ numpy.asarray(\ mu + (mu if mu < sigma else sigma)*rng.uniform(low=-1,high=1,size=(vocab_size, topic_num)),\ dtype=theano.config.floatX\ ),\ borrow=True\ ) self.L2_b = theano.shared(numpy.zeros(topic_num, dtype=theano.config.floatX), borrow=True) self.params = [self.L2_w, self.L2_b] #output L2_topic = sparse.dot(L2_input,self.L2_w)+self.L2_b #difference based objective function Pasterik_topic = tensor.log(tensor.sum(tensor.exp(L2_topic-L2_topic.max(-1, keepdims=True)),-1))#avoiding overflow d_xw_w2 = tensor.mean(Pasterik_topic) -\ 0.5*(L2_input.size*tensor.mean(self.L2_w*self.L2_w)+tensor.dot(self.L2_b,self.L2_b)) grads = theano.grad(d_xw_w2, self.params)#gradient w.r.t params eta = tensor.scalar("eta") updates = [(param, param+eta*grad) for param, grad in zip(self.params, grads)] #training model definition self._fit = theano.function(\ inputs=[L2_input, eta],\ outputs=d_xw_w2, \ updates=updates\ ) #predict model definition self._predict = theano.function(\ inputs=[L2_input],\ outputs=tensor.argmax(L2_topic,axis=-1)\ )
def y(self, y): self._y = np.array(y) self.ydeg = int(np.sqrt(len(y)) - 1) self.map_ref = starry.Map(ydeg=self.ydeg, reflected=True) self.map_emi = starry.Map(ydeg=self.ydeg) if self.ydeg > 0: self.map_ref[1:, :] = y[1:] self.map_emi[1:, :] = y[1:] self._A1y = ts.dot(self.map_ref.ops.A1, tt.as_tensor_variable(y)) # Reset self.intensity(0.0, 0.0, [0.0], [0.0], [0.0], [0.0], [0.0], 0.0, reset=True) self.flux([0.0], [0.0], [0.0], [0.0], [0.0], 0.0, reset=True) self.dfluxdro([0.0], [0.0], [0.0], [0.0], [0.0], 0.0, reset=True) if self.ydeg > 0: self.flux_emitted([0.0], [0.0], [0.0], [0.0], [0.0], 0.0, reset=True) self.dfluxdro_emitted([0.0], [0.0], [0.0], [0.0], [0.0], 0.0, reset=True)
def theano_safe_sparse_dot(X, Y): if _tn_is_sparse(X) or _tn_is_sparse(Y): return tsp.dot(X, Y) else: return T.dot(X, Y)
def __init__(self, nodenet): if nodenet.sparse: self.propagate = theano.function([], [nodenet.w, nodenet.a], updates={nodenet.a: ST.dot(nodenet.w, nodenet.a)}) else: self.propagate = theano.function([], [nodenet.w, nodenet.a], updates={nodenet.a: T.dot(nodenet.w, nodenet.a)})
def TrainFn1Member(fnsim, embeddings, leftop, rightop, rel=True, loss=loss.hinge, loss_margin=1.0, op=None, method='SGD', decay=0.999, epsilon=1e-6, max_learning_rate=None, weight_L1_param_regularizer=None, weight_L2_param_regularizer=None, weight_contractive_regularizer_left=None, weight_contractive_regularizer_right=None): embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr, inpl, inpo = S.csr_matrix('inpr'), S.csr_matrix('inpl'), S.csr_matrix('inpo') inpln, inprn = S.csr_matrix('inpln'), S.csr_matrix('inprn') # Learning rates for parameters and embeddings rate_params = T.scalar('rate_params') rate_embeddings = T.scalar('rate_embeddings') # Graph lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T lhsn = S.dot(embedding.E, inpln).T rhsn = S.dot(embedding.E, inprn).T simi = fnsim(leftop(lhs, rell), rightop(rhs, relr)) # Negative 'left' member similn = fnsim(leftop(lhsn, rell), rightop(rhs, relr)) # Negative 'right' member simirn = fnsim(leftop(lhs, rell), rightop(rhsn, relr)) costl, outl = loss(simi, similn, margin=loss_margin) costr, outr = loss(simi, simirn, margin=loss_margin) cost, out = costl + costr, T.concatenate([outl, outr]) # List of inputs of the function list_in = [rate_embeddings, rate_params, inpl, inpr, inpo, inpln, inprn] if rel: # If rel is True, we also consider a negative relation member inpon = S.csr_matrix() relln = S.dot(relationl.E, inpon).T relrn = S.dot(relationr.E, inpon).T simion = fnsim(leftop(lhs, relln), rightop(rhs, relrn)) costo, outo = loss(simi, simion, margin=loss_margin) cost += costo out = T.concatenate([out, outo]) list_in += [inpon] # <EXPERIMENTAL_CODE> # Should I also plug examples from corrupted triples ? if weight_contractive_regularizer_left is not None: cost = cost + (weight_contractive_regularizer_left * R.contractive_regularizer(lop, lhs)) if weight_contractive_regularizer_right is not None: cost = cost + (weight_contractive_regularizer_right * R.contractive_regularizer(rop, rhs)) for rel_param in set([relationl.E, relationr.E]): if weight_L1_param_regularizer is not None: cost = cost + (weight_L1_param_regularizer * R.L1_regularizer(rel_param)) if weight_L2_param_regularizer is not None: cost = cost + (weight_L2_param_regularizer * R.L2_regularizer(rel_param)) # </EXPERIMENTAL_CODE> params = leftop.params + rightop.params + (fnsim.params if hasattr(fnsim, 'params') else []) embeds = [embedding.E] + ([relationr.E, relationl.E] if (type(embeddings) == list) else []) # The function updates the implicit function arguments according to the updates. updates = collections.OrderedDict() if (method == 'SGD'): pass # do nothing elif (method == 'MOMENTUM'): param_previous_update_map = collections.OrderedDict() for param in params + embeds: # Allocate the previous updates previous_update_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_previous_update = theano.shared(value=previous_update_value, name='su_' + param.name) param_previous_update_map[param] = param_previous_update elif (method == 'ADAGRAD'): param_squared_gradients_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared(value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients elif (method == 'ADADELTA'): param_squared_gradients_map = collections.OrderedDict() param_squared_updates_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared(value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients # Allocate the sums of squared updates squared_updates_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_updates = theano.shared(value=squared_updates_value, name='su_' + param.name) param_squared_updates_map[param] = param_squared_updates elif (method == 'RMSPROP'): param_squared_gradients_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared(value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients else: raise ValueError('Unknown method: %s' % (method)) # Parameter Gradients gradientsparams = T.grad(cost, params) # Embeddings gradients gradientsembeds = T.grad(cost, embeds) # Learning Rates rates_params = [rate_params for i in range(len(params))] # In TransE etc. the rate for predicates' embeddings (that do not get normalized) is rate_params, not rate_embeddings rates_embeddings = [rate_embeddings, rate_params, rate_params] if len(embeds) > 1 else [rate_embeddings] # [rate_embeddings for i in range(len(embeds))] for param, gradient, rate in zip(params + embeds, gradientsparams + gradientsembeds, rates_params + rates_embeddings): if (method == 'SGD'): # SGD U.sgd(param, rate, gradient, updates) elif (method == 'MOMENTUM'): # SGD+MOMENTUM param_previous_update = param_previous_update_map[param] U.momentum(param, rate, decay, gradient, updates, param_previous_update) elif (method == 'ADAGRAD'): # ADAGRAD param_squared_gradients = param_squared_gradients_map[param] U.adagrad(param, rate, epsilon, gradient, updates, param_squared_gradients) elif (method == 'ADADELTA'): # ADADELTA param_squared_gradients = param_squared_gradients_map[param] param_squared_updates = param_squared_updates_map[param] U.adadelta(param, rate, decay, epsilon, gradient, updates, param_squared_gradients, param_squared_updates) elif (method == 'RMSPROP'): # RMSPROP param_squared_gradients = param_squared_gradients_map[param] U.rmsprop(param, rate, decay, max_learning_rate, epsilon, gradient, updates, param_squared_gradients) else: raise ValueError('Unknown method: %s' % (method)) return theano.function(list_in, [T.mean(cost), T.mean(out)], updates=updates, on_unused_input='ignore')
def TrainFn1Member(fnsim, embeddings, leftop, rightop, marge=1.0, rel=True): """ This function returns a theano function to perform a training iteration, contrasting positive and negative triplets. members are given as sparse matrices. For one positive triplet there are two or three (if rel == True) negative triplets. To create a negative triplet we replace only one member at a time. :param fnsim: similarity function (on theano variables). :param embeddings: an embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. :param marge: marge for the cost function. :param rel: boolean, if true we also contrast w.r.t. a negative relation member. """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr = S.csr_matrix() inpl = S.csr_matrix() inpo = S.csr_matrix() inpln = S.csr_matrix() inprn = S.csr_matrix() lrparams = T.scalar('lrparams') lrembeddings = T.scalar('lrembeddings') # Graph lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T lhsn = S.dot(embedding.E, inpln).T rhsn = S.dot(embedding.E, inprn).T simi = fnsim(leftop(lhs, rell), rightop(rhs, relr)) # Negative 'left' member similn = fnsim(leftop(lhsn, rell), rightop(rhs, relr)) # Negative 'right' member simirn = fnsim(leftop(lhs, rell), rightop(rhsn, relr)) costl, outl = margincost(simi, similn, marge) costr, outr = margincost(simi, simirn, marge) cost = costl + costr out = T.concatenate([outl, outr]) # List of inputs of the function list_in = [lrembeddings, lrparams, inpl, inpr, inpo, inpln, inprn] if rel: # If rel is True, we also consider a negative relation member inpon = S.csr_matrix() relln = S.dot(relationl.E, inpon).T relrn = S.dot(relationr.E, inpon).T simion = fnsim(leftop(lhs, relln), rightop(rhs, relrn)) costo, outo = margincost(simi, simion, marge) cost += costo out = T.concatenate([out, outo]) list_in += [inpon] if hasattr(fnsim, 'params'): # If the similarity function has some parameters, we update them too. gradientsparams = T.grad(cost, leftop.params + rightop.params + fnsim.params) updates = OrderedDict((i, i - lrparams * j) for i, j in zip( leftop.params + rightop.params + fnsim.params, gradientsparams)) else: gradientsparams = T.grad(cost, leftop.params + rightop.params) updates = OrderedDict((i, i - lrparams * j) for i, j in zip( leftop.params + rightop.params, gradientsparams)) gradients_embedding = T.grad(cost, embedding.E) newE = embedding.E - lrembeddings * gradients_embedding updates.update({embedding.E: newE}) if type(embeddings) == list: # If there are different embeddings for the relation member. gradients_embedding = T.grad(cost, relationl.E) newE = relationl.E - lrparams * gradients_embedding updates.update({relationl.E: newE}) gradients_embedding = T.grad(cost, relationr.E) newE = relationr.E - lrparams * gradients_embedding updates.update({relationr.E: newE}) """ Theano function inputs. :input lrembeddings: learning rate for the embeddings. :input lrparams: learning rate for the parameters. :input inpl: sparse csr matrix representing the indexes of the positive triplet 'left' member, shape=(#examples,N [Embeddings]). :input inpr: sparse csr matrix representing the indexes of the positive triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpo: sparse csr matrix representing the indexes of the positive triplet relation member, shape=(#examples,N [Embeddings]). :input inpln: sparse csr matrix representing the indexes of the negative triplet 'left' member, shape=(#examples,N [Embeddings]). :input inprn: sparse csr matrix representing the indexes of the negative triplet 'right' member, shape=(#examples,N [Embeddings]). :opt input inpon: sparse csr matrix representing the indexes of the negative triplet relation member, shape=(#examples,N [Embeddings]). Theano function output. :output mean(cost): average cost. :output mean(out): ratio of examples for which the margin is violated, i.e. for which an update occurs. """ return theano.function(list_in, [T.mean(cost), T.mean(out)], updates=updates, on_unused_input='ignore')
def TrainFn(fnsim, embeddings, leftop, rightop, marge=1.0): """ This function returns a theano function to perform a training iteration, contrasting couples of positive and negative triplets. members are given as sparse matrices. for one positive triplet there is one negative triplet. :param fnsim: similarity function (on theano variables). :param embeddings: an embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. :param marge: marge For the cost function. """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr = S.csr_matrix() inpl = S.csr_matrix() inpo = S.csr_matrix() inpln = S.csr_matrix() inprn = S.csr_matrix() inpon = S.csr_matrix() lrparams = T.scalar('lrparams') lrembeddings = T.scalar('lrembeddings') # Graph ## Positive triplet lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T simi = fnsim(leftop(lhs, rell), rightop(rhs, relr)) ## Negative triplet lhsn = S.dot(embedding.E, inpln).T rhsn = S.dot(embedding.E, inprn).T relln = S.dot(relationl.E, inpon).T relrn = S.dot(relationr.E, inpon).T simin = fnsim(leftop(lhsn, relln), rightop(rhsn, relrn)) cost, out = margincost(simi, simin, marge) # Parameters gradients if hasattr(fnsim, 'params'): # If the similarity function has some parameters, we update them too. gradientsparams = T.grad(cost, leftop.params + rightop.params + fnsim.params) updates = OrderedDict((i, i - lrparams * j) for i, j in zip( leftop.params + rightop.params + fnsim.params, gradientsparams)) else: gradientsparams = T.grad(cost, leftop.params + rightop.params) updates = OrderedDict((i, i - lrparams * j) for i, j in zip( leftop.params + rightop.params, gradientsparams)) # Embeddings gradients gradients_embedding = T.grad(cost, embedding.E) newE = embedding.E - lrembeddings * gradients_embedding updates.update({embedding.E: newE}) if type(embeddings) == list: # If there are different embeddings for the relation member. gradients_embedding = T.grad(cost, relationl.E) newE = relationl.E - lrparams * gradients_embedding updates.update({relationl.E: newE}) gradients_embedding = T.grad(cost, relationr.E) newE = relationr.E - lrparams * gradients_embedding updates.update({relationr.E: newE}) """ Theano function inputs. :input lrembeddings: learning rate for the embeddings. :input lrparams: learning rate for the parameters. :input inpl: sparse csr matrix representing the indexes of the positive triplet 'left' member, shape=(#examples,N [Embeddings]). :input inpr: sparse csr matrix representing the indexes of the positive triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpo: sparse csr matrix representing the indexes of the positive triplet relation member, shape=(#examples,N [Embeddings]). :input inpln: sparse csr matrix representing the indexes of the negative triplet 'left' member, shape=(#examples,N [Embeddings]). :input inprn: sparse csr matrix representing the indexes of the negative triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpon: sparse csr matrix representing the indexes of the negative triplet relation member, shape=(#examples,N [Embeddings]). Theano function output. :output mean(cost): average cost. :output mean(out): ratio of examples for which the margin is violated, i.e. for which an update occurs. """ return theano.function([lrembeddings, lrparams, inpl, inpr, inpo, inpln, inprn, inpon], [T.mean(cost), T.mean(out)], updates=updates, on_unused_input='ignore')
def run(jobman,debug = False): expstart = time.time() hp = jobman.state if not os.path.exists('files/'): os.mkdir('files/') # Symbolic variables s_posit = T.matrix() s_negat = T.matrix() idx_start = T.lscalar() idx_stop = T.lscalar() s_valid = theano.sparse.csr_matrix() w2i = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl')) i2w = dict( (v,k) for k,v in w2i.iteritems() ) i2w[0] = 'UNK' senna = [ i2w[i] for i in range(len(i2w.keys())) ] nsenna = len(senna) embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity) H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh) L = logistic(i_size = hp['hsize'], h_size = 1, act = identity) del H.params['d_bias'] del embedding.params['d_bias'] del embedding.params['e_bias'] minsize = hp['minsize'] maxsize = hp['maxsize'] dsize = maxsize - minsize +1 H.params['e_bias'] = theano.shared( numpy.array(numpy.zeros((dsize,hp['hsize'])),dtype=theano.config.floatX),name='e_bias') path = hp['loadpath'] if path: load(embedding,path+'/embedding.pkl') #load(H,path+'/hidden.pkl') #load(L,path+'/logistic.pkl') hp['embedsize'] = embedding.params['e_weights'].get_value(borrow=True).shape[1] #hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1] jobman.save() H.params['e_bias'] = theano.shared( numpy.array(numpy.zeros((dsize,hp['hsize'])),dtype=theano.config.floatX),name='e_bias') valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity) valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize'])) negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize'])) valid_embed = sp.dot(s_valid,valid_embedding.params['weights']).reshape((nsenna,hp['embedsize']*hp['wsize'])) posit_embed_left = T.concatenate([posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']], T.zeros_like(posit_embed[:,idx_stop*hp['embedsize']:]) ],axis=1) negat_embed_left = T.concatenate([negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']], T.zeros_like(negat_embed[:,idx_stop*hp['embedsize']:]) ],axis=1) posit_embed_right = T.concatenate([ T.zeros_like(posit_embed[:,:idx_start*hp['embedsize']]), posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']]],axis=1) negat_embed_right = T.concatenate([ T.zeros_like(negat_embed[:,:idx_start*hp['embedsize']]), negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']]],axis=1) posit_embed = T.concatenate([ T.zeros_like(posit_embed[:,:idx_start*hp['embedsize']]), posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']], T.zeros_like(posit_embed[:,idx_stop*hp['embedsize']:]) ],axis=1) negat_embed = T.concatenate([ T.zeros_like(negat_embed[:,:idx_start*hp['embedsize']]), negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']], T.zeros_like(negat_embed[:,idx_stop*hp['embedsize']:]) ],axis=1) #posit_embed = ifelse(T.eq(idx_start, 0), posit_embed_left, posit_embed) #posit_embed = ifelse(T.eq(idx_stop, hp['maxsize']), posit_embed_right, posit_embed) #negat_embed = ifelse(T.eq(idx_start, 0), negat_embed_left, negat_embed) #negat_embed = ifelse(T.eq(idx_stop, hp['maxsize']), negat_embed_right, negat_embed) Hposit = T.tanh(T.dot(posit_embed,H.params['e_weights']) + H.params['e_bias'][idx_stop-idx_start-minsize,:]) Hnegat = T.tanh(T.dot(negat_embed,H.params['e_weights']) + H.params['e_bias'][idx_stop-idx_start-minsize,:]) posit_score = L.encode(Hposit) negat_score = L.encode(Hnegat) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) CC = (rect(C)).mean() opt = theano.function([s_posit, s_negat, idx_start, idx_stop], (rect(C)).mean(), updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) ) validfct = theano.function([s_valid],valid_score) def saveexp(): save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl') delta = hp['wsize']/2 rest = hp['wsize']%2 freq_idx = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl'))[:2000] fname = '' validsentence = []# cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/valid_debug.pkl')) tseenwords = not debug for e in range(hp['epoch']): hp['split'] = numpy.random.randint(45) sentences = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/ints_50000/split'+str(hp['split'])+'.pkl')) nsent = len(sentences) bigc = [] bigr = [] seen_words = 0 for i,s in enumerate(sentences): nword = len(s) seen_words += nword tseenwords += nword if nword < hp['maxsize'] + 2: continue rndsize = numpy.random.randint(low=hp['minsize']+1,high=hp['maxsize']-1) idxsta = numpy.random.randint(low=1, high=hp['maxsize']-rndsize) idxsto = idxsta+rndsize print 'r',rndsize,'b',idxsta,'e',idxsto,'shape',H.params['e_bias'].get_value().shape c =[] r =[] if debug: print ' *** Processing document',i,'with',nword, sys.stdout.flush() for j in range(delta,nword-delta): nd = rndsize/2 rd = rndsize%2 pchunk = s[j-delta:j+delta+rest] nchunk = [] rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],)) nchunk = [] for kk in range(hp['nneg']): tmpchunk = copy.copy(pchunk) tmpchunk[idxsta+nd] = rndidx[kk] nchunk += tmpchunk assert len(nchunk) == len(pchunk)*hp['nneg'] p, n = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna)) l = opt(p,n, idxsta, idxsto) c.append(l) if debug: print '.', break if debug: print '' bigc += [numpy.array(c).sum()] if 0:#(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']): tseenwords = 0 valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk jobman.save() saveexp() print 'Random Valid Mean rank',mrk if seen_words > hp['freq'] or debug: seen_words = 0 hp['score'] = numpy.array(bigc).mean() hp['e'] = e hp['i'] = i print '' print e,i,'NN Score:', hp['score'] if not debug: ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True)) open('files/'+fname+'nearest.txt','w').write(display(ne,senna)) saveexp() sys.stdout.flush() jobman.save() saveexp()
def fprop(self, state_below, add_noise=True): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) self.x = state_below # linear part if isinstance(self.x, S.SparseVariable): self.z = S.dot(self.x,self.W[0]) + self.b[0] else: self.z = T.dot(self.x,self.W[0]) + self.b[0] # first layer non-linear part if isinstance(self.x, S.SparseVariable): h = S.dot(self.x,self.W[1]) + self.b[1] else: h = T.dot(self.x,self.W[1]) + self.b[1] # activate hidden units of non-linear part if self.hidden_activation is None: pass elif self.hidden_activation == 'tanh': self.h = T.tanh(h) elif self.hidden_activation == 'sigmoid': self.h = T.nnet.sigmoid(h) elif self.hidden_activation == 'softmax': self.h = T.nnet.softmax(h) elif self.hidden_activation == 'rectifiedlinear': self.h = T.maximum(0, h) else: raise NotImplementedError() noise = 0 if self.noise_beta is not None: noise = (1.-self.noise_normality) * self.beta_mean #print self.noise_normality print (1.-self.noise_normality) * self.noise_scale * (self.sparsity_target - 0.5) print noise if add_noise: rng = MRG_RandomStreams(self.mlp.rng.randint(2**15)) if self.noise_beta is not None: noise = (1.-self.noise_normality) * self.beta_dist[ \ self.beta_idx:self.beta_idx+self.x.shape[0],:] \ + (self.noise_normality * self.noise_scale \ * rng.normal(size = self.z.shape, std=self.noise_stdev , dtype=self.z.type.dtype) \ ) else: noise = self.noise_scale \ * rng.normal(size = self.z.shape, std=self.noise_stdev , dtype=self.z.type.dtype) #print self.beta_dist.get_value().shape # second layer non-linear part self.a = T.dot(self.h,self.W[2]) + self.b[2] + noise # activate non-linear part to get bernouilli probabilities self.m_mean = T.nnet.sigmoid(self.a) # mix output of linear part with output of non-linear part self.p = self.m_mean * self.z if self.layer_name is not None: self.z.name = self.layer_name + '_z' self.h.name = self.layer_name + '_h' self.a.name = self.layer_name + '_a' self.m_mean.name = self.layer_name + '_m_mean' self.p.name = self.layer_name + '_p' return self.p
def __init__(self, feature_count, classifier=False, k=8, stdev=0.1, sparse=False): self.classifier = classifier d = feature_count # *** Symbolic variables *** if sparse: X = S.csr_matrix(name='inputs', dtype='float32') else: X = T.matrix() y = T.vector() beta_w1 = T.scalar() beta_v = T.scalar() # *** Model parameters *** # bias term (intercept) w0_init = np.zeros(1) self.w0 = theano.shared(w0_init, allow_downcast=True) # first order coefficients w1_init = np.zeros(d) self.w1 = theano.shared(w1_init, allow_downcast=True) # interaction factors v_init = stdev * np.random.randn(k, d) self.v = theano.shared(v_init, allow_downcast=True) # *** The Model *** # The formula for pairwise interactions is from the bottom left # of page 997 of Rendle 2010, "Factorization Machines." # This version scales linearly in k and d, as opposed to O(d^2). if sparse: interactions = 0.5 * T.sum((S.dot(X, T.transpose(self.v)) ** 2) - \ S.dot(S.mul(X,X), T.transpose(self.v ** 2)), axis=1) y_hat = T.addbroadcast(self.w0, 0) + S.dot(X, self.w1) + interactions else: interactions = 0.5 * T.sum((T.dot(X, T.transpose(self.v)) ** 2) - \ T.dot(X ** 2, T.transpose(self.v ** 2)), axis=1) y_hat = T.addbroadcast(self.w0, 0) + T.dot(X, self.w1) + interactions if self.classifier: y_hat = T.nnet.sigmoid(y_hat) # *** Loss Function *** if self.classifier: error = T.mean(T.nnet.binary_crossentropy(y_hat, y)) else: error = T.mean((y - y_hat)**2) # regularization L2 = beta_w1 * T.mean(self.w1**2) + beta_v * T.mean(self.v**2) loss = error + L2 # *** Learning *** updates = [] params = [self.w0, self.w1, self.v] grads = T.grad(cost=loss, wrt=params) # RMSProp lr, rho, epsilon = 0.001, 0.9, 1e-6 for p, g in zip(params, grads): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g**2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - lr * g)) self.theano_train = theano.function(inputs=[X, y, beta_w1, beta_v], outputs=loss, updates=updates, allow_input_downcast=True) self.theano_cost = theano.function(inputs=[X, y, beta_w1, beta_v], outputs=loss, allow_input_downcast=True) # *** Prediction *** self.theano_predict = theano.function(inputs=[X], outputs=y_hat, allow_input_downcast=True)
def get_hidden_values(self, inp): """ Computes the values of the hidden layer """ return T.nnet.sigmoid(sparse.dot(inp, self.W) + self.b)
def fprop(self, state_below, add_noise=True, threshold=None, stochastic=True): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) self.x = state_below # linear part if isinstance(self.x, S.SparseVariable): self.z = S.dot(self.x,self.W[0]) + self.b[0] else: self.z = T.dot(self.x,self.W[0]) + self.b[0] self.stopper = self.x * T.ones_like(self.x) # first layer non-linear part if isinstance(self.stopper, S.SparseVariable): h = S.dot(self.stopper,self.W[1]) + self.b[1] else: h = T.dot(self.stopper,self.W[1]) + self.b[1] # activate hidden units of non-linear part if self.hidden_activation is None: pass elif self.hidden_activation == 'tanh': self.h = T.tanh(h) elif self.hidden_activation == 'sigmoid': self.h = T.nnet.sigmoid(h) elif self.hidden_activation == 'softmax': self.h = T.nnet.softmax(h) elif self.hidden_activation == 'rectifiedlinear': self.h = T.maximum(0, h) else: raise NotImplementedError() rng = MRG_RandomStreams(self.mlp.rng.randint(2**15)) noise = 0 if self.noise_beta is not None: noise = (1.-self.noise_normality) * self.beta_mean print noise if add_noise: if self.noise_beta is not None: noise = (1.-self.noise_normality) * self.beta_dist[ \ self.beta_idx:self.beta_idx+self.x.shape[0],:] \ + (self.noise_normality * self.noise_scale \ * rng.normal(size = self.z.shape, std=self.noise_stdev , dtype=self.z.type.dtype) \ ) else: noise = self.noise_scale \ * rng.normal(size = self.z.shape, std=self.noise_stdev , dtype=self.z.type.dtype) # second layer non-linear part self.a = T.dot(self.h,self.W[2]) + self.b[2] + noise # activate non-linear part to get bernouilli probabilities self.m_mean = T.nnet.sigmoid(self.a) # Separate stochastic from deterministic part: self.stoch_m_mean = self.m_mean**self.stochastic_ratio self.deter_m_mean = self.m_mean**(1.-self.stochastic_ratio) if threshold is None: if stochastic: # sample from bernouili probs to generate a mask self.m = rng.binomial(size = self.stoch_m_mean.shape, n = 1 , \ p = self.m_mean, dtype=self.stoch_m_mean.type.dtype) else: self.m = self.m_mean else: # deterministic mask: self.m = T.cast(T.gt(self.stoch_m_mean, threshold), \ theano.config.floatX) self.consider_constant = [self.m, self.stopper] # mix output of linear part with output of non-linear part self.p = self.m * self.deter_m_mean * self.z if self.layer_name is not None: self.z.name = self.layer_name + '_z' self.h.name = self.layer_name + '_h' self.a.name = self.layer_name + '_a' self.m_mean.name = self.layer_name + '_m_mean' self.stoch_m_mean.name = self.layer_name + '_stoch_m_mean' self.deter_m_mean.name = self.layer_name + '_deter_m_mean' self.p.name = self.layer_name + '_p' return self.p
def fprop(self, state_below, threshold=None, stochastic=True): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) self.x = state_below # experts part if isinstance(self.x, S.SparseVariable): z = S.dot(self.x,self.W[0]) + self.b[0] else: z = T.dot(self.x,self.W[0]) + self.b[0] # activate hidden units of gater part if self.expert_activation is None: self.z = z elif self.hidden_activation == 'tanh': self.z = T.tanh(z) elif self.expert_activation == 'sigmoid': self.z = T.nnet.sigmoid(z) elif self.expert_activation == 'softmax': self.z = T.nnet.softmax(z) elif self.expert_activation == 'rectifiedlinear': self.z = T.maximum(0, z) else: raise NotImplementedError() # first layer of gater if isinstance(self.x, S.SparseVariable): h = S.dot(self.x,self.W[1]) + self.b[1] else: h = T.dot(self.x,self.W[1]) + self.b[1] # activate hidden units of gater if self.hidden_activation is None: self.h = h elif self.hidden_activation == 'tanh': self.h = T.tanh(h) elif self.hidden_activation == 'sigmoid': self.h = T.nnet.sigmoid(h) elif self.hidden_activation == 'softmax': self.h = T.nnet.softmax(h) elif self.hidden_activation == 'rectifiedlinear': self.h = T.maximum(0, h) else: raise NotImplementedError() # second layer gater self.a = T.dot(self.h,self.W[2]) + self.b[2] # activate gater output to get bernouilli probabilities self.m_mean = T.nnet.sigmoid(self.a) if threshold is None: if stochastic: # sample from bernouili probs to generate a mask rng = MRG_RandomStreams(self.mlp.rng.randint(2**15)) self.m = rng.binomial(size = self.m_mean.shape, n = 1, p = self.m_mean, dtype=self.m_mean.type.dtype) else: self.m = self.m_mean else: # deterministic mask: self.m = T.cast(T.gt(self.m_mean, threshold), \ theano.config.floatX) self.m2 = T.dot(self.m, self.groups) # mask expert output with samples from gater self.p = self.m2 * self.z if self.layer_name is not None: self.z.name = self.layer_name + '_z' self.h.name = self.layer_name + '_h' self.a.name = self.layer_name + '_a' self.m_mean.name = self.layer_name + '_m_mean' self.m.name = self.layer_name + '_m' self.p.name = self.layer_name + '_p' return self.p
mu_u = np.zeros(N) mu_u[0] = 1.0 cov_u = 1e-2 * np.eye(N) cov_u[0, 0] = 1e-10 u = pm.MvNormal("u", mu_u, cov_u, shape=(N, )) u = tt.reshape(u, (N, 1)) # The spectral basis mu_vT = np.ones(K) cov_vT = 1e-2 * np.eye(K) vT = pm.MvNormal("vT", mu_vT, cov_vT, shape=(K, )) vT = tt.reshape(vT, (1, K)) # Compute the model uvT = tt.reshape(tt.dot(u, vT), (N * K, 1)) f_model = tt.reshape(ts.dot(D, uvT), (M * Kobs, )) # Track some values for plotting later pm.Deterministic("f_model", f_model) # Save our initial guess f_model_guess = xo.eval_in_model(f_model) # The likelihood function assuming known Gaussian uncertainty pm.Normal("obs", mu=f_model, sd=ferr, observed=f) # Maximum likelihood solution with model: map_soln = xo.optimize() # Plot some stuff
def sparse_slice_rows(H, idx): '''Returns a dense slice H[idx, :]''' vecs = to_one_hot(idx, H.shape[0], dtype=H.dtype) return ts.dot(vecs, H)
def init_functions(self): '''Construct functions for the model''' # Construct the objective function # Input variables u_i, y_s, y_t = T.ivectors(['u_i', 'y_s', 'y_t']) dropout = T.fscalar(name='p') # Intermediate variables: n_examples * n_songs item_scores = T.dot(self._U[u_i], self._V.T) + self._b # subtract off the row-wise max for numerical stability item_scores = item_scores - item_scores.max(axis=1, keepdims=True) e_scores = T.exp(item_scores) if T.gt(dropout, 0.0): # Construct a random dropout mask retain_prob = 1.0 - dropout M = self._rng.binomial(e_scores.shape, p=retain_prob, dtype=theano.config.floatX) # Importance weight so that E[M[i,j]] = 1 M /= retain_prob # The positive examples should always be sampled M = theano.tensor.set_subtensor(M[T.arange(y_t.shape[0]), y_t], 1.0) e_scores = e_scores * M # Edge feasibilities: n_examples * n_edges prev_feas = sparse_slice_rows(self.H, y_s) # Detect and reset initial-state transitions prev_feas = theano.tensor.set_subtensor(prev_feas[y_s < 0, :], 1) # Raw edge probabilities: n_examples * n_edges edge_given_prev = T.nnet.softmax(prev_feas * self._w) # Compute edge normalization factors: n_examples * n_edges # sum of score mass in each edge for each user edge_norms = ts.dot(e_scores, self.H) # Slice the edge weights according to incoming feasibilities: n_examples next_weight = e_scores[T.arange(y_t.shape[0]), y_t] # Marginalize: n_examples * n_edges next_feas = sparse_slice_rows(self.H, y_t) probs = next_weight * T.sum(next_feas * (edge_given_prev / (_EPS + edge_norms)), axis=1, keepdims=True) # Data likelihood term ll = T.log(probs) avg_ll = ll.mean() # Priors w_prior = -0.5 * self.edge_reg * (self._w**2).sum() b_prior = -0.5 * self.bias_reg * (self._b**2).sum() u_prior = -0.5 * self.user_reg * (self._U**2).sum() v_prior = -0.5 * self.song_reg * (self._V**2).sum() # negative log-MAP objective cost = -1.0 * (avg_ll + u_prior + v_prior + b_prior + w_prior) # Construct the updates variables = [] if 'e' in self.params: variables.append(self._w) if 'b' in self.params: variables.append(self._b) if 'u' in self.params: variables.append(self._U) if 's' in self.params: variables.append(self._V) updates = lasagne.updates.adagrad(cost, variables) self._train = theano.function(inputs=[u_i, y_s, y_t, dropout], outputs=[avg_ll, cost], updates=updates) self._loglikelihood = theano.function(inputs=[u_i, y_s, y_t, theano.Param(dropout, default=0.0, name='p')], outputs=[ll])
def solve(self, u=None, vT=None, b=None, u_guess=None, vT_guess=None, b_guess=None, u_mu=0.0, u_sig=0.01, vT_mu=1.0, vT_sig=0.3, vT_rho=3.e-5, b_mu=1.0, b_sig=0.1, niter_adam=100, niter_linear=100, temp=1.e3, **kwargs): """ """ if not self._loaded: raise RuntimeError("Please load or generate a dataset first.") # Data covariance self.F_CInv = np.ones_like(self.F) / self.ferr ** 2 self.F_lndet = np.sum(np.log(2 * np.pi * self.F_CInv.reshape(-1))) # Prior on `u` self.u_cinv = np.ones(self.N - 1) / u_sig ** 2 self.u_mu = np.ones(self.N - 1) * u_mu self.u_lndet = np.sum(np.log(2 * np.pi * self.u_cinv)) # Gaussian process prior on `vT` self.vT_mu = (np.ones(self.Kp) * vT_mu).reshape(1, -1) if vT_rho > 0.0: kernel = celerite.terms.Matern32Term(np.log(vT_sig), np.log(vT_rho)) gp = celerite.GP(kernel) vT_C = gp.get_matrix(self.lam_padded) cho_C = cho_factor(vT_C) self.vT_CInv = cho_solve(cho_C, np.eye(self.Kp)) self.vT_CInvmu = cho_solve(cho_C, self.vT_mu.reshape(-1)) self.vT_lndet = -2 * np.sum(np.log(2 * np.pi * np.diag(cho_C[0]))) else: self.vT_CInv = np.ones(self.Kp) / vT_sig ** 2 self.vT_CInvmu = (self.vT_CInv * self.vT_mu) self.vT_lndet = np.sum(np.log(2 * np.pi * self.vT_CInv)) self.vT_CInv = np.diag(self.vT_CInv) # Prior on `b` self.b_cinv = np.ones(self.M) / b_sig ** 2 self.b_mu = np.ones(self.M) * b_mu self.b_lndet = self.M * np.log(2 * np.pi / b_sig ** 2) # Simple linear solves if (u is not None) and (vT is not None): self.u = u self.vT = vT self._compute_b() elif (u is not None) and (b is not None): self.u = u self.b = b self._compute_vT() elif (vT is not None) and (b is not None): self.b = b self.vT = vT self._compute_u() # Non-linear else: # Get our guesses going if u is not None: self.u = u var_names = ["vT", "b"] if vT_guess is None and b_guess is None: self.b = np.ones(self.M) self._compute_vT(T=temp) elif vT_guess is not None: self.vT = vT_guess self._compute_b(T=temp) elif b_guess is not None: self.b = b_guess self._compute_vT(T=temp) else: raise ValueError("Unexpected branch!") elif vT is not None: self.vT = vT var_names = ["u", "b"] if u_guess is None and b_guess is None: self.b = np.ones(self.M) self._compute_u(T=temp) elif u_guess is not None: self.u = u_guess self._compute_b(T=temp) elif b_guess is not None: self.b = b_guess self._compute_u(T=temp) else: raise ValueError("Unexpected branch!") elif b is not None: self.b = b var_names = ["u", "vT"] if u_guess is None and vT_guess is None: self.u = self.u_mu + u_sig * np.random.randn(self.N - 1) self._compute_vT(T=temp) elif u_guess is not None: self.u = u_guess self._compute_vT(T=temp) elif vT_guess is not None: self.vT = vT_guess self._compute_u(T=temp) else: raise ValueError("") else: var_names = ["u", "vT", "b"] if vT_guess is None and b_guess is None and u_guess is None: self.b = np.ones(self.M) self.u = self.u_mu + u_sig * np.random.randn(self.N - 1) self._compute_vT(T=temp) elif u_guess is not None: self.u = u_guess if vT_guess is None and b_guess is None: self.b = np.ones(self.M) self._compute_vT(T=temp) elif vT_guess is not None: self.vT = vT_guess self._compute_b(T=temp) elif b_guess is not None: self.b = b_guess self._compute_vT(T=temp) else: raise ValueError("Unexpected branch!") elif vT_guess is not None: self.vT = vT_guess if b_guess is None: self.b = np.ones(self.M) self._compute_u(T=temp) else: self.b = b_guess self._compute_u(T=temp) elif b_guess is not None: self.b = b_guess self.u = self.u_mu + u_sig * np.random.randn(self.N - 1) self._compute_vT(T=temp) else: raise ValueError("Unexpected branch!") # Initialize the variables to the guesses vars = [] if "u" in var_names: u = theano.shared(self.u) vars += [u] else: u = tt.as_tensor_variable(self.u) if "vT" in var_names: vT = theano.shared(self.vT) vars += [vT] else: vT = tt.as_tensor_variable(self.vT) # Compute the model D = ts.as_sparse_variable(self.D) a = tt.reshape(tt.dot(tt.reshape( tt.concatenate([[1.0], u]), (-1, 1)), tt.reshape(vT, (1, -1))), (-1,)) self.map[1:, :] = u b = self.map.flux(theta=self.theta) B = tt.reshape(b, (-1, 1)) M = tt.reshape(ts.dot(D, a), (self.M, -1)) / B # Compute the likelihood r = tt.reshape(self.F - M, (-1,)) cov = tt.reshape(self.F_CInv, (-1,)) lnlike = -0.5 * (tt.sum(r ** 2 * cov) + self.F_lndet) # Compute the prior lnprior = -0.5 * (tt.sum((u - self.u_mu) ** 2 * self.u_cinv) + self.u_lndet) lnprior += -0.5 * (tt.dot(tt.dot(tt.reshape((vT - self.vT_mu), (1, -1)), self.vT_CInv), tt.reshape((vT - self.vT_mu), (-1, 1)))[0, 0] + self.vT_lndet) # The full loss loss = -(lnlike + lnprior) best_loss = loss.eval() best_u = u.eval() best_vT = vT.eval() best_b = b.eval() lnlike_val = np.zeros(niter_adam + 1) lnprior_val = np.zeros(niter_adam + 1) lnlike_val[0] = lnlike.eval() lnprior_val[0] = lnprior.eval() # Optimize upd = Adam(loss, vars, **kwargs) train = theano.function([], [u, vT, b, loss, lnlike, lnprior], updates=upd) for n in tqdm(1 + np.arange(niter_adam)): u_val, vT_val, b_val, loss_val, lnlike_val[n], lnprior_val[n] = train() if (loss_val < best_loss): best_loss = loss_val best_u = u_val best_vT = vT_val best_b = b_val # We're done! self.u = best_u self.vT = best_vT self.b = best_b self.lnlike = lnlike_val self.lnprior = lnprior_val self._solved = True
def RankRelFn(fnsim, embeddings, leftop, rightop, subtensorspec=None, adding=False): """ This function returns a Theano function to measure the similarity score of all relation entities given couples of 'right' and 'left' entities (as sparse matrices). :param fnsim: similarity function (on Theano variables). :param embeddings: an Embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. :param subtensorspec: only measure the similarity score for the entities corresponding to the first subtensorspec (int) entities of the embedding matrix (default None: all entities) :param adding: if the right member is composed of several entities the function needs to more inputs: we have to add the embedding value of the other entities (with the appropriate scaling factor to perform the mean pooling). """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr = S.csr_matrix('inpr') inpl = S.csr_matrix('inpl') if adding: inpoadd = S.csr_matrix('inpoadd') scal = T.scalar('scal') # Graph if subtensorspec is None: rell = relationl.E relr = relationr.E else: # We compute the score only for a subset of entities rell = relationl.E[:, :subtensorspec].T relr = relationr.E[:, :subtensorspec].T if adding: # Add the embeddings of the other entities (mean pooling) rell = rell * scal + (S.dot(relationl.E, inpoadd).T).reshape( (1, embedding.D)) relr = relr * scal + (S.dot(relationr.E, inpoadd).T).reshape( (1, embedding.D)) lhs = (S.dot(embedding.E, inpl).T).reshape((1, embedding.D)) rhs = (S.dot(embedding.E, inpr).T).reshape((1, embedding.D)) # hack to prevent a broadcast problem with the Bilinear layer if hasattr(leftop, 'forwardrankrel'): tmpleft = leftop.forwardrankrel(lhs, rell) else: tmpleft = leftop(lhs, rell) if hasattr(rightop, 'forwardrankrel'): tmpright = rightop.forwardrankrel(rhs, relr) else: tmpright = rightop(lhs, rell) simi = fnsim(tmpleft, tmpright) """ Theano function inputs. :input inpl: sparse csr matrix representing the indexes of the 'left' entities, shape=(#examples,N [Embeddings]). :input inpr: sparse csr matrix representing the indexes of the 'right' entities, shape=(#examples,N [Embeddings]). :opt input inpoadd: sparse csr matrix representing the indexes of the other entities of the relation member with the appropriate scaling factor, shape = (#examples, N [Embeddings]). :opt input scal: scaling factor to perform the mean: 1 / [#entities in the member]. Theano function output. :output simi: matrix of score values. """ if not adding: return theano.function([inpl, inpr], [simi], on_unused_input='ignore') else: return theano.function([inpl, inpr, inpoadd, scal], [simi], on_unused_input='ignore')
def run(jobman, debug=False): expstart = time.time() hp = jobman.state if not os.path.exists('files/'): os.mkdir('files/') # Symbolic variables s_bow = T.matrix() s_idx = T.iscalar() s_tf = T.scalar() s_posit = T.matrix() #theano.sparse.csr_matrix() s_negat = T.matrix() #theano.sparse.csr_matrix() sentences = cPickle.load( open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl')) senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl')) gsubset = cPickle.load( open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten( ).tolist() hashtab = dict(zip(gsubset, range(len(gsubset)))) tfidf_data = numpy.load('/scratch/rifaisal/data/guten/guten_tfidf.npy' ).item().tocsr().astype('float32') #tfidf = cPickle.load(open('/scratch/rifaisal/repos/senna/gutentokenizer.pkl')) senna = numpy.array(senna)[gsubset].tolist() s_valid = theano.sparse.csr_matrix() validsentence = sentences[10000:10010] nsent = len(sentences) nsenna = len(senna) # Layers embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity) H = ae(i_size=hp['embedsize'] * hp['wsize'], h_size=hp['hsize'], e_act=T.tanh) L = logistic(i_size=hp['hsize'], h_size=1, act=identity) S = logistic(i_size=hp['embedsize'], h_size=nsenna, act=T.nnet.softmax) valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act=identity) valid_embedding.params['weights'] = sp.shared( value=scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value( borrow=True))) valid_embedding.params['bias'] = embedding.params['e_bias'] lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape( (1, hp['embedsize'] * hp['wsize'])) negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape( (hp['nneg'], hp['embedsize'] * hp['wsize'])) valid_embed = sp.dot(s_valid, valid_embedding.params['weights']).reshape( (nsenna, hp['embedsize'] * hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) negat_score = L.encode(H.encode(negat_embed)) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) s_bow_pred = S.encode(embedding.encode(s_bow)) pred = s_tf * nllsoft(s_bow_pred, s_idx) CC = (rect(C)).mean() + hp['lambda'] * pred opt = theano.function( [s_posit, s_negat, s_bow, s_idx, s_tf], [(rect(C)).mean(), pred], updates=dict( S.update(CC, lr) + L.update(CC, lr) + H.update(CC, lr) + embedding.update_norm(CC, lr))) #validfct = theano.function([s_valid],valid_score) def saveexp(): save(embedding, fname + 'embedding.pkl') save(H, fname + 'hidden.pkl') save(L, fname + 'logistic.pkl') delta = hp['wsize'] / 2 rest = hp['wsize'] % 2 freq_idx = cPickle.load( open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000] freq_idx = [hashtab[idx] for idx in freq_idx] fname = '' for e in range(hp['epoch']): c = [] r = [] count = 1 for i in range(nsent): rsent = numpy.random.randint(nsent - 1) nword = len(sentences[rsent]) if nword < hp['wsize'] + 2: continue pidx = numpy.random.randint(low=delta, high=nword - delta) pchunk = sentences[rsent][pidx - delta:pidx + delta + rest] nchunk = [] st = sentences[rsent][pidx - delta:pidx] en = sentences[rsent][pidx + 1:pidx + delta + rest] rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], )) nchunk = [] for j in range(hp['nneg']): nchunk += en + [rndidx[j]] + st assert len(nchunk) == len(pchunk) * hp['nneg'] tfidf_chunk = tfidf_data[rsent:rsent + 1].toarray() #pdb.set_trace() tfidf_value = tfidf_chunk[0, sentences[rsent][pidx]] tfidf_chunk[0, sentences[rsent][pidx]] = 0. tfidx = sentences[rsent][ pidx] # numpy.zeros(tfidf_chunk.shape).astype('float32') #tfidx[0,sentences[rsent][pidx]] = 1. p, n, b, iidx, tfval = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna), tfidf_chunk, tfidx, tfidf_value) count += tfval != 0 l, g = opt(p, n, b, iidx, tfval) c = c c.append(l) r.append(g) """ if (time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (i+1)%(20*hp['freq']) == 0 and debug==False: valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk jobman.save() saveexp() print 'Random Valid Mean rank',mrk """ if (i + 1) % hp['freq'] == 0 or debug: hp['score'] = numpy.array(c).sum() / (numpy.array(c) > 0).sum() hp['pred'] = numpy.array(r).sum() / float(count) hp['e'] = e hp['i'] = i print '' print e, i, 'NN Score:', hp['score'], 'Reconstruction:', hp[ 'pred'] if debug != True: ne = knn( freq_idx, embedding.params['e_weights'].get_value(borrow=True)) open('files/' + fname + 'nearest.txt', 'w').write(display(ne, senna)) saveexp() sys.stdout.flush() jobman.save() saveexp()
def plot_results( doppler, loss=[], cho_y1=None, cho_s=None, name="vogtstar", nframes=None, render_movies=False, open_plots=False, overlap=2.0, res=300, ): """ Plot the results of the Doppler imaging problem for the SPOT star. """ # Get the values we'll need for plotting ydeg = doppler.ydeg udeg = doppler._udeg u = doppler.u theta = doppler.theta y1_true = doppler.y1_true s_true = doppler.s_true s_deconv = doppler.s_deconv baseline_true = doppler.baseline_true y1 = np.array(doppler.y1) s = np.array(doppler.s) baseline = doppler.baseline().reshape(-1) model = doppler.model() F = doppler.F lnlam = doppler.lnlam lnlam_padded = doppler.lnlam_padded M = doppler.M inc = doppler.inc # List of figure files we're generating files = [] # Plot the baseline # HACK: Append the first measurement to the last to get # a plot going from -180 to 180 (endpoints included) theta_ = np.append(theta, [180.0]) baseline_true_ = np.append(baseline_true, [baseline_true[0]]) baseline_ = np.append(baseline, [baseline[0]]) fig, ax = plt.subplots(1, figsize=(8, 5)) ax.plot(theta_, baseline_true_, label="true") ax.plot(theta_, baseline_, label="inferred") if cho_y1 is not None: U = np.triu(cho_y1[0]) B = doppler._map.design_matrix(theta=doppler.theta).eval()[:, 1:] A = np.linalg.solve(U.T, B.T) baseline_sig = np.sqrt(np.sum(A**2, axis=0)) baseline_sig_ = np.append(baseline_sig, [baseline_sig[0]]) ax.fill_between( theta_, baseline_ - baseline_sig_, baseline_ + baseline_sig_, color="C1", alpha=0.25, lw=0, ) ax.legend(loc="lower left", fontsize=14) ax.set_xlabel(r"$\theta$ (degrees)") ax.margins(0, None) ax.set_xticks([-180, -135, -90, -45, 0, 45, 90, 135, 180]) ax.set_ylabel("baseline") fig.savefig("%s_baseline.pdf" % name, bbox_inches="tight") files.append("baseline.pdf") plt.close() # Plot the loss if len(np.atleast_1d(loss).flatten()) > 1: # Compute the loss @ true value doppler.y1 = y1_true doppler.s = s_true loss_true = doppler.loss() # Print for the record print("True loss: %.2f" % loss_true) print("Best loss: %.2f" % np.min(loss)) # Plot fig, ax = plt.subplots(1, figsize=(12, 5)) ax.plot(loss, label="loss", color="C0") ax.axhline(loss_true, color="C1", ls="--", label="loss @ true values") ax.set_yscale("log") ax.set_ylabel("negative log probability") ax.set_xlabel("iteration") ax.legend(loc="upper right") fig.savefig("%s_prob.pdf" % name, bbox_inches="tight") files.append("prob.pdf") plt.close() # Plot the Ylm coeffs fig, ax = plt.subplots(1, figsize=(12, 5)) n = np.arange(1, doppler.N) ax.plot(n, y1_true, "C0-", label="true") lo = (doppler.y1_mu - doppler.y1_sig) * np.ones_like(y1) hi = (doppler.y1_mu + doppler.y1_sig) * np.ones_like(y1) ax.fill_between(n, lo, hi, color="C1", lw=0, alpha=0.25, label="prior") ax.plot(n, y1, "C1-", label="inferred") if cho_y1 is not None: cov_y1 = cho_solve(cho_y1, np.eye(doppler.N - 1)) sig_y1 = np.sqrt(np.diag(cov_y1)) ax.fill_between(n, y1 - sig_y1, y1 + sig_y1, color="C1", alpha=0.5) ax.set_ylabel("spherical harmonic coefficient") ax.set_xlabel("coefficient number") ax.legend(loc="lower right", fontsize=14) ax.margins(0.01, None) fig.savefig("%s_coeffs.pdf" % name, bbox_inches="tight") files.append("coeffs.pdf") plt.close() # Render the true map map = starry.Map(ydeg=ydeg, udeg=udeg) map.inc = inc map[1:, :] = y1_true if udeg > 0: map[1:] = u if nframes is None: nframes = len(theta) theta_img = np.array(theta) else: theta_img = np.linspace(-180, 180, nframes + 1)[:-1] if render_movies: map.show(theta=np.linspace(-180, 180, 50), mp4="%s_true.mp4" % name) files.append("true.mp4") img_true_rect = (map.render(projection="rect", res=res).eval().reshape(res, res)) # Render the inferred map map[1:, :] = y1 img = map.render(theta=theta_img).eval() if render_movies: map.show(theta=np.linspace(-180, 180, 50), mp4="%s_inferred.mp4" % name) files.append("inferred.mp4") img_rect = map.render(projection="rect", res=res).eval().reshape(res, res) # Render the pixelwise uncertainties if cho_y1 is not None: # Compute the polynomial transform matrix xyz = map.ops.compute_rect_grid(tt.as_tensor_variable(res)) P = map.ops.pT(xyz[0], xyz[1], xyz[2])[:, :doppler.N] # Transform it to Ylm & evaluate it P = ts.dot(P, map.ops.A1).eval() # Rotate it so north points up """ R = map.ops.R([1, 0, 0], -(90.0 - inc) * np.pi / 180.0) for l in range(map.ydeg + 1): idx = slice(l ** 2, (l + 1) ** 2) P[:, idx] = P[:, idx].dot(R[l]) """ # Discard Y_{0, 0}, whose variance is zero P = P[:, 1:] # NOTE: This is the slow way of computing sigma # CPT = cho_solve(cho_y1, P.T) # cov = np.dot(P, CPT) # sig = np.sqrt(np.diag(cov)) # This is the streamlined version U = np.triu(cho_y1[0]) A = np.linalg.solve(U.T, P.T) img_sig_rect = np.sqrt(np.sum(A**2, axis=0)).reshape(res, res) # This is how I'd compute the *prior* uncertainty on the pixels nsamp = 1000 prior_std = np.std([ np.dot( P[0], doppler.y1_sig * np.random.randn(doppler.N - 1) + doppler.y1_mu, ) for i in range(nsamp) ]) # Normalize to the maximum for plotting vmax = np.nanmax(img_true_rect) img_rect /= vmax img_true_rect /= vmax if cho_y1 is not None: img_sig_rect /= vmax prior_std /= vmax # Plot the maps side by side if cho_y1 is not None: fig, ax = plt.subplots(3, figsize=(10, 13)) fig.subplots_adjust(hspace=0.3) else: fig, ax = plt.subplots(2, figsize=(10, 8)) im = ax[0].imshow( img_true_rect, origin="lower", extent=(-180, 180, -90, 90), cmap="plasma", vmin=0, vmax=1, ) divider = make_axes_locatable(ax[0]) cax = divider.append_axes("right", size="4%", pad=0.25) plt.colorbar(im, cax=cax, format="%.2f") im = ax[1].imshow( img_rect, origin="lower", extent=(-180, 180, -90, 90), cmap="plasma", vmin=0, vmax=1, ) divider = make_axes_locatable(ax[1]) cax = divider.append_axes("right", size="4%", pad=0.25) plt.colorbar(im, cax=cax, format="%.2f") ax[0].annotate( "true", xy=(0, 0), xytext=(7, 7), xycoords="axes fraction", textcoords="offset points", ha="left", va="bottom", fontsize=22, color="k", zorder=101, ) ax[1].annotate( "inferred", xy=(0, 0), xytext=(7, 7), xycoords="axes fraction", textcoords="offset points", ha="left", va="bottom", fontsize=22, color="k", zorder=101, ) if cho_y1 is not None: im = ax[2].imshow( img_sig_rect, origin="lower", extent=(-180, 180, -90, 90), cmap="plasma", vmin=0, vmax=prior_std, ) ticks = np.linspace(0, prior_std, 5) ticklabels = ["%.2f" % t for t in ticks] ticklabels[-1] = r"$\sigma_\mathrm{prior}$" divider = make_axes_locatable(ax[2]) cax = divider.append_axes("right", size="4%", pad=0.25) cb = plt.colorbar(im, cax=cax, format="%.2f", ticks=ticks) cb.ax.set_yticklabels(ticklabels) ax[2].annotate( "uncertainty", xy=(0, 0), xytext=(7, 7), xycoords="axes fraction", textcoords="offset points", ha="left", va="bottom", fontsize=22, color="k", zorder=101, ) for axis in ax: latlines = np.linspace(-90, 90, 7)[1:-1] lonlines = np.linspace(-180, 180, 13) for lat in latlines: axis.axhline(lat, color="k", lw=0.5, alpha=0.5, zorder=100) for lon in lonlines: axis.axvline(lon, color="k", lw=0.5, alpha=0.5, zorder=100) axis.set_xticks(lonlines) axis.set_yticks(latlines) # axis.set_xlabel("Longitude [deg]", fontsize=16) # axis.set_ylabel("Latitude [deg]", fontsize=16) for tick in (axis.xaxis.get_major_ticks() + axis.yaxis.get_major_ticks()): tick.label.set_fontsize(10) fig.savefig("%s_rect.pdf" % name, bbox_inches="tight") files.append("rect.pdf") plt.close() # Plot the "Joy Division" graph fig = plt.figure(figsize=(8, 11.5)) ax_img = [ plt.subplot2grid((nframes, 8), (n, 0), rowspan=1, colspan=1) for n in range(nframes) ] ax_f = [plt.subplot2grid((nframes, 8), (0, 1), rowspan=1, colspan=7)] ax_f += [ plt.subplot2grid( (nframes, 8), (n, 1), rowspan=1, colspan=7, sharex=ax_f[0], sharey=ax_f[0], ) for n in range(1, nframes) ] for n in range(nframes): ax_img[n].imshow(img[n], extent=(-1, 1, -1, 1), origin="lower", cmap="plasma") ax_img[n].axis("off") m = int(np.round(np.linspace(0, M - 1, nframes)[n])) ax_f[n].plot(lnlam, F[m], "k.", ms=2, alpha=0.75, clip_on=False) ax_f[n].plot(lnlam, model[m], "C1-", lw=1, clip_on=False) ax_f[n].axis("off") ymed = np.median(F) ydel = 0.5 * (np.max(F) - np.min(F)) / overlap ax_f[0].set_ylim(ymed - ydel, ymed + ydel) fig.savefig("%s_timeseries.pdf" % name, bbox_inches="tight", dpi=400) files.append("timeseries.pdf") plt.close() # Plot the rest frame spectrum fig, ax = plt.subplots(1) ax.plot(lnlam_padded, s_true.reshape(-1), "C0-", label="true") if s_deconv is not None: ax.plot( lnlam_padded, s_deconv.reshape(-1), "C1--", lw=1, alpha=0.5, label="guess", ) ax.plot(lnlam_padded, s.reshape(-1), "C1-", label="inferred") if cho_s is not None: cov_s = cho_solve(cho_s, np.eye(doppler.Kp)) sig_s = np.sqrt(np.diag(cov_s)) ax.fill_between(lnlam_padded, s - sig_s, s + sig_s, color="C1", alpha=0.5) ax.axvspan(lnlam_padded[0], lnlam[0], color="k", alpha=0.3) ax.axvspan(lnlam[-1], lnlam_padded[-1], color="k", alpha=0.3) ax.set_xlim(lnlam_padded[0], lnlam_padded[-1]) ax.set_xlabel(r"$\ln\left(\lambda/\lambda_\mathrm{r}\right)$") ax.set_ylabel(r"Normalized intensity") ax.legend(loc="lower left", fontsize=12) fig.savefig("%s_spectrum.pdf" % name, bbox_inches="tight") files.append("spectrum.pdf") plt.close() # Open if open_plots: for file in files: subprocess.run(["open", "%s_%s" % (name, file)])
def TrainFn(fnsim, embeddings, leftop, rightop, loss=loss.hinge, loss_margin=1.0, op='', method='SGD', decay=0.999, epsilon=1e-6, max_learning_rate=None, weight_L1_param_regularizer=None, weight_L2_param_regularizer=None, weight_contractive_regularizer_left=None, weight_contractive_regularizer_right=None): """ This function returns a theano function to perform a training iteration, contrasting couples of positive and negative triplets. members are given as sparse matrices. for one positive triplet there is one negative triplet. :param fnsim: similarity function (on theano variables). :param embeddings: an embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs inpr, inpl, inpo = S.csr_matrix('inpr'), S.csr_matrix('inpl'), S.csr_matrix('inpo') inpln, inprn, inpon = S.csr_matrix('inpln'), S.csr_matrix('inprn'), S.csr_matrix('inpon') # Learning rates for parameters and embeddings rate_params = T.scalar('rate_params') rate_embeddings = T.scalar('rate_embeddings') # E: D x N, inp: N x B -> <E, inp>: D x B -> <E, inp>.T: B x D # Positive triplet functions lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T # Negative triplet functions lhsn = S.dot(embedding.E, inpln).T rhsn = S.dot(embedding.E, inprn).T relln = S.dot(relationl.E, inpon).T relrn = S.dot(relationr.E, inpon).T # Similarity Function, applied to g_lhs and g_rhs lop, rop = leftop(lhs, rell), rightop(rhs, relr) lopn, ropn = leftop(lhsn, relln), rightop(rhsn, relrn) simi = fnsim(lop, rop) simin = fnsim(lopn, ropn) supported_loss_args = inspect.getargspec(loss)[0] loss_args = {} if 'margin' not in supported_loss_args else { 'margin':loss_margin } cost, out = loss(simi, simin, **loss_args) # <EXPERIMENTAL_CODE> # Should I also plug examples from corrupted triples ? if weight_contractive_regularizer_left is not None: cost = cost + (weight_contractive_regularizer_left * R.contractive_regularizer(lop, lhs)) if weight_contractive_regularizer_right is not None: cost = cost + (weight_contractive_regularizer_right * R.contractive_regularizer(rop, rhs)) for rel_param in set([relationl.E, relationr.E]): if weight_L1_param_regularizer is not None: cost = cost + (weight_L1_param_regularizer * R.L1_regularizer(rel_param)) if weight_L2_param_regularizer is not None: cost = cost + (weight_L2_param_regularizer * R.L2_regularizer(rel_param)) # </EXPERIMENTAL_CODE> params = leftop.params + rightop.params + (fnsim.params if hasattr(fnsim, 'params') else []) params = list(set(params)) embeds = [embedding.E] + ([relationr.E, relationl.E] if (type(embeddings) == list) else []) embeds = list(set(embeds)) # The function updates the implicit function arguments according to the updates. updates = collections.OrderedDict() if (method == 'SGD'): pass # do nothing elif (method == 'MOMENTUM'): param_previous_update_map = collections.OrderedDict() for param in params + embeds: # Allocate the previous updates previous_update_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_previous_update = theano.shared(value=previous_update_value, name='su_' + param.name) param_previous_update_map[param] = param_previous_update elif (method == 'ADAGRAD'): param_squared_gradients_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared(value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients elif (method == 'ADADELTA'): param_squared_gradients_map = collections.OrderedDict() param_squared_updates_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared(value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients # Allocate the sums of squared updates squared_updates_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_updates = theano.shared(value=squared_updates_value, name='su_' + param.name) param_squared_updates_map[param] = param_squared_updates elif (method == 'RMSPROP'): param_squared_gradients_map = collections.OrderedDict() for param in params + embeds: # Allocate the sums of squared gradients squared_gradients_value = numpy.zeros(param.get_value().shape, dtype=theano.config.floatX) param_squared_gradients = theano.shared(value=squared_gradients_value, name='sg_' + param.name) param_squared_gradients_map[param] = param_squared_gradients else: raise ValueError('Unknown method: %s' % (method)) # Parameter Gradients gradientsparams = T.grad(cost, params) # Embeddings gradients gradientsembeds = T.grad(cost, embeds) # Learning Rates rates_params = [rate_params for i in range(len(params))] # In TransE etc. the rate for predicates' embeddings (that do not get normalized) is rate_params, not rate_embeddings rates_embeddings = [rate_embeddings, rate_params, rate_params] if len(embeds) > 1 else [rate_embeddings] # [rate_embeddings for i in range(len(embeds))] for param, gradient, rate in zip(params + embeds, gradientsparams + gradientsembeds, rates_params + rates_embeddings): if (method == 'SGD'): # SGD U.sgd(param, rate, gradient, updates) elif (method == 'MOMENTUM'): # SGD+MOMENTUM param_previous_update = param_previous_update_map[param] U.momentum(param, rate, decay, gradient, updates, param_previous_update) elif (method == 'ADAGRAD'): # ADAGRAD param_squared_gradients = param_squared_gradients_map[param] U.adagrad(param, rate, epsilon, gradient, updates, param_squared_gradients) elif (method == 'ADADELTA'): # ADADELTA param_squared_gradients = param_squared_gradients_map[param] param_squared_updates = param_squared_updates_map[param] U.adadelta(param, rate, decay, epsilon, gradient, updates, param_squared_gradients, param_squared_updates) elif (method == 'RMSPROP'): # RMSPROP param_squared_gradients = param_squared_gradients_map[param] U.rmsprop(param, rate, decay, max_learning_rate, epsilon, gradient, updates, param_squared_gradients) else: raise ValueError('Unknown method: %s' % (method)) """ Theano function inputs. :input rate_embeddings: learning/decay rate for the embeddings. :input rate_params: learning/decay rate for the parameters. :input inpl: sparse csr matrix representing the indexes of the positive triplet 'left' member, shape=(#examples,N [Embeddings]). :input inpr: sparse csr matrix representing the indexes of the positive triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpo: sparse csr matrix representing the indexes of the positive triplet relation member, shape=(#examples,N [Embeddings]). :input inpln: sparse csr matrix representing the indexes of the negative triplet 'left' member, shape=(#examples,N [Embeddings]). :input inprn: sparse csr matrix representing the indexes of the negative triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpon: sparse csr matrix representing the indexes of the negative triplet relation member, shape=(#examples,N [Embeddings]). Theano function output. :output mean(cost): average cost. :output mean(out): ratio of examples for which the margin is violated, i.e. for which an update occurs. """ return theano.function([rate_embeddings, rate_params, inpl, inpr, inpo, inpln, inprn, inpon], [T.mean(cost), T.mean(out)], updates=updates, on_unused_input='ignore')
def ForwardFn1Member(fnsim, embeddings, leftop, rightop, marge=1.0, rel=True): """ This function returns a theano function to perform a forward step, contrasting positive and negative triplets. members are given as sparse matrices. For one positive triplet there are two or three (if rel == True) negative triplets. To create a negative triplet we replace only one member at a time. :param fnsim: similarity function (on theano variables). :param embeddings: an embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. :param marge: marge for the cost function. :param rel: boolean, if true we also contrast w.r.t. a negative relation member. :note: this is useful for W_SABIE [Weston et al., IJCAI 2011] """ embedding, relationl, relationr = parse_embeddings(embeddings) # inputs inpr = S.csr_matrix() inpl = S.csr_matrix() inpo = S.csr_matrix() inpln = S.csr_matrix() inprn = S.csr_matrix() # graph lhs = S.dot(embedding.E, inpl).T rhs = S.dot(embedding.E, inpr).T rell = S.dot(relationl.E, inpo).T relr = S.dot(relationr.E, inpo).T lhsn = S.dot(embedding.E, inpln).T rhsn = S.dot(embedding.E, inprn).T simi = fnsim(leftop(lhs, rell), rightop(rhs, relr)) similn = fnsim(leftop(lhsn, rell), rightop(rhs, relr)) simirn = fnsim(leftop(lhs, rell), rightop(rhsn, relr)) costl, outl = margincost(simi, similn, marge) costr, outr = margincost(simi, simirn, marge) list_in = [inpl, inpr, inpo, inpln] list_out = [outl, outr] if rel: inpon = S.csr_matrix() relln = S.dot(relationl.E, inpon).T relrn = S.dot(relationr.E, inpon).T simion = fnsim(leftop(lhs, relln), rightop(rhs, relrn)) costo, outo = margincost(simi, simion, marge) out = T.concatenate([outl, outr, outo]) list_in += [inpon] list_out += [outo] """ Theano function inputs. :input inpl: sparse csr matrix representing the indexes of the positive triplet 'left' member, shape=(#examples,N [Embeddings]). :input inpr: sparse csr matrix representing the indexes of the positive triplet 'right' member, shape=(#examples,N [Embeddings]). :input inpo: sparse csr matrix representing the indexes of the positive triplet relation member, shape=(#examples,N [Embeddings]). :input inpln: sparse csr matrix representing the indexes of the negative triplet 'left' member, shape=(#examples,N [Embeddings]). :input inprn: sparse csr matrix representing the indexes of the negative triplet 'right' member, shape=(#examples,N [Embeddings]). :opt input inpon: sparse csr matrix representing the indexes of the negative triplet relation member, shape=(#examples,N [Embeddings]). Theano function output. :output outl: binary vector representing when the margin is violated, i.e. when an update occurs, for the 'left' member. :output outr: binary vector representing when the margin is violated, i.e. when an update occurs, for the 'right' member. :opt output outo: binary vector representing when the margin is violated, i.e. when an update occurs, for the relation member. """ return theano.function(list_in, list_out, on_unused_input='ignore')
def solve(self, y1=None, s=None, baseline=None, y1_guess=None, s_guess=None, baseline_guess=None, niter=100, T=1.0, dlogT=-0.25, optimizer="NAdam", dcf=10.0, quiet=False, **kwargs): """Solve the Doppler imaging problem. Returns: ``(loss, cho_y1, cho_s)``, a tuple containing the array of loss values during the optimization and the Cholesky factorization of the covariance matrices of ``y1`` and ``s``, if available (otherwise the latter two are set to ``None``.) """ # Check the optimizer is valid if optimizer.lower() == "nadam": optimizer = NAdam elif optimizer.lower() == "adam": optimizer = Adam else: raise ValueError("Invalid optimizer.") # Figure out what to solve for known = [] if s is not None: known += ["s"] if y1 is not None: known += ["y1"] if ("y1" in known) and ("s" in known): # Nothing to do here but ingest the values! self.y1 = y1 self.s = s return self.loss(), None, None elif "y1" in known: # Easy: it's a linear problem self.y1 = y1 cho_s = self.compute_s() return self.loss(), None, cho_s else: if ("s" in known) and (baseline is not None): # Still a linear problem! self.s = s cho_y1 = self.compute_y1(baseline=baseline) return self.loss(), cho_y1, None else: # Non-linear. Let's use (N)Adam. if "s" in known: # We know `s` and need to solve for # `y1` w/o any baseline knowledge. s_guess = s else: # We know *nothing*! # Estimate `v^T` from the deconvolved mean spectrum if s_guess is None: fmean = np.mean(self.F, axis=0) fmean -= np.mean(fmean) diagonals = np.tile(self.kT()[0].reshape(-1, 1), self.K) offsets = np.arange(self.W) A = diags(diagonals, offsets, (self.K, self.Kp), format="csr") LInv = (dcf**2 * self.ferr**2 / self.s_sig**2 * np.eye(A.shape[1])) s_guess = 1.0 + np.linalg.solve( A.T.dot(A).toarray() + LInv, A.T.dot(fmean)) # Save this for later self.s_deconv = s_guess # Estimate `y1` w/o baseline knowledge # If `baseline_guess` is `None`, this is done via # a Taylor expansion; see ``compute_y1()``. if y1_guess is None: self.s = s_guess self.compute_y1(T=T, baseline=baseline_guess) y1_guess = self.y1 # Initialize the variables self.y1 = y1_guess self.s = s_guess # Tempering params if T > 1.0: T_arr = 10**np.arange(np.log10(T), 0, dlogT) T_arr = np.append(T_arr, [1.0]) niter_bilin = len(T_arr) else: T_arr = [1.0] niter_bilin = 1 # Loss array loss_val = np.zeros(niter_bilin + niter + 1) loss_val[0] = self.loss() # Iterative bi-linear solve if niter_bilin > 0: if not quiet: print("Running bi-linear solver...") best_loss = loss_val[0] best_y1 = self.y1 best_s = self.s for n in tqdm(range(niter_bilin), disable=quiet): # Compute `y1` using the previous baseline self.compute_y1(T=T_arr[n], baseline=self.baseline()) # Compute `s` using the current `y1` if "s" not in known: self.compute_s(T=T_arr[n]) loss_val[n + 1] = self.loss() if loss_val[n + 1] < best_loss: best_loss = loss_val[n + 1] best_y1 = self.y1 best_s = self.s self.y1 = best_y1 self.s = best_s # Non-linear solve if niter > 0: # Theano nonlienar solve. Variables: y1 = theano.shared(self.y1) s = theano.shared(self.s) if "s" in known: theano_vars = [y1] else: theano_vars = [y1, s] # Compute the model D = ts.as_sparse_variable(self.D()) a = tt.reshape( tt.dot( tt.reshape(tt.concatenate([[1.0], y1]), (-1, 1)), tt.reshape(s, (1, -1)), ), (-1, ), ) b = tt.dot( self._map.design_matrix(theta=self.theta), tt.reshape(tt.concatenate([[1.0], y1]), (-1, 1)), ) B = tt.reshape(b, (-1, 1)) M = tt.reshape(ts.dot(D, a), (self.M, -1)) / B # Compute the loss r = tt.reshape(self.F - M, (-1, )) cov = tt.reshape(self._F_CInv, (-1, )) lnlike = -0.5 * tt.sum(r**2 * cov) lnprior = (-0.5 * tt.sum( (y1 - self.y1_mu)**2 / self.y1_sig**2) + -0.5 * tt.sum( (b - self.baseline_mu)**2 / self.baseline_sig**2) + -0.5 * tt.dot( tt.dot( tt.reshape((s - self.s_mu), (1, -1)), self._s_CInv, ), tt.reshape((s - self.s_mu), (-1, 1)), )[0, 0]) loss = -(lnlike + lnprior) best_loss = loss.eval() best_y1 = y1.eval() best_s = s.eval() if not quiet: print("Running non-linear solver...") upd = optimizer(loss, theano_vars, **kwargs) train = theano.function([], [y1, s, loss], updates=upd) for n in tqdm(1 + niter_bilin + np.arange(niter), disable=quiet): y1_val, s_val, loss_val[n] = train() if loss_val[n] < best_loss: best_loss = loss_val[n] best_y1 = y1_val best_s = s_val # We are done! self.y1 = best_y1 self.s = best_s # Estimate the covariance of `y1` conditioned on `s` # and the covariance of `s` conditioned on `y1`. # Note that the covariance of `y1` is computed from # the linearization that allows us to simultaneously # solve for the baseline. y1_curr = np.array(self.y1) cho_y1 = self.compute_y1() self.y1 = y1_curr if "s" not in known: s_curr = np.array(self.s) cho_s = self.compute_s() self.s = s_curr else: cho_s = None return loss_val, cho_y1, cho_s