Exemplo n.º 1
0
 def train_tissue_specific_genome_only(self):
     for i in range(self.num_tissues):
         # train tissue-specific model
         beta = lr.sgd(self.train_list[i][self.genomic_features].values,
                       self.train_list[i]["expr_label"].values,
                       np.zeros(len(self.genomic_features)),
                       np.zeros(len(self.genomic_features)), 1.0)
         self.train_list[i]["tissue specific genome only"] = np.exp(
             lr.log_prob(self.train_list[i][self.genomic_features].values,
                         beta))
         self.test_list[i]["tissue specific genome only"] = np.exp(
             lr.log_prob(self.test_list[i][self.genomic_features].values,
                         beta))
Exemplo n.º 2
0
    def train_shared_tissue_genome_only(self):

        beta = lr.sgd(self.train_list[0][self.genomic_features].values,
                      self.train_list[0]["median_expr_label"].values,
                      np.zeros(len(self.genomic_features)),
                      np.zeros(len(self.genomic_features)), 1.0)

        for i in range(self.num_tissues):
            self.train_list[i]["shared tissue genome only"] = np.exp(
                lr.log_prob(self.train_list[i][self.genomic_features].values,
                            beta))
            self.test_list[i]["shared tissue genome only"] = np.exp(
                lr.log_prob(self.test_list[i][self.genomic_features].values,
                            beta))
Exemplo n.º 3
0
def computeLikelihood(self):
    ll = self.log_p_beta()
    # P(beta^c | beta)
    for i in range(self.num_tissues):
        ll += self.log_p_beta_child_given_beta(i)
    for i in range(self.num_tissues):
        try:
            log_prob_z_1_g = lr.log_prob(
                self.train_list[i][self.genomic_features], self.getBetaLeaf(i))
            log_prob_z_0_g = np.log(1.0 - np.exp(log_prob_z_1_g))

            log_prob_e_z_1 = nb.log_prob(self.train_list[i]['expr_label'], 1,
                                         self.phi)
            b = log_prob_e_z_1 + lr.log_prob(
                self.train_list[i][self.genomic_features], self.getBetaLeaf(i))
        except:
            continue
        a = nb.log_prob(self.train_list[i]['expr_label'], 0,
                        self.phi) + np.log(1.0 - np.exp(log_prob_z_1_g))
        # log sum exp trick
        s = np.maximum(a, b)
        unnormalized_prob = s + np.log(np.exp(a - s) + np.exp(b - s))
        ll_tissue = np.nansum(unnormalized_prob)
        ll += ll_tissue
Exemplo n.º 4
0
def _RIVER_likelihood(e, g, beta, phi):
    # log p(z = 1 | g)
    log_p_z_1_given_g = lr.log_prob(g, beta)
    # log p(z = 0 | g)
    log_p_z_0_given_g = np.log(1.0 - np.exp(log_p_z_1_given_g))
    # log p(e | z = 1)
    log_p_e_given_z_1 = nb.log_prob(e, 1, phi)
    # log p(e | z = 0)
    log_p_e_given_z_0 = nb.log_prob(e, 0, phi)
    import pdb
    pdb.set_trace()
    #x_1 =

    #m = np.maximum()

    return 1
Exemplo n.º 5
0
    def _cross_validate(self, G, E):
        '''
	        K-fold Cross-validate beta MAP estimation to find optimal lambda
	        :param G genomic features
	        :param E expression labels
	        :lambda set 
	    '''
        G = G
        E = E
        # lambda set
        lambda_set = np.array([
            1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5,
            1e6
        ])
        # initialize beta to zero
        beta_init = np.zeros(len(self.genomic_features))
        # AUC scores for each lambda and each fold
        scores_list = np.zeros((len(lambda_set), self.num_folds))
        # for each fold
        for k in range(self.num_folds):
            # access training data (everything but Kth fold)
            training = np.array(
                [x for i, x in enumerate(G) if i % self.num_folds != k])
            training_labels = np.array(
                [x for i, x in enumerate(E) if i % self.num_folds != k])
            # access validation data (Kth fold)
            validation = np.array(
                [[x for i, x in enumerate(G) if i % self.num_folds == k]])
            validation_labels = np.array(
                [x for i, x in enumerate(E) if i % self.num_folds == k])
            # for each possible lambda
            for i in range(len(lambda_set)):
                # train a logistic regression model
                beta = lr.sgd(training, training_labels, beta_init, beta_init,
                              float(lambda_set[i]))
                # compute predictions on validation set
                scores = lr.log_prob(validation, beta).reshape(-1)
                # compute auc using predictions and validation set labels
                auc = sklearn.metrics.roc_auc_score(validation_labels, scores)
                scores_list[i][k] = auc
        # average across all folds for each lambda
        lambda_averages = np.mean(scores_list, axis=1)
        # sanity check
        assert len(lambda_averages) == len(lambda_set)
        optimal_lambda = lambda_set[np.argmax(lambda_averages)]
        return optimal_lambda
Exemplo n.º 6
0
    def eStepLocal(self, i, data, beta, phi):
        '''
           Compute p(z | ...) for tissue i

            i : int
                tissue index

            data : panda data frame
                core data structure containing genomic features, expression, updated posteriors.

            beta : numpy array : 1 x M
                coefficients for genomic features

            phi : numpy array
                either 2 x 2 numpy array for categorical distribution or 1 x 2 for noisy or

        '''
        # log p(z | g)
        log_prob_z_1_given_g = lr.log_prob(data[self.genomic_features].values,
                                           beta)
        log_prob_z_0_given_g = np.log(1.0 - np.exp(log_prob_z_1_given_g))

        # log p(e | z, q)
        if self.e_distribution == 'noisyor':
            # noisy OR
            log_prob_e_given_z_1 = nb.log_prob_noisyor_2_params(
                data['expr_label'], 1, data["eqtl"], phi)
            log_prob_e_given_z_0 = nb.log_prob_noisyor_2_params(
                data[i]['expr_label'], 0, data["eqtl"], phi)
        # log p(e | z)
        else:
            # naive bayes
            log_prob_e_given_z_1 = nb.log_prob(data['expr_label'].values, 1,
                                               self.phi)
            log_prob_e_given_z_0 = nb.log_prob(data['expr_label'].values, 0,
                                               self.phi)

        # p(e|z =1) * p(z = 1 | g) / (\sum_{z \in S} p(z = s | g) * p(e | z = s))
        log_q = log_prob_e_given_z_1 + log_prob_z_1_given_g - np.log(
            np.exp(log_prob_e_given_z_0) * np.exp(log_prob_z_0_given_g) +
            np.exp(log_prob_e_given_z_1) * np.exp(log_prob_z_1_given_g))

        return np.exp(log_q)
Exemplo n.º 7
0
    def eStepLocalTest(self, i, beta, phi):
        '''
           Compute expectation for tissue i
        '''
        # log P(Z = 1 | G)
        log_prob_z_1_given_g = lr.log_prob(
            self.test_list[i][self.genomic_features].values, beta)

        # log P(Z = 0 | G)
        log_prob_z_0_given_g = np.log(1.0 - np.exp(log_prob_z_1_given_g))

        # log P(E | Z = 1)
        log_prob_e_given_z_1 = nb.log_prob(
            self.test_list[i][self.label].values, 1, phi)
        # log P(E | Z = 0)
        log_prob_e_given_z_0 = nb.log_prob(
            self.test_list[i][self.label].values, 0, phi)
        log_q = log_prob_e_given_z_1 + log_prob_z_1_given_g - np.log(
            np.exp(log_prob_e_given_z_0) * np.exp(log_prob_z_0_given_g) +
            np.exp(log_prob_e_given_z_1) * np.exp(log_prob_z_1_given_g))

        return np.exp(log_q)
Exemplo n.º 8
0
	def _compute_p_z_given_g(self, beta, g):
		'''
			P(z | g; beta)
		'''
		return np.exp(lr.log_prob(g, beta))