예제 #1
0
def estimate_failures(samples, #samples from noisy labelers
                      n_samples=10000, #number of samples to run MCMC for
                      burn=None, #burn-in. Defaults to n_samples/2
                      thin=10, #thinning rate. Sample every k samples from markov chain 
                      alpha_p=1, beta_p=1, #beta parameters for true positive rate
                      alpha_e=1, beta_e=10 #beta parameters for noise rates
                      ):

  if burn is None:
    burn = n_samples / 2

  S,N = samples.shape
  p = Beta('p', alpha=alpha_p, beta=beta_p) #prior on true label
  l = Bernoulli('l', p=p, size=S)
  e_pos = Beta('e_pos', alpha_e, beta_e, size=N) # error rate if label = 1
  e_neg = Beta('e_neg', alpha_e, beta_e, size=N) # error rate if label = 0

  @deterministic(plot=False)
  def noise_rate(l=l, e_pos=e_pos, e_neg=e_neg):
    #probability that a noisy labeler puts a label 1
    return np.outer(l, 1-e_pos) + np.outer(1-l, e_neg)

  noisy_label = Bernoulli('noisy_label', p=noise_rate, size=samples.shape, value=samples, observed=True)
  variables = [l, e_pos, e_neg, p, noisy_label, noise_rate]
  model = MCMC(variables, verbose=3)
  model.sample(iter=n_samples, burn=burn, thin=thin)
  model.write_csv('out.csv', ['p', 'e_pos', 'e_neg'])
  p = np.median(model.trace('p')[:])
  e_pos = np.median(model.trace('e_pos')[:],0)
  e_neg = np.median(model.trace('e_neg')[:],0)
  return p, e_pos, e_neg
예제 #2
0
def Model_twostage_fit_v2(n_TF, n_gene, p_gene_array, p_TF_gene_array, num_iter, num_burn, num_thin, prior_T, prior_T_method, r_TF_gene, a_TF_gene_h1, a_TF_gene_h0, a_gene):
    """
    Assumptions: We allow learning of parameters
    """
    a_gp = float(a_gene)
    if a_TF_gene_h0 == 'None':
        a_tg0 = Uniform('a_tg0', lower=0.5, upper=1)
    else:
        a_tg0 = float(a_TF_gene_h0)
    if a_TF_gene_h1 == 'None':
        a_tg1 = Uniform('a_tg1', lower=0, upper=0.5)
    else:
        a_tg1 = float(a_TF_gene_h1)
    p_T = float(prior_T)

    if r_TF_gene == 'None':
        r_tg = Uniform('r_tg', lower=0, upper=1)
    else:
        r_tg = float(r_TF_gene)
    p_gene = np.zeros(n_gene, dtype=object)     #the ovserved variables 
    T = np.zeros((n_TF, n_gene), dtype=object)  #variables showing TF-gene-pheno relationship
    T_sum = np.zeros(n_gene, dtype=object)
    p_TF_gene = np.zeros((n_TF, n_gene), dtype=object)  #p-value of correlation of gene TF
    for j in range(n_gene):
        for i in range(n_TF):
            T[i, j] = Bernoulli('T_%i_%i' %(i, j), p=p_T)
            
            #If T[i, j] = 0: then p_TF_gene is coming from a mixture of beta and uniform (r is the mixture param)
            @pymc.stochastic(name='p_TF_gene_%i_%i' %(i, j), dtype=float, observed=True)
            def temp_p_TF_gene(value=p_TF_gene_array[i, j], TF_gene_ind=T[i, j], a0=a_tg0, a1=a_tg1, r=r_tg) :
                if TF_gene_ind:
                    out = pymc.distributions.beta_like(value, alpha=a1, beta=1)
                else:
                    out = np.log(r * np.exp(pymc.distributions.beta_like(value, alpha=a1, beta=1))
                                + (1 - r) * np.exp(pymc.distributions.beta_like(value, alpha=a0, beta=1)))
                return out
            p_TF_gene[i, j] = temp_p_TF_gene
            
        #we define a deterministic function to find values of T           
        @pymc.deterministic(name='T_sum_%i' %j, plot=False)
        def temp_T_sum(ind_vec=T[:,j]): 
            return (np.sum(ind_vec)>0)
        T_sum[j] = temp_T_sum

        #If T_sum[j] == 0: then p_TF_gene is coming from a uniform; else, beta
        @pymc.stochastic(name='p_gene_%i' %j, dtype=float, observed=True)
        def temp_p_gene(value=p_gene_array[j], ind=T_sum[j], a=a_gp):
            if ind:
                out = pymc.distributions.beta_like(value, alpha=a, beta=1)
            else:
                out = pymc.distributions.uniform_like(value, 0, 1)
            return out
        p_gene[j] = temp_p_gene
    if a_gene == None and a_TF_gene_h0 == None and a_TF_gene_h1 == None:
        M5 = pymc.MCMC([T, T_sum, a_gp, a_tg0, a_tg1])
    else:        
        M5 = pymc.MCMC([T, T_sum])
    M5.sample(iter=int(num_iter), burn=int(num_burn), thin=int(num_thin))
    return(M5)
예제 #3
0
    def set_models(self):
        """Define models for each group.

        :return: None
        """
        for group in ['control', 'variant']:
            self.stochastics[group] = Bernoulli(group,
                                                self.stochastics[group + '_p'],
                                                value=getattr(self, group),
                                                observed=True)
    print c,v 
print "num_pathways:", len(pathways)
print "num_features:", len(features)
print "num_evidence:", len(evidence)
print "num_metfrag: ", len(metfrag_evidence)
rate_prior = 0.5

#eps = Beta('eps', 0.005, 1)
eps = 0.0001
ap =  {p : Gamma('p_' + p, rate_prior, 1) for p in pathways}
bmp = {p : {feat : Gamma('b_{' + p + ',' + feat + '}', ap[p],1) for feat in path_dict[p]} for p in pathways}
y_bmp = {}
g = {}

def logp_f(f, b, eps):
    if f in evidence:
        return math.log(1 - math.e ** (-1 * b) + epsilon)
    if f in metfrag_evidence:
        a_p = (1.0 / (1 - metfrag_evidence[f])) - 1
        return a_p * math.log(1 - math.e ** (-1 * b) + epsilon) - b
    return math.log(eps) - b
psi = {}
for feat, pathways in reverse_path_dict.iteritems():
    y_bmp[feat] = sum([bmp[pname][feat] for pname in pathways])
    g[feat] = Bernoulli('g_' + feat, 1 - math.e ** (-y_bmp[feat]))
    psi[feat] = pymc.Potential(logp = logp_f,
                               name = 'psi_' + feat,
                               parents = {'f' : feat, 'b' : y_bmp[feat], 'eps' : eps},
                               doc = 'hello world potential'
                              )
metfrag_evidence = read.dict_of_set(
    read.metfrag_with_scores(observation_file, keep_zero_scores=False),
    metfrag & features - cofactors - evidence)
evidence = {e: 1 for e in evidence}

rate_prior = 0.5

ap = {p: Gamma('p_' + p, rate_prior, 1) for p in pathways}
bmp = {
    p: {
        feat: Gamma('b_{' + p + ',' + feat + '}', ap[p], 1)
        for feat in path_dict[p]
    }
    for p in pathways
}
y_bmp = {}
virtual = {}

se_count = 0
for feat, pathways in reverse_path_dict.iteritems():
    #g_bmp[feat] = Poisson('g_' + feat, sum([bmp[pname][feat] for pname in pathways]))
    y_bmp[feat] = Bernoulli(
        'y_' + feat,
        1 - math.e**-sum([bmp[pname][feat] for pname in pathways]))
#    if feat in evidence:
#        virtual[feat] = Bernoulli('ve_' + feat, ONE if (g_bmp[feat] != 0) else ZERO, value = 1, observed = True)
#    elif feat in metfrag_evidence:
#        se_count += 1
#        e = metfrag_evidence[feat]
#        virtual[feat] = Bernoulli('vs_' + feat, e if (g_bmp[feat] != 0) else 1 - e, value = 1, observed = True)
evidence &= features
reverse_path_dict = read.reverse_dict(path_dict)
metfrag = read.metfrag(observation_file)
metfrag_evidence = read.dict_of_set(
    read.metfrag_with_scores(observation_file, keep_zero_scores=False),
    metfrag & features - cofactors - evidence)
evidence = {e: one for e in evidence}
evidence.update(metfrag_evidence)
features = list(features)
print 'C05381' in evidence
print evidence['C05381']
pi = 0.1

#l = [Beta('lambda_'+p, alpha = 1, beta = 1, value = 0.5) for p in pathways]
l = 0.5
a_ps = [Bernoulli(path, p=l) for i, path in enumerate(pathways)]
#a_ps = [Bernoulli(path, p = l[i]) for i, path in enumerate(pathways)]

for i, p in enumerate(pathways):
    O[p] = {}
    active_path = (lambda x=a_ps[i]: u[1] if x else u[0])
    u_ap = Lambda('u_ap' + str(i), active_path)
    for f in path_dict[p]:
        O[p][f] = (Bernoulli('o_{p=' + p + ',f=' + f + '}', p=u_ap), u_ap)


def is_present(f_id, parents):
    """ Calculates y_f, the probability that a features appears in our sample.
    Args:
        f_id (int): feature id
        O (dict): O is a dict of dicts representing probability of each feature
예제 #7
0
 def __init__(self, G=cycle_graph(9), beta=0.0):
     self.G, self.beta = G, beta
     self.x = [Bernoulli(str(v), 0.5, value=0) for v in G.nodes_iter()]
     self.psi = [self.IndepSetPotential(v, G[v]) for v in G.nodes_iter()]
     MCMC.__init__(self, [self.x, self.psi])
예제 #8
0
from pymc import MCMC, Matplot, Beta, Bernoulli, Lambda, Poisson, Uniform, deterministic, logp_of_set, logp_gradient_of_set

n = 100000
theta = 2
pi = 0.4
y = [(random.random() < pi) * random.poisson(theta) for i in range(n)]


def remcache(s):
    s._cache_depth = 0
    s.gen_lazy_function()


p = Beta('p', 1, 1)

z = Bernoulli('z', p, value=array(y) > 0, plot=False)

theta_hat = Uniform('theta_hat', 0, 100, value=3)

t = z * theta
counts = Poisson('counts', t, value=y, observed=True)
model = [p, z, theta_hat, counts]

#disable caching for all the nodes
v = model + [t]
for s in v:
    remcache(s)


def pymc_logp():
    return logp_of_set(model)
예제 #9
0
from pymc import Bernoulli, Lambda
import pymc
import numpy as np
import parser

u = [0.01, 0.8]
l = 0.5
O = {}
pathways = parser.pathways()
features = parser.features()
detected = parser.detected_features()
evidence = parser.evidence()

a_ps = [Bernoulli('a_' + str(i), p=l) for i in xrange(len(pathways))]

for i, p in enumerate(pathways):
    O[i] = {}
    active_path = (lambda x=a_ps[i]: u[1] if x else u[0])
    u_ap = Lambda('u_ap' + str(i), active_path)
    for f in p.mets:
        O[i][f] = (Bernoulli('o_{p=' + str(i) + ',f=' + str(f) + '}',
                             p=u_ap), u_ap)


def is_present(f_id, O):
    """ Calculates y_f, the probability that a features appears in our sample.
    Args:
        f_id (int): feature id
        O (dict): O is a dict of dicts representing probability of each feature
            in each pathway
    Returns: