Exemplo n.º 1
0
 def get_log_likelihood(self, observation):
     if len(observation) != 4:
         raise ValueError('expected the observation to be a vector of four integers')
     n = sum(observation)
     accum = 0
     accum += StatsUtil.poisson_log_pmf(n, self.expected_coverage)
     accum += StatsUtil.multinomial_log_pmf(self.distribution, observation)
     return accum
Exemplo n.º 2
0
 def __call__(self, X):
     """
     @return: negative log likelihood
     """
     # unpack the params into a finite distribution
     a, b = X.tolist()
     mutation_rate = StatsUtil.expit(a)
     fitness_ratio = math.exp(b)
     h = 0.5
     v = get_sample_distn(self.N, self.n, mutation_rate, fitness_ratio, h)
     # return the negative log likelihood
     return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
Exemplo n.º 3
0
 def __call__(self, X):
     """
     @return: negative log likelihood
     """
     # unpack the params into a finite distribution
     a, b = X.tolist()
     mutation_rate = StatsUtil.expit(a)
     fitness_ratio = math.exp(b)
     h = 0.5
     v = get_sample_distn(
             self.N, self.n,
             mutation_rate, fitness_ratio, h)
     # return the negative log likelihood
     return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
Exemplo n.º 4
0
def get_transition_matrix_slow(N_diploid, k, mutation, fit):
    """
    Mutation probabilities are away from a fixed state.
    @param N_diploid: diploid population size
    @param k: number of alleles e.g. 4 for A,C,G,T
    @param mutation: k by k matrix of per-generation mutation probabilities
    @param fit: sequence of k fitness values
    @return: a transition matrix
    """
    N = N_diploid * 2
    states = [tuple(s) for s in gen_states(N,k)]
    nstates = len(states)
    s_to_i = dict((s, i) for i, s in enumerate(states))
    P = np.zeros((nstates, nstates))
    # Add rows corresponding to transitions from population states
    # for which an allele is currently fixed in the population.
    for i in range(k):
        P[i, i] = mutation[i, i]
        for j in range(k):
            if i == j:
                continue
            state = [0]*k
            state[i] = N-1
            state[j] = 1
            P[i, s_to_i[tuple(state)]] = mutation[i, j]
    # Add rows corresponding to transitions from polymorphic population states.
    for i, j in combinations(range(k), 2):
        for h in range(1, N):
            state = [0]*k
            state[i] = h
            state[j] = N-h
            index = s_to_i[tuple(state)]
            # Compute each child probability of having allele j.
            #pi, pj = wrightfisher.genic_diallelic(fit[i], fit[j], h, N-h)
            #s = fit[i] - fit[j]
            s = 1 - fit[j] / fit[i]
            pi, pj = wrightfisher.genic_diallelic(1.0, 1.0 - s, h, N-h)
            # Add entries corresponding to fixation of an allele.
            P[index, i] = math.exp(StatsUtil.binomial_log_pmf(N, N, pi))
            P[index, j] = math.exp(StatsUtil.binomial_log_pmf(0, N, pi))
            # Add entries corresponding to transitions to polymorphic states.
            for hsink in range(1, N):
                sink_state = [0]*k
                sink_state[i] = hsink
                sink_state[j] = N-hsink
                sink_index = s_to_i[tuple(sink_state)]
                logp = StatsUtil.binomial_log_pmf(hsink, N, pi)
                P[index, sink_index] = math.exp(logp)
    return P
Exemplo n.º 5
0
def get_transition_matrix_slow(N_diploid, k, mutation, fit):
    """
    Mutation probabilities are away from a fixed state.
    @param N_diploid: diploid population size
    @param k: number of alleles e.g. 4 for A,C,G,T
    @param mutation: k by k matrix of per-generation mutation probabilities
    @param fit: sequence of k fitness values
    @return: a transition matrix
    """
    N = N_diploid * 2
    states = [tuple(s) for s in gen_states(N, k)]
    nstates = len(states)
    s_to_i = dict((s, i) for i, s in enumerate(states))
    P = np.zeros((nstates, nstates))
    # Add rows corresponding to transitions from population states
    # for which an allele is currently fixed in the population.
    for i in range(k):
        P[i, i] = mutation[i, i]
        for j in range(k):
            if i == j:
                continue
            state = [0] * k
            state[i] = N - 1
            state[j] = 1
            P[i, s_to_i[tuple(state)]] = mutation[i, j]
    # Add rows corresponding to transitions from polymorphic population states.
    for i, j in combinations(range(k), 2):
        for h in range(1, N):
            state = [0] * k
            state[i] = h
            state[j] = N - h
            index = s_to_i[tuple(state)]
            # Compute each child probability of having allele j.
            #pi, pj = wrightfisher.genic_diallelic(fit[i], fit[j], h, N-h)
            #s = fit[i] - fit[j]
            s = 1 - fit[j] / fit[i]
            pi, pj = wrightfisher.genic_diallelic(1.0, 1.0 - s, h, N - h)
            # Add entries corresponding to fixation of an allele.
            P[index, i] = math.exp(StatsUtil.binomial_log_pmf(N, N, pi))
            P[index, j] = math.exp(StatsUtil.binomial_log_pmf(0, N, pi))
            # Add entries corresponding to transitions to polymorphic states.
            for hsink in range(1, N):
                sink_state = [0] * k
                sink_state[i] = hsink
                sink_state[j] = N - hsink
                sink_index = s_to_i[tuple(sink_state)]
                logp = StatsUtil.binomial_log_pmf(hsink, N, pi)
                P[index, sink_index] = math.exp(logp)
    return P
Exemplo n.º 6
0
def get_expected_transitions_binomial(prandom, nstates, nsteps):
    """
    This function is for transition matrices defined by their size and a single parameter.
    Use binomial coefficients to compute transition expectations.
    @param prandom: the probability of randomization at each step
    @param nstates: the number of states in the chain
    @param nsteps: one fewer than the length of the sequence
    @return: (expected_t_same, expected_t_different)
    """
    # handle corner cases
    if not nsteps:
        return 0.0, float('nan')
    if nsteps == 1:
        return 0.0, 1.0
    if not prandom:
        return 0.0, float('nan')
    # precalculate stuff
    p_notrans = prandom / nstates + (1 - prandom)
    p_any_trans = 1.0 - p_notrans
    # precalculate expected probability of each endpoint pair state
    prandom_total = 1 - (1 - prandom)**nsteps
    p_notrans_total = prandom_total / nstates + (1 - prandom_total)
    # initialize expectations
    e_same = 0
    e_different = 0
    # define expectations
    for ntrans in range(nsteps+1):
        log_p_ntrans = StatsUtil.binomial_log_pmf(ntrans, nsteps, p_any_trans)
        p_ntrans = math.exp(log_p_ntrans)
        p_same = (1 - (1 - nstates)**(1 - ntrans))/nstates
        e_same += p_same * p_ntrans * ntrans
        e_different += (1 - p_same) * p_ntrans * ntrans
    e_same /= p_notrans_total
    e_different /= (1 - p_notrans_total)
    return e_same, e_different
Exemplo n.º 7
0
def get_expected_transitions_binomial(prandom, nstates, nsteps):
    """
    This function is for transition matrices defined by their size and a single parameter.
    Use binomial coefficients to compute transition expectations.
    @param prandom: the probability of randomization at each step
    @param nstates: the number of states in the chain
    @param nsteps: one fewer than the length of the sequence
    @return: (expected_t_same, expected_t_different)
    """
    # handle corner cases
    if not nsteps:
        return 0.0, float('nan')
    if nsteps == 1:
        return 0.0, 1.0
    if not prandom:
        return 0.0, float('nan')
    # precalculate stuff
    p_notrans = prandom / nstates + (1 - prandom)
    p_any_trans = 1.0 - p_notrans
    # precalculate expected probability of each endpoint pair state
    prandom_total = 1 - (1 - prandom)**nsteps
    p_notrans_total = prandom_total / nstates + (1 - prandom_total)
    # initialize expectations
    e_same = 0
    e_different = 0
    # define expectations
    for ntrans in range(nsteps + 1):
        log_p_ntrans = StatsUtil.binomial_log_pmf(ntrans, nsteps, p_any_trans)
        p_ntrans = math.exp(log_p_ntrans)
        p_same = (1 - (1 - nstates)**(1 - ntrans)) / nstates
        e_same += p_same * p_ntrans * ntrans
        e_different += (1 - p_same) * p_ntrans * ntrans
    e_same /= p_notrans_total
    e_different /= (1 - p_notrans_total)
    return e_same, e_different
Exemplo n.º 8
0
def get_response_content(fs):
    npop = fs.nB + fs.nb
    nstates = npop + 1
    # Check the complexity;
    # solving a system of linear equations takes about n^3 effort.
    if nstates ** 3 > 1e6:
        raise ValueError('sorry this population size is too large')
    # Compute the transition matrix.
    # This assumes no mutation or selection or recombination.
    # It is pure binomial.
    P = np.zeros((nstates, nstates))
    for i in range(nstates):
        nB_initial = i
        for j in range(nstates):
            nB_final = j
            log_p = StatsUtil.binomial_log_pmf(
                    nB_final, npop, nB_initial / float(npop))
            P[i, j] = math.exp(log_p)
    # Put the puzzle into the form Ax=b
    # so that it can be solved by a generic linear solver.
    A = P - np.eye(nstates)
    b = np.zeros(nstates)
    # Adjust the matrix to disambiguate absorbing states.
    A[0, 0] = 1.0
    A[npop, npop] = 1.0
    b[0] = 0.0
    b[npop] = 1.0
    # Solve Ax=b for x.
    x = linalg.solve(A, b)
    # Print the solution.
    out = StringIO()
    print >> out, 'probability of eventual fixation (as opposed to extinction)'
    print >> out, 'of allele B in the population:'
    print >> out, x[fs.nB]
    return out.getvalue()
Exemplo n.º 9
0
def get_two_allele_distribution(N_big, N_small, f0, f1, f_subsample):
    """
    Assumes small genic selection.
    Assumes small mutation.
    The mutational bias does not affect the distribution.
    @param N_big: total number of alleles in the population
    @param N_small: number of alleles sampled from the population
    @param f0: fitness of allele 0
    @param f1: fitness of allele 1
    @param f_subsample: subsampling function
    @return: distribution over all non-fixed population states
    """
    # construct a transition matrix
    nstates = N_big + 1
    P = np.zeros((nstates, nstates))
    for i in range(nstates):
        p0, p1 = wrightfisher.genic_diallelic(f0, f1, i, N_big - i)
        if i == 0:
            P[i, 1] = 1.0
        elif i == N_big:
            P[i, N_big - 1] = 1.0
        else:
            for j in range(nstates):
                logp = StatsUtil.binomial_log_pmf(j, N_big, p0)
                P[i, j] = math.exp(logp)
    # find the stationary distribution
    v = MatrixUtil.get_stationary_distribution(P)
    MatrixUtil.assert_distribution(v)
    if not np.allclose(v, np.dot(v, P)):
        raise ValueError('expected a left eigenvector with eigenvalue 1')
    # return the stationary distribution conditional on dimorphism
    print v
    distn = f_subsample(v, N_small)
    return distn[1:-1] / np.sum(distn[1:-1])
Exemplo n.º 10
0
 def __call__(self, X):
     """
     @param X: three encoded parameters
     @return: negative log likelihood
     """
     # unpack the params into a finite distribution
     v = params_to_distn(self.M_large, self.M_small, X)
     # return the negative log likelihood
     return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
Exemplo n.º 11
0
def params_to_distn(M_large, M_small, X):
    # unpack the parameters
    params = X.tolist()
    a, b, c = params
    # decode the parameters
    mut_ratio = math.exp(a)
    fit_ratio = math.exp(b)
    h = StatsUtil.expit(c)
    # get the distribution implied by the parameters
    return get_sample_distn(M_large, M_small, mut_ratio, fit_ratio, h)
Exemplo n.º 12
0
def params_to_distn(M_large, M_small, X):
    # unpack the parameters
    params = X.tolist()
    a, b, c = params
    # decode the parameters
    mut_ratio = math.exp(a)
    fit_ratio = math.exp(b)
    h = StatsUtil.expit(c)
    # get the distribution implied by the parameters
    return get_sample_distn(M_large, M_small, mut_ratio, fit_ratio, h)
Exemplo n.º 13
0
def create_mutation_transition_matrix(npop, mutation_ab, mutation_ba):
    """
    The states are indexed by the number of mutants.
    @param npop: total population size
    @param mutation_ab: wild-type to mutant transition probability
    @param mutation_ba: mutant to wild-type transition probability
    @return: a transition matrix
    """
    StatsUtil.assert_probability(mutation_ab)
    StatsUtil.assert_probability(mutation_ba)
    nstates = npop + 1
    P = np.zeros((nstates, nstates))
    for a in range(nstates):
        for n_mut_to_wild in range(a+1):
            ba_observed_n = n_mut_to_wild
            ba_max_n = a
            ba_p_success = mutation_ba
            ba_log_p = StatsUtil.binomial_log_pmf(
                    ba_observed_n, ba_max_n, ba_p_success)
            for n_wild_to_mut in range(npop - a + 1):
                ab_observed_n = n_wild_to_mut
                ab_max_n = npop - a
                ab_p_success = mutation_ab
                ab_log_p = StatsUtil.binomial_log_pmf(
                        ab_observed_n, ab_max_n, ab_p_success)
                #
                p = math.exp(ba_log_p + ab_log_p)
                b = a + n_wild_to_mut - n_mut_to_wild
                P[a, b] += p
    return P
Exemplo n.º 14
0
def create_mutation_transition_matrix(npop, mutation_ab, mutation_ba):
    """
    The states are indexed by the number of mutants.
    @param npop: total population size
    @param mutation_ab: wild-type to mutant transition probability
    @param mutation_ba: mutant to wild-type transition probability
    @return: a transition matrix
    """
    StatsUtil.assert_probability(mutation_ab)
    StatsUtil.assert_probability(mutation_ba)
    nstates = npop + 1
    P = np.zeros((nstates, nstates))
    for a in range(nstates):
        for n_mut_to_wild in range(a + 1):
            ba_observed_n = n_mut_to_wild
            ba_max_n = a
            ba_p_success = mutation_ba
            ba_log_p = StatsUtil.binomial_log_pmf(ba_observed_n, ba_max_n,
                                                  ba_p_success)
            for n_wild_to_mut in range(npop - a + 1):
                ab_observed_n = n_wild_to_mut
                ab_max_n = npop - a
                ab_p_success = mutation_ab
                ab_log_p = StatsUtil.binomial_log_pmf(ab_observed_n, ab_max_n,
                                                      ab_p_success)
                #
                p = math.exp(ba_log_p + ab_log_p)
                b = a + n_wild_to_mut - n_mut_to_wild
                P[a, b] += p
    return P
Exemplo n.º 15
0
 def __call__(self, X):
     """
     @return: negative log likelihood
     """
     # unpack the params into a finite distribution
     a, = X.tolist()
     fit_ratio = math.exp(a)
     mut_ratio = 1.0
     h = 0.5
     v = get_sample_distn(self.M_large, self.M_small, mut_ratio, fit_ratio,
                          h)
     # return the negative log likelihood
     return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
Exemplo n.º 16
0
 def __call__(self, X):
     """
     @return: negative log likelihood
     """
     # unpack the params into a finite distribution
     a, = X.tolist()
     fit_ratio = math.exp(a)
     mut_ratio = 1.0
     h = 0.5
     v = get_sample_distn(
             self.M_large, self.M_small,
             mut_ratio, fit_ratio, h)
     # return the negative log likelihood
     return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
Exemplo n.º 17
0
def UpdateStats(stats, t0, curr_lp, K, z, c, steps, gt_z, map_z, verbose):
    stats['lp'].append(curr_lp)
    stats['K'].append(K)
    stats['z'].append(z)
    stats['c'].append(c)
    curr_time = time.clock() - t0
    stats['times'].append(curr_time)
    if verbose:
        print('Step: ' + str(steps) + ' Time: ' + str(curr_time) + ' LP: ' +
              str(curr_lp) + ' K: ' + str(K))

    if gt_z.size > 0:
        stats['NMI'].append(StatsUtil.NMI(gt_z, map_z))

    return stats
Exemplo n.º 18
0
 def __call__(self, X):
     """
     @param X: six params defining mutation and selection
     @return: negative log likelihood
     """
     # define the hardcoded number of alleles
     k = 4
     # unpack the params
     params = X.tolist()
     theta, ka, kb, g0, g1, g2 = params
     if any(x < 0 for x in (theta, ka, kb)):
         return float('inf')
     mutation, fitnesses = kaizeng.params_to_mutation_fitness(
             self.N, params)
     # get the transition matrix
     P = kaizeng.get_transition_matrix(self.N, k, mutation, fitnesses)
     v = MatrixUtil.get_stationary_distribution(P)
     return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
Exemplo n.º 19
0
 def __call__(self, X):
     """
     @param X: six params defining mutation and selection
     @return: negative log likelihood
     """
     # define the hardcoded number of alleles
     k = 4
     # unpack the params
     params = X.tolist()
     theta, ka, kb, g0, g1, g2 = params
     if any(x < 0 for x in (theta, ka, kb)):
         return float('inf')
     mutation, fitnesses = kaizeng.params_to_mutation_fitness(
         self.N, params)
     # get the transition matrix
     P = kaizeng.get_transition_matrix(self.N, k, mutation, fitnesses)
     v = MatrixUtil.get_stationary_distribution(P)
     return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
Exemplo n.º 20
0
def get_transition_matrix(npop, sAA, sAa):
    """
    Note that sab is 0 by convention.
    @param npop: constant Wright-Fisher population
    @param sAA: a selection value
    @param sAa: a selection value
    @return: a transition matrix
    """
    fitnesses = 1.0 + np.array([sAA, sAa, 0])
    # precompute the index_to_composition and composition_to_index maps.
    compositions = list(gen_population_compositions(npop))
    c_to_i = dict((c, i) for i, c in enumerate(compositions))
    nstates = get_state_space_size(npop)
    if nstates != len(compositions):
        raise ValueError('internal error regarding state space size')
    #
    P = np.zeros((nstates, nstates))
    for parent_index, parent_composition_tuple in enumerate(compositions):
        parent_compo = np.array(parent_composition_tuple)
        random_mating = True
        if random_mating:
            single_parent_distn = parent_compo / float(np.sum(parent_compo))
            parent_distn = np.outer(single_parent_distn, single_parent_distn)
            child_distn = np.zeros(3)
            for i in range(3):
                for j in range(3):
                    child_distn += parent_distn[i, j] * get_child_distn(i, j)
            child_distn *= fitnesses
            child_distn /= np.sum(child_distn)
        else:
            total = np.dot(fitnesses, parent_compo)
            single_parent_distn = (fitnesses * parent_compo) / total
            parent_distn = np.outer(single_parent_distn, single_parent_distn)
            child_distn = np.zeros(3)
            for i in range(3):
                for j in range(3):
                    child_distn += parent_distn[i, j] * get_child_distn(i, j)
        for child_index, child_composition_tuple in enumerate(compositions):
            P[parent_index, child_index] = math.exp(
                    StatsUtil.multinomial_log_pmf(
                        child_distn, child_composition_tuple))
    return P
Exemplo n.º 21
0
def LearnSynthForDataset(synth):
    # Hyperparameters
    alpha = 10;
    kappa = 0.0001;
    nu = 1;
    sigsq = 0.01;
    pass_limit = 30;

    D = NormalizeConn(synth.D)  # Normalize connectivity to zero mean, unit var

    # Compute our ddCRP-based parcellation
    Z = WardClustering.ClusterTree(D, synth.adj_list)
    _,dd_stats = initdd.InitializeAndRun(Z, D, synth.adj_list, range(1,21),
                    alpha, kappa, nu, sigsq, pass_limit, synth.z, 0)
    DC = dd_stats['NMI'][-1]
    DC_K = dd_stats['K'][-1]

    # Ward Clustering, using number of clusters discovered from our method
    WC = StatsUtil.NMI(synth.z, WardClustering.Cluster(Z, DC_K))

    return (WC,DC,DC_K)
Exemplo n.º 22
0
def get_transition_matrix(npop, sAA, sAa):
    """
    Note that sab is 0 by convention.
    @param npop: constant Wright-Fisher population
    @param sAA: a selection value
    @param sAa: a selection value
    @return: a transition matrix
    """
    fitnesses = 1.0 + np.array([sAA, sAa, 0])
    # precompute the index_to_composition and composition_to_index maps.
    compositions = list(gen_population_compositions(npop))
    c_to_i = dict((c, i) for i, c in enumerate(compositions))
    nstates = get_state_space_size(npop)
    if nstates != len(compositions):
        raise ValueError("internal error regarding state space size")
    #
    P = np.zeros((nstates, nstates))
    for parent_index, parent_composition_tuple in enumerate(compositions):
        parent_compo = np.array(parent_composition_tuple)
        random_mating = True
        if random_mating:
            single_parent_distn = parent_compo / float(np.sum(parent_compo))
            parent_distn = np.outer(single_parent_distn, single_parent_distn)
            child_distn = np.zeros(3)
            for i in range(3):
                for j in range(3):
                    child_distn += parent_distn[i, j] * get_child_distn(i, j)
            child_distn *= fitnesses
            child_distn /= np.sum(child_distn)
        else:
            total = np.dot(fitnesses, parent_compo)
            single_parent_distn = (fitnesses * parent_compo) / total
            parent_distn = np.outer(single_parent_distn, single_parent_distn)
            child_distn = np.zeros(3)
            for i in range(3):
                for j in range(3):
                    child_distn += parent_distn[i, j] * get_child_distn(i, j)
        for child_index, child_composition_tuple in enumerate(compositions):
            P[parent_index, child_index] = math.exp(StatsUtil.multinomial_log_pmf(child_distn, child_composition_tuple))
    return P
Exemplo n.º 23
0
def get_hobolth_eceo(R, v, a, b, T, nmax):
    """
    The eceo means endpoint cnoditioned expected occupancy.
    Most of the function arguments are the same as those of the more 
    verbosely named function.
    @param nmax: truncation of an infinite summation
    """
    accum = np.zeros(len(v))
    mu = np.max(-np.diag(R))
    X = np.eye(len(v)) + R / mu
    for n in range(nmax+1):
        coeff = (T / (n+1)) * math.exp(StatsUtil.poisson_log_pmf(n, mu*T))
        #print 'coeff:', coeff
        for alpha in range(len(v)):
            conditional_sum = 0
            for i in range(n+1):
                prefix = np.linalg.matrix_power(X, i)[a, alpha]
                suffix = np.linalg.matrix_power(X, n-i)[alpha, b]
                conditional_sum += prefix * suffix
                #print 'conditional sum:', conditional_sum
            accum[alpha] += coeff * conditional_sum
    return accum / scipy.linalg.expm(R*T)[a,b]
Exemplo n.º 24
0
def create_drift_selection_transition_matrix(npop, selection_ratio):
    """
    The states are indexed by the number of mutants.
    @param npop: total population size
    @param selection_ratio: a value larger than unity means mutants are fitter
    @return: a transition matrix
    """
    nstates = npop + 1
    P = np.zeros((nstates, nstates))
    for a in range(nstates):
        # compute the i.i.d probability of picking a mutant
        p = (selection_ratio * a) / (selection_ratio * a + (npop-a))
        for b in range(nstates):
            # These are from a binomial distribution
            # with npop trials and p probability of success per trial.
            # (n choose k) p^k (1-p)^(n-k)
            observed_n = b
            max_n = npop
            p_success = p
            P[a, b] = math.exp(StatsUtil.binomial_log_pmf(
                observed_n, max_n, p_success))
    return P
Exemplo n.º 25
0
def get_hobolth_eceo(R, v, a, b, T, nmax):
    """
    The eceo means endpoint cnoditioned expected occupancy.
    Most of the function arguments are the same as those of the more 
    verbosely named function.
    @param nmax: truncation of an infinite summation
    """
    accum = np.zeros(len(v))
    mu = np.max(-np.diag(R))
    X = np.eye(len(v)) + R / mu
    for n in range(nmax + 1):
        coeff = (T / (n + 1)) * math.exp(StatsUtil.poisson_log_pmf(n, mu * T))
        #print 'coeff:', coeff
        for alpha in range(len(v)):
            conditional_sum = 0
            for i in range(n + 1):
                prefix = np.linalg.matrix_power(X, i)[a, alpha]
                suffix = np.linalg.matrix_power(X, n - i)[alpha, b]
                conditional_sum += prefix * suffix
                #print 'conditional sum:', conditional_sum
            accum[alpha] += coeff * conditional_sum
    return accum / scipy.linalg.expm(R * T)[a, b]
Exemplo n.º 26
0
def create_drift_selection_transition_matrix(npop, selection_ratio):
    """
    The states are indexed by the number of mutants.
    @param npop: total population size
    @param selection_ratio: a value larger than unity means mutants are fitter
    @return: a transition matrix
    """
    nstates = npop + 1
    P = np.zeros((nstates, nstates))
    for a in range(nstates):
        # compute the i.i.d probability of picking a mutant
        p = (selection_ratio * a) / (selection_ratio * a + (npop - a))
        for b in range(nstates):
            # These are from a binomial distribution
            # with npop trials and p probability of success per trial.
            # (n choose k) p^k (1-p)^(n-k)
            observed_n = b
            max_n = npop
            p_success = p
            P[a, b] = math.exp(
                StatsUtil.binomial_log_pmf(observed_n, max_n, p_success))
    return P
Exemplo n.º 27
0
def get_sample_distn(N, n, mutation_rate, fitness_ratio, h):
    """
    @param N: haploid pop size
    @param n: allele sample size
    @param mutation_rate: mutation rate
    @param fitness_ratio: fitness ratio
    @param h: dominance parameter 0.5 when additive
    @return: a distribution over n+1 mono-/di-morphic sample states
    """
    s = 1.0 - fitness_ratio
    P = np.exp(wfengine.create_diallelic_recessive(N // 2, s, h))
    MatrixUtil.assert_transition_matrix(P)
    # allow mutation out of the fixed states
    P[0, 0] = 1.0 - mutation_rate
    P[0, 1] = mutation_rate
    P[-1, -1] = 1.0 - mutation_rate
    P[-1, -2] = mutation_rate
    MatrixUtil.assert_transition_matrix(P)
    # get the population stationary distribution
    v_large = MatrixUtil.get_stationary_distribution(P)
    # get the allele distribution
    v_small = StatsUtil.subsample_pmf_without_replacement(v_large, n)
    return v_small
Exemplo n.º 28
0
def get_sample_distn(N, n, mutation_rate, fitness_ratio, h):
    """
    @param N: haploid pop size
    @param n: allele sample size
    @param mutation_rate: mutation rate
    @param fitness_ratio: fitness ratio
    @param h: dominance parameter 0.5 when additive
    @return: a distribution over n+1 mono-/di-morphic sample states
    """
    s = 1.0 - fitness_ratio
    P = np.exp(wfengine.create_diallelic_recessive(N // 2, s, h))
    MatrixUtil.assert_transition_matrix(P)
    # allow mutation out of the fixed states
    P[0, 0] = 1.0 - mutation_rate
    P[0, 1] = mutation_rate
    P[-1, -1] = 1.0 - mutation_rate
    P[-1, -2] = mutation_rate
    MatrixUtil.assert_transition_matrix(P)
    # get the population stationary distribution
    v_large = MatrixUtil.get_stationary_distribution(P)
    # get the allele distribution
    v_small = StatsUtil.subsample_pmf_without_replacement(v_large, n)
    return v_small
Exemplo n.º 29
0
def get_response_content(fs):
    np.set_printoptions(linewidth=200)
    out = StringIO()
    nsamples = 1
    arr = []
    #
    nsites = 50000
    N = 15*2
    k = 4
    params = (0.002, 1, 1, 0, 0, 0)
    #params = (0.008, 1, 1, 0.5, 1, 1.5)
    mutation, fitnesses = kaizeng.params_to_mutation_fitness(N, params)
    #
    tm = time.time()
    P = kaizeng.get_transition_matrix(N, k, mutation, fitnesses)
    print 'time to construct transition matrix:', time.time() - tm
    #
    tm = time.time()
    v = MatrixUtil.get_stationary_distribution(P)
    print 'time to get stationary distribution:', time.time() - tm
    #
    tm = time.time()
    counts = np.random.multinomial(nsites, v)
    print 'time to sample multinomial counts:', time.time() - tm
    #
    tm = time.time()
    logp = StatsUtil.multinomial_log_pmf(v, counts)
    print 'time to get multinomial log pmf:', time.time() - tm
    #
    for i in range(nsamples):
        counts = np.random.multinomial(nsites, v)
        X0 = np.array(params)
        g = G(N, counts)
        Xopt = optimize.fmin(g, X0)
        arr.append(Xopt)
    print >> out, np.array(arr)
    return out.getvalue()
Exemplo n.º 30
0
def get_response_content(fs):
    np.set_printoptions(linewidth=200)
    out = StringIO()
    nsamples = 1
    arr = []
    #
    nsites = 50000
    N = 15 * 2
    k = 4
    params = (0.002, 1, 1, 0, 0, 0)
    #params = (0.008, 1, 1, 0.5, 1, 1.5)
    mutation, fitnesses = kaizeng.params_to_mutation_fitness(N, params)
    #
    tm = time.time()
    P = kaizeng.get_transition_matrix(N, k, mutation, fitnesses)
    print 'time to construct transition matrix:', time.time() - tm
    #
    tm = time.time()
    v = MatrixUtil.get_stationary_distribution(P)
    print 'time to get stationary distribution:', time.time() - tm
    #
    tm = time.time()
    counts = np.random.multinomial(nsites, v)
    print 'time to sample multinomial counts:', time.time() - tm
    #
    tm = time.time()
    logp = StatsUtil.multinomial_log_pmf(v, counts)
    print 'time to get multinomial log pmf:', time.time() - tm
    #
    for i in range(nsamples):
        counts = np.random.multinomial(nsites, v)
        X0 = np.array(params)
        g = G(N, counts)
        Xopt = optimize.fmin(g, X0)
        arr.append(Xopt)
    print >> out, np.array(arr)
    return out.getvalue()
def LogProbWC(D, Z, sizes, alpha, kappa, nu, sigsq):
    hyp = ddCRP.ComputeCachedLikelihoodTerms(kappa, nu, sigsq)
    logp = np.zeros(len(sizes))

    for i in range(len(sizes)):
        z = cluster.hierarchy.fcluster(Z, t=sizes[i], criterion='maxclust')

        sorted_i = np.argsort(z)
        sorted_z = np.sort(z)
        parcels = np.split(sorted_i, np.flatnonzero(np.diff(sorted_z)) + 1)

        # Formally we should construct a spanning tree within each cluster so
        #   that we can evaluate the probability. However, the only property of
        #   the "c" links that impacts the probability directly is the number of
        #   self-connections. So we simply add the correct number of self-
        #   connections (equal to the number of parcels) and leave the rest
        #   set to zero
        c = np.zeros(len(z))
        c[0:sizes[i]] = np.arange(sizes[i])

        logp[i] = ddCRP.FullProbabilityddCRP(D, c, parcels, alpha, hyp,
                                             StatsUtil.CheckSymApprox(D))

    return logp
Exemplo n.º 32
0
def get_response_content(fs):
    np.set_printoptions(linewidth=200)
    out = StringIO()
    # extract user-supplied parameters
    N_diploid = fs.N_diploid
    nsites = fs.nsites
    nalleles = fs.nalleles
    mutation_rate = fs.mutation_rate
    Ns = fs.Ns
    h = fs.h
    #
    N_hap = 2 * N_diploid
    N = N_hap
    n = nalleles
    s = fs.Ns / float(N)
    fitness_ratio = 1 - s
    v_small = get_sample_distn(N, n, mutation_rate, fitness_ratio, h)
    # sample from this distribution
    counts = np.random.multinomial(nsites, v_small)
    #
    negloglik = -StatsUtil.multinomial_log_pmf(v_small, counts)
    #
    print >> out, 'actual mutation rate:', mutation_rate
    print >> out, 'actual fitness ratio:', fitness_ratio
    print >> out, 'actual h:', h
    print >> out, 'implied finite polymorphic diallelic distribution:'
    print >> out, v_small
    print >> out, 'negative log likelihood:', negloglik
    print >> out
    #
    # try to estimate the parameters
    X0 = np.array([
        StatsUtil.logit(mutation_rate),
        math.log(fitness_ratio),
        StatsUtil.logit(h),
    ],
                  dtype=float)
    g = G(N, n, counts)
    Xopt = optimize.fmin(g, X0)
    #
    a, b, c = Xopt.tolist()
    mutation_rate_hat = StatsUtil.expit(a)
    fitness_ratio_hat = math.exp(b)
    h_hat = StatsUtil.expit(c)
    v_small_hat = get_sample_distn(N, n, mutation_rate_hat, fitness_ratio_hat,
                                   h_hat)
    #
    negloglik_alt = g(Xopt)
    #
    print >> out, 'estim. mutation rate:', mutation_rate_hat
    print >> out, 'estim. fitness ratio:', fitness_ratio_hat
    print >> out, 'estim. h:', h_hat
    print >> out, 'implied finite polymorphic diallelic distribution:'
    print >> out, v_small_hat
    print >> out, 'negative log likelihood:', negloglik_alt
    print >> out
    #
    # constrain to additive selection
    X0 = np.array([
        StatsUtil.logit(mutation_rate),
        math.log(fitness_ratio),
    ],
                  dtype=float)
    g = G_additive(N, n, counts)
    Xopt = optimize.fmin(g, X0)
    a, b = Xopt.tolist()
    mutation_rate_hat = StatsUtil.expit(a)
    fitness_ratio_hat = math.exp(b)
    h_hat = 0.5
    v_small_hat = get_sample_distn(N, n, mutation_rate_hat, fitness_ratio_hat,
                                   h_hat)
    #
    negloglik_null = g(Xopt)
    #
    print >> out, '-- inference assuming additive selection (h = 0.5) --'
    print >> out, 'estim. mutation rate:', mutation_rate_hat
    print >> out, 'estim. fitness ratio:', fitness_ratio_hat
    print >> out, 'estim. h:', h_hat
    print >> out, 'implied finite polymorphic diallelic distribution:'
    print >> out, v_small_hat
    print >> out, 'negative log likelihood:', negloglik_null
    print >> out
    #
    D = 2 * (negloglik_null - negloglik_alt)
    print >> out, 'likelihood ratio test statistic:', D
    print >> out, 'chi squared 1-df 0.05 significance threshold:', 3.84
    print >> out
    #
    """
    # constrain to additive selection and equal expected mutation
    X0 = np.zeros(1)
    g = G_fit_only(M_large, M_small, counts)
    Xopt = optimize.fmin(g, X0)
    a, = Xopt.tolist()
    mut_ratio_hat = 1.0
    fit_ratio_hat = math.exp(a)
    h_hat = 0.5
    v_small_hat = get_sample_distn(
            M_large, M_small,
            mut_ratio_hat, fit_ratio_hat, h_hat)
    print >> out, '-- inference assuming additive selection and equal mut --'
    print >> out, 'estim. mut_ratio:', mut_ratio_hat
    print >> out, 'estim. fit_ratio:', fit_ratio_hat
    print >> out, 'estim. h:', h_hat
    print >> out, 'implied finite polymorphic diallelic distribution:'
    print >> out, v_small_hat
    print >> out
    """
    #
    return out.getvalue()
Exemplo n.º 33
0
 def get_log_likelihood(self, obs):
     n = sum(obs)
     accum = 0
     accum += self.coverage_distribution.get_log_likelihood(n)
     accum += StatsUtil.multinomial_log_pmf(self.distribution, obs)
     return accum
Exemplo n.º 34
0
def ddCRP(D, adj_list, init_c, gt_z, num_passes, alpha, kappa, nu, sigsq,
          stats_interval, verbose):
    map_z = np.zeros(np.shape(D)[0])
    stats = {'times': [], 'lp': [], 'NMI': [], 'K': [], 'z': [], 'c': []}

    hyp = ComputeCachedLikelihoodTerms(kappa, nu, sigsq)
    num_el = len(adj_list)

    # Generate random initialization if not specified
    if init_c.size == 0:
        c = np.zeros(num_el)
        for i in range(num_el):
            neighbors = np.concatenate((adj_list[i], i), axis=1)
            c[i] = neighbors[rd.randint(1, len(neighbors))]
    else:
        c = init_c

    # Initialize spatial connection matrix
    G = sparse.coo_matrix((np.ones(num_el), (np.arange(num_el), c)),
                          shape=(num_el, num_el))
    K, z, parcels = ConnectedComp(G)

    sym = StatsUtil.CheckSymApprox(D)
    curr_lp = FullProbabilityddCRP(D, c, parcels, alpha, hyp, sym)

    max_lp = -float('inf')
    steps = 0
    t0 = time.clock()

    for curr_pass in range(num_passes):
        order = np.random.permutation(num_el)  # Visit elements randomly

        for i in order:
            if curr_lp > max_lp:
                max_lp = curr_lp
                map_z = z

            if steps % stats_interval == 0:
                stats = UpdateStats(stats, t0, curr_lp, K, z, c, steps, gt_z,
                                    map_z, verbose)

            # Compute change in log-prob when removing the edge c_i
            CooModifyRow(G, i, -1)
            if c[i] == i:
                # Removing self-loop, parcellation won't change
                rem_delta_lp, z_rem, parcels_rem = -mt.log(alpha), z, parcels
            else:
                K_rem, z_rem, parcels_rem = ConnectedComp(G)
                if K_rem != K:
                    # We split a cluster, compute change in likelihood
                    rem_delta_lp = -LikelihoodDiff(D, parcels_rem, z_rem[i],
                                                   z_rem[c[i]], hyp, sym)
                else:
                    rem_delta_lp = 0

            # Compute change in log-prob for each possible edge c_i
            adj_list_i = adj_list[i]
            lp = np.zeros((len(adj_list_i) + 1))
            lp[len(adj_list_i)] = mt.log(alpha)
            cached_merge = -1 * np.ones(len(adj_list_i), dtype=np.int32)
            for n_ind in range(len(adj_list_i)):
                n = adj_list_i[n_ind]
                if z_rem[n] == z_rem[c[i]]:
                    # Just undoing edge removal
                    lp[n_ind] = -rem_delta_lp - (c[i] == i) * mt.log(alpha)
                elif z_rem[n] != z_rem[i]:
                    # Proposing merge
                    # First check cache to see if this is already computed
                    prev_lp = np.flatnonzero(cached_merge == z_rem[n])
                    if prev_lp.size > 0:
                        lp[n_ind] = lp[prev_lp[0]]
                    else:
                        # This is a novel merge, compute change in likelihood
                        lp[n_ind] = LikelihoodDiff(D, parcels_rem, z_rem[i],
                                                   z_rem[n], hyp, sym)
                        cached_merge[n_ind] = z_rem[n]

            # Pick new edge proportional to probability
            new_neighbor = ChooseFromLP(lp)
            if new_neighbor < len(adj_list_i):
                c[i] = adj_list_i[new_neighbor]
            else:
                c[i] = i

            # Update likelihood and parcellation
            curr_lp = curr_lp + rem_delta_lp + lp[new_neighbor]
            CooModifyRow(G, i, c[i])
            K, z, parcels = ConnectedComp(G)
            steps = steps + 1

    stats = UpdateStats(stats, t0, curr_lp, K, z, c, steps, gt_z, map_z,
                        verbose)
    return (map_z, stats)
Exemplo n.º 35
0
def get_response_content(fs):
    np.set_printoptions(linewidth=200)
    out = StringIO()
    # extract user-supplied parameters
    N_diploid = fs.N_diploid
    nsites = fs.nsites
    nalleles = fs.nalleles
    mutation_rate = fs.mutation_rate
    Ns = fs.Ns
    h = fs.h
    #
    N_hap = 2 * N_diploid
    N = N_hap
    n = nalleles
    s = fs.Ns / float(N)
    fitness_ratio = 1 - s
    v_small = get_sample_distn(N, n, mutation_rate, fitness_ratio, h)
    # sample from this distribution
    counts = np.random.multinomial(nsites, v_small)
    #
    negloglik = -StatsUtil.multinomial_log_pmf(
            v_small, counts)
    #
    print >> out, 'actual mutation rate:', mutation_rate
    print >> out, 'actual fitness ratio:', fitness_ratio
    print >> out, 'actual h:', h
    print >> out, 'implied finite polymorphic diallelic distribution:'
    print >> out, v_small
    print >> out, 'negative log likelihood:', negloglik
    print >> out
    #
    # try to estimate the parameters
    X0 = np.array([
        StatsUtil.logit(mutation_rate),
        math.log(fitness_ratio),
        StatsUtil.logit(h),
        ], dtype=float)
    g = G(N, n, counts)
    Xopt = optimize.fmin(g, X0)
    #
    a, b, c = Xopt.tolist()
    mutation_rate_hat = StatsUtil.expit(a)
    fitness_ratio_hat = math.exp(b)
    h_hat = StatsUtil.expit(c)
    v_small_hat = get_sample_distn(
            N, n,
            mutation_rate_hat, fitness_ratio_hat, h_hat)
    #
    negloglik_alt = g(Xopt)
    #
    print >> out, 'estim. mutation rate:', mutation_rate_hat
    print >> out, 'estim. fitness ratio:', fitness_ratio_hat
    print >> out, 'estim. h:', h_hat
    print >> out, 'implied finite polymorphic diallelic distribution:'
    print >> out, v_small_hat
    print >> out, 'negative log likelihood:', negloglik_alt
    print >> out
    #
    # constrain to additive selection
    X0 = np.array([
        StatsUtil.logit(mutation_rate),
        math.log(fitness_ratio),
        ], dtype=float)
    g = G_additive(N, n, counts)
    Xopt = optimize.fmin(g, X0)
    a, b = Xopt.tolist()
    mutation_rate_hat = StatsUtil.expit(a)
    fitness_ratio_hat = math.exp(b)
    h_hat = 0.5
    v_small_hat = get_sample_distn(
            N, n,
            mutation_rate_hat, fitness_ratio_hat, h_hat)
    #
    negloglik_null = g(Xopt)
    #
    print >> out, '-- inference assuming additive selection (h = 0.5) --'
    print >> out, 'estim. mutation rate:', mutation_rate_hat
    print >> out, 'estim. fitness ratio:', fitness_ratio_hat
    print >> out, 'estim. h:', h_hat
    print >> out, 'implied finite polymorphic diallelic distribution:'
    print >> out, v_small_hat
    print >> out, 'negative log likelihood:', negloglik_null
    print >> out
    #
    D = 2*(negloglik_null - negloglik_alt)
    print >> out, 'likelihood ratio test statistic:', D
    print >> out, 'chi squared 1-df 0.05 significance threshold:', 3.84
    print >> out
    #
    """
    # constrain to additive selection and equal expected mutation
    X0 = np.zeros(1)
    g = G_fit_only(M_large, M_small, counts)
    Xopt = optimize.fmin(g, X0)
    a, = Xopt.tolist()
    mut_ratio_hat = 1.0
    fit_ratio_hat = math.exp(a)
    h_hat = 0.5
    v_small_hat = get_sample_distn(
            M_large, M_small,
            mut_ratio_hat, fit_ratio_hat, h_hat)
    print >> out, '-- inference assuming additive selection and equal mut --'
    print >> out, 'estim. mut_ratio:', mut_ratio_hat
    print >> out, 'estim. fit_ratio:', fit_ratio_hat
    print >> out, 'estim. h:', h_hat
    print >> out, 'implied finite polymorphic diallelic distribution:'
    print >> out, v_small_hat
    print >> out
    """
    #
    return out.getvalue()
def ClusterTree(D, adj_list):
    if StatsUtil.CheckSymApprox(D):
        X = D
    else:
        X = np.concatenate((D,D.transpose()),axis=1)

    # Compute squared euclidean distance Y between rows
    Qx = np.tile(np.linalg.norm(X, axis=1)**2,(X.shape[0],1))
    Y = Qx + Qx.transpose()-2*np.dot(X, X.transpose())
    Y = spatial.distance.squareform(Y,checks=False)
    Y[Y<0] = 0  # Correct for numerical errors in very similar rows
    
    # Construct adjacency matrix
    N = len(adj_list)
    A = np.zeros([N,N], dtype=bool)
    for i in range(N):
        A[i,adj_list[i]] = True
    connected = spatial.distance.squareform(A).astype(bool)
    

    # Initialize all data structures
    valid_clusts = np.ones(N, dtype=bool)   # which clusters still remain
    col_limits = np.cumsum(np.concatenate((np.array([N-2]), np.arange(N-2, 0, -1))))
    
    # During updating clusters, cluster index is constantly changing, R is
    # a index vector mapping the original index to the current (row, column)
    # index in Y.  C denotes how many points are contained in each cluster.
    m = mt.ceil(mt.sqrt(2*Y.shape[0]))
    C = np.zeros(2*m-1)
    C[0:m] = 1
    R = np.arange(m)
    all_inds = np.arange(Y.shape[0])
    conn_inds = all_inds[connected] # pairs of adjacent clusters that can be merged
    Z = np.zeros([m-1,4])

    for s in range(m-1):
        if conn_inds.size==0:
            # The graph was disconnected (e.g. two hemispheres)
            # Just add all connections to finish up cluster tree
            connected = np.zeros(len(connected))
            conn_inds = []
            valid_clust_inds = np.flatnonzero(valid_clusts)
            
            for i in valid_clust_inds:
                U = valid_clusts
                U[i] = 0
                new_conns = PdistInds(i, N, U)
                connected[new_conns] = True
                conn_inds = np.concatenate((conn_inds, new_conns))
            
            conn_inds = np.unique(conn_inds)

        # Find closest pair of clusters
        v = np.amin(Y[conn_inds])
        k = conn_inds[np.argmin(Y[conn_inds])]
    
        j = np.where(k <= col_limits)[0][0]
        i = N - (col_limits[j] - k) - 1
        
        Z[s,0:3] = np.array([R[i], R[j], v]) # Add row to output linkage
    
        # Update Y with this new cluster i containing old clusters i and j
        U = valid_clusts      
        U[np.array([i,j])] = 0
        I = PdistInds(i, N, U)
        J = PdistInds(j, N, U)
        Y[I] = ((C[R[U]]+C[R[i]])*Y[I] + 
                (C[R[U]]+C[R[j]])*Y[J] - C[R[U]]*v)/(C[R[i]]+C[R[j]]+C[R[U]])
        
        # Add j's connections to new cluster i
        new_conns = connected[J] & ~connected[I]
        connected[I] = connected[I] | new_conns
        conn_inds = np.sort(np.concatenate((conn_inds,I[new_conns])))
        
        # Remove all of j's connections from conn_inds and connected
        U[i]=1
        J = PdistInds(j, N, U)
        conn_inds = conn_inds[np.in1d(conn_inds,J, assume_unique=True,
                                                                invert=True)]
        connected[J] = np.zeros(len(J))
       
        valid_clusts[j] = 0 
        # update m, N, R
        C[m+s] = C[R[i]] + C[R[j]]
        Z[s,3] = C[m+s]
        R[i] = m+s

    Z[:,2] = np.sqrt(Z[:,2])
    return Z
Exemplo n.º 37
0
 def get_log_likelihood(self, observation):
     if len(observation) != 4:
         raise ValueError('expected the observation to be a vector of four integers')
     mu = self.expected_coverage / 4.0
     pr = 1/(mu+1)
     return sum(StatsUtil.geometric_log_pmf(obs, pr) for obs in observation)
Exemplo n.º 38
0
 def get_likelihood(self, obs):
     return math.exp(StatsUtil.poisson_log_pmf(obs, self.expectation))