Пример #1
0
def get_heuristics(M, R):
    """
    Return a multiline string with some heuristics.
    The heuristics are independendent of time and of the information variant.
    Greater stationary distribution shannon entropy suggests less saturation.
    Greater stationary distribution logical entropy suggests less saturation.
    Greater expected rate suggests more saturation.
    Greater spectral rate suggests more saturation.
    @param M: pure mutation rate matrix
    @param R: mutation-selection balance rate matrix
    @return: multiline string
    """
    # get the stationary distributions
    M_v = mrate.R_to_distn(M)
    R_v = mrate.R_to_distn(R)
    # check a different way to get the stationary distribution just for fun
    M_v_nonspectral = mrate.R_to_distn_nonspectral(M)
    R_v_nonspectral = mrate.R_to_distn_nonspectral(R)
    if not np.allclose(M_v, M_v_nonspectral):
        raise ValueError('internal stationary distribution calculation error')
    if not np.allclose(R_v, R_v_nonspectral):
        raise ValueError('internal stationary distribution calculation error')
    # compute the shannon entropy of the matrices
    M_shannon_entropy = -sum(p * math.log(p) for p in M_v)
    R_shannon_entropy = -sum(p * math.log(p) for p in R_v)
    shannon_entropy_sign = np.sign(M_shannon_entropy - R_shannon_entropy)
    # compute the logical entropy of the matrices
    M_logical_entropy = 1 - sum(p * p for p in M_v)
    R_logical_entropy = 1 - sum(p * p for p in R_v)
    logical_entropy_sign = np.sign(M_logical_entropy - R_logical_entropy)
    # compute the expected rate
    M_expected_rate = mrate.Q_to_expected_rate(M)
    R_expected_rate = mrate.Q_to_expected_rate(R)
    expected_rate_sign = np.sign(R_expected_rate - M_expected_rate)
    # compute the spectral rate
    M_spectral_rate = 1 / mrate.R_to_relaxation_time(M)
    R_spectral_rate = 1 / mrate.R_to_relaxation_time(R)
    spectral_rate_sign = np.sign(R_spectral_rate - M_spectral_rate)
    # report the heuristics
    out = StringIO()
    print >> out, 'Greater Shannon entropy of the stationary distribution',
    print >> out, 'suggests more information about divergence time.'
    print >> out, _heuristic_helper(shannon_entropy_sign)
    print >> out
    print >> out, 'Greater logical entropy of the stationary distribution',
    print >> out, 'suggests more information about divergence time.'
    print >> out, _heuristic_helper(logical_entropy_sign)
    print >> out
    print >> out, 'Smaller expected rate',
    print >> out, 'suggests more information about divergence time.'
    print >> out, _heuristic_helper(expected_rate_sign)
    print >> out
    print >> out, 'Smaller spectral rate',
    print >> out, 'suggests more information about divergence time.'
    print >> out, _heuristic_helper(spectral_rate_sign)
    print >> out
    return out.getvalue().strip()
Пример #2
0
def get_rate_matrix_summary(Q):
    out = StringIO()
    Q_v = mrate.R_to_distn(Q)
    Q_r = mrate.Q_to_expected_rate(Q)
    Q_t = mrate.R_to_relaxation_time(Q)
    print >> out, 'rate matrix:'
    print >> out, Q
    print >> out
    print >> out, 'this should be near zero for detailed balance:'
    print >> out, get_detailed_balance_error(Q)
    print >> out
    print >> out, 'computed stationary distribution:'
    print >> out, Q_v
    print >> out
    print >> out, 'expected rate:'
    print >> out, Q_r
    print >> out
    print >> out, 'relaxation time'
    print >> out, Q_t
    print >> out
    print >> out, '(expected rate) * (relaxation time):'
    print >> out, Q_r * Q_t
    print >> out
    print >> out
    return out.getvalue().rstrip()
Пример #3
0
 def __init__(self, Q):
     self.Q = Q
     self.relaxation_time = mrate.R_to_relaxation_time(Q)
     self.p = min(mrate.R_to_distn(Q))
     self.N = len(Q)
     self.lam = - 1 / self.relaxation_time
     key_time_points = ctmcmitaylor.get_key_time_points(
         self.lam, self.p, self.N)
     self.time_to_uniformity, self.time_to_usefulness = key_time_points
Пример #4
0
 def __init__(self, M, t):
     """
     @param M: mutation matrix
     @param t: the distance to go in the requested direction
     """
     self.M = M
     self.t = t
     # get the stationary distribution of the mutation process
     self.v = mrate.R_to_distn(M)
     # get the mutation process relaxation time
     self.r_mut = mrate.R_to_relaxation_time(M)
Пример #5
0
 def __call__(self, X):
     """
     @param X: a vector to be converted into a finite distribution
     """
     v_target = X_to_distn(X)
     v_new = (1 - self.t) * self.v + self.t * v_target
     R = mrate.to_gtr_halpern_bruno(self.M, v_new)
     if not np.allclose(v_new, mrate.R_to_distn(R)):
         raise ValueError('stationary distribution error')
     r_sel = mrate.R_to_relaxation_time(R)
     # we want to minimize this
     return self.r_mut - r_sel
Пример #6
0
def get_response_content(fs):
    np.set_printoptions(linewidth=200)
    out = StringIO()
    n = fs.nstates
    t = 0.001
    # sample the initial mutation rate matrix
    S = sample_symmetric_rate_matrix(n)
    v = sample_distribution(n)
    M = mrate.to_gtr_halpern_bruno(S, v)
    if not np.allclose(v, mrate.R_to_distn(M)):
        raise ValueError('stationary distribution error')
    print >> out, 't:', t
    print >> out
    print >> out, 'initial GTR matrix:'
    print >> out, M
    print >> out
    # Try to iteratively increase the relaxation time
    # by repeatedly applying Halpern-Bruno selection.
    R = M
    v_old = v
    for i in range(20):
        # print some properties of the matrix
        print >> out, v_old
        print >> out, mrate.R_to_relaxation_time(R)
        print >> out
        f = MyOpt(R, t)
        x0 = [1.0] * (n - 1)
        result = scipy.optimize.fmin(f,
                                     x0,
                                     disp=0,
                                     full_output=1,
                                     ftol=0.000001)
        xopt, fopt, niters, funcalls, warnflag = result
        if fopt > 0:
            print >> out, 'failed to increase relaxation time'
            print >> out
            break
        # compute the next stationary distribution
        v_target = X_to_distn(xopt)
        v_new = (1 - t) * v_old + t * v_target
        print >> out, v_new - v_old
        print >> out
        # compute the next rate matrix and update its stationary distribution
        R = mrate.to_gtr_halpern_bruno(R, v_new)
        if not np.allclose(v_new, mrate.R_to_distn(R)):
            raise ValueError('stationary distribution error')
        v_old = v_new
    print >> out, 'final rate matrix:'
    print >> out, R
    print >> out
    return out.getvalue()
Пример #7
0
def get_statistic_ratios(Q_mut, Q_sels):
    """
    @param Q_mut: mutation rate matrix
    @param Q_sels: mutations-selection balance rate matrices
    @return: ER_ratios, NSR_ratios, ER_NSR_ratios
    """
    ER_mut = mrate.Q_to_expected_rate(Q_mut)
    ER_sels = [mrate.Q_to_expected_rate(Q) for Q in Q_sels]
    ER_ratios = [ER_sel / ER_mut for ER_sel in ER_sels]
    ER_NSR_mut = 1 / mrate.R_to_relaxation_time(Q_mut)
    ER_NSR_sels = [1 / mrate.R_to_relaxation_time(Q) for Q in Q_sels]
    ER_NSR_ratios = [ER_NSR_sel / ER_NSR_mut for ER_NSR_sel in ER_NSR_sels]
    NSR_ratios = [a / b for a, b in zip(ER_NSR_ratios, ER_ratios)]
    # do some extra investigation
    """
    nsels = len(Q_sels)
    for i in range(nsels):
        if ER_NSR_ratios[i] < 1:
            print 'found a slower-decaying mutation-selection matrix:'
            print Q_sels[i]
            print
    print
    print 'ER_mut:'
    print ER_mut
    print
    print 'ER_NSR_mut:'
    print ER_NSR_mut
    print
    print 'ER_sels:'
    for x in ER_sels:
        print x
    print
    print 'ER_NSR_sels:'
    for x in ER_NSR_sels:
        print x
    print
    """
    return ER_ratios, NSR_ratios, ER_NSR_ratios
Пример #8
0
 def test_large_variance(self):
     n = 4
     v = sample_distribution(n)
     S = sample_symmetric_rate_matrix(n)
     R = mrate.to_gtr_halpern_bruno(S, v)
     """
     a = .1
     b = .2
     c = .7
     R = np.array([
         [-(b+c), b, c],
         [a, -(a+c), c],
         [a, b, -(a+b)]])
     """
     t = 2.0
     dt = 0.0000001
     rtime = mrate.R_to_relaxation_time(R)
     var_a = get_ml_variance(R, t)
     var_b = get_ml_variance(R, t + dt)
     var_slope = (var_b - var_a) / dt
     deriv_ratio = get_p_id_deriv_ratio(R, t)
     clever_ratio = get_ml_variance_ratio(R, t)
     print 'time:', t
     print 'variance:', var_a
     print 'variance slope:', var_slope
     print 'var_slope / var_a:', var_slope / var_a
     print 'var_slope / var_a [clever]:', clever_ratio
     print 'log variance:', math.log(var_a)
     print 'relaxation time:', rtime
     print '2 / relaxation_time:', 2 / rtime
     print "p_id(t)'' / p_id(t)':", deriv_ratio
     print
     print '--- new attempt ---'
     print 'mutual information:', ctmcmi.get_mutual_information(R, t)
     print 'reciprocal of MI:', 1.0 / ctmcmi.get_mutual_information(R, t)
     print 'asymptotic variance:', get_asymptotic_variance(R, t)
     print 'asymptotic variance (ver. 2):', get_asymptotic_variance_b(R, t)
     print 'asymptotic variance (ver. 3):', get_asymptotic_variance_c(R, t)
     print 'AV approx (ver. 4):', get_asymptotic_variance_d(R, t)
     print 'AV approx (ver. 5):', get_asymptotic_variance_e(R, t)
     print
     print '--- another thing ---'
     fi_slow = get_fisher_info_known_distn(R, v, t)
     fi_fast = get_fisher_info_known_distn_fast(R, v, t)
     print 'slow asymptotic variance:', 1 / fi_slow
     print 'fast asymptotic variance:', 1 / fi_fast
     print
Пример #9
0
def get_rate_matrix_summary(Q):
    out = StringIO()
    Q_t = mrate.R_to_relaxation_time(Q)
    Q_cheeger_bound = get_local_cheeger_ratio_bound(Q)
    if len(Q) < 16:
        real_cheeger = get_real_cheeger(Q)
        cheeger_string = str(real_cheeger)
    else:
        cheeger_string = 'takes too long to compute'
    print >> out, 'rate matrix:'
    print >> out, Q
    print >> out
    print >> out, 'algebraic connectivity (all hypothesized to be <= 2)'
    print >> out, 1 / Q_t
    print >> out
    print >> out, 'local cheeger ratio bound (all hypothesized to be <= 1):'
    print >> out, Q_cheeger_bound
    print >> out
    print >> out, 'actual Cheeger constant (all hypothesized to be <= 1):'
    print >> out, cheeger_string
    print >> out
    return out.getvalue().rstrip()
Пример #10
0
 def __call__(self):
     """
     @return: True if a counterexample is found
     """
     n = self.nstates
     # sample a fairly generic GTR mutation rate matrix
     S = sample_symmetric_rate_matrix(n)
     v = sample_distribution(n)
     M = mrate.to_gtr_halpern_bruno(S, v)
     # look at the fiedler-like eigenvector of the mutation rate matrix
     r_recip, fiedler = mrate._R_to_eigenpair(M)
     r_mut = 1 / r_recip
     value_min, state_min = min((fiedler[i], i) for i in range(n))
     value_max, state_max = max((fiedler[i], i) for i in range(n))
     # move the stationary distribution towards a 50/50 distribution
     v_target = np.zeros(n)
     v_target[state_min] = 0.5
     v_target[state_max] = 0.5
     v_new = (1 - self.t) * v + self.t * v_target
     R = mrate.to_gtr_halpern_bruno(M, v_new)
     r_sel = mrate.R_to_relaxation_time(R)
     # the mutation-selection balance should have longer relaxation time
     #if r_sel < r_mut:
     #if True:
     if maxind(np.abs(fiedler / v)) != maxind(np.abs(fiedler / np.sqrt(v))):
         self.M = M
         self.fiedler = fiedler
         self.r_mut = r_mut
         self.r_sel = r_sel
         self.v = v
         self.v_new = v_new
         self.v_target = v_target
         self.opt_target = self._get_opt_target()
         return True
     else:
         return False
Пример #11
0
def get_time_point_summary(Q_mut, Q_sels, t):
    """
    @param Q_mut: the mutation rate matrix
    @param Q_sels: sequence of mutation-selection rate matrices
    @param t: the time point under consideration
    @return: a sequence of statistics
    """
    # Compute the following statistics at this time point:
    # t
    # mutation MI
    # selection MI max
    # selection MI high
    # selection MI mean
    # selection MI low
    # selection MI min
    # correlation fn 1
    # correlation fn 2
    # correlation fn 3
    # correlation fn 4
    # correlation fn 5
    # proportion sign agreement fn 1
    # proportion sign agreement fn 2
    # proportion sign agreement fn 3
    # proportion sign agreement fn 4
    # proportion sign agreement fn 5
    # informativeness fn 1
    # informativeness fn 2
    # informativeness fn 3
    # informativeness fn 4
    # informativeness fn 5
    #
    # First compute the mutual information for mut and mut-sel.
    nsels = len(Q_sels)
    mi_mut = ctmcmi.get_mutual_information(Q_mut, t)
    mi_sels = [ctmcmi.get_mutual_information(Q, t) for Q in Q_sels]
    mi_signs = [1 if mi_sel > mi_mut else -1 for mi_sel in mi_sels]
    # Now compute some other functions
    v0 = [ctmcmi.get_mutual_information_small_approx_c(Q, t) for Q in Q_sels]
    v1 = [ctmcmi.get_mutual_information_small_approx(Q, t) for Q in Q_sels]
    v2 = [ctmcmi.get_mutual_information_approx_c(Q, t) for Q in Q_sels]
    v3 = [math.exp(-2*t/mrate.R_to_relaxation_time(Q)) for Q in Q_sels]
    v4 = [math.exp(-t*mrate.Q_to_expected_rate(Q)) for Q in Q_sels]
    # Now that we have computed all of the vectors at this time point,
    # we can compute the statistics that we want to report.
    statistics = []
    statistics.append(t)
    statistics.append(mi_mut)
    # add the mutual information statistics
    sorted_mi = sorted(mi_sels)
    n_extreme = nsels / 20
    statistics.append(sorted_mi[-1])
    statistics.append(sorted_mi[-n_extreme])
    statistics.append(sum(sorted_mi) / nsels)
    statistics.append(sorted_mi[n_extreme-1])
    statistics.append(sorted_mi[0])
    # add the correlations
    for v in (v0, v1, v2, v3, v4):
        r, p = scipy.stats.stats.pearsonr(v, mi_sels)
        statistics.append(r)
    # add the sign proportions
    for v in (v0, v1, v2, v3, v4):
        v_signs = [1 if value > mi_mut else -1 for value in v]
        total = sum(1 for a, b in zip(mi_signs, v_signs) if a == b)
        p = float(total) / nsels
        statistics.append(p)
    # add the informativenesses
    for v in (v0, v1, v2, v3, v4):
        v_signs = [1 if value > mi_mut else -1 for value in v]
        informativeness = 0
        for pair in ((1, 1), (1, -1), (-1, 1), (-1, -1)):
            v_value, m_value = pair
            v_marginal_count = sum(1 for x in v_signs if x == v_value)
            m_marginal_count = sum(1 for x in mi_signs if x == m_value)
            joint_count = sum(1 for x in zip(v_signs, mi_signs) if x == pair)
            if joint_count:
                joint_prob = joint_count / float(nsels)
                a = math.log(joint_prob)
                b = math.log(v_marginal_count / float(nsels))
                c = math.log(m_marginal_count / float(nsels))
                informativeness += joint_prob * (a - b - c)
        statistics.append(informativeness)
    # return the statistics
    return statistics
Пример #12
0
def sample_row():
    n = 4
    # sample the exchangeability
    S = np.zeros((n, n))
    S[1, 0] = random.expovariate(1)
    S[2, 0] = random.expovariate(1)
    S[2, 1] = random.expovariate(1)
    S[3, 0] = random.expovariate(1)
    S[3, 1] = random.expovariate(1)
    S[3, 2] = random.expovariate(1)
    # sample the mutation stationary distribution
    mdistn = np.array([random.expovariate(1) for i in range(n)])
    mdistn /= np.sum(mdistn)
    # sample the mutation selection balance stationary distribution
    bdistn = np.array([random.expovariate(1) for i in range(n)])
    bdistn /= np.sum(bdistn)
    # sample the time
    t = random.expovariate(1)
    # sample the info type
    infotype = random.choice(('infotype.mi', 'infotype.fi'))
    # Compute some intermediate variables
    # from which the summary statistics and the label are computed.
    S = S + S.T
    M = S * mdistn
    M -= np.diag(np.sum(M, axis=1))
    R = mrate.to_gtr_halpern_bruno(M, bdistn)
    shannon_ent_mut = -sum(p * log(p) for p in mdistn)
    shannon_ent_bal = -sum(p * log(p) for p in bdistn)
    logical_ent_mut = 1.0 - sum(p * p for p in mdistn)
    logical_ent_bal = 1.0 - sum(p * p for p in bdistn)
    expected_rate_mut = mrate.Q_to_expected_rate(M)
    expected_rate_bal = mrate.Q_to_expected_rate(R)
    spectral_rate_mut = 1 / mrate.R_to_relaxation_time(M)
    spectral_rate_bal = 1 / mrate.R_to_relaxation_time(R)
    mi_mut = ctmcmi.get_mutual_information(M, t)
    mi_bal = ctmcmi.get_mutual_information(R, t)
    fi_mut = divtime.get_fisher_information(M, t)
    fi_bal = divtime.get_fisher_information(R, t)
    # compute the summary statistics
    summary_entries = [
        shannon_ent_bal - shannon_ent_mut,
        logical_ent_bal - logical_ent_mut,
        log(shannon_ent_bal) - log(shannon_ent_mut),
        log(logical_ent_bal) - log(logical_ent_mut),
        expected_rate_bal - expected_rate_mut,
        spectral_rate_bal - spectral_rate_mut,
        log(expected_rate_bal) - log(expected_rate_mut),
        log(spectral_rate_bal) - log(spectral_rate_mut),
        mi_bal - mi_mut,
        fi_bal - fi_mut,
        math.log(mi_bal) - math.log(mi_mut),
        math.log(fi_bal) - math.log(fi_mut),
    ]
    # get the definition entries
    definition_entries = [
        S[1, 0],
        S[2, 0],
        S[2, 1],
        S[3, 0],
        S[3, 1],
        S[3, 2],
        mdistn[0],
        mdistn[1],
        mdistn[2],
        mdistn[3],
        bdistn[0],
        bdistn[1],
        bdistn[2],
        bdistn[3],
        infotype,
        t,
    ]
    # define the label
    if infotype == 'infotype.mi' and mi_mut > mi_bal:
        label = 'mut.is.better'
    elif infotype == 'infotype.mi' and mi_mut < mi_bal:
        label = 'bal.is.better'
    elif infotype == 'infotype.fi' and fi_mut > fi_bal:
        label = 'mut.is.better'
    elif infotype == 'infotype.fi' and fi_mut < fi_bal:
        label = 'bal.is.better'
    else:
        label = 'indistinguishable'
    # return the row
    return definition_entries + summary_entries + [label]