示例#1
0
def switching_BER(data, **kwargs):
    """ Process data for BER experiment. """
    count_mat, start_stt = count_matrices_ber(data, **kwargs)
    switched_stt = int(1 - start_stt)
    mean = beta.mean(1 + count_mat[start_stt, switched_stt],
                     1 + count_mat[start_stt, start_stt])
    limit = beta.mean(
        1 + count_mat[start_stt, switched_stt] +
        count_mat[start_stt, start_stt], 1)
    ci68 = beta.interval(0.68, 1 + count_mat[start_stt, switched_stt],
                         1 + count_mat[start_stt, start_stt])
    ci95 = beta.interval(0.95, 1 + count_mat[start_stt, switched_stt],
                         1 + count_mat[start_stt, start_stt])
    return mean, limit, ci68, ci95
示例#2
0
    def get_answer(self):

        # In the beginning of every turn, we calculate the first, second, and third top guesses
        # related to the clue.
        if self.start_turn:
            embedding_distances = self.compute_distance(self.clue, self.words)
            sorted_words = [k for k, v in sorted(embedding_distances.items(), key=lambda item: item[1])]
            print("Words sorted by embedding distances: ", sorted_words)

            # First closest word.
            self.first = sorted_words[0]
            self.state[self.first] = (self.state[self.first][0] + 5, self.state[self.first][1])

            # Second closest word. Guaranteed to exist.
            self.second = sorted_words[1]
            self.state[self.second] = (self.state[self.second][0] + 3, self.state[self.second][1])

            # Third closest word. Usually exists.
            self.third = sorted_words[2]
            self.state[self.third] = (self.state[self.third][0] + 2, self.state[self.third][1])

            self.start_turn = False

        print("state is: ", self.state)
        print("\n")

        sorted_by_beta = [k for k, v in sorted(self.state.items(), key=lambda item: -beta.mean(item[1][0], item[1][1]))]
        print("sorted_by_beta: ", sorted_by_beta)
        print("\n")
        # Note this guess may or may not be related to the current clue.
        self.guess = sorted_by_beta[0]
        self.guess_index = self.words.index(self.guess)
        print("Guess is: ", self.guess)
        self.num -= 1
        return self.guess
示例#3
0
def beta_estimation(_, rewards):
    global GLOBAL_CACHE
    key = (tuple(rewards), 'beta')
    if key in GLOBAL_CACHE:
        return GLOBAL_CACHE[key]
    trials = len(rewards)
    GLOBAL_CACHE[key] = (beta.mean(1 + sum(rewards), trials+1 - sum(rewards)),
        beta.std(1 + sum(rewards), trials+1 - sum(rewards)))
    return GLOBAL_CACHE[key]
示例#4
0
def run_beta_fit(cadd_trset, mnp_cadd_trset, gerp_trset):
  '''
from scipy import stats  
import numpy as np  
import matplotlib.pylab as plt

# create some normal random noisy data
ser = 50*np.random.rand() * np.random.normal(10, 10, 100) + 20

# plot normed histogram
plt.hist(ser, normed=True)

# find minimum and maximum of xticks, so we know
# where we should compute theoretical distribution
xt = plt.xticks()[0]  
xmin, xmax = min(xt), max(xt)  
lnspc = np.linspace(xmin, xmax, len(ser))

ab,bb,cb,db = stats.beta.fit(ser)  
pdf_beta = stats.beta.pdf(lnspc, ab, bb,cb, db)  
plt.plot(lnspc, pdf_beta, label="Beta")

plt.show()
  '''
  
  cadd_trset_param = {}
  for aaconv in cadd_trset.keys():
     a,b,loc2,scale2 = beta.fit(cadd_trset[aaconv])
     mean2 = beta.mean(a,b,loc2,scale2)
     cadd_trset_param[aaconv] = [a,b,loc2,scale2,mean2]

  mnp_cadd_trset_param = {}
  for aaconv in mnp_cadd_trset.keys(): 
    a,b,loc2,scale2 = beta.fit(mnp_cadd_trset[aaconv])
    mean2 = beta.mean(a,b,loc2,scale2)
    mnp_cadd_trset_param[aaconv] = [a,b,loc2,scale2,mean2]

  gerp_trset_param = {}
  for aaconv in gerp_trset.keys():
    a,b,loc2,scale2 = beta.fit(gerp_trset[aaconv])
    mean2 = beta.mean(a,b,loc2,scale2)
    gerp_trset_param[aaconv] = [a,b,loc2,scale2,mean2]

  return cadd_trset_param, mnp_cadd_trset_param, gerp_trset_param
    def test_transformation_composition_II(self):
        num_vars = 2
        alpha_stat = 5
        beta_stat = 2
        def beta_cdf(x): return beta_rv.cdf(x, a=alpha_stat, b=beta_stat)
        def beta_icdf(x): return beta_rv.ppf(x, a=alpha_stat, b=beta_stat)
        x_marginal_cdfs = [beta_cdf]*num_vars
        x_marginal_inv_cdfs = [beta_icdf]*num_vars
        x_marginal_means = np.asarray(
            [beta_rv.mean(a=alpha_stat, b=beta_stat)]*num_vars)
        x_marginal_stdevs = np.asarray(
            [beta_rv.std(a=alpha_stat, b=beta_stat)]*num_vars)

        def beta_pdf(x): return beta_rv.pdf(x, a=alpha_stat, b=beta_stat)
        x_marginal_pdfs = [beta_pdf]*num_vars

        z_correlation = -0.9*np.ones((num_vars, num_vars))
        for ii in range(num_vars):
            z_correlation[ii, ii] = 1.

        x_correlation = gaussian_copula_compute_x_correlation_from_z_correlation(
            x_marginal_inv_cdfs, x_marginal_means, x_marginal_stdevs,
            z_correlation)
        x_covariance = correlation_to_covariance(
            x_correlation, x_marginal_stdevs)

        var_trans_1 = NatafTransformation(
            x_marginal_cdfs, x_marginal_inv_cdfs, x_marginal_pdfs, x_covariance,
            x_marginal_means)

        # rosenblatt maps to [0,1] but polynomials of bounded variables
        # are in [-1,1] so add second transformation for this second mapping
        def normal_cdf(x): return normal_rv.cdf(x)
        def normal_icdf(x): return normal_rv.ppf(x)
        std_normal_marginal_cdfs = [normal_cdf]*num_vars
        std_normal_marginal_inv_cdfs = [normal_icdf]*num_vars
        var_trans_2 = UniformMarginalTransformation(
            std_normal_marginal_cdfs, std_normal_marginal_inv_cdfs)
        var_trans = TransformationComposition([var_trans_1, var_trans_2])

        num_samples = 1000
        true_samples, true_canonical_samples = \
            generate_x_samples_using_gaussian_copula(
                num_vars, z_correlation, x_marginal_inv_cdfs, num_samples)
        true_canonical_samples = normal_rv.cdf(true_canonical_samples)

        samples = var_trans.map_from_canonical_space(
            true_canonical_samples)
        assert np.allclose(true_samples, samples)

        canonical_samples = var_trans.map_to_canonical_space(samples)
        assert np.allclose(true_canonical_samples, canonical_samples)
示例#6
0
    def setUp(self):
        uniform_var1 = {'var_type': 'uniform', 'range': [-1, 1]}
        uniform_var2 = {'var_type': 'uniform', 'range': [0, 1]}
        beta_var1 = {
            'var_type': 'beta',
            'range': [-1, 1],
            'alpha_stat': 1,
            'beta_stat': 1
        }
        beta_var2 = {
            'var_type': 'beta',
            'range': [-2, 1],
            'alpha_stat': 2,
            'beta_stat': 1
        }
        gaussian_var = {'var_type': 'gaussian', 'mean': -1., 'variance': 4.}

        #self.continuous_variables = [
        #    uniform_var1,beta_var1,gaussian_var,uniform_var2,uniform_var1,
        #    beta_var2]
        self.continuous_variables = [
            uniform(-1, 2),
            beta(1, 1, -1, 2),
            norm(-1, 2),
            uniform(),
            uniform(-1, 2),
            beta(2, 1, -2, 3)
        ]

        self.continuous_mean = np.array(
            [0., 0., -1, 0.5, 0.,
             beta.mean(a=2, b=1, loc=-2, scale=3)])

        nmasses1 = 10
        mass_locations1 = np.geomspace(1.0, 32.0, num=nmasses1)
        masses1 = np.ones(nmasses1, dtype=float) / nmasses1

        nmasses2 = 10
        mass_locations2 = np.arange(0, nmasses2)
        masses2 = np.geomspace(1.0, 32.0, num=nmasses2)
        masses2 /= masses2.sum()
        # second () is to freeze variable which creates var.dist member
        # variable
        var1 = float_rv_discrete(name='var1',
                                 values=(mass_locations1, masses1))()
        var2 = float_rv_discrete(name='var2',
                                 values=(mass_locations2, masses2))()
        self.discrete_variables = [var1, var2]
        self.discrete_mean = np.empty(len(self.discrete_variables))
        for ii, var in enumerate(self.discrete_variables):
            self.discrete_mean[ii] = var.moment(1)
def shots_to_obs_moments(bitarray: np.ndarray, qubits: List[int], observable: PauliTerm,
                         use_beta_dist_unbiased_prior: bool = False) -> Tuple[float, float]:
    """
    Calculate the mean and variance of the given observable based on the bitarray of results.

    :param bitarray: results from running `qc.run`, a 2D num_shots by num_qubits array.
    :param qubits: list of qubits in order corresponding to the bitarray results.
    :param observable: the observable whose moments are calculated from the shot data
    :param use_beta_dist_unbiased_prior: if true then the mean and variance are estimated from a
        beta distribution that incorporates an unbiased Bayes prior. This precludes var = 0.
    :return: tuple specifying (mean, variance)
    """
    coeff = complex(observable.coefficient)
    if not np.isclose(coeff.imag, 0):
        raise ValueError(f"The coefficient of an observable should not be complex.")
    coeff = coeff.real

    obs_qubits = [q for q, _ in observable]
    # Identify classical register indices to select
    idxs = [idx for idx, q in enumerate(qubits) if q in obs_qubits]

    if len(idxs) == 0:  # identity term
        return coeff, 0

    assert bitarray.shape[1] == len(qubits), 'qubits should label each column of the bitarray'

    # Pick columns corresponding to qubits with a non-identity out_operation
    obs_strings = bitarray[:, idxs]
    # Transform bits to eigenvalues; ie (+1, -1)
    my_obs_strings = 1 - 2 * obs_strings
    # Multiply row-wise to get operator values.
    obs_vals = np.prod(my_obs_strings, axis=1)

    if use_beta_dist_unbiased_prior:
        # For binary classified data with N counts of + and M counts of -, these can be estimated
        # using the mean and variance of the beta distribution beta(N+1, M+1) where the +1 is used
        # to incorporate an unbiased Bayes prior.
        plus_array = obs_vals == 1
        n_minus, n_plus = np.bincount(plus_array, minlength=2)
        bernoulli_mean = beta.mean(n_plus + 1, n_minus + 1)
        bernoulli_var = beta.var(n_plus + 1, n_minus + 1)
        obs_mean, obs_var = transform_bit_moments_to_pauli(bernoulli_mean, bernoulli_var)
        obs_mean *= coeff
        obs_var *= coeff**2
    else:
        obs_vals = coeff * obs_vals
        obs_mean = np.mean(obs_vals).item()
        obs_var = np.var(obs_vals).item() / len(bitarray)

    return obs_mean, obs_var
示例#8
0
def get_mean_accuracy(all_means, nbins=10):
    """
    Bins ancestors according to mean bootstrapped posterior probability,
    and then returns the mean accuracy for each bin
    """
    ## Add a columns of bin assignments
    # bins = np.linspace(0, all_means['posterior'].max(), nbins)
    bins = np.linspace(0, 1, nbins)
    all_means['bin'] = np.digitize(all_means['posterior'], bins)

    ## Add upper bound to right-most bin
    all_means.replace(to_replace={'bin':{nbins: nbins-1}}, inplace=True)

    ## Bin ancestors by mean bootstrapped probability, adding columns for
    ## whether they were the true generating ancestor, and the number of
    ## ancestors in each bin
    bin_count = lambda x: len(x)
    binned = all_means[['generator', 'bin']].pivot_table(index='bin',
                            aggfunc=[np.mean, bin_count], fill_value=0)
    binned.columns = [['observed_prob', 'bin_count']]
    binned['n_successes'] = binned['observed_prob'].values * \
            binned['bin_count'].values

    ## Estimate means and confidence intervals as sampling from a binomial
    ## distribution, with a uniform prior on success rates - Done using
    ## a beta distribution
    binned['alpha'] = binned['n_successes'] + 1
    binned['beta'] = binned['bin_count'].values - binned['n_successes'].values + 1
    beta_mean = lambda row: beta.mean(float(row['alpha']), float(row['beta']))
    binned['posterior_mean'] = binned.apply(beta_mean, axis=1)

    ## Add confidence intercals
    beta_025CI = lambda row: beta.ppf(0.025, float(row['alpha']), float(row['beta']))
    beta_975CI = lambda row: beta.ppf(0.975, float(row['alpha']), float(row['beta']))
    binned['CI2.5'] = binned.apply(beta_025CI, axis=1)
    binned['CI97.5'] = binned.apply(beta_975CI, axis=1)

    ## Convert to values relative to mean, to fit plotting convention
    binned['CI2.5'] = binned['posterior_mean'].values - binned['CI2.5'].values
    binned['CI97.5'] = binned['CI97.5'].values - binned['posterior_mean'].values

    ## Add column with bin centre for plotting
    binned['bin_centre'] = all_means[['posterior', 'bin']].groupby('bin').mean()

    return binned
    def test_nataf_transformation(self):
        num_vars = 2
        alpha_stat = 2
        beta_stat = 5
        bisection_opts = {'tol': 1e-10, 'max_iterations': 100}

        def beta_cdf(x): return beta_rv.cdf(x, a=alpha_stat, b=beta_stat)
        def beta_icdf(x): return beta_rv.ppf(x, a=alpha_stat, b=beta_stat)
        x_marginal_cdfs = [beta_cdf]*num_vars
        x_marginal_inv_cdfs = [beta_icdf]*num_vars
        x_marginal_means = np.asarray(
            [beta_rv.mean(a=alpha_stat, b=beta_stat)]*num_vars)
        x_marginal_stdevs = np.asarray(
            [beta_rv.std(a=alpha_stat, b=beta_stat)]*num_vars)

        def beta_pdf(x): return beta_rv.pdf(x, a=alpha_stat, b=beta_stat)
        x_marginal_pdfs = [beta_pdf]*num_vars

        z_correlation = np.array([[1, 0.7], [0.7, 1]])

        x_correlation = \
            gaussian_copula_compute_x_correlation_from_z_correlation(
                x_marginal_inv_cdfs, x_marginal_means, x_marginal_stdevs,
                z_correlation)

        x_covariance = correlation_to_covariance(
            x_correlation, x_marginal_stdevs)

        var_trans = NatafTransformation(
            x_marginal_cdfs, x_marginal_inv_cdfs, x_marginal_pdfs, x_covariance,
            x_marginal_means, bisection_opts)

        assert np.allclose(var_trans.z_correlation, z_correlation)

        num_samples = 1000
        true_samples, true_canonical_samples = \
            generate_x_samples_using_gaussian_copula(
                num_vars, z_correlation, x_marginal_inv_cdfs, num_samples)

        canonical_samples = var_trans.map_to_canonical_space(true_samples)
        assert np.allclose(true_canonical_samples, canonical_samples)

        samples = var_trans.map_from_canonical_space(
            true_canonical_samples)
        assert np.allclose(true_samples, samples)
示例#10
0
    def beta_posterior(self, a, b):
        """
            A beta(a,b) prior on the porportion of disease cases: theta.
        """

        theta_lower = 0
        theta_upper = 1

        theta_range = np.linspace(theta_lower, theta_upper, 100)

        beta_posterior_distribution = beta.pdf(
            theta_range, self.no_disease_occurances + a,
            self.sample_size + b - self.no_disease_occurances)

        beta_posterior_mean = beta.mean(
            self.no_disease_occurances + a,
            self.sample_size + b - self.no_disease_occurances)

        return theta_range, beta_posterior_distribution, beta_posterior_mean
def survival_statistics(bitstrings):
    """
    Calculate the mean and variance of the estimated probability of the ground state given shot
    data on one or more bits.

    For binary classified data with N counts of 1 and M counts of 0, these
    can be estimated using the mean and variance of the beta distribution beta(N+1, M+1) where the
    +1 is used to incorporate an unbiased Bayes prior.

    :param ndarray bitstrings: A 2D numpy array of repetitions x bit-arrays.
    :return: (survival mean, sqrt(survival variance))
    """
    survived = np.sum(bitstrings, axis=1) == 0

    # count obmurrences of 000...0 and anything besides 000...0
    n_died, n_survived = bincount(survived, minlength=2)

    # mean and variance given by beta distribution with a uniform prior
    survival_mean = beta.mean(n_survived + 1, n_died + 1)
    survival_var = beta.var(n_survived + 1, n_died + 1)
    return survival_mean, np.sqrt(survival_var)
示例#12
0
def player_abilities(decay, day_span):

    query = {'mw': decay, 'day_span': day_span}

    projection = {
        '_id': 0,
        'mw': 0,
        'day_span': 0,
    }

    mongo_wrapper = mongo.Mongo()
    cursor = mongo_wrapper.find(mongo_wrapper.PLAYERS_BETA, query, projection)

    abilities_df = pd.DataFrame(list(cursor))

    df = pd.concat([
        abilities_df.drop(['player'], axis=1), abilities_df['player'].apply(
            pd.Series)
    ],
                   axis=1)

    df['mean'] = beta.mean(df.a, df.b)

    return df
示例#13
0
def switching_phase_diagram(buffers, durations, volts, num_clusters=2):
    #Get an idea of SNR
    #Cluster all the data into three based with starting point based on edges
    all_vals = buffers.flatten()
    all_vals.resize((all_vals.size, 1))
    init_guess = np.linspace(np.min(all_vals), np.max(all_vals), num_clusters)
    init_guess[[1, -1]] = init_guess[[-1, 1]]
    init_guess.resize((num_clusters, 1))
    clusterer = KMeans(init=init_guess, n_clusters=num_clusters)
    state = clusterer.fit_predict(all_vals)

    #Report initial state distributions
    print("Total initial state distribution:")
    init_state = state[::2]
    for ct in range(num_clusters):
        print("\tState {}: {:.2f}%".format(
            ct, 100 * np.sum(init_state == ct) / len(init_state)))

    #Approximate SNR from centre distance and variance
    std0 = np.std(all_vals[state == 0])
    std1 = np.std(all_vals[state == 1])
    mean_std = 0.5 * (std0 + std1)
    centre0 = clusterer.cluster_centers_[0, 0]
    centre1 = clusterer.cluster_centers_[1, 0]
    centre_dist = centre1 - centre0
    print(
        "Centre distance = {:.3f} with widths = {:.4f} / {:.4f} gives SNR ratio {:.3}"
        .format(centre_dist, std0, std1, centre_dist / mean_std))

    #Have a look at the distributions
    plt.figure()
    for ct in range(num_clusters):
        sns.distplot(all_vals[state == ct], kde=False, norm_hist=False)

    #calculate some switching matrices for each amplitude
    # 0->0 0->1
    # 1->0 1->1
    counts = []
    for buf in buffers:
        state = clusterer.predict(buf.reshape((len(buf), 1)))
        init_state = state[::2]
        final_state = state[1::2]
        switched = np.logical_xor(init_state, final_state)

        count_mat = np.zeros((2, 2), dtype=np.int)

        count_mat[0, 0] = np.sum(
            np.logical_and(init_state == 0, np.logical_not(switched)))
        count_mat[0, 1] = np.sum(np.logical_and(init_state == 0, switched))
        count_mat[1, 0] = np.sum(np.logical_and(init_state == 1, switched))
        count_mat[1, 1] = np.sum(
            np.logical_and(init_state == 1, np.logical_not(switched)))

        counts.append(count_mat)

    mean_PtoAP = np.array(
        [beta.mean(1 + c[0, 1], 1 + c[0, 0]) for c in counts])
    limit_PtoAP = np.array(
        [beta.mean(1 + c[0, 1] + c[0, 0], 1) for c in counts])
    mean_APtoP = np.array(
        [beta.mean(1 + c[1, 0], 1 + c[1, 1]) for c in counts])
    limit_APtoP = np.array(
        [beta.mean(1 + c[1, 0] + c[1, 1], 1) for c in counts])
    ci68_PtoAP = np.array(
        [beta.interval(0.68, 1 + c[0, 1], 1 + c[0, 0]) for c in counts])
    ci68_APtoP = np.array(
        [beta.interval(0.68, 1 + c[1, 0], 1 + c[1, 1]) for c in counts])
    ci95_PtoAP = np.array(
        [beta.interval(0.95, 1 + c[0, 1], 1 + c[0, 0]) for c in counts])
    ci95_APtoP = np.array(
        [beta.interval(0.95, 1 + c[1, 0], 1 + c[1, 1]) for c in counts])

    # import h5py
    # FID = h5py.File("data/CSHE-Switching-PhaseDiagramPtoAP.h5", "w")
    # FID.create_dataset("/buffer", data=buffers, compression="lzf")
    # FID.create_dataset("/durations", data=durations, compression="lzf")
    # FID.create_dataset("/volts", data=volts, compression="lzf")
    # FID.close()

    plt.figure()
    plt.title("Phase Diagram - P to AP", size=16)
    plt.xlabel("Pulse Duration (ns)", size=14)
    plt.ylabel("Pulse Amplitude (V)", size=14)
    means_diagram_PtoAP = mean_PtoAP.reshape(len(volts),
                                             len(durations),
                                             order='F')

    plt.pcolormesh(durations * 1e9, volts, means_diagram_PtoAP, cmap="RdGy")
    plt.colorbar()
    plt.figure()
    plt.title("Phase Diagram - AP to P", size=16)
    plt.xlabel("Pulse Duration (ns)", size=14)
    plt.ylabel("Pulse Amplitude (V)", size=14)
    means_diagram_APtoP = mean_APtoP.reshape(len(volts),
                                             len(durations),
                                             order='F')
    plt.pcolormesh(durations * 1e9, volts, means_diagram_APtoP, cmap="RdGy")
    plt.colorbar()
    print("Reached end")
    plt.show()
        count_mat[0, 1] = np.sum(np.logical_and(init_state == 0, switched))
        count_mat[1, 0] = np.sum(np.logical_and(init_state == 1, switched))
        count_mat[1, 1] = np.sum(
            np.logical_and(init_state == 1, np.logical_not(switched)))

        counts.append(count_mat)

    import h5py
    FID = h5py.File("data/nTron-AmpFall-PhaseDiagram-10V-52us-HighRes.h5", "w")
    FID.create_dataset("/buffer", data=buffers, compression="lzf")
    FID.create_dataset("/fall_times", data=fall_times, compression="lzf")
    FID.create_dataset("/amplitudes", data=amplitudes, compression="lzf")
    FID.close()

    mean_PtoAP = np.array(
        [beta.mean(1 + c[0, 1], 1 + c[0, 0]) for c in counts])
    mean_APtoP = np.array(
        [beta.mean(1 + c[1, 0], 1 + c[1, 1]) for c in counts])
    plt.figure()
    plt.title("P to AP")
    plt.xlabel("Pulse Falltime (ns)")
    plt.ylabel("Pulse Amplitude (Arb. Units)")
    means_diagram_PtoAP = mean_PtoAP.reshape(len(amplitudes),
                                             len(fall_times),
                                             order='F')
    plt.pcolormesh(fall_times * 1e9,
                   amplitudes,
                   means_diagram_PtoAP,
                   cmap="RdGy")
    plt.colorbar()
    plt.figure()
    def test_correlated_beta(self):

        num_vars = 2
        alpha_stat = 2
        beta_stat = 5
        bisection_opts = {'tol': 1e-10, 'max_iterations': 100}

        beta_cdf = lambda x: beta_rv.cdf(x, a=alpha_stat, b=beta_stat)
        beta_icdf = lambda x: beta_rv.ppf(x, a=alpha_stat, b=beta_stat)
        x_marginal_cdfs = [beta_cdf] * num_vars
        x_marginal_inv_cdfs = [beta_icdf] * num_vars
        x_marginal_means = np.asarray(
            [beta_rv.mean(a=alpha_stat, b=beta_stat)] * num_vars)
        x_marginal_stdevs = np.asarray(
            [beta_rv.std(a=alpha_stat, b=beta_stat)] * num_vars)
        beta_pdf = lambda x: beta_rv.pdf(x, a=alpha_stat, b=beta_stat)
        x_marginal_pdfs = [beta_pdf] * num_vars

        x_correlation = np.array([[1, 0.7], [0.7, 1]])

        quad_rule = gauss_hermite_pts_wts_1D(11)
        z_correlation = transform_correlations(x_correlation,
                                               x_marginal_inv_cdfs,
                                               x_marginal_means,
                                               x_marginal_stdevs, quad_rule,
                                               bisection_opts)
        assert np.allclose(z_correlation[0, 1], z_correlation[1, 0])

        x_correlation_recovered = \
            gaussian_copula_compute_x_correlation_from_z_correlation(
                x_marginal_inv_cdfs,x_marginal_means,x_marginal_stdevs,
                z_correlation)
        assert np.allclose(x_correlation, x_correlation_recovered)

        z_variable = multivariate_normal(mean=np.zeros((num_vars)),
                                         cov=z_correlation)
        z_joint_density = lambda x: z_variable.pdf(x.T)
        target_density = partial(nataf_joint_density,
                                 x_marginal_cdfs=x_marginal_cdfs,
                                 x_marginal_pdfs=x_marginal_pdfs,
                                 z_joint_density=z_joint_density)

        # all variances are the same so
        #true_x_covariance  = x_correlation.copy()*x_marginal_stdevs[0]**2
        true_x_covariance = correlation_to_covariance(x_correlation,
                                                      x_marginal_stdevs)

        def univariate_quad_rule(n):
            x, w = np.polynomial.legendre.leggauss(n)
            x = (x + 1.) / 2.
            w /= 2.
            return x, w

        x, w = get_tensor_product_quadrature_rule(100, num_vars,
                                                  univariate_quad_rule)
        assert np.allclose(np.dot(target_density(x), w), 1.0)

        # test covariance of computed by aplying quadrature to joint density
        mean = np.dot(x * target_density(x), w)
        x_covariance = np.empty((num_vars, num_vars))
        x_covariance[0, 0] = np.dot(x[0, :]**2 * target_density(x),
                                    w) - mean[0]**2
        x_covariance[1, 1] = np.dot(x[1, :]**2 * target_density(x),
                                    w) - mean[1]**2
        x_covariance[0, 1] = np.dot(x[0, :] * x[1, :] * target_density(x),
                                    w) - mean[0] * mean[1]
        x_covariance[1, 0] = x_covariance[0, 1]
        # error is influenced by bisection_opts['tol']
        assert np.allclose(x_covariance,
                           true_x_covariance,
                           atol=bisection_opts['tol'])

        # test samples generated using Gaussian copula are correct
        num_samples = 10000
        x_samples, true_u_samples = generate_x_samples_using_gaussian_copula(
            num_vars, z_correlation, x_marginal_inv_cdfs, num_samples)

        x_sample_covariance = np.cov(x_samples)
        assert np.allclose(true_x_covariance, x_sample_covariance, atol=1e-2)

        u_samples = nataf_transformation(x_samples, true_x_covariance,
                                         x_marginal_cdfs, x_marginal_inv_cdfs,
                                         x_marginal_means, x_marginal_stdevs,
                                         bisection_opts)

        assert np.allclose(u_samples, true_u_samples)

        trans_samples = inverse_nataf_transformation(
            u_samples, x_covariance, x_marginal_cdfs, x_marginal_inv_cdfs,
            x_marginal_means, x_marginal_stdevs, bisection_opts)

        assert np.allclose(x_samples, trans_samples)
示例#16
0
 def mean(self, n, p):
     mu = beta.mean(self, n, p)
     return mu
示例#17
0
 def get_overall_acc(self, weight):
     return np.dot(beta.mean(self._params[:, 0], self._params[:, 1]),
                   weight)
示例#18
0
def generate_beta_distribution_mean(alpha_val, beta_val):
    """ Generates the mean of the beta distribution for the given alpha and beta values.
    """
    return beta.mean(alpha_val, beta_val)
示例#19
0
ax.fill_between(mu_test, prior_mu, color='green', alpha=0.3)

ax.set_xlabel('$\mu$')
ax.set_ylabel('$p(\mu|\mathbf{x})$')

# pick random uniform point
# and update assumption
points = []

index = np.random.permutation(X.shape[0])
for i in range(0, X.shape[0]):
    y, a_n, b_n = posterior(a, b, X[:index[i]])
    plt.plot(mu_test, y, 'r', alpha=0.3)

    print(a_n, b_n)
    post_mean = beta.mean(a_n, b_n)
    prior_mean = beta.mean(a, b)

    points.append(post_mean - prior_mean)

y, _, _ = posterior(a, b, X)
plt.plot(mu_test, y, 'b', linewidth=4.0)

# q3
ax = fig.add_subplot(212)
xx = [i for i in range(0, len(points))]
plt.plot(xx, points)

plt.tight_layout()
plt.show()
#plt.savefig(path, transparent=True)
    def naiveProbability(self, questionNumber, idx):
        expectedPerformance = list()
        individualResponse = list()
        probabilities = list()
        human_accuracy = list()

        machine_accuracy = [None for _ in range(self.numAgents)]
        group_accuracy = 0
        
        #Save human expected performance based
        for i in range(0,self.teamSize):
            expectedPerformance.append(beta.mean(self.alphas[i],self.betas[i]))
            individualResponse.append(self.lastIndividualResponsesbyQNo[(self.lastIndividualResponsesbyQNo["questionNumber"] == questionNumber) & (self.lastIndividualResponsesbyQNo["sender"] == self.teamMember.iloc[i])]["stringValue"].any())
            self.updateAlphaBeta(i,self.lastIndividualResponsesbyQNo[(self.lastIndividualResponsesbyQNo["questionNumber"] == questionNumber) & (self.lastIndividualResponsesbyQNo["sender"] == self.teamMember.iloc[i])]["stringValue"].any(),self.correctAnswers[idx])

            ans = self.lastIndividualResponsesbyQNo[(self.lastIndividualResponsesbyQNo["questionNumber"] == questionNumber) & (self.lastIndividualResponsesbyQNo["sender"] == self.teamMember.iloc[i])]["stringValue"].any()
            if ans == self.correctAnswers[idx]:
                human_accuracy.append(1)
            else:
                human_accuracy.append(0)
                
        if (self.groupSubmission["groupAnswer"].iloc[idx] == self.correctAnswers[idx]):
            group_accuracy = 1            
            
        indxQ = -1
        anyMachineAsked = False
        if(int(float(questionNumber)) in self.machineAskedQuestions):
            indxQ = self.machineAskedQuestions.index(int(float(questionNumber)))        
            sender = self.machineAsked['sender'].iloc[indxQ]
            k = self.teamArray.index(sender)
            anyMachineAsked = True
        
        # Add expected Performance for Agents
        for i in range(self.teamSize, self.teamSize+self.numAgents):
            expectedPerformance.append(beta.mean(self.alphas[i],self.betas[i]))
            # update alpha beta for that machine

        #Update machine accuracy 
        if(anyMachineAsked):
            self.updateAlphaBeta(self.getAgentForHuman(k), self.machineAsked['event_content'].iloc[indxQ].split("||")[0].split(":")[2].replace('"', ''), self.correctAnswers[idx])
            self.machineUseCount[k]+=1
            machineAnswer = self.machineAsked['event_content'].iloc[indxQ].split("||")[0].split(":")[2].replace('"', '').split("_")[0]

            if self.firstMachineUsage[k] == -1:
                self.firstMachineUsage[k] = idx
            machine_accuracy[k] = 1

        
        # Conditional Probability
        # Do a bayes update
        denominator = 0
        numerator = [1. for _ in range(len(self.options[idx]))]
        prob_class = 0.25
        prob_resp = 0
        prob_class_responses = [None for _ in range(len(self.options[idx]))]
        prob_resp_given_class = [None for _ in range(len(self.options[idx]))]


        for opt_num in range(0,len(self.options[idx])):
            prob_resp = 0
            numerator = prob_class
            for person_num in range(0,self.teamSize):
                if individualResponse[person_num] == self.options[idx][opt_num]:
                    numerator *= expectedPerformance[person_num]
                else:
                    numerator *= (1 - expectedPerformance[person_num])/3
                prob_resp += numerator

            prob_resp_given_class[opt_num] = numerator
        prob_class_responses = [(prob_resp_given_class[i]/sum(prob_resp_given_class)) for i in range(0,len(prob_resp_given_class))]
        
        #ANSIs this updating agent probabilities?
        for i in range(self.teamSize):
            probabilities.append(expectedPerformance[self.teamSize+i])    

        #8 probability values returned
        # first set is for options (sums to 1)
	
        assert(sum(prob_class_responses) > 0.999 and sum(prob_class_responses) < 1.001)
        #second set is for machines
        prob_all_class_responses = prob_class_responses + [expectedPerformance[self.getAgentForHuman(k)] for k in range(self.teamSize)]

        return prob_all_class_responses,human_accuracy,group_accuracy,machine_accuracy 
def run():

    if len(sys.argv) < 3:
        print 'usage: python %s methylation_reads_all.tsv loglike_threshold outfile [-stranded +|-] [-minAbsLogLike float] [-minAbsPValue float] [-BayesianIntegration window(bp) step alpha beta pseudosamplesize] [-N6mAweight pseudosamplesize genome.fa] [-saveNewSingleMoleculeFile filename]' % sys.argv[
            0]
        print '\tNote: the [-BayesianIntegration] option requires the [-minAbsPValue] option'
        print '\tNote: the [-saveNewSingleMoleculeFile] option requires the [-BayesianIntegration] option'
        print '\tNote: the [-N6mAweight] option only works together with the -BayesianIntegration option'
        sys.exit(1)

    reads = sys.argv[1]
    LLthreshold = float(sys.argv[2])
    outfilename = sys.argv[3]

    doStranded = False
    if '-stranded' in sys.argv:
        doStranded = True
        WantedStrand = sys.argv[sys.argv.index('-stranded') + 1]
        print 'will only output coverage from reads on the', WantedStrand, 'strand'

    minAbsLogLike = 0
    if '-minAbsLogLike' in sys.argv:
        minAbsLogLike = float(sys.argv[sys.argv.index('-minAbsLogLike') + 1])
        print 'will ignore bases with absolute loglikelihood values less than', minAbsLogLike

    doP = False
    minAbsPValue = 0
    if '-minAbsPValue' in sys.argv:
        doP = True
        minAbsPValue = float(sys.argv[sys.argv.index('-minAbsPValue') + 1])
        print 'will ignore bases with p-values higher than', minAbsPValue, 'and lower than', 1 - minAbsPValue

    doSaveNewFile = False
    doBI = False
    if '-BayesianIntegration' in sys.argv:
        if not doP:
            print 'data not specified to be in probability space, exiting'
            sys.exit(1)
        doBI = True
        window = int(sys.argv[sys.argv.index('-BayesianIntegration') + 1])
        step = int(sys.argv[sys.argv.index('-BayesianIntegration') + 2])
        alph = float(sys.argv[sys.argv.index('-BayesianIntegration') + 3])
        bet = float(sys.argv[sys.argv.index('-BayesianIntegration') + 4])
        PSS = int(sys.argv[sys.argv.index('-BayesianIntegration') + 5])
        print 'will integrate accessibility probabilities over windows of', window, 'bp in size, step size', step, 'bp, using (', alph, bet, ') as beta priors, and a pseudosample size of', PSS
        if '-saveNewSingleMoleculeFile' in sys.argv:
            doSaveNewFile = True
            NewFile = open(
                sys.argv[sys.argv.index('-saveNewSingleMoleculeFile') + 1],
                'w')
            print 'will save integrated basepair accessibilities probabilities into a new file:', sys.argv[
                sys.argv.index('-saveNewSingleMoleculeFile') + 1]
        doAweight = False
        if '-N6mAweight' in sys.argv:
            doAweight = True
            N6mAweight = int(sys.argv[sys.argv.index('-N6mAweight') + 1])
            genome_fasta = sys.argv[sys.argv.index('-N6mAweight') + 2]
            print 'will used different weight for N6mA', sys.argv[
                sys.argv.index('-saveNewSingleMoleculeFile') + 1]
            GenomeDict = {}
            sequence = ''
            inputdatafile = open(genome_fasta)
            for line in inputdatafile:
                if line[0] == '>':
                    if sequence != '':
                        GenomeDict[chr] = ''.join(sequence).upper()
                    chr = line.strip().split('>')[1]
                    sequence = []
                    continue
                else:
                    sequence.append(line.strip())
            GenomeDict[chr] = ''.join(sequence).upper()

    CoverageDict = {}

    if reads.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + reads
    elif reads.endswith('.gz') or reads.endswith('.bgz'):
        cmd = 'zcat ' + reads
    elif reads.endswith('.zip'):
        cmd = 'unzip -p ' + reads
    else:
        cmd = 'cat ' + reads
    RN = 0
    P = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = P.readline().strip()
        if line == '':
            break
        if line.startswith('chromosome\tstart\tend\tread_name'):
            continue
        fields = line.strip().split('\t')
        if len(fields) < 8:
            print 'skipping:', fields
            continue
        RN += 1
        if RN % 10000 == 0:
            print RN, 'lines processed'
        chr = fields[0]
        strand = fields[3]
        if doStranded:
            if strand != WantedStrand:
                continue
        Ps = fields[6].split(',')
        LLs = fields[7].split(',')
        if CoverageDict.has_key(chr):
            pass
        else:
            CoverageDict[chr] = {}
        if doBI:
            PDict = {}
            for i in range(len(Ps)):
                pos = int(Ps[i])
                p = float(LLs[i])
                PDict[pos] = p
            positions = PDict.keys()
            minPos = min(positions)
            maxPos = max(positions)
            if doSaveNewFile:
                NewPos = []
                NewLLs = []
            for pos in range(minPos + window / 2, maxPos - window / 2, step):
                (A, B) = (alph, bet)
                for j in range(pos - window / 2, pos + window / 2):
                    if PDict.has_key(j):
                        p = PDict[j]
                        if doAweight:
                            if strand == '+' and GenomeDict[chr][j] == 'A':
                                Z = int(N6mAweight * p)
                                A = A + Z
                                B = B + N6mAweight - Z
                            elif strand == '-' and GenomeDict[chr][j] == 'T':
                                Z = int(N6mAweight * p)
                                A = A + Z
                                B = B + N6mAweight - Z
                            else:
                                Z = int(PSS * p)
                                A = A + Z
                                B = B + PSS - Z
                        else:
                            Z = int(PSS * p)
                            A = A + Z
                            B = B + PSS - Z
                final_p = beta.mean(A, B)
                newpos = pos - (pos % step)
                if doSaveNewFile:
                    NewPos.append(str(newpos))
                    NewLLs.append("{0:.2f}".format(final_p))
                if final_p > minAbsPValue and final_p < 1 - minAbsPValue:
                    continue
                else:
                    pass
                if CoverageDict[chr].has_key(newpos):
                    pass
                else:
                    CoverageDict[chr][newpos] = [0, 0]
                if final_p < LLthreshold:
                    CoverageDict[chr][newpos][1] += 1
                else:
                    CoverageDict[chr][newpos][0] += 1
            if doSaveNewFile:
                outline = fields[0] + '\t' + fields[1] + '\t' + fields[
                    2] + '\t' + fields[3] + '\t' + fields[4] + '\t' + fields[5]
                outline = outline + '\t' + ','.join(NewPos) + '\t' + ','.join(
                    NewLLs)
                NewFile.write(outline + '\n')
        else:
            try:
                for i in range(len(Ps)):
                    pos = int(Ps[i])
                    ll = float(LLs[i])
                    if math.fabs(ll) >= minAbsLogLike:
                        pass
                    else:
                        continue
                    if doP:
                        if ll > minAbsPValue and ll < 1 - minAbsPValue:
                            continue
                        else:
                            pass
                    if CoverageDict[chr].has_key(pos):
                        pass
                    else:
                        CoverageDict[chr][pos] = [0, 0]
                    if ll < LLthreshold:
                        CoverageDict[chr][pos][1] += 1
                    else:
                        CoverageDict[chr][pos][0] += 1
            except:
                print 'skipping read'
                print fields
                continue

    print 'finished inputting reads'

    if doSaveNewFile:
        NewFile.close()

    chromosomes = CoverageDict.keys()
    chromosomes.sort()

    outfile = open(outfilename, 'w')

    outline = '#chr\tstart\tend\tmeth\tunmeth\tcov'
    outfile.write(outline + '\n')

    K = 0
    for chr in chromosomes:
        print chr
        positions = CoverageDict[chr].keys()
        positions.sort()
        for pos in positions:
            outline = chr + '\t' + str(pos) + '\t' + str(pos + 1)
            outline = outline + '\t' + str(CoverageDict[chr][pos][1])
            outline = outline + '\t' + str(CoverageDict[chr][pos][0])
            outline = outline + '\t' + str(CoverageDict[chr][pos][0] +
                                           CoverageDict[chr][pos][1])
            outfile.write(outline + '\n')

    outfile.close()
def run():

    if len(sys.argv) < 9:
        print 'usage: python %s methylation_reads_all.tsv peaks chrFieldID leftFiled RightFieldID minCoverage maxDist tabix_location outfile [-subsample N]' % sys.argv[
            0]
        print '\Note: the script assumes Tombo 1.3 probabilities, a tabix indexed reads file, and uses a beta distribution prior of (10,10) by default'
        print '\Note: the subsample option will sample the reads in all comparisons down to the minCoverage level; the N parameter indicates how many such subsamplings should be averaged for the final value'
        sys.exit(1)

    reads = sys.argv[1]
    peaks = sys.argv[2]
    chrFieldID = int(sys.argv[3])
    leftFieldID = int(sys.argv[4])
    rightFieldID = int(sys.argv[5])
    minCov = int(sys.argv[6])
    maxDist = int(sys.argv[7])
    tabix = sys.argv[8]
    outfilename = sys.argv[9]

    doSS = False
    if '-subsample' in sys.argv:
        SS = int(sys.argv[sys.argv.index('-subsample') + 1])
        doSS = True
        print 'will subsample all comparisons down to', minCov, 'reads'
        print 'will take the average outcome of', SS, 'subsamplings'

    alph = 10
    bet = 10
    PSS = 100

    PeakDict = {}
    if peaks.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + peaks
    elif peaks.endswith('.gz') or peaks.endswith('.bgz'):
        cmd = 'zcat ' + peaks
    elif peaks.endswith('.zip'):
        cmd = 'unzip -p ' + peaks
    else:
        cmd = 'cat ' + peaks
    RN = 0
    P = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = P.readline().strip()
        if line == '':
            break
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        chr = fields[chrFieldID]
        RL = int(fields[leftFieldID])
        RR = int(fields[rightFieldID])
        if PeakDict.has_key(chr):
            pass
        else:
            PeakDict[chr] = []
        PeakDict[chr].append((RL, RR))

    print 'finished inputting peaks'

    outfile = open(outfilename, 'w')
    outline = '#chr\tpeak1_left\tpeak1_right\tpeak1_open\tpeak1_closed\tpeak1_fraction\tpeak2_left\tpeak2_right\tpeak2_open\tpeak2_closed\tpeak2_fraction\tp_val'
    outfile.write(outline + '\n')

    for chr in PeakDict.keys():
        PeakDict[chr].sort()
        for i in range(len(PeakDict[chr]) - 1):
            (RL1, RR1) = PeakDict[chr][i]
            for j in range(i + 1, len(PeakDict[chr])):
                (RL2, RR2) = PeakDict[chr][j]
                print chr, RL1, RR1, RL2, RR2,
                if RR2 - RL1 > maxDist:
                    break
                cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(
                    RL1) + '-' + str(RR2)
                p = os.popen(cmd, "r")
                RegionReads = []
                line = 'line'
                while line != '':
                    line = p.readline().strip()
                    if line == '':
                        break
                    fields = line.strip().split('\t')
                    chr = fields[0]
                    read_left = int(fields[1])
                    read_right = int(fields[2])
                    if read_left <= RL1 and read_right >= RR2:
                        pass
                    else:
                        continue
                    strand = fields[3]
                    read = fields[4]
                    cgs = fields[6].split(',')
                    loglike = fields[7].split(',')
                    RegionReads.append((cgs, loglike))
                print len(RegionReads)
                if len(RegionReads) < minCov:
                    continue
                if doSS:
                    p_av = 0.0
                    CR1_av = 0.0
                    OR1_av = 0.0
                    CR2_av = 0.0
                    OR2_av = 0.0
                    for S in range(SS):
                        RegionReadsSubSampled = random.sample(
                            RegionReads, minCov)
                        OpenOrClosed = []
                        for (cgs, loglike) in RegionReadsSubSampled:
                            t = zip(cgs, loglike)
                            RD = dict((int(x), float(y)) for x, y in t)
                            (A, B) = (alph, bet)
                            for pos in range(RL1, RR1):
                                if RD.has_key(pos):
                                    p = RD[pos]
                                    Z = int(PSS * p)
                                    A = A + Z
                                    B = B + PSS - Z
                            if beta.mean(A, B) > 0.5:
                                final_p1 = 1
                            else:
                                final_p1 = 0
                            (A, B) = (alph, bet)
                            for pos in range(RL2, RR2):
                                if RD.has_key(pos):
                                    p = RD[pos]
                                    Z = int(PSS * p)
                                    A = A + Z
                                    B = B + PSS - Z
                            if beta.mean(A, B) > 0.5:
                                final_p2 = 1
                            else:
                                final_p2 = 0
                            OpenOrClosed.append((final_p1, final_p2))
                        C00 = OpenOrClosed.count((0, 0))
                        C01 = OpenOrClosed.count((0, 1))
                        C10 = OpenOrClosed.count((1, 0))
                        C11 = OpenOrClosed.count((1, 1))
                        CR1 = C01 + C00
                        OR1 = C10 + C11
                        CR2 = C10 + C00
                        OR2 = C01 + C11
                        oddsratio, pvalue = fisher_exact([[C00, C01],
                                                          [C10, C11]])
                        logp = -math.log10(pvalue)
                        p_av += logp
                        CR1_av += CR1
                        OR1_av += OR1
                        CR2_av += CR2
                        OR2_av += OR2
                    (pvalue, CR1, OR1, CR2,
                     OR2) = (p_av / SS, CR1_av / SS, OR1_av / SS, CR2_av / SS,
                             OR2_av / SS)
                    outline = chr + '\t' + str(RL1) + '\t' + str(RR1)
                    outline = outline + '\t' + str(OR1) + '\t' + str(
                        CR1) + '\t' + str(OR1 / (OR1 + CR1 + 0.0))
                    outline = outline + '\t' + str(RL2) + '\t' + str(RR2)
                    outline = outline + '\t' + str(OR2) + '\t' + str(
                        CR2) + '\t' + str(
                            OR2 / (OR2 + CR2 + 0.0)) + '\t' + str(pvalue)
                    outfile.write(outline + '\n')
                else:
                    OpenOrClosed = []
                    for (cgs, loglike) in RegionReads:
                        t = zip(cgs, loglike)
                        RD = dict((int(x), float(y)) for x, y in t)
                        (A, B) = (alph, bet)
                        for pos in range(RL1, RR1):
                            if RD.has_key(pos):
                                p = RD[pos]
                                Z = int(PSS * p)
                                A = A + Z
                                B = B + PSS - Z
                        if beta.mean(A, B) > 0.5:
                            final_p1 = 1
                        else:
                            final_p1 = 0
                        (A, B) = (alph, bet)
                        for pos in range(RL2, RR2):
                            if RD.has_key(pos):
                                p = RD[pos]
                                Z = int(PSS * p)
                                A = A + Z
                                B = B + PSS - Z
                        if beta.mean(A, B) > 0.5:
                            final_p2 = 1
                        else:
                            final_p2 = 0
                        OpenOrClosed.append((final_p1, final_p2))
                    C00 = OpenOrClosed.count((0, 0))
                    C01 = OpenOrClosed.count((0, 1))
                    C10 = OpenOrClosed.count((1, 0))
                    C11 = OpenOrClosed.count((1, 1))
                    OR1 = C01 + C00
                    CR1 = C10 + C11
                    OR2 = C10 + C00
                    CR2 = C01 + C11
                    oddsratio, pvalue = fisher_exact([[C00, C01], [C10, C11]])
                    outline = chr + '\t' + str(RL1) + '\t' + str(RR1)
                    outline = outline + '\t' + str(OR1) + '\t' + str(
                        CR1) + '\t' + str(OR1 / (OR1 + CR1 + 0.0))
                    outline = outline + '\t' + str(RL2) + '\t' + str(RR2)
                    if pvalue == 0:
                        pvalue = 1e-300
                    outline = outline + '\t' + str(OR2) + '\t' + str(
                        CR2) + '\t' + str(OR2 /
                                          (OR2 + CR2 + 0.0)) + '\t' + str(
                                              -math.log10(pvalue))
                    outfile.write(outline + '\n')

    outfile.close()
示例#23
0
from scipy.stats import beta
from scipy import special
from matplotlib import pyplot
import numpy as np
import math

a_prior = 2
b_prior = 10
a = 46
b = 240

print('Prior mean: ', beta.mean(a_prior, b_prior))
print('Posterior mean : ', beta.mean(a, b))

x = np.arange(0.0, 1.01, 0.01)
y_prior = beta.pdf(x, a_prior, b_prior)
y = beta.pdf(x, a, b)

ax = pyplot.subplot(111)
ax.plot(x, y_prior)
ax.plot(x, y)
# plot.show()

sample_size = 1000
r = beta.rvs(a, b, size=sample_size)
new_mean = np.mean(r)
new_std = np.std(r)
"""
z_critical is the number of std you'd have to go from the mean 
to capture the proportion of data association with the confidence level 
"""
        state = clusterer.predict(buf.reshape((2*reps,1)))
        init_state = state[::2]
        final_state = state[1::2]
        switched = np.logical_xor(init_state, final_state)

        count_mat = np.zeros((2,2), dtype=np.int)

        count_mat[0,0] = np.sum(np.logical_and(init_state == 0, np.logical_not(switched) ))
        count_mat[0,1] = np.sum(np.logical_and(init_state == 0, switched ))
        count_mat[1,0] = np.sum(np.logical_and(init_state == 1, switched ))
        count_mat[1,1] = np.sum(np.logical_and(init_state == 1, np.logical_not(switched) ))

        counts.append(count_mat)

    plt.figure()
    mean_PtoAP = [beta.mean(1+c[0,1], 1+c[0,0]) for c in counts]
    mean_APtoP = [beta.mean(1+c[1,0], 1+c[1,1]) for c in counts]
    ci68_PtoAP = [beta.interval(0.68, 1+c[0,1], 1+c[0,0]) for c in counts]
    ci68_APtoP = [beta.interval(0.68, 1+c[1,0], 1+c[1,1]) for c in counts]
    ci95_PtoAP = [beta.interval(0.95, 1+c[0,1], 1+c[0,0]) for c in counts]
    ci95_APtoP = [beta.interval(0.95, 1+c[1,0], 1+c[1,1]) for c in counts]
    current_palette = sns.color_palette()
    plt.plot(fall_times, mean_PtoAP)
    plt.fill_between(fall_times, [ci[0] for ci in ci68_PtoAP], [ci[1] for ci in ci68_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none")
    plt.fill_between(fall_times, [ci[0] for ci in ci95_PtoAP], [ci[1] for ci in ci95_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none")
    plt.plot(fall_times, mean_APtoP)
    plt.fill_between(fall_times, [ci[0] for ci in ci68_APtoP], [ci[1] for ci in ci68_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none")
    plt.fill_between(fall_times, [ci[0] for ci in ci95_APtoP], [ci[1] for ci in ci95_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none")
    plt.xlabel("nTron Pulse Fall Time (s)")
    plt.ylabel("Switching Probability")
    plt.legend(("P->AP", "AP->P"))
示例#25
0
def run():

    if len(sys.argv) < 10:
        print 'usage: python %s methylation_reads_all.tsv peaks chrFieldID leftFiled RightFieldID minCoverage maxDist N_samplings tabix_location outfile [-subsample N] [-quantiles N]' % sys.argv[0]
        print '\Note: the script assumes Tombo 1.3 probabilities, a tabix indexed reads file, and uses a beta distribution prior of (10,10) by default'
        print '\Note: the subsample option will sample the reads in all comparisons down to the minCoverage level; the N parameter indicates how many such subsamplings should be averaged for the final value'
        print '\Note: the subsample option IS REQUIRED AT THE MOMENT'
        sys.exit(1)

    reads = sys.argv[1]
    peaks = sys.argv[2]
    chrFieldID = int(sys.argv[3])
    leftFieldID = int(sys.argv[4])
    rightFieldID = int(sys.argv[5])
    minCov = int(sys.argv[6])
    maxDist = int(sys.argv[7])
    Nsamp = int(sys.argv[8])
    tabix = sys.argv[9]
    outfilename = sys.argv[10]

    QU = 5
    if '-quantiles' in sys.argv:
        QU = int(sys.argv[sys.argv.index('-quantiles') + 1])
        print 'will split reads into', QU, 'quantiles/bins instead of the default 5'

    doSS = False
    if '-subsample' in sys.argv:
        SS = int(sys.argv[sys.argv.index('-subsample') + 1])
        doSS = True
        print 'will subsample all comparisons down to', minCov, 'reads'
        print 'will take the average outcome of', SS, 'subsamplings'
        if minCov % QU != 0:
            print 'minCov value must be divisble by the number of quantiles, which is', QU, ', exiting'
            sys.exit(1)

    alph = 10
    bet = 10
    PSS = 100

    PeakDict = {}
    if peaks.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + peaks
    elif peaks.endswith('.gz') or peaks.endswith('.bgz'):
        cmd = 'zcat ' + peaks
    elif peaks.endswith('.zip'):
        cmd = 'unzip -p ' + peaks
    else:
        cmd = 'cat ' + peaks
    RN = 0
    P = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = P.readline().strip()
        if line == '':
            break
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        chr = fields[chrFieldID]
        RL = int(fields[leftFieldID])
        RR = int(fields[rightFieldID])
        if PeakDict.has_key(chr):
            pass
        else:
            PeakDict[chr] = []
        PeakDict[chr].append((RL,RR))

    print 'finished inputting peaks'

    outfile = open(outfilename,'w')
    outline = '#chr\tpeak1_left\tpeak1_right\tpeak1_open\tpeak1_closed\tpeak1_fraction\tpeak2_left\tpeak2_right\tpeak2_open\tpeak2_closed\tpeak2_fraction\tFisher_test_p_val\tEmpirical_p-val\tMax_upper_empirical_p-val\tMax_lower_empirical_p-val\tNMI'
    outfile.write(outline + '\n')

    for chr in PeakDict.keys():
        PeakDict[chr].sort()
        for i in range(len(PeakDict[chr])-1):
            (RL1,RR1) = PeakDict[chr][i]
            for j in range(i+1,len(PeakDict[chr])):
                (RL2,RR2) = PeakDict[chr][j]
                print 'testing:', chr, RL1, RR1, RL2, RR2
                if RR2 - RL1 > maxDist:
                    break
                cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(RL1) + '-' + str(RR2)
                p = os.popen(cmd, "r")
                RegionReads = []
                line = 'line'
                while line != '':
                    line = p.readline().strip()
                    if line == '':
                        break
                    fields = line.strip().split('\t')
                    chr = fields[0]
                    read_left = int(fields[1])
                    read_right = int(fields[2])
                    if read_left <= RL1 and read_right >= RR2:
                        pass
                    else:
                        continue
                    strand = fields[3]
                    read = fields[4]
                    cgs = fields[6].split(',')
                    loglike = fields[7].split(',')
                    LLs = []
                    NLLs = sum(float(L) > 0.5 for L in loglike)/(len(loglike) + 0.0)
                    RegionReads.append((NLLs,cgs,loglike))
                print 'found:', len(RegionReads), 'reads', 'needed:', minCov, 'reads'
                if len(RegionReads) < minCov:
                    continue
                if doSS:
                    emp_p_av = 0.0
                    max_possible_upper_emp_p_av = 0.0
                    max_possible_lower_emp_p_av = 0.0
                    emp_p_av = 0.0
                    NMI_av = 0.0
                    p_av = 0.0
                    CR1_av = 0.0
                    OR1_av = 0.0
                    CR2_av = 0.0
                    OR2_av = 0.0
                    for S in range(SS):
                        RegionReadsSubSampled = random.sample(RegionReads,minCov)
                        OpenOrClosed = []
                        for (NLLs,cgs,loglike) in RegionReadsSubSampled:
                            t = zip(cgs,loglike)
                            RD = dict((int(x), float(y)) for x, y in t)
                            (A,B) = (alph,bet)
                            for pos in range(RL1,RR1):
                                if RD.has_key(pos):
                                    p = RD[pos]
                                    Z = int(PSS*p)
                                    A = A + Z
                                    B = B + PSS - Z
                            if beta.mean(A,B) > 0.5:
                                final_p1 = 1
                            else:
                                final_p1 = 0
                            (A,B) = (alph,bet)
                            for pos in range(RL2,RR2):
                                if RD.has_key(pos):
                                    p = RD[pos]
                                    Z = int(PSS*p)
                                    A = A + Z
                                    B = B + PSS - Z
                            if beta.mean(A,B) > 0.5:
                                final_p2 = 1
                            else:
                                final_p2 = 0
                            OpenOrClosed.append((NLLs,final_p1,final_p2))

#                        (Z1,Z2,Z3) = zip(*OpenOrClosed)
#                        print 'before sorting:', Z1
#                        OpenOrClosed.sort()
#                        (Z1,Z2,Z3) = zip(*OpenOrClosed)
#                        print 'after sorting:', Z1

                        # get empirical sampling distribution of coaccessibility values:
                        OpenOrClosed.sort()
                        ObservedMatches = []
                        STEP = len(OpenOrClosed)/QU
                        OMlist = []
                        for i in range(QU):
                            tempOMlist = []
                            for j in range(i*STEP,(i+1)*STEP):
                                tempOMlist.append(OpenOrClosed[j])
                            [Z,P,Q] = zip(*tempOMlist)
                            P = list(P)
                            Q = list(Q)
                            OMlist.append((P,Q))
                        for NS in range(Nsamp):
                            ObsMatchesInChunks = 0
                            for (P,Q) in OMlist:
                                Ps = np.array(P)
                                Qs = np.array(Q)
                                random.shuffle(Ps)
                                random.shuffle(Qs)
                                ObsMatchesInChunks += np.sum(Ps==Qs)
                            ObservedMatches.append(ObsMatchesInChunks)

                        # calculate normalized mutual information on coaccessibility:

                        [Z,P,Q] = zip(*OpenOrClosed)
                        OpenOrClosed = zip(P,Q)
                        P = list(P)
                        Q = list(Q)
                        P = np.array(P)
                        Q = np.array(Q)
                        NMI = NMIS(P,Q)

                        # calculate empirical p-values (assuming normal distribution of sampling coaccessibilities):
                        matches = np.sum(P==Q)
                        if matches < 0.5*len(P):
                            NMI = -NMI
                        ObservedMatches = np.array(ObservedMatches)
                        OMm = np.mean(ObservedMatches)
                        OMstd = np.std(ObservedMatches)
                        if OMstd == 0:
                            OMstd = 0.1
                        if matches < OMm:
                            emp_p_val = norm.cdf(matches,OMm,OMstd)
                            if emp_p_val == 0:
                                emp_p_val = -300
                            else:
                                emp_p_val = math.log10(emp_p_val)
                        else:
                            emp_p_val = 1 - norm.cdf(matches,OMm,OMstd)
                            if emp_p_val == 0:
                                emp_p_val = 300
                            else:
                                emp_p_val = -math.log10(emp_p_val)

                        # calculate maximum possible p-values:
                        max_possible_upper_emp_p = 1 - norm.cdf(len(P),OMm,OMstd)
                        if max_possible_upper_emp_p == 0:
                            max_possible_upper_emp_p = 300
                        else:
                            max_possible_upper_emp_p = -math.log10(max_possible_upper_emp_p)
                        max_possible_lower_emp_p = norm.cdf(0,OMm,OMstd)
                        if max_possible_lower_emp_p == 0:
                            max_possible_lower_emp_p = -300
                        else:
                            max_possible_lower_emp_p = math.log10(max_possible_lower_emp_p)

#                        print matches, len(P), OMm, OMstd, np.mean(P), np.mean(Q)
#                        print norm.cdf(len(P),OMm,OMstd), norm.cdf(0,OMm,OMstd)
#                        print norm.cdf(matches,OMm,OMstd), 1 - norm.cdf(matches,OMm,OMstd)

                        # calculate Fisher test p-value:
                        C00 = OpenOrClosed.count((0,0))
                        C01 = OpenOrClosed.count((0,1))
                        C10 = OpenOrClosed.count((1,0))
                        C11 = OpenOrClosed.count((1,1))
                        CR1 = C01 + C00
                        OR1 = C10 + C11
                        CR2 = C10 + C00
                        OR2 = C01 + C11
                        oddsratio, pvalue = fisher_exact([[C00, C01], [C10, C11]])
                        logp = -math.log10(pvalue)	
                        emp_p_av += emp_p_val
                        max_possible_upper_emp_p_av += max_possible_upper_emp_p 
                        max_possible_lower_emp_p_av += max_possible_lower_emp_p 
                        NMI_av += NMI
                        p_av += logp
                        CR1_av += CR1
                        OR1_av += OR1
                        CR2_av += CR2
                        OR2_av += OR2
                    (emp_p, max_possible_upper_emp_p, max_possible_lower_emp_p, NMI, pvalue, CR1, OR1, CR2, OR2) = (emp_p_av/SS, max_possible_upper_emp_p_av/SS, max_possible_lower_emp_p_av/SS,
                                                                                                                          NMI_av/SS, p_av/SS, CR1_av/SS, OR1_av/SS, CR2_av/SS, OR2_av/SS)

                    if str(emp_p) == 'nan':
                        print emp_p, max_possible_upper_emp_p, max_possible_lower_emp_p, NMI, pvalue, CR1, OR1, CR2, OR2

                    outline = chr + '\t' + str(RL1) + '\t' + str(RR1)
                    outline = outline + '\t' + str(OR1) + '\t' + str(CR1) + '\t' + str(OR1/(OR1 + CR1 + 0.0))
                    outline = outline + '\t' + str(RL2) + '\t' + str(RR2)
                    outline = outline + '\t' + str(OR2) + '\t' + str(CR2) + '\t' + str(OR2/(OR2 + CR2 + 0.0)) + '\t' + str(pvalue)
                    outline = outline + '\t' + str(emp_p)
                    outline = outline + '\t' + str(max_possible_upper_emp_p)
                    outline = outline + '\t' + str(max_possible_lower_emp_p)
                    outline = outline + '\t' + str(NMI)
                    outfile.write(outline + '\n')
                    print outline
#                else:
#                    OpenOrClosed = []
#                    for (cgs,loglike) in RegionReads:
#                        t = zip(cgs,loglike)
#                        RD = dict((int(x), float(y)) for x, y in t)
#                        (A,B) = (alph,bet)
#                        for pos in range(RL1,RR1):
#                            if RD.has_key(pos):
#                                p = RD[pos]
#                                Z = int(PSS*p)
#                                A = A + Z
#                                B = B + PSS - Z
#                        if beta.mean(A,B) > 0.5:
#                            final_p1 = 1
#                        else:
#                            final_p1 = 0
#                        (A,B) = (alph,bet)
#                        for pos in range(RL2,RR2):
#                            if RD.has_key(pos):
#                                p = RD[pos]
#                                Z = int(PSS*p)
#                                A = A + Z
#                                B = B + PSS - Z
#                        if beta.mean(A,B) > 0.5:
#                            final_p2 = 1
#                        else:
#                            final_p2 = 0
#                        OpenOrClosed.append((final_p1,final_p2))
#                    C00 = OpenOrClosed.count((0,0))
#                    C01 = OpenOrClosed.count((0,1))
#                    C10 = OpenOrClosed.count((1,0))
#                    C11 = OpenOrClosed.count((1,1))
#                    OR1 = C01 + C00
#                    CR1 = C10 + C11
#                    OR2 = C10 + C00
#                    CR2 = C01 + C11
#                    oddsratio, pvalue = fisher_exact([[C00, C01], [C10, C11]])
#                    outline = chr + '\t' + str(RL1) + '\t' + str(RR1)
#                    outline = outline + '\t' + str(OR1) + '\t' + str(CR1) + '\t' + str(OR1/(OR1 + CR1 + 0.0))
#                    outline = outline + '\t' + str(RL2) + '\t' + str(RR2)
#                    if pvalue == 0:
#                        pvalue = 1e-300
#                    outline = outline + '\t' + str(OR2) + '\t' + str(CR2) + '\t' + str(OR2/(OR2 + CR2 + 0.0)) + '\t' + str(-math.log10(pvalue))
#                    outfile.write(outline + '\n')

    outfile.close()
示例#26
0
def switching_plots(buffers, axis1, num_clusters=2):
    #Get an idea of SNR
    #Cluster all the data into three based with starting point based on edges
    all_vals = buffers.flatten()
    all_vals.resize((all_vals.size, 1))
    init_guess = np.linspace(np.min(all_vals), np.max(all_vals), num_clusters)
    init_guess[[1, -1]] = init_guess[[-1, 1]]
    init_guess.resize((num_clusters, 1))
    clusterer = KMeans(init=init_guess, n_clusters=num_clusters)
    state = clusterer.fit_predict(all_vals)

    #Report initial state distributions
    print("Total initial state distribution:")
    init_state = state[::2]
    for ct in range(num_clusters):
        print("\tState {}: {:.2f}%".format(
            ct, 100 * np.sum(init_state == ct) / len(init_state)))

    #Approximate SNR from centre distance and variance
    std0 = np.std(all_vals[state == 0])
    std1 = np.std(all_vals[state == 1])
    mean_std = 0.5 * (std0 + std1)
    centre0 = clusterer.cluster_centers_[0, 0]
    centre1 = clusterer.cluster_centers_[1, 0]
    centre_dist = centre1 - centre0
    print(
        "Centre distance = {:.3f} with widths = {:.4f} / {:.4f} gives SNR ratio {:.3}"
        .format(centre_dist, std0, std1, centre_dist / mean_std))

    #Have a look at the distributions
    plt.figure()
    for ct in range(num_clusters):
        sns.distplot(all_vals[state == ct], kde=False, norm_hist=False)

    #calculate some switching matrices for each amplitude
    # 0->0 0->1
    # 1->0 1->1
    counts = []
    for buf in buffers:
        state = clusterer.predict(buf.reshape((len(buf), 1)))
        init_state = state[::2]
        final_state = state[1::2]
        switched = np.logical_xor(init_state, final_state)

        count_mat = np.zeros((2, 2), dtype=np.int)

        count_mat[0, 0] = np.sum(
            np.logical_and(init_state == 0, np.logical_not(switched)))
        count_mat[0, 1] = np.sum(np.logical_and(init_state == 0, switched))
        count_mat[1, 0] = np.sum(np.logical_and(init_state == 1, switched))
        count_mat[1, 1] = np.sum(
            np.logical_and(init_state == 1, np.logical_not(switched)))

        counts.append(count_mat)

    mean_PtoAP = np.array(
        [beta.mean(1 + c[0, 1], 1 + c[0, 0]) for c in counts])
    limit_PtoAP = np.array(
        [beta.mean(1 + c[0, 1] + c[0, 0], 1) for c in counts])
    mean_APtoP = np.array(
        [beta.mean(1 + c[1, 0], 1 + c[1, 1]) for c in counts])
    limit_APtoP = np.array(
        [beta.mean(1 + c[1, 0] + c[1, 1], 1) for c in counts])
    ci68_PtoAP = np.array(
        [beta.interval(0.68, 1 + c[0, 1], 1 + c[0, 0]) for c in counts])
    ci68_APtoP = np.array(
        [beta.interval(0.68, 1 + c[1, 0], 1 + c[1, 1]) for c in counts])
    ci95_PtoAP = np.array(
        [beta.interval(0.95, 1 + c[0, 1], 1 + c[0, 0]) for c in counts])
    ci95_APtoP = np.array(
        [beta.interval(0.95, 1 + c[1, 0], 1 + c[1, 1]) for c in counts])

    plt.figure()
    # volts = 7.5*np.power(10, (-5+attens)/20)
    current_palette = sns.color_palette()
    plt.plot(axis1, mean_PtoAP)
    plt.fill_between(axis1, [ci[0] for ci in ci68_PtoAP],
                     [ci[1] for ci in ci68_PtoAP],
                     color=current_palette[0],
                     alpha=0.2,
                     edgecolor="none")
    plt.fill_between(axis1, [ci[0] for ci in ci95_PtoAP],
                     [ci[1] for ci in ci95_PtoAP],
                     color=current_palette[0],
                     alpha=0.2,
                     edgecolor="none")
    plt.plot(axis1, mean_APtoP)
    plt.fill_between(axis1, [ci[0] for ci in ci68_APtoP],
                     [ci[1] for ci in ci68_APtoP],
                     color=current_palette[1],
                     alpha=0.2,
                     edgecolor="none")
    plt.fill_between(axis1, [ci[0] for ci in ci95_APtoP],
                     [ci[1] for ci in ci95_APtoP],
                     color=current_palette[1],
                     alpha=0.2,
                     edgecolor="none")
    # plt.xlabel("Pulse Amp (V)")
    plt.xlabel("Pulse Duration (ns)")
    plt.ylabel("Estimated Switching Probability")
    # plt.title("P to AP")
    # means_diagram_PtoAP = mean_PtoAP.reshape(len(attens), len(durations), order='F')
    # plt.pcolormesh(axis1, volts, means_diagram_PtoAP, cmap="RdGy")
    # plt.colorbar()
    # plt.figure()
    # plt.title("AP to P")
    # means_diagram_APtoP = mean_APtoP.reshape(len(attens), len(durations), order='F')
    # plt.pcolormesh(axis1, volts, means_diagram_APtoP, cmap="RdGy")
    # plt.colorbar()

    plt.figure()
    plt.semilogy(axis1, 1 - mean_PtoAP)
    plt.semilogy(axis1,
                 1 - limit_PtoAP,
                 color=current_palette[0],
                 linestyle="--")
    plt.semilogy(axis1, 1 - mean_APtoP)
    plt.semilogy(axis1,
                 1 - limit_APtoP,
                 color=current_palette[1],
                 linestyle="--")
    plt.ylabel("Switching Error Rate")
    plt.xlabel("Pulse Duration (ns)")

    plt.show()
示例#27
0
# The variance = E[X**2] - (E[X])**2
exp_x_squared = np.sum(np.square(p_grid) * posterior)
std = np.sqrt(exp_x_squared - mu**2)

print(f'posterior mean = {mu}, posterior standard deviation = {std}')
norm_approx_posterior = norm.pdf(p_grid, loc=mu, scale=std)

# The Beta dist. is a conjugate pair of the binomial dist
# More specifically, if X_1, ..., X_n are iid random variables from a Binomial dist.
# with parameter p, and p ~ Beta(a, b), then the posterior distribution of p
# given X_1 = x_1, ..., X_n = x_n is Beta(a + sum(x_1, ..., x_n), b + n - sum(x_1, ..., x_n))
# Since Uniform(0, 1) = Beta(1, 1), the hyper-parameter update rule after observing water W times
# and land L times is a = W + 1 and b = L + 1
W = 6
L = 3
beta_data = beta.pdf(p_grid, W + 1, L + 1)
beta_mu = beta.mean(W + 1, L + 1)
beta_std = beta.std(W + 1, L + 1)

norm_approx = norm.pdf(p_grid, beta_mu, beta_std)
# Plot both the analytically obtained posterior and the normal approximation
plt.plot(p_grid, beta_data, 'bo-', label='beta')
plt.plot(p_grid, norm_approx, 'ro-', label='normal')

plt.xlabel('Fraction of water')
plt.ylabel('Beta(W=6, L=3)')
plt.title(f'Sample= WLWWWLWLW; number of grid points = {NUM_PTS}')
plt.legend()

plt.show()
 def true_mean_beta():
     return beta.mean(a=0.5, b=1, loc=self.left, scale=self.right)
示例#29
0
def run():

    if len(sys.argv) < 10:
        print 'usage: python %s methylation_reads_all.tsv region.bed chrFieldID leftField rightFieldID minCoverage windowsize stepsize tabix_location outfileprefix [-subsample N] [-expectedMaxDist bp]' % sys.argv[0]
        print '\Note: the script assumes Tombo 1.3 probabilities, and a tabix indexed reads file'
        print '\Note: the [-subsample] option will sample the reads in all comparisons down to the minCoverage level; the N parameter indicates how many such subsamplings should be averaged for the final value'
        print '\Note: the [-expectedMaxDist] option will change the initial window over which the required minimum number of reads is to be search for; default: 2kb'
        sys.exit(1)

    reads = sys.argv[1]
    peaks = sys.argv[2]
    chrFieldID = int(sys.argv[3])
    leftFieldID = int(sys.argv[4])
    rightFieldID = int(sys.argv[5])
    minCov = int(sys.argv[6])
    window = int(sys.argv[7])
    step = int(sys.argv[8])
    tabix = sys.argv[9]
    outprefix = sys.argv[10]

    alph = 10
    bet = 10
    PSS = 100

    SS = 1
    doSS = False
    if '-subsample' in sys.argv:
        SS = int(sys.argv[sys.argv.index('-subsample') + 1])
        doSS = True
        print 'will subsample all comparisons down to', minCov, 'reads'
        print 'will take the average outcome of', SS, 'subsamplings'

    EMD = 2000
    if '-expectedMaxDist' in sys.argv:
        EMD = int(sys.argv[sys.argv.index('-expectedMaxDist') + 1])
        print 'will use an expected maximum distance of', EMD


    PeakDict = {}
    if peaks.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + peaks
    elif peaks.endswith('.gz') or peaks.endswith('.bgz'):
        cmd = 'zcat ' + peaks
    elif peaks.endswith('.zip'):
        cmd = 'unzip -p ' + peaks
    else:
        cmd = 'cat ' + peaks
    RN = 0
    P = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = P.readline().strip()
        if line == '':
            break
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        chr = fields[chrFieldID]
        RL = int(fields[leftFieldID])
        RR = int(fields[rightFieldID])
        if PeakDict.has_key(chr):
            pass
        else:
            PeakDict[chr] = []
        PeakDict[chr].append((RL,RR))

    print 'finished inputting peaks'


    for chr in PeakDict.keys():
        PeakDict[chr].sort()
        for (left,right) in PeakDict[chr]:
            print chr,left,right
            Matrix = {}
            outfile = open(outprefix + '.' + chr + '_' + str(left) + '_' + str(right),'w')
            for i in range(left,right,step):
                RC = 0
                cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(i) + '-' + str(min(right,i + EMD))
                p = os.popen(cmd, "r")
                line = 'line'
                while line != '':
                    line = p.readline().strip()
                    if line == '':
                        break
                    fields = line.strip().split('\t')
                    read_left = int(fields[1])
                    read_right = int(fields[2])
                    if read_left <= i and read_right >= min(right,i + EMD):
                        RC += 1
                    else:
                        continue
                if RC >= minCov:
                    RCC = minCov
                    LJ = min(right,i + EMD)
                    while RCC >= minCov and LJ < right:
                        LJ += window
                        RRR = 0
                        cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(i) + '-' + str(LJ)
                        p = os.popen(cmd, "r")
                        line = 'line'
                        while line != '':
                            line = p.readline().strip()
                            if line == '':
                                break
                            fields = line.strip().split('\t')
                            read_left = int(fields[1])
                            read_right = int(fields[2])
                            if read_left <= i and read_right >= LJ:
                                RRR += 1
                            else:
                                continue
                        RCC = RRR
                    rightLimit = LJ - window
                else: 
                    RCC = RC
                    LJ = min(right,i + EMD)
                    while RCC <= minCov and LJ > i:
                        LJ = LJ - window
                        RRR = 0
                        cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(i) + '-' + str(LJ)
                        p = os.popen(cmd, "r")
                        line = 'line'
                        while line != '':
                            line = p.readline().strip()
                            if line == '':
                                break
                            fields = line.strip().split('\t')
                            read_left = int(fields[1])
                            read_right = int(fields[2])
                            if read_left <= i and read_right >= LJ:
                                RRR += 1
                            else:
                                continue
                        RCC = RRR
                    rightLimit = LJ
                rightLimit = min(right,rightLimit)
                print chr, i, rightLimit
                RegionReads = []
                cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(i) + '-' + str(rightLimit)
                p = os.popen(cmd, "r")
                line = 'line'
                while line != '':
                    line = p.readline().strip()
                    if line == '':
                        break
                    fields = line.strip().split('\t')
                    chr = fields[0]
                    read_left = int(fields[1])
                    read_right = int(fields[2])
                    if read_left <= i and read_right >= rightLimit:
                        pass
                    else:
                        continue
                    strand = fields[3]
                    read = fields[4]
                    cgs = fields[6].split(',')
                    loglike = fields[7].split(',')
                    RegionReads.append((cgs,loglike))
                AccDict = {}
                for j in range(i,rightLimit,window):
                    AccDict[j] = []
                    Matrix[j] = {}
                    for k in range(j,rightLimit,window):
                        Matrix[j][k] = 0
                print len(RegionReads)
                if len(RegionReads) < minCov:
                    continue
                for S in range(SS):
                    if doSS:
                        RegionReadsSubSampled = random.sample(RegionReads,minCov)
                    else:
                        RegionReadsSubSampled = RegionReads
                    for (cgs,loglike) in RegionReadsSubSampled:
                        t = zip(cgs,loglike)
                        RD = dict((int(x), float(y)) for x, y in t)
                        for j in range(i,rightLimit,window):
                            (A,B) = (alph,bet)
                            for pos in range(j, j + window):
                                if RD.has_key(pos):
                                    p = RD[pos]
                                    Z = int(PSS*p)
                                    A = A + Z
                                    B = B + PSS - Z
                            if beta.mean(A,B) > 0.5:
                                final_p = 1
                            else:
                                final_p = 0
                            AccDict[j].append(final_p)
                    for j in range(i,rightLimit,window):
                        for k in range(j,rightLimit,window):
                            JSDvalue = JSD(AccDict[j],AccDict[k])
                            Matrix[j][k] += JSDvalue/SS
            outline = '#'
            for i in range(left,right,window):
                outline = outline + '\t' + str(i)
            outfile.write(outline + '\n')
            for i in range(left,right,window):
                outline = str(i)
                for j in range(left,right,window):
                    if Matrix.has_key(i):
                        if Matrix[i].has_key(j):
                            outline = outline + '\t' + "{0:.2f}".format(Matrix[i][j])
                        else:
                            outline = outline + '\t' + 'nan'
                    else:
                        outline = outline + '\t' + 'nan'
                outfile.write(outline + '\n')
            outfile.close()
示例#30
0
from scipy.stats import beta

a = 7.2
b = 2.3
m, v = beta.stats(a, b, moments="mv")
mean = beta.mean(a=a, b=b)
var = beta.var(a=a, b=b)
print("The mean is " + str(m))
print("The variance is " + str(v))

prob = 1 - beta.cdf(a=a, b=b, x=.90)
print("The probability of having a variance over 90% is " + str(prob))
示例#31
0
 def mean(self):
     return beta.mean(self.a, self.b)