Python qqplot_2samples示例，statsmodels.api.qqplot_2samples Python示例

示例#1

0

显示文件

def test_1():
    try:
        from matplotlib import pyplot as pp
    except ImportError:
        raise SkipTest('Not making QQ plot')

    # check that ldirichlet_softmax_pdf is actually giving a dirichlet
    # distribution, by comparing a QQ plot with np.random.dirichlet
    alpha = np.array([1, 2, 3], dtype=float)

    def logprob(x, alpha):
        grad = np.zeros_like(x)
        logp = ldirichlet_softmax(x, alpha, grad=grad)
        return logp, grad

    samples, diag = hmc(logprob,
                        x0=np.random.normal(size=(3, )),
                        n_samples=1000,
                        args=(alpha, ),
                        n_steps=10,
                        return_diagnostics=True)

    expx = np.exp(samples)
    pi1 = expx / np.sum(expx, 1, keepdims=True)
    pi2 = np.random.dirichlet(alpha=alpha, size=1000)

    sm.qqplot_2samples(pi1[:, 0], pi2[:, 0], line='45')
    pp.savefig('bayes_ratematrix-test-1.png')

示例#2

0

显示文件

文件： central_limit_theorem.py 项目： krypticmouse/LiveStats

 def run(self, data = 'normal'):
     mean_dist = self.generate_mean_distribution(data)
     mean_of_sample = np.mean(mean_dist)
     sd_of_sample = np.std(mean_dist)
     
     print(f'Mean of Mean Distribution: {mean_of_sample}')
     print(f'Mean of Data Distribution: {np.mean(self.data)}')
     print(f'S.D. of Mean Distribution: {sd_of_sample}')
     print(f'S.D. of Data Distribution: {np.std(self.data)}')
     cust_norm = np.random.normal(mean_of_sample, sd_of_sample, self.sample_size)
     
     plt.figure(figsize = (14,10))
     ax1 = plt.subplot2grid ((2, 2), (0, 0)) 
     ax1 = sns.distplot(cust_norm)
     ax1 = plt.title('Normal Distribution')
     
     ax2 = plt.subplot2grid ((2, 2), (0, 1)) 
     ax2 = sns.distplot(mean_dist)
     ax2 = plt.title('Mean Distribution')
     
     ax3 = plt.subplot2grid ((2, 2), (1, 0), colspan=2) 
     ax3 = sns.distplot(self.data)
     ax3 = plt.title('Data Distribution')
     
     qqplot_2samples(np.array(mean_dist),cust_norm, line = '45')
     
     plt.show()

示例#3

0

显示文件

文件： verification.py 项目： mcveanlab/msprime

    def run_dtwf_coalescent_comparison(self, test_name, **kwargs):
        df = pd.DataFrame()
        for model in ["hudson", "dtwf"]:
            kwargs["model"] = model
            print("Running: ", kwargs)
            replicates = msprime.simulate(**kwargs)
            data = collections.defaultdict(list)
            for ts in replicates:
                t_mrca = np.zeros(ts.num_trees)
                for tree in ts.trees():
                    t_mrca[tree.index] = tree.time(tree.root)
                data["tmrca_mean"].append(np.mean(t_mrca))
                data["num_trees"].append(ts.num_trees)
                data["model"].append(model)
            df = df.append(pd.DataFrame(data))

        basedir = os.path.join("tmp__NOBACKUP__", test_name)
        if not os.path.exists(basedir):
            os.mkdir(basedir)

        df_hudson = df[df.model == "hudson"]
        df_dtwf = df[df.model == "dtwf"]
        for stat in ["tmrca_mean", "num_trees"]:
            v1 = df_hudson[stat]
            v2 = df_dtwf[stat]
            sm.graphics.qqplot(v1)
            sm.qqplot_2samples(v1, v2, line="45")
            f = os.path.join(basedir, "{}.png".format(stat))
            pyplot.savefig(f, dpi=72)
            pyplot.close('all')

示例#4

0

显示文件

文件： verification.py 项目： koelling/msprime

 def _plot_stats(self, key, stats_type, df_msp, df_ms):
     assert set(df_ms.columns.values) == set(df_msp.columns.values)
     for stat in df_ms.columns.values:
         v1 = df_ms[stat]
         v2 = df_msp[stat]
         sm.graphics.qqplot(v1)
         sm.qqplot_2samples(v1, v2, line="45")
         f = self._build_filename(key, stats_type, stat)
         pyplot.savefig(f, dpi=72)
         pyplot.close('all')

示例#5

0

显示文件

 def _plot_stats(self, key, stats_type, df_msp, df_ms):
     assert set(df_ms.columns.values) == set(df_msp.columns.values)
     for stat in df_ms.columns.values:
         v1 = df_ms[stat]
         v2 = df_msp[stat]
         sm.graphics.qqplot(v1)
         sm.qqplot_2samples(v1, v2, line="45")
         f = self._build_filename(key, stats_type, stat)
         pyplot.savefig(f, dpi=72)
         pyplot.close('all')

示例#6

0

显示文件

def test_qqplot_unequal():
    rs = np.random.RandomState(0)
    data1 = rs.standard_normal(100)
    data2 = rs.standard_normal(200)
    fig1 = sm.qqplot_2samples(data1, data2)
    fig2 = sm.qqplot_2samples(data2, data1)
    x1, y1 = fig1.get_axes()[0].get_children()[0].get_data()
    x2, y2 = fig2.get_axes()[0].get_children()[0].get_data()
    np.testing.assert_allclose(x1, x2)
    np.testing.assert_allclose(y1, y2)
    numobj1 = len(fig1.get_axes()[0].get_children())
    numobj2 = len(fig2.get_axes()[0].get_children())
    assert numobj1 == numobj2

示例#7

0

显示文件

文件： test_gofplots.py 项目： Inoryy/statsmodels

 def test_qqplot_2samples_ProbPlotObjects(self):
     # also tests all values for line
     for line in ['r', 'q', '45', 's']:
         # test with `ProbPlot` instances
         fig = sm.qqplot_2samples(self.prbplt, self.other_prbplot,
                                  line=line)
         plt.close('all')

示例#8

0

显示文件

 def test_qqplot_2samples_ProbPlotObjects(self):
     # also tests all values for line
     for line in ['r', 'q', '45', 's']:
         # test with `ProbPlot` instances
         fig = sm.qqplot_2samples(self.prbplt,
                                  self.other_prbplot,
                                  line=line)

示例#9

0

显示文件

文件： qq_plot.py 项目： yemilawal/pyseer

def main():
    options = get_options()

    import sys
    import numpy as np
    import pandas as pd
    import statsmodels.api as sm
    import matplotlib.pyplot as plt

    m = pd.read_csv(options.table, usecols=['lrt-pvalue'],
                    sep='\t')['lrt-pvalue']

    plt.figure(figsize=(4, 3.75))
    ax = plt.subplot(111)

    y = -np.log10(m)
    x = -np.log10(np.random.uniform(0, 1, m.shape[0]))

    fig = sm.qqplot_2samples(y,
                             x,
                             xlabel='Expected $-log_{10}(pvalue)$',
                             ylabel='Observed $-log_{10}(pvalue)$',
                             line='45',
                             ax=ax)

    ax = fig.axes[0]
    ax.lines[0].set_color('k')
    ax.lines[0].set_alpha(0.3)

    ax.set_xlim(-0.5, x.max() + 0.5)
    ax.set_ylim(-0.5, y.max() + 0.5)

    plt.tight_layout()

    plt.savefig(options.output, dpi=150)

示例#10

0

显示文件

文件： generate_dtwf_perf_data.py 项目： jeromekelleher/msprime-1.0-paper

def validate(replicates):
    """
    Validate that we are simulating the same things in the simulators
    by running some replicates and plotting the distributions of the
    number of output trees.
    """
    # NOTE: we seem to consistently get more trees from ARGON. Looking
    # at the qqplots, the distributions looks about the same, but there's
    # consistently more from ARGON. We've check the parameters as closely
    # as we can here, so I'm not sure there's much we can do.
    # However, see the discussion here:
    # https://github.com/tskit-dev/msprime-1.0-paper/pull/109
    # When we export to a tree sequence and squash the trees down properly,
    # we get the same distributions. So, this is fine.
    L = 1  # Megabases
    sample_size = 10

    nt_argon = np.zeros(replicates)
    nt_hybrid = np.zeros(replicates)
    nt_msprime = np.zeros(replicates)

    with click.progressbar(range(replicates)) as bar:
        for j in bar:
            nt_argon[j] = sim_argon(sample_size, L, count_trees=True)
            nt_hybrid[j] = sim_msprime_hybrid(sample_size, L)
            nt_msprime[j] = sim_msprime(sample_size, L)

    print(
        "mean number of trees:",
        "argon=",
        np.mean(nt_argon),
        "msprime breakpoints=",
        np.mean(nt_msprime),
        "hybrid breakpoints=",
        np.mean(nt_hybrid),
    )

    sm.graphics.qqplot(nt_argon)
    sm.qqplot_2samples(nt_argon, nt_msprime, line="45")
    plt.xlabel("argon")
    plt.ylabel("msprime")
    plt.savefig("figures/verify_argon_v_msprime.png")

    plt.close("all")

示例#11

0

显示文件

文件： verification.py 项目： koelling/msprime

    def run_tbl_analytical_check(self):
        """
        Runs the check for the total branch length.
        """
        R = 10000
        basedir = "tmp__NOBACKUP__/analytical_tbl"
        if not os.path.exists(basedir):
            os.mkdir(basedir)
        for n in range(2, 15):
            tbl_ms = self.get_tbl_distribution(n, R, self._ms_executable)
            tbl_msp = self.get_tbl_distribution(n, R, self._mspms_executable)

            sm.graphics.qqplot(tbl_ms)
            sm.qqplot_2samples(tbl_ms, tbl_msp, line="45")
            filename = os.path.join(basedir, "qqplot_{}.png".format(n))
            pyplot.savefig(filename, dpi=72)
            pyplot.close('all')

            hist_ms, bin_edges = np.histogram(tbl_ms, 20, density=True)
            hist_msp, _ = np.histogram(tbl_msp, bin_edges, density=True)

            index = bin_edges[:-1]
            # We don't seem to have the analytical value quite right here,
            # but since the value is so very close to ms's, there doesn't
            # seem to be much point in trying to fix it.
            analytical = [self.get_analytical_tbl(n, x * 2) for x in index]
            fig, ax = pyplot.subplots()
            bar_width = 0.15
            rects1 = pyplot.bar(index,
                                hist_ms,
                                bar_width,
                                color='b',
                                label="ms")
            rects2 = pyplot.bar(index + bar_width,
                                hist_msp,
                                bar_width,
                                color='r',
                                label="msp")
            pyplot.plot(index + bar_width, analytical, "o", color='k')
            pyplot.legend()
            # pyplot.xticks(index + bar_width, [str(j) for j in index])
            pyplot.tight_layout()
            filename = os.path.join(basedir, "hist_{}.png".format(n))
            pyplot.savefig(filename)

示例#12

0

显示文件

文件： verification.py 项目： koelling/msprime

    def run_pairwise_island_model(self):
        """
        Runs the check for the pairwise coalscence times for within
        and between populations.
        """
        R = 10000
        M = 0.2
        basedir = "tmp__NOBACKUP__/analytical_pairwise_island"
        if not os.path.exists(basedir):
            os.mkdir(basedir)

        for d in range(2, 6):
            cmd = "2 {} -T -I {} 2 {} {}".format(R, d, "0 " * (d - 1), M)
            T_w_ms = self.get_pairwise_coalescence_time(
                self._ms_executable + cmd.split() + self.get_ms_seeds(), R)
            T_w_msp = self.get_pairwise_coalescence_time(
                self._mspms_executable + cmd.split() + self.get_ms_seeds(), R)

            cmd = "2 {} -T -I {} 1 1 {} {}".format(R, d, "0 " * (d - 2), M)
            T_b_ms = self.get_pairwise_coalescence_time(
                self._ms_executable + cmd.split() + self.get_ms_seeds(), R)
            T_b_msp = self.get_pairwise_coalescence_time(
                self._mspms_executable + cmd.split() + self.get_ms_seeds(), R)
            print(d,
                  np.mean(T_w_ms),
                  np.mean(T_w_msp),
                  d / 2,
                  np.mean(T_b_ms),
                  np.mean(T_b_msp), (d + (d - 1) / M) / 2,
                  sep="\t")

            sm.graphics.qqplot(T_w_ms)
            sm.qqplot_2samples(T_w_ms, T_w_msp, line="45")
            f = os.path.join(basedir, "within_{}.png".format(d))
            pyplot.savefig(f, dpi=72)
            pyplot.close('all')

            sm.graphics.qqplot(T_b_ms)
            sm.qqplot_2samples(T_b_ms, T_b_msp, line="45")
            f = os.path.join(basedir, "between_{}.png".format(d))
            pyplot.savefig(f, dpi=72)
            pyplot.close('all')

示例#13

0

显示文件

文件： verification.py 项目： terhorst/msprime

def run_verify_coalescent(n, m, Ne, r, models, num_replicates, output_prefix):
    """
    Runs ms and msprime on the specified parameters and outputs qqplots
    of the coalescent simulation summary statistics with the specified
    prefix.
    """
    ms = MsCoalescentStatisticsSimulator(n, m, r, Ne, models)
    df_ms = ms.run(num_replicates)
    msp = MsprimeCoalescentStatisticsSimulator(n, m, r, Ne, models)
    df_msp = msp.run(num_replicates)
    for stat in ["t", "num_trees", "re_events", "ca_events"]:
        v1 = df_ms[stat]
        v2 = df_msp[stat]
        # pyplot.hist(v1, 20, alpha=0.5, label="ms")
        # pyplot.hist(v2, 20, alpha=0.5, label="msp")
        # pyplot.legend(loc="upper left")
        sm.graphics.qqplot(v1)
        sm.qqplot_2samples(v1, v2, line="45")
        f = "{0}_{1}.png".format(output_prefix, stat)
        pyplot.savefig(f, dpi=72)
        pyplot.clf()

示例#14

0

显示文件

文件： verification.py 项目： terhorst/msprime

def run_verify_mutations(n, m, Ne, r, models, num_replicates, mutation_rate,
        output_prefix):
    """
    Runs ms and msprime for the specified parameters, and filters the results
    through Hudson's sample_stats program to get distributions of the
    haplotype statistics.
    """
    ms = MsMutationStatisticsSimulator(n, m, r, Ne, models, mutation_rate)
    df_ms = ms.run(num_replicates)
    msp = MsprimeMutationStatisticsSimulator(n, m, r, Ne, models, mutation_rate)
    df_msp = msp.run(num_replicates)
    for stat in ["pi", "ss", "D", "thetaH", "H"]:
        v1 = df_ms[stat]
        v2 = df_msp[stat]
        # pyplot.hist(v1, 20, alpha=0.5, label="ms")
        # pyplot.hist(v2, 20, alpha=0.5, label="msp")
        # pyplot.legend(loc="upper left")
        sm.graphics.qqplot(v1)
        sm.qqplot_2samples(v1, v2, line="45")
        f = "{0}_{1}.png".format(output_prefix, stat)
        pyplot.savefig(f, dpi=72)
        pyplot.clf()

示例#15

0

显示文件

    def run_tbl_analytical_check(self):
        """
        Runs the check for the total branch length.
        """
        R = 10000
        basedir = "tmp__NOBACKUP__/analytical_tbl"
        if not os.path.exists(basedir):
            os.mkdir(basedir)
        for n in range(2, 15):
            tbl_ms = self.get_tbl_distribution(n, R, self._ms_executable)
            tbl_msp = self.get_tbl_distribution(n, R, self._mspms_executable)

            sm.graphics.qqplot(tbl_ms)
            sm.qqplot_2samples(tbl_ms, tbl_msp, line="45")
            filename = os.path.join(basedir, "qqplot_{}.png".format(n))
            pyplot.savefig(filename, dpi=72)
            pyplot.close('all')

            hist_ms, bin_edges = np.histogram(tbl_ms, 20, density=True)
            hist_msp, _ = np.histogram(tbl_msp, bin_edges, density=True)

            index = bin_edges[:-1]
            # We don't seem to have the analytical value quite right here,
            # but since the value is so very close to ms's, there doesn't
            # seem to be much point in trying to fix it.
            analytical = [self.get_analytical_tbl(n, x * 2) for x in index]
            fig, ax = pyplot.subplots()
            bar_width = 0.15
            rects1 = pyplot.bar(
                index, hist_ms, bar_width, color='b', label="ms")
            rects2 = pyplot.bar(
                index + bar_width, hist_msp, bar_width, color='r', label="msp")
            pyplot.plot(index + bar_width, analytical, "o", color='k')
            pyplot.legend()
            # pyplot.xticks(index + bar_width, [str(j) for j in index])
            pyplot.tight_layout()
            filename = os.path.join(basedir, "hist_{}.png".format(n))
            pyplot.savefig(filename)

示例#16

0

显示文件

    def run_pairwise_island_model(self):
        """
        Runs the check for the pairwise coalscence times for within
        and between populations.
        """
        R = 10000
        M = 0.2
        basedir = "tmp__NOBACKUP__/analytical_pairwise_island"
        if not os.path.exists(basedir):
            os.mkdir(basedir)

        for d in range(2, 6):
            cmd = "2 {} -T -I {} 2 {} {}".format(R, d, "0 " * (d - 1), M)
            T_w_ms = self.get_pairwise_coalescence_time(
                self._ms_executable + cmd.split() + self.get_ms_seeds(), R)
            T_w_msp = self.get_pairwise_coalescence_time(
                self._mspms_executable + cmd.split() + self.get_ms_seeds(), R)

            cmd = "2 {} -T -I {} 1 1 {} {}".format(R, d, "0 " * (d - 2), M)
            T_b_ms = self.get_pairwise_coalescence_time(
                self._ms_executable + cmd.split() + self.get_ms_seeds(), R)
            T_b_msp = self.get_pairwise_coalescence_time(
                self._mspms_executable + cmd.split() + self.get_ms_seeds(), R)
            print(d, np.mean(T_w_ms), np.mean(T_w_msp), d / 2,
                    np.mean(T_b_ms), np.mean(T_b_msp), (d + (d - 1) / M) / 2,
                    sep="\t")

            sm.graphics.qqplot(T_w_ms)
            sm.qqplot_2samples(T_w_ms, T_w_msp, line="45")
            f = os.path.join(basedir, "within_{}.png".format(d))
            pyplot.savefig(f, dpi=72)
            pyplot.close('all')

            sm.graphics.qqplot(T_b_ms)
            sm.qqplot_2samples(T_b_ms, T_b_msp, line="45")
            f = os.path.join(basedir, "between_{}.png".format(d))
            pyplot.savefig(f, dpi=72)
            pyplot.close('all')

示例#17

0

显示文件

文件： test_gofplots.py 项目： zhisheng/statsmodels

def test_qqplot_2samples_arrays():
    #just test that it runs
    x = np.random.normal(loc=8.25, scale=3.25, size=37)
    y = np.random.normal(loc=8.25, scale=3.25, size=37)

    pp_x = sm.ProbPlot(x)
    pp_y = sm.ProbPlot(y)

    # also tests all values for line
    for line in ['r', 'q', '45', 's']:
        # test with arrays
        fig1 = sm.qqplot_2samples(x, y, line=line)

    plt.close('all')

示例#18

0

显示文件

文件： test_bayes_ratematrix.py 项目： back2mars/msmbuilder

def test_1():
    try:
        from matplotlib import pyplot as pp
    except ImportError:
        raise SkipTest('Not making QQ plot')

    # check that ldirichlet_softmax_pdf is actually giving a dirichlet
    # distribution, by comparing a QQ plot with np.random.dirichlet
    alpha = np.array([1, 2, 3], dtype=float)

    def logprob(x, alpha):
        grad = np.zeros_like(x)
        logp = ldirichlet_softmax(x, alpha, grad=grad)
        return logp, grad
    samples, diag = hmc(logprob, x0=np.random.normal(size=(3,)), n_samples=1000,
                        args=(alpha,), n_steps=10, return_diagnostics=True)

    expx = np.exp(samples)
    pi1 = expx / np.sum(expx, 1, keepdims=True)
    pi2 = np.random.dirichlet(alpha=alpha, size=1000)

    sm.qqplot_2samples(pi1[:, 0], pi2[:, 0], line='45')
    pp.savefig('bayes_ratematrix-test-1.png')

示例#19

0

显示文件

文件： test_gofplots.py 项目： dengemann/statsmodels

def test_qqplot_2samples_arrays():
    #just test that it runs
    x = np.random.normal(loc=8.25, scale=3.25, size=37)
    y = np.random.normal(loc=8.25, scale=3.25, size=37)

    pp_x = sm.ProbPlot(x)
    pp_y = sm.ProbPlot(y)

    # also tests all values for line
    for line in ['r', 'q', '45', 's']:
        # test with arrays
        fig1 = sm.qqplot_2samples(x, y, line=line)

    plt.close('all')

示例#20

0

显示文件

文件： algorithms.py 项目： jeromekelleher/msprime-paper

def run_verify(args):
    """
    Checks that the distibution of events we get is the same as msprime.
    """
    n = args.sample_size
    m = args.num_loci
    rho = args.recombination_rate
    msp_events = np.zeros(args.num_replicates)
    local_events = np.zeros(args.num_replicates)
    for j in range(args.num_replicates):
        random.seed(j)
        s = Simulator(n, m, rho, 10000)
        s.simulate()
        local_events[j] = s.num_re_events
        s = msprime.TreeSimulator(n)
        s.set_num_loci(m)
        s.set_scaled_recombination_rate(rho)
        s.set_random_seed(j)
        s.run()
        msp_events[j] = s.get_num_recombination_events()
    sm.graphics.qqplot(local_events)
    sm.qqplot_2samples(local_events, msp_events, line="45")
    pyplot.savefig(args.outfile, dpi=72)

示例#21

0

显示文件

def run_verify(args):
    """
    Checks that the distibution of events we get is the same as msprime.
    """
    n = args.sample_size
    m = args.num_loci
    rho = args.recombination_rate
    msp_events = np.zeros(args.num_replicates)
    local_events = np.zeros(args.num_replicates)
    for j in range(args.num_replicates):
        random.seed(j)
        s = Simulator(n, m, rho, 10000)
        s.simulate()
        local_events[j] = s.num_re_events
        s = msprime.TreeSimulator(n)
        s.set_num_loci(m)
        s.set_scaled_recombination_rate(rho)
        s.set_random_seed(j)
        s.run()
        msp_events[j] = s.get_num_recombination_events()
    sm.graphics.qqplot(local_events)
    sm.qqplot_2samples(local_events, msp_events, line="45")
    pyplot.savefig(args.outfile, dpi=72)

示例#22

0

显示文件

                verbose_eval=2000)

preds = gbm.predict(X_val, num_iteration=gbm.best_iteration)

print('validation smape: ', smape(y_val, preds))
print('validation mae: ', mean_absolute_error(y_val, preds))

# investigating the distribution of the error
error = y_val.values - preds

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.hist(error, EDGECOLOR='black', color='y')

# comparing the distribution of the predictin and the actual
sm.qqplot_2samples(y_val.values, preds, line='45', ax=plt.subplot(1, 2, 2))
plt.show()

# exploring the feature importance
lgb.plot_importance(gbm, height=0.6)
plt.show()

# predicting sale values for year 2018
X_train = training_df.loc[:, [
    col for col in training_df.columns if col not in ['sales']
]].values
y_train = training_df['sales'].values
X_test = testing_df.loc[:, [
    col for col in testing_df.columns if col not in ['sales']
]]
lgb_train = lgb.Dataset(X_train, y_train)

示例#23

0

显示文件

文件： exo_16.py 项目： abenc/simulation-lois-proba

k_gam
#from scipy.special import gamma as Gamma
#
#def f1(x):
#    return Gamma(x)
#eqn3 = Eq( med/(np.log(2)**(1/k))*f1((1+1/k))-esp )
k_wei = float(0.4758754)
lambda_wei = med/(np.log(2))**(1/k_wei)
lambda_wei

vec=np.sort(vec)

# Log(vec) -> Normal
norm = np.array([np.random.normal(loc=esp,scale=sigma) for x in range(len(vec))])
count, bins, _ = plt.hist(norm, 30, normed=True)
sm.qqplot_2samples(norm,np.log(vec),line='r').suptitle('Log(echantillon) -> Normal (KS_dist = 0.7)', fontsize=20)
np.corrcoef(np.sort(norm),np.log(vec))
stats.ks_2samp(np.sort(norm),np.log(vec))
#Ks_2sampResult(statistic=0.69090909090909092, pvalue=1.8946637774700268e-12)

# vec -> LogNormal
lognorm = np.random.lognormal(mean=param_logn_u,sigma=param_logn_sigma**(1/2),size=len(vec))
count,bins,_ = plt.hist(lognorm,30,normed=True)
sm.qqplot_2samples(lognorm,vec,line='r').suptitle('echantillon -> LogNorm (KS_dist = 0.14)', fontsize=20)
np.corrcoef(np.sort(lognorm),vec)
stats.ks_2samp(np.sort(lognorm),vec)
#Ks_2sampResult(statistic=0.145, pvalue=0.00784630338162055)


# vec -> exp
exp = np.random.exponential(scale=esp,size=len(vec))

示例#24

0

显示文件

文件： probabilistic_extension.py 项目： NunoEdgarGFlowHub/climada_papers

plt.xscale('log')
#plt.title('power {} and scale {}'.format(power_final,scale_final))
plt.xlabel('Returnperiod [years]')
plt.ylabel('Storm Severity Index []')

#plot cdfs to get ks test statistic
cdf_gev_i = scipy.stats.genextreme.cdf(ssi_quantiles_prob,
                                       params_final_gev[0],
                                       loc=params_final_gev[1],
                                       scale=params_final_gev[2])
plt.figure()
plt.plot(ssi_quantiles_prob, cdf_gev_i, '.k')
#        plt.plot(ssi_quantiles_distribution,(1-1/return_periods)**5,'.b')
cdf_wisc_prob_i = (1 - 1 / return_period_prob)**block_size_years
plt.plot(ssi_quantiles_prob, cdf_wisc_prob_i, '.b')
plt.xlim(xmin=10 ^ 10)
#plt.title('power {} and scale {}'.format(power_final,scale_final))
plt.ylabel('cummulative distribution function')
plt.xlabel('Storm Severity Index []')

# show qq plot of historic and probabilistic ssis
sm.qqplot_2samples(
    wisc_hist.ssi,
    wisc_prob_CH.ssi_full_area,
    line='45',
)
#                   xlabel='"WISC historic" pan-European SSI',
#                   ylabel='"WISC probabilistic extension" pan-European SSI'
plt.xlabel('"WISC probabilistic extension" pan-European SSI')
plt.ylabel('"WISC historic" pan-European SSI')

示例#25

0

显示文件

 def test_qqplot_2samples_arrays(self):
     # also tests all values for line
     for line in ['r', 'q', '45', 's']:
         # test with arrays
         fig = sm.qqplot_2samples(self.res, self.other_array, line=line)
         plt.close('all')

示例#26

0

显示文件

文件： test_gofplots.py 项目： Inoryy/statsmodels

 def test_qqplot_2samples_arrays(self):
     # also tests all values for line
     for line in ['r', 'q', '45', 's']:
         # test with arrays
         fig = sm.qqplot_2samples(self.res, self.other_array, line=line)
         plt.close('all')

示例#27

0

显示文件

    print('Number of non-NA/null observations for:', labels[i],
          data.count()[i])
    print('Maximum value for:', labels[i], data.max()[i])
    print('Minimum value for:', labels[i], '', data.max()[i])
    print('Mean for:', labels[i], data.mean()[i])
    print('Standard deviation for', labels[i], data.std()[i])
    print('Kurtosis for:', labels[i], data.kurt()[i])
    print('IQR for:', labels[i],
          data.quantile(0.75)[i] - data.quantile(0.25)[i])
    print('')
print('Covariance matrix')
print(data.cov())

# 2.3 Scatter plot of 2 variables (SL & SW)
# Scatter-plot, SL vs SW
sns.catplot(x="SL", y="SW", data=data, hue='CLASS')

# 2.4 q-q plot of 2 variables (SL & SW), we need statsmodels
# Same sample sizes, sorted data, 'PW' vs 'SL', the result is a qqplot, with same sample size
sm.qqplot_2samples(data['SL'], data['PW'])

#  2.5, 2.6
# Scatter plot matrix
sns.pairplot(data, hue='CLASS')

# 2.7 Apply multidimensional scaling (MDS) to project the d-dimensional data in 2-d data
X = data.iloc[:, :4]
embedding = MDS(n_components=2)
x_transformes = embedding.fit_transform(X[:100])
x_transformes.shape
plt.scatter(x_transformes[:, 0], x_transformes[:, 1])

示例#28

0

显示文件

i = 0
for fname in fnames:
    finalData = pd.read_csv(fname + ".csv")
    scores = reliefAlgorithm(finalData, shifts[i], 1000)
    scores = scores.sort_values()
    #print "\nFor", fname, "\n-------------------------------\n"
    first = scores.idxmax()
    #print "Most Important Attribute =>", scores.idxmax()
    scores[scores.idxmax()] = -sys.maxint - 1
    #print "Second Most Important Attribute =>", scores.idxmax()
    second = scores.idxmax()
    X = finalData[first]
    Y = finalData[second]
    X = (X - X.min()) / (X.max() - X.min())
    Y = (Y - Y.min()) / (Y.max() - Y.min())
    plt.xlabel(first)
    plt.ylabel(second)
    plt.scatter(X, Y)
    plt.savefig(fname + '.png', bbox_inches='tight')
    plt.clf()
    # for quantiles

    pp_x = sm.ProbPlot(X)
    pp_y = sm.ProbPlot(Y)
    ppp = sm.qqplot_2samples(pp_x, pp_y)
    plt.savefig(fname + '_quantiles.png', bbox_inches='tight')
    plt.clf()
    #plt.show()
    i = i + 1

示例#29

0

显示文件

    Used in QQ plot, normalize all the data first.
    """
    max_num = max(x)
    min_num = min(x)
    inter = max_num - min_num
    return [(data - min_num) / inter for data in x]


import statsmodels.api as sm

age_norm = normalize(age)
fat_norm = normalize(fat)

sm.qqplot_2samples(np.asarray(age_norm),
                   np.asarray(fat_norm),
                   xlabel='age',
                   ylabel='fat',
                   line='45')
plt.show()


# -------------------------------
# For Q6 :)
def cosine_similarity(x, y):
    x = np.asarray(x)
    y = np.asarray(y)

    numerator = np.dot(x, y)
    sqrt_x = np.sqrt(sum(x**2))
    sqrt_y = np.sqrt(sum(y**2))

示例#30

0

显示文件

文件： biogrid_read.py 项目： brandon-jernigan/Research-Evolutionary-Rate-Covariation-UA-2015-2017

        pass
    
intact_more_dict ={}

for key, item in ID_dict.items():
    if item >= MIN_PUBLICATION_NUM:
        intact_more_dict[key] = intact_dict[key]

#plots to see how ERC value changes with interacting proteins
list_values = [ v for v in intact_more_dict.values() ]
int_choices = np.random.choice(list_values, 1000)
plt.hist(df_choices)
plt.title("All ERC\n Mean = %.4f" % np.mean(df_choices))
plt.show()
print("ERC mean for all: %.4f" % np.mean(df_choices))
plt.hist(int_choices)
plt.title("Interacting ERC\n Mean = %.4f" % np.mean(int_choices))
plt.show()
print("ERC mean for interacting: %.4f" % np.mean(int_choices))

sm.qqplot_2samples(np.asarray(df_choices), np.asarray(int_choices),xlabel="All ERC", ylabel="Interacting ERC")
plt.plot()
plt.xlim(-1, 1)
plt.ylim(-1, 1)
plt.plot( [-1,1],[-1,1] , 'r')
plt.gca().set_aspect('equal', adjustable='box')
plt.draw()


df_biogrid_acms.to_csv("ACMS_BIOGRID-ORGANISM-Saccharomyces_cerevisiae_S288c-3.4.146.tab2.txt", sep = '\t')

示例#31

0

显示文件

文件： test_gofplots.py 项目： ChadFulton/statsmodels

 def test_qqplot_2samples_arrays(self, close_figures):
     # also tests all values for line
     for line in ['r', 'q', '45', 's']:
         # test with arrays
         sm.qqplot_2samples(self.res, self.other_array, line=line)

示例#32

0

显示文件

文件： gptest.py 项目： HEPonHPC/apprentice

def predict(GP,multitestfiles,RAFOLD,OUTDIR):
    Nsample = 30
    seed = 326323
    allMC = []
    allDeltaMC = []
    X = None

    dc_keys = ['ks2','ks','ad2','ad','kl']
    dc_fns = [computeKS2Sample,computeKS,computeAD2Sample,computeAD,computeKLdivergence]
    distrCompare = {}
    for key in dc_keys:
        distrCompare[key] = {}

    if len(multitestfiles) >1 : raise Exception("Multiple test files not compatible")

    for fno, file in enumerate(multitestfiles):
        dataperfile = pd.read_csv(file, header=None)
        Dperfile = dataperfile.values
        if fno == 0: X = Dperfile[:, :-2]
        allMC.append(Dperfile[:, -2].tolist())
        allDeltaMC.append(Dperfile[:, -1].tolist())

    bestparamfileForRA = GP.printRAmetrics(RAFOLD)
    print("\n\n\n\n")

    # RESULTS
    with open(GP.bestparamfile, 'r') as f:
        ds = json.load(f)

    if 'buildtype' in ds:
        buildtype = ds['buildtype']
    else:
        print("Buildtype not in ds not implemented")
        sys.exit(1)

    if buildtype != "gp":
        Ymean, Ysd = GP.predictHeteroscedastic(X)
    else:
        Ymean, Ysd = GP.predictHomoscedastic(X)

    allchi2metric = []
    allmeanmsemetric = []
    allsdmsemetric = []
    # for j, (mu, sd) in enumerate(zip(Ymean, Ysd)):
    #     MCatp = [allMC[i][j] for i in range(len(allMC))]
    #     allchi2metric.append(((mu - np.mean(MCatp)) / sd) ** 2)
    #     allmeanmsemetric.append((mu - np.mean(MCatp)) ** 2)
    #     allsdmsemetric.append((sd - np.std(MCatp)) ** 2)


    for j, (mu, sd) in enumerate(zip(Ymean, Ysd)):
        MCatp = allMC[0][j]
        DeltaMCatp = allDeltaMC[0][j]
        allchi2metric.append(((mu - MCatp) / sd) ** 2)
        allmeanmsemetric.append((mu - MCatp) ** 2)
        allsdmsemetric.append((sd - DeltaMCatp) ** 2)


    chi2metric = np.mean(allchi2metric)
    meanmsemetric = np.mean(allmeanmsemetric)
    sdmsemetric = np.mean(allsdmsemetric)

    print("#########################")
    for kno,key in enumerate(dc_keys):
        distrCompare[key]['MCvs{}'.format(buildtype)] = []
        distrCompare[key]['MCvs{}'.format(buildtype)] = []
        for j,(mu,sd) in enumerate(zip(Ymean,Ysd)):
            MCatp = allMC[0][j]
            DeltaMCatp = allDeltaMC[0][j]
            data = np.random.normal(MCatp, DeltaMCatp, Nsample)
            distrCompare[key]['MCvs{}'.format(buildtype)].append(
                dc_fns[kno](data,mu,sd,seed)
            )
    print("#########################\n\n")

    print("################ RESULTS START HERE")
    with open(GP.bestparamfile, 'r') as f:
        ds = json.load(f)
    bestkernel = ds['kernel']
    print("Best Kernel is {}".format(bestkernel))
    print("with meanmsemetric %.2E" % (meanmsemetric))
    print("with sdmsemetric %.2E" % (sdmsemetric))
    print("with chi2metric %.2E" % (chi2metric))

    ############################################
    # print(X)
    # print(Ymean)
    # print(Ysd)
    os.makedirs(OUTDIR, exist_ok=True)
    datatdump = np.column_stack((X, Ymean, Ysd))
    np.savetxt(os.path.join(OUTDIR, "{}.csv".format(ds["obsname"])), datatdump, delimiter=',')
    ############################################

    if bestparamfileForRA is not None:
        import apprentice
        OUTDIRRA = os.path.dirname(bestparamfileForRA)
        with open(bestparamfileForRA, 'r') as f:
            ds = json.load(f)
        seed = ds['seed']
        Moutfile = os.path.join(OUTDIRRA, 'RA', "{}_MCRA_S{}.json".format(GP.obsname.replace('/', '_'),
                                                                        seed))
        DeltaMoutfile = os.path.join(OUTDIRRA, 'RA', "{}_DeltaMCRA_S{}.json".format(GP.obsname.replace('/', '_'),
                                                                        seed))

        meanappset = apprentice.appset.AppSet(Moutfile, binids=[GP.obsname])
        if len(meanappset._binids) != 1 or \
                meanappset._binids[0] != GP.obsname:
            print("Something went wrong.\n"
                  "RA Fold Mean function could not be created.")
            exit(1)
        meanerrappset = apprentice.appset.AppSet(DeltaMoutfile, binids=[GP.obsname])
        if len(meanerrappset._binids) != 1 or \
                meanerrappset._binids[0] != GP.obsname:
            print("Something went wrong.\n"
                  "RA Fold Error mean function could not be created.")
            exit(1)

        Mte = np.array([meanappset.vals(x)[0] for x in X])
        DeltaMte = np.array([meanerrappset.vals(x)[0] for x in X])

    else:
        Mte = np.array([GP.approxmeancountval(x) for x in X])
        DeltaMte = np.array([GP.errapproxmeancountval(x) for x in X])

    allchi2metricRA = []
    allmeanmsemetricRA = []
    allsdmsemetricRA = []
    # for j, (mu, sd) in enumerate(zip(Mte, DeltaMte)):
    #     MCatp = [allMC[i][j] for i in range(len(allMC))]
    #     allchi2metricRA.append(((mu - np.mean(MCatp)) / sd) ** 2)
    #     allmeanmsemetricRA.append((mu - np.mean(MCatp)) ** 2)
    #     allsdmsemetricRA.append((sd - np.std(MCatp)) ** 2)

    for j, (mu, sd) in enumerate(zip(Mte, DeltaMte)):
        MCatp = allMC[0][j]
        DeltaMCatp = allDeltaMC[0][j]
        allchi2metricRA.append(((mu - MCatp) / sd) ** 2)
        allmeanmsemetricRA.append((mu - MCatp) ** 2)
        allsdmsemetricRA.append((sd - DeltaMCatp) ** 2)
    chi2metricRA = np.mean(allchi2metricRA)
    meanmsemetricRA = np.mean(allmeanmsemetricRA)
    sdmsemetricRA = np.mean(allsdmsemetricRA)

    print("RAMEAN (meanmsemetric_RA) is %.2E" % (meanmsemetricRA))
    print("RAMEAN (sdmsemetric_RA) is %.2E" % (sdmsemetricRA))
    print("RAMEAN (chi2metric_RA) is %.2E" % (chi2metricRA))

    print("\n\n#########################")
    for kno,key in enumerate(dc_keys):
        distrCompare[key]['MCvsRA'] = []
        distrCompare[key]['MCvsRA'] = []
        for j, (mu, sd) in enumerate(zip(Mte, DeltaMte)):
            MCatp = allMC[0][j]
            DeltaMCatp = allDeltaMC[0][j]
            data = np.random.normal(MCatp, DeltaMCatp, Nsample)
            distrCompare[key]['MCvsRA'].append(
                dc_fns[kno](data,mu,sd,seed)
            )
    print("#########################")

    # np.random.seed(seed)
    # distrCompare['ks']['RAvs{}'.format(buildtype)] = \
    #     [computeKSstatistic(np.random.normal(mu1, sd1, Nsample),
    #                         np.random.normal(mu2, sd2, Nsample))
    #                         for (mu1, mu2, sd1, sd2) in
    #                         zip(Mte, Ymean,DeltaMte,Ysd)]
    # np.random.seed(seed)
    # distrCompare['kl']['RAvs{}'.format(buildtype)] = \
    #     [computeKLdivergence(np.random.normal(mu1, sd1, Nsample),
    #                         np.random.normal(mu2, sd2, Nsample))
    #                          for (mu1, mu2, sd1, sd2) in
    #                          zip(Mte, Ymean, DeltaMte, Ysd)]

    ############################################
    # Print best metrics into a json file
    ############################################
    bestmetricdata = {
        'RA':{
            'meanmsemetric' : meanmsemetricRA,
            'chi2metric': chi2metricRA,
            'sdmsemetric': sdmsemetricRA
        },
        buildtype:{
            'meanmsemetric': meanmsemetric,
            'chi2metric': chi2metric,
            'sdmsemetric': sdmsemetric,
            'bestkernel':bestkernel
        },
        'distrCompare': distrCompare,
        "Nsample":Nsample

    }
    bestmetricfile = os.path.join(OUTDIR,"{}_bestmetrics.json".format(ds["obsname"]))
    with open(bestmetricfile, 'w') as f:
        json.dump(bestmetricdata, f, indent=4)
    ############################################

    import scipy.stats as stats
    import statsmodels.api as sm
    import matplotlib.pyplot as plt
    plotoutdir = os.path.join(OUTDIR,'plots','QQplot')
    os.makedirs(plotoutdir,exist_ok=True)
    for j, (gpmu, gpsd, ramu, rasd) in enumerate(zip(Ymean, Ysd, Mte, DeltaMte)):
        MCatp = allMC[0][j]
        DeltaMCatp = allDeltaMC[0][j]
        MCdata = np.random.normal(MCatp, DeltaMCatp, 1000)
        RAdata = np.random.normal(ramu, rasd, 1000)
        GPdata = np.random.normal(gpmu, gpsd, 1000)
        fig = plt.figure()
        plt.style.use('seaborn')
        ax = fig.add_subplot(1, 1, 1)
        sm.qqplot_2samples(MCdata, RAdata, line='45',ax=ax)
        ax.get_lines()[0].set_markerfacecolor('blue')
        ax.get_lines()[0].set_label('RA')
        sm.qqplot_2samples(MCdata, GPdata, line='45',ax=ax)
        ax.get_lines()[2].set_markerfacecolor('green')
        ax.get_lines()[2].set_label('GP')

        ax.set_xlabel('MC')
        ax.set_ylabel('')

        plt.legend(loc='best')
        fig.tight_layout()
        plotfilename = os.path.join(plotoutdir, "qqplot_{}.pdf".format(j))
        plt.savefig(plotfilename)
        # plt.show()
        plt.close('all')

示例#33

0

显示文件

文件： generate_gc_perf_data.py 项目： fbaumdicker/msprime-1.0-paper

def validate(replicates, sample_size):
    """
    Validate that we are simulating the same things in the simulators
    by running some replicates and plotting the distributions of the
    number of output trees.
    """
    L = 1000
    gc_rate = 0.015
    gc_tract_length = 10

    nt_simbac = np.zeros(replicates)
    nt_fastsimbac = np.zeros(replicates)
    nt_msprime = np.zeros(replicates)
    nb_msprime = np.zeros(replicates)

    with click.progressbar(range(replicates)) as bar:
        for j in bar:
            nt_simbac[j] = run_simbac(
                sample_size=sample_size,
                L=L,
                gc_rate=gc_rate,
                gc_tract_length=gc_tract_length,
                count_trees=True,
            )
            nt_fastsimbac[j] = run_fastsimbac(
                sample_size=sample_size,
                L=L,
                gc_rate=gc_rate,
                gc_tract_length=gc_tract_length,
                set_seed=j,
                count_trees=True,
            )
            nt_msprime[j], nb_msprime[j] = run_msprime(
                sample_size=sample_size,
                L=L,
                gc_rate=gc_rate,
                gc_tract_length=gc_tract_length,
                ret_breakpoints=True,
            )
    print(
        "mean number of trees:",
        "simbac=",
        np.mean(nt_simbac),
        "fastsimbac=",
        np.mean(nt_fastsimbac),
        "msprime trees=",
        np.mean(nt_msprime),
        "msprime breakpoints=",
        np.mean(nb_msprime),
    )

    sm.graphics.qqplot(nt_simbac)
    sm.qqplot_2samples(nt_simbac, nb_msprime, line="45")
    plt.xlabel("simbac")
    plt.ylabel("msprime")
    plt.savefig("figures/verify_simbac_v_msprime.png")

    plt.close("all")

    sm.graphics.qqplot(nt_fastsimbac)
    sm.qqplot_2samples(nt_fastsimbac, nt_msprime, line="45")
    plt.xlabel("fastsimbac")
    plt.ylabel("msprime")
    plt.savefig("figures/verify_fastsimbac_v_msprime.png")

示例#34

0

显示文件

文件： exo_9.py 项目： abenc/simulation-lois-proba

##pareto solving

from sympy import Eq, Symbol, solve

k       = Symbol('k')
eqn     = Eq( (esp_c**2)+ (2*var_c*k)-(var_c*k**2) ,0)
k_root  = solve(eqn)[1]
xm      = Symbol('xm')
eqn2    = Eq( esp_c - k_root*xm/(k_root-1) )
xm_root = solve(eqn2)[0]
# xm_root = 499.55021
# mode = xm_root*((k_root-1)/k_root)**1/k_root

exp_c          = np.random.exponential(scale=esp_c,size=len(vec_c))
count, bins, _ = plt.hist(exp_c, 200, normed=True)
sm.qqplot_2samples(exp_c,vec_c,line='r').suptitle('echantillon C ~> expo (KS_dist = 0.49)', fontsize=20)
stats.ks_2samp(np.sort(exp_c),vec_c)
#Ks_2sampResult(statistic=0.49, pvalue=0.0

norm_c = np.random.normal(loc=esp_c,scale=sigma_c,size=len(vec_c))
count, bins, _ = plt.hist(norm_c, 200, normed=True)
sm.qqplot_2samples(norm_c,np.log(vec_c),line='r').suptitle('echantillon C ~> normal (KS_dist = 0.07)', fontsize=20)
stats.ks_2samp(np.sort(norm_c),vec_c)
#Ks_2sampResult(statistic=0.068199999999999927, pvalue=6.3527863052483843e-41)

par_c = (np.random.pareto(k_root,len(vec_c))+1) * float(xm_root)
count, bins, _ = plt.hist(par_c, 200, normed=True)
sm.qqplot_2samples(par_c,vec_c,line='r').suptitle('echantillon C ~> pareto (KS_dist = 0.27)', fontsize=20)
stats.ks_2samp(np.sort(par_c),vec_c)
#Ks_2sampResult(statistic=0.26915, pvalue=0.0)

示例#35

0

显示文件

 def test_qqplot_2samples_arrays(self, close_figures):
     # also tests all values for line
     for line in ['r', 'q', '45', 's']:
         # test with arrays
         sm.qqplot_2samples(self.res, self.other_array, line=line)

示例#36

0

显示文件

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.distributions.empirical_distribution import ECDF
import statsmodels.api as sm
from paretochart.pareto import pareto

df_ospedali = pd.read_csv(
    '/home/gabriele/Documenti/Università/Statistica/Dataset/csv_lab/dati-ospedali.csv',
    sep=';')
#print(df_ospedali)
"""
dist = ECDF(df_ospedali['Medici SSN'])
#print(dist)
plt.plot(dist.x, dist.y)
plt.show()
df_ospedali['Medici SSN'].plot.hist()
plt.show()
"""
"""
sm.qqplot_2samples(df_ospedali['Medici SSN'], df_ospedali['Farmacisti SSN'], line = '45')
plt.show()
"""

grouped = df_ospedali.groupby(
    'Regione')  #raggruppa i valori della colonna regione
temp = grouped['Medici SSN'].sum()
pareto(temp, labels=temp.index)
plt.show()