def main(nsim=500, n=200, p=50, target='full', sigma=3):

    import matplotlib.pyplot as plt
    P0, PA = [], []
    from statsmodels.distributions import ECDF

    for i in range(nsim):
        try:
            p0, pA = test_group_lasso(n=n, p=p, target=target, sigma=sigma)
        except:
            pass
        print(len(p0), len(pA))
        P0.extend(p0)
        PA.extend(pA)

        P0_clean = np.array(P0)

        P0_clean = P0_clean[P0_clean > 1.e-5]  #
        print(np.mean(P0_clean), np.std(P0_clean),
              np.mean(np.array(PA) < 0.05),
              np.sum(np.array(PA) < 0.05) / (i + 1),
              np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05),
              np.mean(np.array(P0) < 1e-5), 'null pvalue + power + failure')

        if i % 3 == 0 and i > 0:
            U = np.linspace(0, 1, 101)
            plt.clf()
            if len(P0_clean) > 0:
                plt.plot(U, ECDF(P0_clean)(U))
            if len(PA) > 0:
                plt.plot(U, ECDF(PA)(U), 'r')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.savefig("plot.pdf")
    plt.show()
Exemplo n.º 2
0
def main():

    fig1, fig2, dbn = marginal(20, 3., 3, nsim=1000)
    full = np.load('pval_20.npz')
    Ugrid = np.linspace(0, 1, 101)

    ax1 = fig1.gca()
    ax1.plot(Ugrid,
             ECDF(full['known'])(Ugrid),
             label=r'Selected using $i^*(Z)$',
             c='green',
             linewidth=5,
             alpha=0.5)
    ax1.legend(loc='lower right')

    ax2 = fig2.gca()
    ax2.plot(Ugrid,
             ECDF(full['known'][full['hypotheses']])(Ugrid),
             label=r'Selected using $i^*(Z)$',
             c='green',
             linewidth=5,
             alpha=0.5)
    ax2.legend(loc='lower right')

    fig1.savefig('splitting_marginal_1sparse.pdf')
    fig2.savefig('splitting_conditional_1sparse.pdf')
def plot_ecdf_pair(data_0, data_1, x, label_0, label_1, unit):
    ecdf = ECDF(data_0.values)
    median = np.median(data_0.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='m',
             label=label_0 + ': median {:.1f} {}'.format(median, unit))

    ecdf = ECDF(data_1.values)
    median = np.median(data_1.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='Orange',
             label=label_1 + ': median {:.1f} {}'.format(median, unit))

    plt.plot(x, 0.5 * np.ones(len(x)), lw=2.0, ls='--', c='b', alpha=.3)

    plt.grid()
    plt.tick_params(axis='both', which='major')
    plt.xlabel(label_0 + '/' + label_1 + ' [{}]'.format(unit))
    plt.ylabel('ECDF')
    plt.ylim([0, 1.05])
    plt.legend(loc='upper left')
    plt.tight_layout()
def main(nsim=500, n=500, p=100, sigma=3):

    P0, PA = [], []
    from statsmodels.distributions import ECDF
    import matplotlib.pyplot as plt

    for i in range(nsim):
        if True:
            p0, pA = test_multiple_queries(n=n, p=p, sigma=sigma)
        else:
            p0, pA = [], []
        P0.extend(p0)
        PA.extend(pA)

        P0_clean = np.array(P0)

        P0_clean = P0_clean[P0_clean > 1.e-5]  #
        print(np.mean(P0_clean), np.std(P0_clean),
              np.mean(np.array(PA) < 0.05), np.mean(np.array(P0) < 0.05),
              np.mean(P0_clean < 0.05), np.mean(np.array(P0) < 1e-5))

        if i % 3 == 0 and i > 0:
            U = np.linspace(0, 1, 101)
            plt.clf()
            if len(P0_clean) > 0:
                plt.plot(U, ECDF(P0_clean)(U))
            if len(PA) > 0:
                plt.plot(U, ECDF(PA)(U), 'r')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.savefig("plot.pdf")
    plt.show()
Exemplo n.º 5
0
def median_summ_conn_pass_thresh(inSum,outSum,dmsoSum,matrixType,rnkpt_thresh=90,graph=True):
    "1) what is the number of connections past a given rnkpt threshold \
    -what is the median for each unique pert"
    passMask = np.zeros_like(inSum.values)
    passMask[np.where(inSum.values > rnkpt_thresh)] = 1
    # count connections passed theshold
    passSum = np.sum(passMask,axis=0)
    passSer = pd.Series(data=passSum,index=inSum.columns)
    passSer.name = 'number_of_connections_pass_' + str(rnkpt_thresh) + '_rnkpt'
    passGrped = passSer.groupby(level='pert_id')
    dosMedConnect = passGrped.median()
    dosMedConnect.name = 'median_number_of_connections_above_' + str(rnkpt_thresh) + '_rnkpt'
    # repeat calculation for DMSOs
    passMaskDMSO = np.zeros_like(dmsoSum.values)
    passMaskDMSO[np.where(dmsoSum.values > rnkpt_thresh)] = 1
    passSumDMSO = np.sum(passMaskDMSO,axis=0)
    dmsoSer = pd.Series(data=passSumDMSO,index=dmsoSum.columns)
    dmsoSer.name = 'number_of_connections_above_' + str(rnkpt_thresh) + '_rnkpt'
    # repeat calculation for non-dos compounds
    passMaskNon = np.zeros_like(outSum.values)
    passMaskNon[np.where(outSum.values > rnkpt_thresh)] = 1
    passSumNon = np.sum(passMaskNon,axis=0)
    nonSer = pd.Series(data=passSumNon,index=outSum.columns)
    nonSer.name = 'number_of_connections_pass_' + str(rnkpt_thresh) + '_rnkpt'
    nonSer.index.name = 'sig_id'
    nonGrped = nonSer.groupby(level='pert_id')
    nonMedConnect = nonGrped.median()    
    if graph:
        min1 = np.min([np.min(passSer.values),np.min(passSumNon),np.min(dmsoSer.values)])
        max1 = np.max([np.max(passSer.values),np.max(passSumNon),np.min(dmsoSer.values)])
        h1 = plt.hist(dmsoSer,30,color='b',range=[min1,max1],label=['DMSO n=' + str(len(dmsoSer))],alpha=.4,normed=True)
        # h2 = plt.hist(nonMedConnect,30,color='g',range=[min1,max1],label=['non_DOS n=' + str(len(nonMedConnect))],alpha=.4,normed=True)
        h3 = plt.hist(dosMedConnect,30,color='r',range=[min1,max1],label=['DOS n=' + str(len(dosMedConnect))],alpha=.3,normed=True) #
        plt.legend()
        plt.ylabel('normed freq',fontweight='bold')
        plt.xlabel('median counts ('+ matrixType + ' > ' + str(rnkpt_thresh) + ')',fontweight='bold')
        plt.title('median connections (compounds collapsed by pert_id) - pass rnkpt ' + str(rnkpt_thresh))
        outF = os.path.join(wkdir, 'median_summly_counts_pass_threshold.png')
        plt.savefig(outF, bbox_inches='tight',dpi=200)
        plt.close()
        ### make cdf graph ####
        vals = np.linspace(min1,max1,100)
        dosEcdf = ECDF(dosMedConnect)
        dmsoEcdf = ECDF(dmsoSer)
        nonEcdf = ECDF(dosMedConnect)
        obsDos = dosEcdf(vals)
        obsDmso = dmsoEcdf(vals)
        obsNon = nonEcdf(vals)
        a1 = plt.plot(vals,obsDos,color='b',label=['DOS n=' + str(len(dosMedConnect))])
        a2 = plt.plot(vals,obsNon,color='g',label=['non_DOS n=' + str(len(nonMedConnect))])
        a3 = plt.plot(vals,obsDmso,color='r',label=['DMSO n=' + str(len(dmsoSer))]) #
        # plt.legend()
        plt.ylabel('F(x)',fontweight='bold')
        plt.xlabel('median counts ('+ matrixType + ' > ' + str(rnkpt_thresh) + ')',fontweight='bold')
        # plt.title('median connections pass rnkpt ' + str(rnkpt_thresh))
        outF = os.path.join(wkdir, 'median_summly_counts_cdf.png')
        plt.savefig(outF, bbox_inches='tight',dpi=200)
        plt.close()
    return dosMedConnect, dmsoSer
def plot_ecdf_triplet(data_0,
                      data_1,
                      data_2,
                      x,
                      label_0=None,
                      label_1=None,
                      label_2=None,
                      unit=None,
                      plot_info=True):
    ecdf = ECDF(data_0.values)
    median = np.median(data_0.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='m',
             label=label_0 + ': median {:.1f} {}'.format(median, unit))

    ecdf = ECDF(data_1.values)
    median = np.median(data_1.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='Blue',
             label=label_1 + ': median {:.1f} {}'.format(median, unit))

    ecdf = ECDF(data_2.values)
    median = np.median(data_2.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='Orange',
             label=label_2 + ': median {:.1f} {}'.format(median, unit))

    plt.plot(x, 0.5 * np.ones(len(x)), lw=2.0, ls='--', c='b', alpha=.3)

    plt.grid()
    plt.tick_params(axis='both', which='major')
    if plot_info:
        if label_0 is not None:
            plt.xlabel(label_0 + '/' + label_1 + '/' + label_2 +
                       ' [{}]'.format(unit))
        else:
            plt.xlabel('[{}]'.format(unit))
        plt.ylabel('ECDF')
        plt.ylim([0, 1.05])
        plt.legend(loc='upper left')
    plt.tight_layout()
Exemplo n.º 7
0
def main(nsim=500, n=500, p=200, sqrt=False, target='full', sigma=3, AR=True):

    import matplotlib.pyplot as plt
    P0, PA = [], []
    from statsmodels.distributions import ECDF

    for i in range(nsim):
        if True:
            if not sqrt:
                if AR:
                    p0, pA = test_AR_randomization(n=n,
                                                   p=p,
                                                   target=target,
                                                   sigma=sigma)
                else:
                    p0, pA = test_highdim_lasso(n=n,
                                                p=p,
                                                target=target,
                                                sigma=sigma)
            else:
                p0, pA = test_sqrt_highdim_lasso(n=n,
                                                 p=p,
                                                 target=target,
                                                 compare_to_lasso=False)
        else:
            p0, pA = [], []
        print(len(p0), len(pA))
        P0.extend(p0)
        PA.extend(pA)

        P0_clean = np.array(P0)

        P0_clean = P0_clean[P0_clean > 1.e-5]  #
        print(np.mean(P0_clean), np.std(P0_clean),
              np.mean(np.array(PA) < 0.05),
              np.sum(np.array(PA) < 0.05) / (i + 1),
              np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05),
              np.mean(np.array(P0) < 1e-5), 'null pvalue + power + failure')

        if i % 3 == 0 and i > 0:
            U = np.linspace(0, 1, 101)
            plt.clf()
            if len(P0_clean) > 0:
                plt.plot(U, ECDF(P0_clean)(U))
            if len(PA) > 0:
                plt.plot(U, ECDF(PA)(U), 'r')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.savefig("plot.pdf")
    plt.show()
Exemplo n.º 8
0
def cdf(path):
    params, colnames, vals = load_coverage_file(path)
    from matplotlib import pyplot as plt
    from scipy.stats import binom
    from statsmodels.distributions import ECDF

    # Prepare binning
    n_MC = params["ntest"]
    n_min = min(n_MC * min(params["cl"]), vals[:, -1].min())
    bin_centers = np.arange(n_min, n_MC + 1)

    plt.figure()
    plt.title("coverage histogram {}".format(path))
    ls_col = cycle(product(("solid", "dashed", "dashdot", "dotted"), "rgbmk"))
    for targ_cl, (ls, col) in zip(params["cl"], ls_col):
        idx = vals[:, -2] == targ_cl
        n_cov = vals[idx, -1]
        ecdf = ECDF(n_cov)
        plt.plot(bin_centers,
                 ecdf(bin_centers),
                 color=col,
                 linestyle=ls,
                 linewidth=3,
                 label="{0:.5f}".format(targ_cl))
        plt.plot(bin_centers,
                 binom.cdf(bin_centers, n_MC, targ_cl),
                 color=col,
                 linestyle=ls,
                 marker="o",
                 mew=0)

    plt.legend(loc="best", title="target CL")
    plt.xlabel("#(covered)")
    plt.ylabel("cumulative frequency (CDF)")
    plt.show()
Exemplo n.º 9
0
def CDFm(data,
         nPoint,
         dist='normal',
         mu=0,
         sigma=1,
         analitica=False,
         lim=None):
    import numpy as np
    from scipy.interpolate import interp1d
    from statsmodels.distributions import ECDF
    from scipy.stats import norm, lognorm

    eps = 5e-5
    y = np.linspace(eps, 1 - eps, nPoint)

    if not analitica:
        ecdf = ECDF(data)
        xest = np.linspace(lim[0], lim[1], int(100e3))
        yest = ecdf(xest)
        interp = interp1d(yest, xest, fill_value='extrapolate', kind='nearest')
        x = interp(y)
    else:
        if dist == 'normal':
            x = norm.ppf(y, loc=mu, scale=sigma)
        elif dist == 'lognormal':
            x = lognorm.ppf(y, sigma, loc=0, scale=np.exp(mu))

    return x
Exemplo n.º 10
0
    def pval_adjust_WY(self, cov, pvals, N=10000):
        """
        Purpose:
        multiple testing correction with a Westfall young-like procedure as
        in ridge projection method, http://arxiv.org/abs/1202.1377 P.Buehlmann
        ======================================================================
        :param cov: covariance matrix of your estimator
        :param pvals: single testing pvalues
        :param N: the number of samples to take for the empirical distribution
        :return pcorr: corrected p-values
        ======================================================================
        Author: Ziyan Zhu, Date: April 10th, 2019
        Following R version by Ruben Dezeure, Date: 6 Feb 2014, 14:27
        """
        ncol = cov.shape[1]
        zz = np.random.multivariate_normal(mean=np.zeros(ncol),
                                           cov=cov,
                                           size=N)
        zz2 = zz / np.sqrt(np.diagonal(cov))
        gz = 2 * norm.sf(abs(zz2))
        GZ = np.min(gz, axis=0)

        ecdf = ECDF(GZ)
        pcorr = ecdf(pvals)
        return pcorr
Exemplo n.º 11
0
def main(nsim=500):

    P0 = []
    from statsmodels.distributions import ECDF

    n, p, s = 1000, 10, 3
    Sigma_12 = 0.8
    gsnr = 1.
    beta_star = 1.

    for i in range(nsim):
        try:
            p0 = test_lasso_iv_instance(n=n, p=p, s=s, Sigma_12=Sigma_12, gsnr=gsnr, beta_star=beta_star)
        except:
            p0 = []
        P0.extend(p0)

    print(np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.05))

    U = np.linspace(0, 1, 101)
    #plt.clf()
    plt.plot(U, ECDF(P0)(U))
    plt.plot(U, U, 'r--')
    #plt.savefig("plot.pdf")
    plt.show()
Exemplo n.º 12
0
def L(muestra, alpha):
    n = len(muestra)
    epsilon = sqrt(log(2. / alpha) / (2 * n))
    ecdf = ECDF(muestra)
    nn = len(ecdf.y)
    out = zeros(nn)
    for i in range(0, nn):
        out[i] = max(ecdf.y[i] - epsilon, 0)
    return out
Exemplo n.º 13
0
	def passive_aggressive_train(self):
		'''Trains passive aggressive classifier

		'''
		self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0)
		self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y)
		y_dist = self._clf.decision_function(self._term_doc_matrix._X)
		pos_ecdf = ECDF(y_dist[y_dist >= 0])
		neg_ecdf = ECDF(y_dist[y_dist <= 0])

		def proba_function(distance_from_hyperplane):
			if distance_from_hyperplane > 0:
				return pos_ecdf(distance_from_hyperplane) / 2. + 0.5
			elif distance_from_hyperplane < 0:
				return pos_ecdf(distance_from_hyperplane) / 2.
			return 0.5

		self._proba = proba_function
		return self
Exemplo n.º 14
0
    def fit(self, x):
        self.ecdfs = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            self.ecdfs.update(
                {i: ECDF(x[:, i] if is_np else x.iloc[:, i].values)})
        return self
Exemplo n.º 15
0
def plotCDF(data, logScale=False):

    from statsmodels.distributions.empirical_distribution import ECDF
    import matplotlib.pylab as plt

    ecdf = ECDF(data)
    plt.figure()
    plt.plot(ecdf.x, ecdf.y, linewidth=2)
    plt.ylabel('CDF')
    if logScale:
        plt.xscale('log')
    plt.show()
Exemplo n.º 16
0
def cdf_dphase(delay, freq, title='Histogram of phase lag', xscale='f'):
    """
    Plot cdf of delay for given frequencies.

    Params:
    --------
    delay (ndarray)
        (n_freq,n_samples) Phase distance between two trajectories.
    freq (ndarray)
        Frequencies that are given.
    title (str)
    xscale (str)
        'f' means frequency scale and 't' means time scale
    """
    from misc.plot import set_ticks_radian, colorcycle
    from statsmodels.distributions import ECDF

    fig, ax = plt.subplots(figsize=(7, 4))
    c = colorcycle(len(freq))
    for freqix in range(len(freq)):
        ecdf = ECDF(delay[freqix])
        if xscale == 't':
            ax.plot(ecdf.x / (2 * np.pi) / freq[freqix],
                    ecdf.y,
                    '-',
                    alpha=1,
                    c=next(c),
                    lw=2)
        else:
            ax.plot(ecdf.x, ecdf.y, '-', alpha=1, c=next(c), lw=2)

    if xscale == 't':
        xlim = [-1 / freq[0], 1 / freq[0]]
        xticks = np.arange(*xlim)
    else:
        xlim = [-pi, pi]
        xticks = [-pi, pi / 2, 0, pi / 2, pi]
    ax.set(xlim=xlim,
           xticks=xticks,
           xlabel='Phase lag',
           ylabel='CDF',
           title=title)
    set_ticks_radian(ax, axis='x')
    ax.legend(['%1.1f Hz' % f for f in freq],
              numpoints=1,
              title='Frequency',
              fontsize='small',
              bbox_to_anchor=[1.4, 1.03],
              labelspacing=.1)
    ax.grid()
    return fig, ax
Exemplo n.º 17
0
def plotDistributions(data, title, lineC, f=None, ax=None):
    """
    Plots the distribution of reads lengths (or any other data defined over 
    reads) over both paired end sets
    """

    from statsmodels.distributions import ECDF
    import matplotlib
    matplotlib.use('agg')
    import matplotlib.pylab as plt

    if (f is None):
        f, ax = plt.subplots(2, sharey=True, sharex=True)

    ecdfPE1 = ECDF(data[0, :])
    ecdfPE2 = ECDF(data[1, :])
    ax[0].step(ecdfPE1.x, ecdfPE1.y, color=lineC, alpha=.5)
    ax[1].step(ecdfPE2.x, ecdfPE2.y, color=lineC, alpha=.5)
    ax[0].set_title('Read 0')
    ax[1].set_title('Read 1')

    f.suptitle('Cumulative Distributions of Read Length')
    return f, ax
Exemplo n.º 18
0
def main():

    beta_seq, MLE_cur, MLE_prev, pivot = test_agreement()

    plt.figure(num=1)

    plt.plot(beta_seq, np.array(MLE_cur), label='MLE now')
    plt.plot(beta_seq, np.array(MLE_prev), 'r--', label='MLE prev')
    plt.legend()

    plt.figure(num=2)
    U = np.linspace(0, 1, 101)
    plt.plot(U, ECDF(pivot)(U))
    plt.plot([0, 1], [0, 1], 'k--')
def main(nsim=500):

    P0, PA = [], []
    from statsmodels.distributions import ECDF

    for i in range(nsim):
        try:
            p0, pA = test_condition_subgrad(n=200, p=10)
        except:
            p0, pA = [], []
        P0.extend(p0)
        PA.extend(pA)
        print(np.mean(P0), np.std(P0), np.mean(np.array(PA) < 0.05))

        if i % 3 == 0 and i > 0:
            U = np.linspace(0, 1, 101)
            plt.clf()
            if len(P0) > 0:
                plt.plot(U, ECDF(P0)(U))
            if len(PA) > 0:
                plt.plot(U, ECDF(PA)(U), 'r')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.savefig("plot.pdf")
    plt.show()
Exemplo n.º 20
0
def CDFm(data,nPoint):
    import numpy as np
    from scipy.interpolate import interp1d
    from statsmodels.distributions import ECDF
    eps = 5e-5
    
    yest = np.linspace(0+eps,1-eps,nPoint)
    ecdf = ECDF(data)
    inf,sup = min(data),max(data)
    xest = np.linspace(inf,sup,int(100e3))
    yest = ecdf(xest)
    interp = interp1d(yest,xest,fill_value = 'extrapolate', kind = 'nearest')
    y = np.linspace(eps,1-eps,nPoint)
    x = interp(y)
    
    return x
def main():

    beta_seq, MLE_cur, MLE_prev, pivot = test_agreement()

    import matplotlib.pyplot as plt
    from statsmodels.distributions import ECDF

    plt.figure(num=1)

    plt.plot(beta_seq, np.array(MLE_cur), label='MLE now')
    plt.plot(beta_seq, np.array(MLE_prev), 'r--', label='MLE prev')
    plt.legend()

    plt.figure(num=2)
    U = np.linspace(0, 1, 101)
    plt.plot(U, ECDF(pivot)(U))
    plt.plot([0, 1], [0, 1], 'k--')
Exemplo n.º 22
0
	def _get_scaler_function(scaler_algo):
		scaler = None
		if scaler_algo == 'normcdf':
			scaler = lambda x: norm.cdf(x, x.mean(), x.std())
		elif scaler_algo == 'lognormcdf':
			scaler = lambda x: norm.cdf(np.log(x), np.log(x).mean(), np.log(x).std())
		elif scaler_algo == 'percentile':
			scaler = lambda x: rankdata(x).astype(np.float64) / len(x)
		elif scaler_algo == 'percentiledense':
			scaler = lambda x: rankdata(x, method='dense').astype(np.float64) / len(x)
		elif scaler_algo == 'ecdf':
			from statsmodels.distributions import ECDF
			scaler = lambda x: ECDF(x)
		elif scaler_algo == 'none':
			scaler = lambda x: x
		else:
			raise InvalidScalerException("Invalid scaler alogrithm.  Must be either percentile or normcdf.")
		return scaler
def main(nsim=500):
    cover= 0.
    pivot = []

    for i in range(nsim):
        cover_, pivot_ = test_selected_targets()

        cover += cover_
        pivot.append(pivot_)

        print("iteration completed ", i)
        print("coverage so far ", cover/(i+1.))
    plt.clf()
    ecdf_MLE = ECDF(np.asarray(pivot))
    grid = np.linspace(0, 1, 101)
    plt.plot(grid, ecdf_MLE(grid), c='blue', marker='^')
    plt.plot(grid, grid, 'k--')
    plt.show()
Exemplo n.º 24
0
def lmda_estimator(data):
    ecdf = ECDF(data)
    y = ecdf(data)

    def F(x, lmda, D):
        return 1 - lmda * (x - D + 1 / lmda) * np.exp(-lmda * (x - D))

    result = curve_fit(F, data, y)
    # print """
    #     ===================λ估计完成,结果如下===================
    #         λ:%s
    #         D: %s
    #         Covariance matrix:
    #              | %s\t, %s |
    #              | %s\t, %s |
    #     ======================================================
    #     """ % (result[0][0], result[0][1], result[1][0][0], result[1][0][1], result[1][1][0], result[1][1][1])

    return result[0][0]
Exemplo n.º 25
0
    def pval_adjust_WY(self, cov, pval, N=10000):
        ## Purpose:
        ## multiple testing correction with a Westfall young-like procedure as
        ## in ridge projection method, http://arxiv.org/abs/1202.1377 P.Buehlmann
        ## ----------------------------------------------------------------------
        ## Arguments:
        ## cov: covariance matrix of your estimator
        ## pval: the single testing p-values
        ## N: the number of samples to take for the empirical distribution
        ##    which is used to correct the p-values
        ## ----------------------------------------------------------------------
        ## R-version Author: Ruben Dezeure, Date: 6 Feb 2014, 14:27

        ncol = cov.shape[1]
        zz = np.random.multivariate_normal(mean=np.zeros(ncol),
                                           cov=cov,
                                           size=N)
        zz2 = zz / np.sqrt(np.diagonal(cov))
        gz = 2 * norm.sf(abs(zz2))
        GZ = np.min(gz, axis=0)

        ecdf = ECDF(GZ)
        pcorr = ecdf(pval)
        return pcorr
Exemplo n.º 26
0
from statsmodels.distributions import ECDF

def empirical_cdf_plot(xs):
    ecdf = ECDF(xs)
    xmin = np.nanmin(xs)
    xmax = np.nanmax(xs)
    vals = np.linspace(xmin,xmax,100)
    ax = plt.axes()
    ax.plot(vals,ecdf(vals))
    ax.set_ylabel('F(x)')
    return ax
ax1 = empirical_cdf_plot(passSer)

# Though something simple works too
    ecdf_ridge = ECDF(ridge_r)
    ecdf_linreg = ECDF(linreg_r)
    vals = np.linspace(-1,1,100)
    ax = plt.axes()
    ax.plot(vals,ecdf_ridge(vals),label='LR',linewidth=2)
    ax.plot(vals,ecdf_linreg(vals),label='Ridge',linewidth=2)



dosSer = sigSer.reindex(dosGold['sig_id'].values)

### make summary table:
# 1) pert_id
# 2) times_profiled_in_a2
# 3) times_gold_in_a2
# 4) is_gold_cell lines
Exemplo n.º 27
0
 rMed = rowMedian[pIds]
 fig = plt.figure(1, figsize=(10, 10))
 # make matrix of equal size using null
 nperm = 10000
 permDict = {}
 for iperm in range(nperm):
     iRand = np.random.choice(range(0, dmsoFrm.shape[1]), size=(len(pIds)))
     iRandCol = dmsoFrm.columns[iRand]  #random column names
     smDmso = dmsoFrm.reindex(index=pIds, columns=iRandCol)
     # remove identity cells and unstack
     uDmso = no_diagonal_unstack(smDmso)
     medDmso = uDmso.median()
     permDict[iperm] = medDmso
 nullSer = pd.Series(permDict)
 #two tailed p-value
 ecdf = ECDF(nullSer)
 arg1 = ecdf(medObs)
 arg2 = 1 - ecdf(medObs)
 pval = 2 * np.minimum(arg1, arg2)
 #set p-val min
 if pval == 0:
     pval = 1 / float(nperm)
 pvalDict[cName] = pval
 if graph:
     # graph heatmap of each
     plt.imshow(smFrm.values,
                interpolation='nearest',
                aspect='auto',
                vmin=-100,
                vmax=100,
                cmap=cm.RdBu_r)
Exemplo n.º 28
0
def diffArea(nest,
             outlier=0,
             data=0,
             kinds='all',
             axis='probability',
             ROI=20,
             mu=0,
             sigma=1,
             weight=False,
             interpolator='linear',
             distribuition='normal',
             seed=None,
             plot=True):
    """
    Return an error area between a analitic function and a estimated discretization from a distribuition.

    Parameters
    ----------
    nest: int
        The number of estimation points.
    outlier: int, optional
        Is the point of an outlier event, e.g outlier = 50 will put an event in -50 and +50 if mu = 0.
        Defaut is 0
    data: int, optional
        If data > 0, a randon data will be inserted insted analitcs data.
        Defaut is 0.
    kinds: str or array, optional
        specifies the kind of distribuition to analize.
        ('Linspace', 'CDFm', 'PDFm', 'iPDF1', 'iPDF2', 'all').
        Defaut is 'all'.
    axis: str, optional
        specifies the x axis to analize
        ('probability', 'derivative', '2nd_derivative', 'X').
        Defaut is 'probability'.
    ROI: int, optional
        Specifies the number of regions of interest.
        Defaut is 20.
    mu: int, optional
        Specifies the mean of distribuition.
        Defaut is 0.
    sigma: int, optional
        Specifies the standard desviation of a distribuition.
        Defaut is 1.
    weight: bool, optional
        if True, each ROI will have a diferent weight to analyze.
        Defaut is False
    interpolator: str, optional
        Specifies the kind of interpolation as a string
        ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic'
        where 'zero', 'slinear', 'quadratic' and 'cubic' refer to a spline
        interpolation of zeroth, first, second or third order) or as an
        integer specifying the order of the spline interpolator to use.
        Default is 'linear'.
    distribuition: str, optional
        Select the distribuition to analyze.
        ('normal', 'lognormal')
        Defaut is 'normal'
    plot: bool, optional
        If True, a plot will be ploted with the analyzes
        Defaut is True
        
    Returns
    -------
    a, [b,c]: float and float of ndarray. area,[probROIord,areaROIord]
       returns the sum of total error area and the 'x' and 'y' values.   
    

    """
    import numpy as np
    from scipy.stats import norm, lognorm
    from scipy.interpolate import interp1d
    from numpy import exp
    import matplotlib.pyplot as plt
    from statsmodels.distributions import ECDF
    from distAnalyze import pdf, dpdf, ddpdf, PDF, dPDF, ddPDF

    area = []
    n = []
    data = int(data)
    if distribuition == 'normal':
        outlier_inf = outlier_sup = outlier
    elif distribuition == 'lognormal':
        outlier_inf = 0
        outlier_sup = outlier

    ngrid = int(1e6)
    truth = pdf

    if axis == 'probability':
        truth1 = pdf
    elif axis == 'derivative':
        truth1 = dpdf
    elif axis == '2nd_derivative':
        truth1 = ddpdf
    elif axis == 'X':
        truth1 = lambda x, mu, sigma, distribuition: x
    #else: return 'No valid axis'

    probROIord = {}
    areaROIord = {}
    div = {}
    if seed is not None:
        np.random.set_state(seed)
    if data:
        if distribuition == 'normal':
            d = np.random.normal(mu, sigma, data)
        elif distribuition == 'lognormal':
            d = np.random.lognormal(mu, sigma, data)

    if kinds == 'all':
        kinds = ['Linspace', 'CDFm', 'PDFm', 'iPDF1', 'iPDF2']
    elif type(kinds) == str:
        kinds = [kinds]

    for kind in kinds:
        if distribuition == 'normal':
            inf, sup = norm.interval(0.9999, loc=mu, scale=sigma)

        elif distribuition == 'lognormal':
            inf, sup = lognorm.interval(0.9999, sigma, loc=0, scale=exp(mu))
            inf = lognorm.pdf(sup, sigma, loc=0, scale=np.exp(mu))
            inf = lognorm.ppf(inf, sigma, loc=0, scale=np.exp(mu))

        xgrid = np.linspace(inf, sup, ngrid)
        xgridROI = xgrid.reshape([ROI, ngrid // ROI])

        dx = np.diff(xgrid)[0]

        if kind == 'Linspace':
            if not data:
                xest = np.linspace(inf - outlier_inf, sup + outlier_sup, nest)
            else:
                if distribuition == 'normal':
                    #d = np.random.normal(loc = mu, scale = sigma, size = data)
                    inf, sup = min(d), max(d)
                    xest = np.linspace(inf - outlier_inf, sup + outlier_sup,
                                       nest)
                elif distribuition == 'lognormal':
                    #d = np.random.lognormal(mean = mu, sigma = sigma, size = data)
                    inf, sup = min(d), max(d)
                    xest = np.linspace(inf - outlier_inf, sup + outlier_sup,
                                       nest)

            yest = pdf(xest, mu, sigma, distribuition)

        elif kind == 'CDFm':
            eps = 5e-5
            yest = np.linspace(0 + eps, 1 - eps, nest)
            if distribuition == 'normal':
                if not data:
                    xest = norm.ppf(yest, loc=mu, scale=sigma)
                    yest = pdf(xest, mu, sigma, distribuition)
                else:
                    #d = np.random.normal(loc = mu, scale = sigma, size = data)
                    ecdf = ECDF(d)
                    inf, sup = min(d), max(d)
                    xest = np.linspace(inf, sup, data)
                    yest = ecdf(xest)
                    interp = interp1d(yest,
                                      xest,
                                      fill_value='extrapolate',
                                      kind='nearest')
                    yest = np.linspace(eps, 1 - eps, nest)
                    xest = interp(yest)

            elif distribuition == 'lognormal':
                if not data:
                    xest = lognorm.ppf(yest, sigma, loc=0, scale=exp(mu))
                    yest = pdf(xest, mu, sigma, distribuition)
                else:
                    #d = np.random.lognormal(mean = mu, sigma = sigma, size = data)
                    ecdf = ECDF(d)
                    inf, sup = min(d), max(d)
                    xest = np.linspace(inf, sup, nest)
                    yest = ecdf(xest)
                    interp = interp1d(yest,
                                      xest,
                                      fill_value='extrapolate',
                                      kind='nearest')
                    yest = np.linspace(eps, 1 - eps, nest)
                    xest = interp(yest)

        elif kind == 'PDFm':
            xest, yest = PDF(nest, mu, sigma, distribuition, outlier, data,
                             seed)
        elif kind == 'iPDF1':
            xest, yest = dPDF(nest, mu, sigma, distribuition, outlier, data,
                              10, seed)
        elif kind == 'iPDF2':
            xest, yest = ddPDF(nest, mu, sigma, distribuition, outlier, data,
                               10, seed)

        YY = pdf(xest, mu, sigma, distribuition)
        fest = interp1d(xest,
                        YY,
                        kind=interpolator,
                        bounds_error=False,
                        fill_value=(YY[0], YY[-1]))

        #fest = lambda x: np.concatenate([fest1(x)[fest1(x) != -1],np.ones(len(fest1(x)[fest1(x) == -1]))*fest1(x)[fest1(x) != -1][-1]])

        yestGrid = []
        ytruthGrid = []
        ytruthGrid2 = []
        divi = []

        for i in range(ROI):
            yestGrid.append([fest(xgridROI[i])])
            ytruthGrid.append([truth(xgridROI[i], mu, sigma, distribuition)])
            ytruthGrid2.append([truth1(xgridROI[i], mu, sigma, distribuition)])
            divi.append(
                len(
                    np.intersect1d(
                        np.where(xest >= min(xgridROI[i]))[0],
                        np.where(xest < max(xgridROI[i]))[0])))

        diff2 = np.concatenate(
            abs((np.array(yestGrid) - np.array(ytruthGrid)) * dx))
        #diff2[np.isnan(diff2)] = 0
        areaROI = np.sum(diff2, 1)

        divi = np.array(divi)
        divi[divi == 0] = 1

        try:
            probROI = np.mean(np.sum(ytruthGrid2, 1), 1)
        except:
            probROI = np.mean(ytruthGrid2, 1)

        probROIord[kind] = np.sort(probROI)
        index = np.argsort(probROI)

        areaROIord[kind] = areaROI[index]
        #deletes = ~np.isnan(areaROIord[kind])
        #areaROIord[kind] = areaROIord[kind][deletes]
        #probROIord[kind] = probROIord[kind][deletes]

        area = np.append(area, np.sum(areaROIord[kind]))
        n = np.append(n, len(probROIord[kind]))
        div[kind] = divi[index]
        if plot:
            if weight:
                plt.logy(probROIord[kind],
                         areaROIord[kind] * div[kind],
                         '-o',
                         label=kind,
                         ms=3)
            else:
                plt.plot(probROIord[kind],
                         areaROIord[kind],
                         '-o',
                         label=kind,
                         ms=3)

            plt.yscale('log')
            plt.xlabel(axis)
            plt.ylabel('Error')
            plt.legend()

        #plt.title('%s - Pontos = %d, div = %s - %s' %(j,nest, divs,interpolator))

    return area, [probROIord, areaROIord]
Exemplo n.º 29
0
    def ppplot(self,
               xlabel=None,
               ylabel=None,
               line=None,
               other=None,
               ax=None,
               **plotkwargs):
        """
        P-P plot of the percentiles (probabilities) of x versus the
        probabilities (percetiles) of a distribution.

        Parameters
        ----------
        xlabel : str or None, optional
            User-provided lables for the x-axis. If None (default),
            other values are used depending on the status of the kwarg `other`.
        ylabel : str or None, optional
            User-provided lables for the y-axis. If None (default),
            other values are used depending on the status of the kwarg `other`.
        line : str {'45', 's', 'r', q'} or None, optional
            Options for the reference line to which the data is compared:

                - '45': 45-degree line
                - 's': standardized line, the expected order statistics are
                  scaled by the standard deviation of the given sample and have
                  the mean added to them
                - 'r': A regression line is fit
                - 'q': A line is fit through the quartiles.
                - None: by default no reference line is added to the plot.

        other : ProbPlot, array-like, or None, optional
            If provided, ECDF(x) will be plotted against p(x) where x are
            sorted samples from `self`. ECDF is an empirical cumulative
            distribution function estimated from `other` and
            p(x) = 0.5/n, 1.5/n, ..., (n-0.5)/n where n is the number of
            samples in `self`. If an array-object is provided, it will be
            turned into a `ProbPlot` instance default parameters. If not
            provided (default), `self.dist(x)` is be plotted against p(x).

        ax : Matplotlib AxesSubplot instance, optional
            If given, this subplot is used to plot in instead of a new figure
            being created.
        **plotkwargs : additional matplotlib arguments to be passed to the
            `plot` command.

        Returns
        -------
        fig : Matplotlib figure instance
            If `ax` is None, the created figure.  Otherwise the figure to which
            `ax` is connected.
        """
        if other is not None:
            check_other = isinstance(other, ProbPlot)
            if not check_other:
                other = ProbPlot(other)

            p_x = self.theoretical_percentiles
            ecdf_x = ECDF(other.sample_quantiles)(self.sample_quantiles)

            fig, ax = _do_plot(p_x,
                               ecdf_x,
                               self.dist,
                               ax=ax,
                               line=line,
                               **plotkwargs)

            if xlabel is None:
                xlabel = 'Probabilities of 2nd Sample'
            if ylabel is None:
                ylabel = 'Probabilities of 1st Sample'

        else:
            fig, ax = _do_plot(self.theoretical_percentiles,
                               self.sample_percentiles,
                               self.dist,
                               ax=ax,
                               line=line,
                               **plotkwargs)
            if xlabel is None:
                xlabel = "Theoretical Probabilities"
            if ylabel is None:
                ylabel = "Sample Probabilities"

        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)

        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.0])

        return fig
Exemplo n.º 30
0
def empirical_cdf_plot(xs):
    ecdf = ECDF(xs)