Exemplo n.º 1
0
def _identify_doublets_fisher(cluster_labels: Union[pd.Categorical, List[int]],
                              pred_dbl: List[bool],
                              alpha: float = 0.05) -> pd.DataFrame:
    df = pd.crosstab(cluster_labels, pred_dbl)

    ndbl = df[True].sum()
    a = df[True].values.astype(np.int32)
    b = df[False].values.astype(np.int32)
    c = ndbl - a
    d = (pred_dbl.size - ndbl) - b

    avg_dblr = ndbl / pred_dbl.size
    freqs = a / (a + b)

    from pegasus.cylib.cfisher import fisher_exact
    from statsmodels.stats.multitest import fdrcorrection as fdr
    _, pvals = fisher_exact(a, b, c, d)
    passed, qvals = fdr(pvals, alpha=alpha)

    posvec = np.where(passed)[0][freqs[passed] > avg_dblr]

    result = pd.DataFrame({
        'cluster': df.index[posvec],
        'percentage': freqs[posvec] * 100.0,
        'pval': pvals[posvec],
        'qval': qvals[posvec]
    })
    result.sort_values('percentage', ascending=False, inplace=True)
    result.reset_index(drop=True, inplace=True)

    return result
Exemplo n.º 2
0
def singleTissue_eGene_stat(tissue, SNP_PCs, SNP_sampleList):
    # establish mapping between TRs and Genes
    tisGeneList, tisGene2ind = indexGeneList(tissue)
    locusi2tisGenei = getLocusi2tisGenei(tisGene2ind)

    tisGenMat = getTissueGenMat(
        tissue)  # genMat with samples missing in tpmMat removed
    print(f'\ttisGenMat {tisGenMat.shape}')

    if glob.glob(f'{args.resDir}/{tissue}.ResMat.pickle'):
        tisResTpmMat = pickle.load(
            open(f'{args.resDir}/{tissue}.ResMat.pickle', 'rb'))
    else:
        tisResTpmMat = getTisSNPResTpmMat(tissue, SNP_PCs, SNP_sampleList)
        pickle.dump(tisResTpmMat,
                    open(f'{args.outDir}/{tissue}.ResMat.pickle', 'wb'))
    print(f'\ttisResTpmMat {tisResTpmMat.shape}')

    genei2nloci = getGenei2nloci(
        locusi2tisGenei
    )  # count # of TRs mapped to each gene; used for Bonferroni correction

    tiseGeneTR, stats = runRegressionZ3(
        tisResTpmMat, tisGenMat, locusi2tisGenei,
        genei2nloci)  # [genei, locusi], [p, b, bse]
    print(f'\t{tiseGeneTR.shape[0]} genes tested')

    rejected, adjP = fdr(stats[:, 0])
    print(f'\t{np.sum(rejected)} tissue eGenes')

    eGeneStat = annotateGeneTR1(tissue, tisGeneList, genei2nloci, tiseGeneTR,
                                stats, rejected, adjP)
    print(f'\t{eGeneStat.shape[0]} total eGenes')

    return eGeneStat
Exemplo n.º 3
0
def _calc_qvals(
    nclust: int,
    pvals: np.ndarray,
    first_j: int,
    second_j: int,
) -> np.ndarray:
    """ Calculate FDR
    """
    qvals = np.zeros(pvals.shape, dtype = np.float32)
    if second_j > 0:
        _, qval = fdr(pvals[:, first_j])
        qvals[:, first_j] = qvals[:, second_j] = qval
    else:
        for j in range(nclust):
            _, qvals[:, j] = fdr(pvals[:, j])
    return qvals
Exemplo n.º 4
0
    def fit(self, inds=None, y=None, vcfFile=None, trait=0):
        """ carry out GWAS
            y can be directly provided or obtained from pop object
            returns [ b, se, pvalue, fdr]
            can do for any trait=0:ntrait, or as provided in y vector
            - X(nind x nmkr array): contains genotypes
            - y(nind array): contains phenotypes
            - trait(int): trait index to be used, overriden if y provided [0]
        """
        # get phenotype if y not provided
        if y is None:
            nind = len(inds)
            y = np.array(list(inds[i].y[trait] for i in range(nind)))
        else:
            nind = len(y)

        '''
        # genotypes from vcfFile (deprecated)
        if X is None and vcfFile is None:
                sys.exit('genotypes must be in vcfFile or passed as X matrix')

        elif vcfFile is not None:
            f_open = gzip.open if vcfFile.endswith('.gz') else open
            sep = '\||/'
            with f_open(vcfFile) as f:
                for line in f:
                    try:
                        line = line.decode()
                    except AttributeError:
                        pass
                    if line.startswith('#'): continue
                    line = line.rstrip().split('\t')
                    genotypes = line[9:]
                    gt = []
                    for g in genotypes:
                        ig = g.split(':')[0]
                        gt.append(re.split(sep, ig))
                    # convert into 1D list and to int
                    gt = np.array(list((map(int, sum(gt, [])))))
                    # trick to join by individual genotypes, and then sum
                    gt = np.split(gt, nind)
                    gt = np.asarray(list(map(sum, gt)))
                    b, intercept, r_value, p_value, std_err = stats.linregress(gt, y)
                    out = np.append(out, [b, std_err, p_value])
        '''

        # output
        out = np.array([])
        # genotypes provided in X
        for i in range(self.X.shape[0]):
              # trick to sum genotypes over haplotypes
              x = np.split(self.X[i, :], nind)
              x = np.asarray(list(map(sum, x)))
              b, intercept, r_value, p_value, std_err = stats.linregress(x, y)
              out = np.append(out, [b, std_err, p_value])

        out = out.reshape(len(out) // 3, 3)
        self.b, self.se, self.pvalue = out[:,0], out[:,1], out[:,2]
        # FDR obtained from statsmodels package
        self.fdr = fdr(self.pvalue)[1]
Exemplo n.º 5
0
def calc_fisher(
    clust_id: str,
    data: List[float],
    indices: List[int],
    indptr: List[int],
    shape: Tuple[int, int],
    cluster_labels: List[str],
    cond_labels: List[str],
    gene_names: List[str],
    cnt_vec: List[int],
    verbose: bool,
) -> pd.DataFrame:
    """ Calcualte Fisher's exact test for one cluster
    """
    import fisher

    # recover sparse matrix
    mat = csr_matrix((data, indices, indptr), shape=shape)
    mask = cluster_labels == clust_id
    mat_clust = mat[mask]

    if cond_labels is None:
        n1 = mat_clust.shape[0]
        n2 = shape[0] - n1

        a_true = mat_clust.getnnz(axis=0).astype(np.uint)
        a_false = n1 - a_true
        b_true = cnt_vec.astype(np.uint) - a_true
        b_false = n2 - b_true
    else:
        cond1 = cond_labels.categories[0]
        cond_labs = cond_labels[mask]
        mask2 = cond_labs == cond1

        mat_cond1 = mat_clust[mask2]
        mat_cond2 = mat_clust[~mask2]
        n1 = mat_cond1.shape[0]
        n2 = mat_cond2.shape[0]

        a_true = mat_cond1.getnnz(axis=0).astype(np.uint)
        a_false = n1 - a_true
        b_true = mat_cond2.getnnz(axis=0).astype(np.uint)
        b_false = n2 - b_true

    pvals = fisher.pvalue_npy(a_true, a_false, b_true, b_false)[2]
    passed, qvals = fdr(pvals)

    df = pd.DataFrame(
        {
            "fisher_pval:{0}".format(clust_id): pvals.astype(np.float32),
            "fisher_qval:{0}".format(clust_id): qvals.astype(np.float32),
        },
        index=gene_names,
    )

    if verbose:
        logger.info("calc_fisher finished for cluster {0}.".format(clust_id))

    return df
def differential_analysis(test_norm, control_norm):
    '''
    calculated fold change on log transformed data.
    ========
    Paremeters:
    test_norm: pandas.dataframe, a dataframe of normalized test data, rows as cells and columns as features.
    control_norm: pandas.dataframe, a dataframe of normalized control data, rows as cells and columns as features.
    '''
    report = pd.DataFrame(
        columns=['logFC', 'T_pValue', 'KS_pValue', 'adj_T_pVal', 'adj_KS_pVal'])
    for feature in control_norm.columns:
        fc = test_norm[feature].mean() - control_norm[feature].mean()
        pval = ttest(control_norm[feature], test_norm[feature])
        ks_pval = ks_2samp(control_norm[feature], test_norm[feature])[1]
        report.loc[feature, 'logFC'] = np.round(fc, 2)
        report.loc[feature, 'T_pValue'] = pval[1]
        report.loc[feature, 'KS_pValue'] = ks_pval
    report['adj_T_pVal'] = fdr(report.T_pValue)[1]
    report['adj_KS_pVal'] = fdr(report.KS_pValue)[1]
    return report
Exemplo n.º 7
0
def multi_comp_correction(r, p):
    from statsmodels.stats.multitest import fdrcorrection_twostage as fdr
    for_comp = [np.asarray(p) > 0]
    p_corr_fc = fdr(np.asarray(p)[for_comp], 0.05, 'bh')[1]
    p_corr = np.asarray(p)
    p_corr[for_comp] = p_corr_fc
    r_th = np.asarray(r)
    r_th[np.asarray(p_corr) > 0.05] = 0
    r_th = list(r_th)
    p_corr = list(p_corr)

    return r_th, p_corr
Exemplo n.º 8
0
def correct_fdr(data):
    """
    After receiving the p-values for the test we should correct the multiple comparisons error.
    Here we using the FDR method.
    :param data: The list of genes for correction
    :return: list with the updated p-values
    """
    data.sort(key=lambda x: x.getPValue())
    pvals = [x.getPValue() for x in data]
    after_fdr = fdr(pvals, 0.05)
    for i in range(len(after_fdr[1])):
        data[i].setPValue(after_fdr[1][i])
    return data
Exemplo n.º 9
0
def multi_comp_correction(r, p):
    from statsmodels.stats.multitest import multipletests as fdr
    import copy
    print('Correction for multiple comparisons')
    r = np.asarray(r)
    p = np.asarray(p)
    for_comp = [p > 0]
    p_corr_fc = fdr(p[for_comp], 0.05, 'fdr_bh')[1]
    p_corr = p
    p_corr[for_comp] = p_corr_fc

    r_th = np.asarray(copy.deepcopy(r))
    r_th[np.asarray(p_corr) > 0.05] = 0

    return list(r),list(p),list(r_th)
Exemplo n.º 10
0
def calc_fisher(i, clust_label, gene_names, ct, total):
    cpt = total - ct[:, i, :]
    pvals = fisher.pvalue_npy(ct[:, i, 0], ct[:, i, 1], cpt[:, 0], cpt[:,
                                                                       1])[2]
    passed, qvals = fdr(pvals)
    df = pd.DataFrame(
        {
            "fisher_pval_{0}".format(clust_label): pvals,
            "fisher_qval_{0}".format(clust_label): qvals
        },
        index=gene_names)

    print("Cluster {0} is processed.".format(clust_label))

    return df
Exemplo n.º 11
0
    def fit(self, y):
        """ carry out GWAS
        """
        nind = len(y)
        # output
        out = np.array([])
        # genotypes provided in X
        for i in range(self.X.shape[0]):
            # trick to sum genotypes over haplotypes
            x = np.split(self.X[i, :], nind)
            x = np.asarray(list(map(sum, x)))
            b, intercept, r_value, p_value, std_err = stats.linregress(x, y)
            out = np.append(out, [b, std_err, p_value])

        out = out.reshape(len(out) // 3, 3)
        self.b, self.se, self.pvalue = out[:, 0], out[:, 1], out[:, 2]
        # FDR obtained from statsmodels package
        self.fdr = fdr(self.pvalue)[1]
Exemplo n.º 12
0
def corr_spec_net(r, p, net_name):
    from statsmodels.stats.multitest import multipletests as fdr
    import copy
    id_net = network_id_list(network_type=net_name)
    mask = np.ones(len(r), bool)
    mask[np.asarray(id_net) - 1] = False
    r = np.asarray(r)
    p = np.asarray(p)
    r[mask] = 0
    p[mask] = 0
    for_comp = [p > 0]
    p_corr_fc = fdr(p[for_comp], 0.05, 'fdr_bh')[1]
    p_corr = p
    p_corr[for_comp] = p_corr_fc

    r_th = np.asarray(copy.deepcopy(r))
    r_th[np.asarray(p_corr) > 0.05] = 0
    r_th = list(r_th)

    return r, p, r_th
Exemplo n.º 13
0
def perform_oneway_anova(data, glist, restriction_vec, group_str, fdr_alpha = 0.05):
	selected = np.ones(data.shape[0], dtype = bool)
	for rest_str in restriction_vec:
		attr, value_str = rest_str.split(':')
		values = value_str.split(',')
		selected = selected & np.isin(data.obs[attr], values)
	gene_list = np.array(glist)
	gene_list = gene_list[np.isin(gene_list, data.var_names)]	
	newdat = data[selected, :][:, gene_list].copy()
	newdat.X = newdat.X.toarray()
	group_attr, tmp_str = group_str.split(':')
	groups_str = tmp_str.split(';')
	ngr = len(groups_str)
	group_names = []
	group_idx = np.zeros((ngr, newdat.shape[0]), dtype = bool)
	for i, gstr in enumerate(groups_str):
		name, values = gstr.split('~')
		group_names.extend([name + '_mean', name + '_percent'])
		group_idx[i] = np.isin(newdat.obs[group_attr], values.split(','))
	np.warnings.filterwarnings('ignore')
	stats = np.zeros((len(gene_list), 3 + ngr * 2))
	for i in range(len(gene_list)):
		arr_list = []
		for j in range(group_idx.shape[0]):
			arr = newdat.X[group_idx[j], i]
			stats[i, 3 + j * 2] = arr.mean()
			stats[i, 3 + j * 2 + 1] = (arr > 0).sum() * 100.0 / arr.size
			arr_list.append(arr)
		stats[i, 0], stats[i, 1] = f_oneway(*arr_list)
		if np.isnan(stats[i, 0]):
			stats[i, 0] = 0.0
			stats[i, 1] = 1.0
	passed, stats[:, 2] = fdr(stats[:, 1])
	cols = ['fstat', 'pval', 'qval']
	cols.extend(group_names)
	raw_results = pd.DataFrame(stats, columns = cols, index = gene_list)
	results = raw_results[raw_results['qval'] <= fdr_alpha]
	results = results.sort_values('qval')
	return results, raw_results
Exemplo n.º 14
0
def calc_mwu(clust_label, labels, conds, cond_order, gene_names, data, indices, indptr, shape):
	csc_mat = csc_matrix((data, indices, indptr), shape = shape)
	ngene = shape[1]
	log_fc = np.zeros(ngene)
	U_stats = np.zeros(ngene)
	pvals = np.zeros(ngene)

	idx = labels == clust_label
	exprs = np.zeros(idx.sum())

	idx_x = conds[idx] == cond_order[0]
	idx_y = conds[idx] == cond_order[1]

	local_mat = csc_mat[idx, :]
	
	for j in range(ngene):
		vec = local_mat[:, j]
		if vec.size > 0:
			exprs[vec.indices] = vec.data
			log_fc[j] = np.mean(exprs[idx_x]) - np.mean(exprs[idx_y])
			U_stats[j], pvals[j] = ss.mannwhitneyu(exprs[idx_x], exprs[idx_y], alternative = 'two-sided')
		else:
			log_fc[j] = 0.0
			U_stats[j] = 0.0
			pvals[j] = 1.0
		exprs[:] = 0.0

	passed, qvals = fdr(pvals)

	df = pd.DataFrame({"log_fc": log_fc,
					   "mwu_U": U_stats,
					   "mwu_pval": pvals,
					   "mwu_qval": qvals},
					   index = gene_names)

	print("Cluster {0} is processed.".format(clust_label))

	return df
Exemplo n.º 15
0
def calc_mwu(clust_label, labels, gene_names, data, indices, indptr, shape):
    csc_mat = csc_matrix((data, indices, indptr), shape=shape)
    nsample = shape[0]
    ngene = shape[1]

    idx_x = labels == clust_label
    idx_y = ~idx_x

    exprs = np.zeros(nsample)
    U_stats = np.zeros(ngene)
    pvals = np.zeros(ngene)

    for j in range(ngene):
        exprs[:] = 0.0
        vec = csc_mat[:, j]
        if vec.size > 0:
            exprs[vec.indices] = vec.data
            U_stats[j], pvals[j] = ss.mannwhitneyu(exprs[idx_x],
                                                   exprs[idx_y],
                                                   alternative='two-sided')
        else:
            U_stats[j] = 0.0
            pvals[j] = 1.0
    passed, qvals = fdr(pvals)

    df = pd.DataFrame(
        {
            "mwu_U_{0}".format(clust_label): U_stats,
            "mwu_pval_{0}".format(clust_label): pvals,
            "mwu_qval_{0}".format(clust_label): qvals
        },
        index=gene_names)

    print("Cluster {0} is processed.".format(clust_label))

    return df
expectedFrm = pd.DataFrame(resDict)
expectedFrm = expectedFrm.T
expectedFrm.index.name = 'connection_pair'

expectedFrm['percent_rank_by_direction'] = np.nan
isPos = expectedFrm['expected_direc'] == 'Positively connected'
isNeg = expectedFrm['expected_direc'] == 'Negatively connected'
expectedFrm.ix[
    isPos,
    'percent_rank_by_direction'] = expectedFrm['perc_rank_within_pert_type']
expectedFrm.ix[~isPos, 'percent_rank_by_direction'] = 1 - expectedFrm[
    'perc_rank_within_pert_type']

# plt.hist(expectedFrm['perc_rank'],30)
#check FDR using percent ranks
boolFDR, valFDR = fdr(
    pvals=expectedFrm['perc_rank_within_pert_type'].values)  #,alpha=.05
expectedFrm['pass_FDR_with_percent_rank'] = boolFDR

### write expected connection summary
outF = wkdir + '/expected_connection_summary.txt'
expectedFrm.to_csv(outF, sep='\t', header=True, index=True)

## plot percent ranks overall
plt.hist(expectedFrm['perc_rank_within_pert_type'], 30)
plt.ylabel('freq', fontweight='bold')
plt.xlabel('percent rank of expected connections', fontweight='bold')
plt.title('All expected GEO connections')
outF = path.join(wkdir, 'percent_rank_expected_connections.png')
plt.savefig(outF, bbox_inches='tight', dpi=200)
plt.close()
Exemplo n.º 17
0
def perform_oneway_anova(
    data: AnnData,
    glist: List[str],
    restriction_vec: List[str],
    group_str: str,
    fdr_alpha: float = 0.05,
    res_key: str = None,
) -> pd.DataFrame:
    """Perform one way ANOVA on a subset of cells (restricted by restriction_vec) grouped by group_str and control FDR at fdr_alpha.
    Parameters
    ----------

    data : `anndata` object
        An `anndata` object containing the expression matrix.
    glist : `list[str]`
        A list of gene symbols.
    restriction_vec : `list[str]`
        A vector of restrictions for selecting cells. Each restriction takes the format of attr:value,value,value
    group_str : `str`
        How to group selected cells for ANOVA analysis. If group_str is for pseudotime, it has two formats. 1) 'pseudotime:time:n', which divides cells by equal pseudotime invertal; 2) 'pseudotime:size:n' divides cells by equal number of cells.
    fdr_alpha : `float`, optional (default: 0.05)
        False discovery rate.
    res_key : `str`, optional (default: None)
        Store results into data using res_key, the grouping information is stored in obs and the results is stored in uns.

    Returns
    -------
    `pandas.DataFrame`
        Results for genes that pass FDR control.

    Examples
    --------
    >>> results = misc.perform_oneway_anova(data, ['CD3E', 'CD4', 'CD8'], [], 'pseudotime:size:10')
    """

    from scipy.stats import f_oneway
    from statsmodels.stats.multitest import fdrcorrection as fdr

    selected = np.ones(data.shape[0], dtype=bool)
    for rest_str in restriction_vec:
        attr, value_str = rest_str.split(":")
        values = value_str.split(",")
        selected = selected & np.isin(data.obs[attr], values)

    gene_list = np.array(glist)
    gene_list = gene_list[np.isin(gene_list, data.var_names)]
    ngene = gene_list.size

    newdat = data[selected, :][:, gene_list].copy()
    newdat.X = newdat.X.toarray()

    group_values = group_str.split(":")
    group_names = []
    col_names = []

    ngr = 0
    group_idx = None

    if group_values[0] == "pseudotime":
        assert len(group_values) == 3
        div_by = group_values[1]
        ngr = int(group_values[2])

        group_idx = np.zeros((ngr, newdat.shape[0]), dtype=bool)
        pseudotimes = newdat.obs["pseudotime"].values

        min_t = pseudotimes.min()
        max_t = pseudotimes.max()

        if div_by == "time":
            interval = (max_t - min_t) / ngr
            left = min_t - 1e-5
            for i in range(ngr):
                right = min_t + interval * (i + 1)
                name = "({:.2f}, {:.2f}]".format(left if left >= 0 else 0.0,
                                                 right)
                group_names.append(name)
                group_idx[i] = (pseudotimes > left) & (pseudotimes <= right)
                left = right
        else:
            assert div_by == "size"
            ords = np.argsort(pseudotimes)
            quotient = ords.size // ngr
            residule = ords.size % ngr

            fr = 0
            for i in range(ngr):
                to = fr + quotient + (i < residule)
                name = "[{:.2f}, {:.2f}]".format(pseudotimes[ords[fr]],
                                                 pseudotimes[ords[to - 1]])
                group_names.append(name)
                group_idx[i][ords[fr:to]] = True
                fr = to

    else:
        assert len(group_values) == 2
        group_attr = group_values[0]
        tmp_str = group_values[1]
        groups_str = tmp_str.split(";")

        ngr = len(groups_str)
        group_idx = np.zeros((ngr, newdat.shape[0]), dtype=bool)

        for i, gstr in enumerate(groups_str):
            name, values = gstr.split("~")
            group_names.append(name)
            group_idx[i] = np.isin(newdat.obs[group_attr], values.split(","))

    for i in range(ngr):
        print("Group {} has {} cells.".format(group_names[i],
                                              group_idx[i].sum()))

    np.warnings.filterwarnings("ignore")
    stats = np.zeros((ngene, 3 + ngr * 2))
    for i in range(ngene):
        arr_list = []
        for j in range(ngr):
            arr = newdat.X[group_idx[j], i]
            stats[i, 3 + j * 2] = arr.mean()
            stats[i, 3 + j * 2 + 1] = (arr > 0).sum() * 100.0 / arr.size
            arr_list.append(arr)
        stats[i, 0], stats[i, 1] = f_oneway(*arr_list)
        if np.isnan(stats[i, 0]):
            stats[i, 0] = 0.0
            stats[i, 1] = 1.0
    passed, stats[:, 2] = fdr(stats[:, 1])

    cols = ["fstat", "pval", "qval"]
    for i in range(ngr):
        cols.extend([group_names[i] + "_mean", group_names[i] + "_percent"])
    raw_results = pd.DataFrame(stats, columns=cols, index=gene_list)

    results = raw_results[raw_results["qval"] <= fdr_alpha]
    results = results.sort_values("qval")

    if res_key is not None:
        data.uns[res_key] = raw_results
        data.obs[res_key] = "background"
        for i in range(ngr):
            idx = np.zeros(data.shape[0], dtype=bool)
            idx[selected] = group_idx[i]
            data.obs.loc[idx, res_key] = group_names[i]

    return results
Exemplo n.º 18
0
def calc_stat_and_t(i, clust_label, labels, gene_names, data, indices, indptr,
                    shape, sm1, sm2, ct, total):
    mat = csr_matrix((data, indices, indptr), shape=shape)

    n = shape[0]
    pvals = np.zeros(shape[1])
    qvals = np.zeros(shape[1])
    percent_fold_change = np.zeros(shape[1])

    mask = labels == clust_label
    clust_mat = mat[mask]
    n1 = clust_mat.shape[0]
    n2 = n - n1

    assert n1 > 1 and n2 > 1

    sm1_1 = clust_mat.sum(axis=0).A1
    sm2_1 = clust_mat.power(2).sum(axis=0).A1

    mean1 = sm1_1 / n1
    mean2 = (sm1 - sm1_1) / n2

    s1sqr = (sm2_1 - n1 * (mean1**2)) / (n1 - 1)
    s2sqr = ((sm2 - sm2_1) - n2 * (mean2**2)) / (n2 - 1)

    var_est = s1sqr / n1 + s2sqr / n2

    pvals[:] = 1.01
    qvals[:] = 1.01

    idx = var_est > 0.0
    if idx.sum() > 0:
        tscore = (mean1[idx] - mean2[idx]) / np.sqrt(var_est[idx])
        v = (var_est[idx]**2) / ((s1sqr[idx] / n1)**2 / (n1 - 1) +
                                 (s2sqr[idx] / n2)**2 / (n2 - 1))
        pvals[idx] = ss.t.sf(np.fabs(tscore), v) * 2.0  # two-sided
        passed, qvals[idx] = fdr(pvals[idx])

    # calculate WAD, Weighted Average Difference, https://almob.biomedcentral.com/articles/10.1186/1748-7188-3-8
    log_fold_change = mean1 - mean2
    x_avg = (mean1 + mean2) / 2
    x_max = x_avg.max()
    x_min = x_avg.min() - 0.001  # to avoid divide by zero
    weights = (x_avg - x_min) / (x_max - x_min)
    wads = log_fold_change * weights

    # calculate percentage expressed and percent fold change
    percents = ct[:, i, 0] / (ct[:, i, 0] + ct[:, i, 1]) * 100.0
    cpt = total - ct[:, i, :]
    percents_other = cpt[:, 0] / (cpt[:, 0] + cpt[:, 1]) * 100.0

    idx = percents > 0.0
    idx_other = percents_other > 0.0
    percent_fold_change[(~idx) & (~idx_other)] = 0.0
    percent_fold_change[idx & (~idx_other)] = np.inf
    percent_fold_change[
        idx_other] = percents[idx_other] / percents_other[idx_other]

    df = pd.DataFrame(
        {
            "percentage_{0}".format(clust_label): percents,
            "percentage_other_{0}".format(clust_label): percents_other,
            "mean_log_expression_{0}".format(clust_label): mean1,
            "percentage_fold_change_{0}".format(clust_label):
            percent_fold_change,
            "log_fold_change_{0}".format(clust_label): log_fold_change,
            "WAD_score_{0}".format(clust_label): wads,
            "t_pval_{0}".format(clust_label): pvals,
            "t_qval_{0}".format(clust_label): qvals
        },
        index=gene_names)

    print("Cluster {0} is processed.".format(clust_label))

    return df
Exemplo n.º 19
0
							tarRanks.append(rnk)
							tarRnkPercs.append(RnkPerc)
							#write summary table
							sig = resCids[iq]
							f.write('\t'.join([sig,pDescDict[pert],target,str(cs),str(rnk),str(RnkPerc)[:7]]) + '\n')
					targetCS[cell1][pert][target] = tarCS
					targetRnkPercs[cell1][pert][target] = tarRnkPercs
					targetRanks[cell1][pert][target] = tarRanks
	### put RnkPercs into a vector, test for FDR
	pVec = []
	for pert in targetRnkPercs[cell1]:
		RnkPercs = targetRnkPercs[cell1][pert].values()
		for RnkPerc in RnkPercs:
			pVec.extend(RnkPerc)
	#perform FDR on pVec
	[pBoolean, pCorrected] = fdr(pVec, alpha=0.1, method='indep')
	nPassFDR = sum(pBoolean)
	if nPassFDR:
		iPassFDR = [i for i,x in enumerate(pBoolean) if x == True]
		pPass = [pVec[i] for i in iPassFDR]
		pMaxThresh = max(pPass) #what is the largest RnkPerc that passed FDR
		#flag connections which pass FDR
		outF = os.path.join(celldir,cell1 + '_drug-target_FDR_summary.txt')
		headers = ['query_sig','cp_pert_desc','target_KD_cgs','cs', 'query_rank', 'RnkPerc']
		with open(outF,'w') as f:
			f.write('\t'.join(headers) + '\n')
			for pert in targetRnkPercs[cell1]:
				qInds = queryInd[pert]
				for target in targetRnkPercs[cell1][pert]:
					RnkPercs = targetRnkPercs[cell1][pert][target]
					for i,RnkPerc in enumerate(RnkPercs):
Exemplo n.º 20
0
def calc_mwu(
    clust_id: str,
    data: List[float],
    indices: List[int],
    indptr: List[int],
    shape: Tuple[int, int],
    cluster_labels: List[str],
    cond_labels: List[str],
    gene_names: List[str],
    verbose: bool,
) -> pd.DataFrame:
    """ Run Mann-Whitney U test for one cluster
    """

    csc_mat = csc_matrix((data, indices, indptr), shape=shape)
    U_stats = np.zeros(shape[1], dtype=np.float32)
    pvals = np.full(shape[1], 1.0)
    mask = cluster_labels == clust_id

    if cond_labels is None:
        exprs = np.zeros(shape[0])
        idx_x = mask
        idx_y = ~idx_x
    else:
        exprs = None
        cond1 = cond_labels.categories[0]
        cond_labs = cond_labels[mask]
        idx_x = cond_labs == cond1
        idx_y = ~idx_x

    n1 = idx_x.sum()
    n2 = idx_y.sum()

    if n1 > 0 and n2 > 0:
        import scipy.stats as ss

        for i in range(shape[1]):
            if cond_labels is None:
                if csc_mat.indptr[i + 1] - csc_mat.indptr[i] > 0:
                    exprs[:] = 0.0
                    exprs[csc_mat.indices[csc_mat.indptr[i]:csc_mat.indptr[
                        i + 1]]] = csc_mat.data[csc_mat.indptr[i]:csc_mat.
                                                indptr[i + 1]]
                    U_stats[i], pvals[i] = ss.mannwhitneyu(
                        exprs[idx_x], exprs[idx_y], alternative="two-sided")
            else:
                tmp_mat = csc_mat[mask, i]
                if tmp_mat.data.size > 0:
                    exprs = tmp_mat.toarray()[:, 0]
                    U_stats[i], pvals[i] = ss.mannwhitneyu(
                        exprs[idx_x], exprs[idx_y], alternative="two-sided")

    passed, qvals = fdr(pvals)

    df = pd.DataFrame(
        {
            "mwu_U:{0}".format(clust_id): U_stats.astype(np.float32),
            "mwu_pval:{0}".format(clust_id): pvals.astype(np.float32),
            "mwu_qval:{0}".format(clust_id): qvals.astype(np.float32),
        },
        index=gene_names,
    )

    if verbose:
        logger.info("calc_mwu finished for cluster {0}.".format(clust_id))

    return df
Exemplo n.º 21
0
						continue
					else:
						count = count + 1
						sKeysStr.append(cpRes.index[i].split('_')[1])
						yVals = count
						plt.scatter(rnk,yVals)
				plt.xlim((0, 100))
				plt.ylim((0,count+1))
				plt.yticks(range(1, count + 2), sKeysStr, rotation = 0)
				plt.xlabel('percent rank')
				plt.ylabel('cell line')
				plt.title(pDescDict[brd] + ' - ' + ind + ' connection - ' + gp_type)
				plt.savefig(os.path.join(work_dir,'drug_target_graphs',brd +'_' + ind + '_percent_rank.png'))
				plt.close()
### perform FDR correction
FDRtest = fdr(pVec)
pArr= np.array(pVec)
passed = pArr[FDRtest[0]]
pThreshFDR = max(passed)
cpPass = []
kdPass = []
print 'connections passing FDR correction:'
for test in pDict:
	if pDict[test] < pThreshFDR:
		print test
		drug = test.split('-')[0] + '-' + test.split('-')[1]
		cpPass.append(drug)
		kdPass.append(test.split('-')[-1])
#make graphs of connections that pass FRD 
for ibrd,brd in enumerate(cpPass):
	#skip if not in query
Exemplo n.º 22
0
def calc_t(
    clust_id: str,
    data: List[float],
    indices: List[int],
    indptr: List[int],
    shape: Tuple[int, int],
    cluster_labels: List[str],
    cond_labels: List[str],
    gene_names: List[str],
    sum_vec: List[float],
    sum2_vec: List[float],
    verbose: bool,
) -> pd.DataFrame:
    """ Calcualte Welch's t-test for one cluster
    """

    # recover sparse matrix
    mat = csr_matrix((data, indices, indptr), shape=shape)
    pvals = np.full(shape[1], 1.0)
    tscores = np.full(shape[1], 0)
    mask = cluster_labels == clust_id
    mat_clust = mat[mask]

    if cond_labels is None:
        n1 = mat_clust.shape[0]
        n2 = shape[0] - n1

        if n1 > 1 and n2 > 1:
            sum_clu = mat_clust.sum(axis=0).A1
            mean1 = sum_clu / n1
            mean2 = (sum_vec - sum_clu) / n2
            mean2[mean2 < 0.0] = 0.0

            sum2_clu = mat_clust.power(2).sum(axis=0).A1
            s1sqr = (sum2_clu - n1 * (mean1**2)) / (n1 - 1)
            s2sqr = ((sum2_vec - sum2_clu) - n2 * (mean2**2)) / (n2 - 1)
            s2sqr[s2sqr < 0.0] = 0.0
    else:
        cond1 = cond_labels.categories[0]
        cond_labs = cond_labels[mask]
        mask2 = cond_labs == cond1

        mat_cond1 = mat_clust[mask2]
        mat_cond2 = mat_clust[~mask2]
        n1 = mat_cond1.shape[0]
        n2 = mat_cond2.shape[0]

        if n1 > 1 and n2 > 1:
            mean1 = mat_cond1.mean(axis=0).A1
            psum1 = mat_cond1.power(2).sum(axis=0).A1
            s1sqr = (psum1 - n1 * (mean1**2)) / (n1 - 1)

            mean2 = mat_cond2.mean(axis=0).A1
            psum2 = mat_cond2.power(2).sum(axis=0).A1
            s2sqr = (psum2 - n2 * (mean2**2)) / (n2 - 1)

    if n1 > 1 and n2 > 1:
        import scipy.stats as ss

        var_est = s1sqr / n1 + s2sqr / n2
        idx = var_est > 0.0
        if idx.sum() > 0:
            tscore = (mean1[idx] - mean2[idx]) / np.sqrt(var_est[idx])
            v = (var_est[idx]**2) / ((s1sqr[idx] / n1)**2 / (n1 - 1) +
                                     (s2sqr[idx] / n2)**2 / (n2 - 1))
            pvals[idx] = ss.t.sf(np.fabs(tscore), v) * 2.0  # two-sided
            tscores[idx] = tscore
    passed, qvals = fdr(pvals)

    df = pd.DataFrame(
        {
            "t_pval:{0}".format(clust_id): pvals.astype(np.float32),
            "t_qval:{0}".format(clust_id): qvals.astype(np.float32),
            "t_score:{0}".format(clust_id): tscores.astype(np.float32),
        },
        index=gene_names,
    )

    if verbose:
        logger.info("calc_t finished for cluster {0}.".format(clust_id))

    return df
                                'perc_rank_within_pert_type':ePerc,
                                'expected_direc':eDir,
                                'a_name':aName}
expectedFrm = pd.DataFrame(resDict)
expectedFrm = expectedFrm.T
expectedFrm.index.name = 'connection_pair'

expectedFrm['percent_rank_by_direction'] = np.nan
isPos = expectedFrm['expected_direc'] == 'Positively connected'
isNeg = expectedFrm['expected_direc'] == 'Negatively connected'
expectedFrm.ix[isPos,'percent_rank_by_direction'] = expectedFrm['perc_rank_within_pert_type']
expectedFrm.ix[~isPos,'percent_rank_by_direction'] = 1-expectedFrm['perc_rank_within_pert_type']

# plt.hist(expectedFrm['perc_rank'],30)
#check FDR using percent ranks
boolFDR, valFDR = fdr(pvals=expectedFrm['perc_rank_within_pert_type'].values) #,alpha=.05
expectedFrm['pass_FDR_with_percent_rank'] = boolFDR

### write expected connection summary
outF = wkdir+'/expected_connection_summary.txt'
expectedFrm.to_csv(outF,sep='\t',header=True,index=True)

## plot percent ranks overall
plt.hist(expectedFrm['perc_rank_within_pert_type'],30)
plt.ylabel('freq',fontweight='bold')
plt.xlabel('percent rank of expected connections',fontweight='bold')
plt.title('All expected GEO connections')
outF = path.join(wkdir, 'percent_rank_expected_connections.png')
plt.savefig(outF, bbox_inches='tight',dpi=200)
plt.close()
Exemplo n.º 24
0
sns.histplot(pvalIMG2[np.where(mask_data > 0)])

np.min(pvalIMG2[np.where(mask_data > 0)])

inv = copy.deepcopy(pvalIMG2)
inv[np.where(mask_data > 0)] = 1. / inv[np.where(mask_data > 0)]
inv10 = np.log10(inv)
utils.show_slices(np.log10(inv), isPath=False)
inv10IMG = nib.Nifti1Image(inv10, Tpt_img.affine)
nib.save(inv10IMG, join(T1wDirTemplatePatients01, 'pval_diff_log10inv.nii.gz'))

pvalIMG2_threshold = copy.deepcopy(pvalIMG2)
pvalIMG2_threshold[np.where(pvalIMG2_threshold > 0.5 / len(x))] = 0
utils.show_slices(pvalIMG2_threshold, isPath=False)

trues = fdr(pvalIMG2[np.where(mask_data > 0)], alpha=0.05)[0]
any(trues)
pvalIMG2_fdr = copy.deepcopy(pvalIMG2)
pvalIMG2_fdr[np.where(mask_data > 0)] = fdr(
    pvalIMG2[np.where(mask_data > 0)])[1]
pvalIMG2_fdr_threshold = copy.deepcopy(pvalIMG2_fdr)
pvalIMG2_fdr_threshold[np.where(pvalIMG2_fdr_threshold > 0.05)] = 0
utils.show_slices(pvalIMG2_fdr_threshold, isPath=False)
sns.histplot(pvalIMG2_fdr_threshold[np.where(mask_data > 0)])

invFDR = copy.deepcopy(pvalIMG2_fdr)
invFDR[np.where(mask_data > 0)] = 1. / invFDR[np.where(mask_data > 0)]
utils.show_slices(pvalIMG2_fdr_threshold, isPath=False)

(pvalIMG2_fdr_threshold > 0).any()
Exemplo n.º 25
0
                        sKeysStr.append(cpRes.index[i].split('_')[1])
                        yVals = count
                        plt.scatter(rnk, yVals)
                plt.xlim((0, 100))
                plt.ylim((0, count + 1))
                plt.yticks(range(1, count + 2), sKeysStr, rotation=0)
                plt.xlabel('percent rank')
                plt.ylabel('cell line')
                plt.title(pDescDict[brd] + ' - ' + ind + ' connection - ' +
                          gp_type)
                plt.savefig(
                    os.path.join(work_dir, 'drug_target_graphs',
                                 brd + '_' + ind + '_percent_rank.png'))
                plt.close()
### perform FDR correction
FDRtest = fdr(pVec)
pArr = np.array(pVec)
passed = pArr[FDRtest[0]]
pThreshFDR = max(passed)
cpPass = []
kdPass = []
print 'connections passing FDR correction:'
for test in pDict:
    if pDict[test] < pThreshFDR:
        print test
        drug = test.split('-')[0] + '-' + test.split('-')[1]
        cpPass.append(drug)
        kdPass.append(test.split('-')[-1])
#make graphs of connections that pass FRD
for ibrd, brd in enumerate(cpPass):
    #skip if not in query
#######################################
# ------ (5) COMPARE y/X DISTs ------ #

# (i) Individual Y's
holder_prop = []
for cn in val_Y.columns.drop(['caseid','operyr']):
    tmp_df = pd.concat([pd.DataFrame({'y':val_Y[cn].copy(),'tt':'SK'}),
                    pd.DataFrame({'y':df_Y[cn].copy(), 'tt':'NSQIP'})]).query('y>=0').reset_index(None,True)
    tmp_tbl = tmp_df.groupby(['y','tt']).size().reset_index().pivot('tt','y',0).fillna(0).astype(int)
    pval = stats.chi2_contingency(tmp_tbl.values)[1]
    tmp_prop = tmp_tbl.divide(tmp_tbl.sum(1),axis=0).reset_index().melt('tt').query('y==1').drop(columns='y')
    tmp_prop = tmp_prop.assign(outcome=cn,pval=pval)
    holder_prop.append(tmp_prop)
dist_Y = pd.concat(holder_prop).reset_index(None,True)
# Get the FDR values
dist_Y = dist_Y.merge(dist_Y.groupby('outcome').pval.max().reset_index().assign(fdr=lambda x: fdr(x.pval, alpha=0.10)[1]))
dist_Y = dist_Y.assign(outcome=lambda x: x.outcome.map(di_mapper))

# (ii) Aggregate Y's
holder_Yagg = []
for ay in di_agg:
    tmp_cn = list(np.setdiff1d(di_agg[ay],'othseshock'))
    tmp_df = pd.concat([pd.DataFrame({'y':np.where(val_Y[tmp_cn].sum(1)==0, 0, 1),'tt':'SK'}),
                    pd.DataFrame({'y':df_Y['agg_'+ay], 'tt':'NSQIP'})]).reset_index(None,True)
    tmp_tbl = tmp_df.groupby(['y','tt']).size().reset_index().pivot('tt','y',0).fillna(0).astype(int)
    pval = stats.chi2_contingency(tmp_tbl.values)[1]
    tmp_prop = tmp_tbl.divide(tmp_tbl.sum(1),axis=0).reset_index().melt('tt').query('y==1').drop(columns='y')
    tmp_prop = tmp_prop.assign(outcome=ay,pval=pval)
    holder_Yagg.append(tmp_prop)
dist_Yagg = pd.concat(holder_Yagg).reset_index(None,True)
dist_Yagg = dist_Yagg.assign(version=lambda x: x.outcome.str.replace('[^0-9]','',regex=True).replace('', '1').astype(int),