Пример #1
0
def gaussian_clustering_analysis(alpha_gauss, doGeneratePlots, g1, sph, g2, 
             wp_ensg, well_plate, u_well_plates, ab_cell, ab_nuc, ab_cyto, mt_cell, wp_iscell, wp_isnuc, wp_iscyto):
    '''Analyze the results of Gaussian clustering of FUCCI data for each protein antibody staining'''
    wp_cell_kruskal, wp_nuc_kruskal, wp_cyto_kruskal, wp_mt_kruskal = [],[],[],[]
    curr_wp_phases = []
    mockbulk_phases = np.array(["  "] * len(ab_cell))
    fileprefixes = np.array([f"{ensg}_{sum(wp_ensg[:ei] == ensg)}" for ei, ensg in enumerate(wp_ensg)])
    for iii, wp in enumerate(u_well_plates):
        curr_well_inds = well_plate==wp
        curr_wp_g1 = curr_well_inds & g1
        curr_wp_sph = curr_well_inds & sph
        curr_wp_g2 = curr_well_inds & g2
        curr_wp_phase_list = get_phase_strings(g1[curr_well_inds], sph[curr_well_inds], g2[curr_well_inds])
        mockbulk_phases[curr_well_inds] = np.asarray(curr_wp_phase_list)
        curr_wp_phases.append(curr_wp_phase_list)
        wp_cell_kruskal.append(scipy.stats.kruskal(ab_cell[curr_wp_g1], ab_cell[curr_wp_sph], ab_cell[curr_wp_g2])[1])
        wp_nuc_kruskal.append(scipy.stats.kruskal(ab_nuc[curr_wp_g1], ab_nuc[curr_wp_sph], ab_nuc[curr_wp_g2])[1])
        wp_cyto_kruskal.append(scipy.stats.kruskal(ab_cyto[curr_wp_g1], ab_cyto[curr_wp_sph], ab_cyto[curr_wp_g2])[1])
        wp_mt_kruskal.append(scipy.stats.kruskal(mt_cell[curr_wp_g1], mt_cell[curr_wp_sph], mt_cell[curr_wp_g2])[1])
        max_val_for_norm = np.max(ab_cell[curr_well_inds] if wp_iscell[iii] else ab_nuc[curr_well_inds] if wp_isnuc[iii] else ab_cyto[curr_well_inds])
        max_mt_for_norm = np.max(mt_cell[curr_well_inds])
        if doGeneratePlots:
            gaussian_boxplot_result(
                    (ab_cell[curr_wp_g1] if wp_iscell[iii] else ab_nuc[curr_wp_g1] if wp_isnuc[iii] else ab_cyto[curr_wp_g1]) / max_val_for_norm,
                    (ab_cell[curr_wp_sph] if wp_iscell[iii] else ab_nuc[curr_wp_sph] if wp_isnuc[iii] else ab_cyto[curr_wp_sph]) / max_val_for_norm,
                    (ab_cell[curr_wp_g2] if wp_iscell[iii] else ab_nuc[curr_wp_g2] if wp_isnuc[iii] else ab_cyto[curr_wp_g2]) / max_val_for_norm,
                    "figures/GaussianBoxplots", fileprefixes[iii])
            gaussian_boxplot_result(
                mt_cell[curr_wp_g1] / max_mt_for_norm,
                mt_cell[curr_wp_sph] / max_mt_for_norm,
                mt_cell[curr_wp_g2] / max_mt_for_norm,
                "figures/GaussianBoxplots_mt", f"{fileprefixes[iii]}_mt")
        
    # multiple testing correction for protein of interest
    wp_comp_kruskal_gaussccd_p = utils.values_comp(wp_cell_kruskal, wp_nuc_kruskal, wp_cyto_kruskal, wp_iscell, wp_isnuc, wp_iscyto)
    wp_comp_kruskal_gaussccd_adj, wp_pass_kruskal_gaussccd_bh_comp = utils.benji_hoch(alpha_gauss, wp_comp_kruskal_gaussccd_p)
    utils.np_save_overwriting("output/pickles/wp_comp_kruskal_gaussccd_adj.npy", wp_comp_kruskal_gaussccd_adj)
    utils.np_save_overwriting("output/pickles/wp_pass_kruskal_gaussccd_bh_comp.npy", wp_pass_kruskal_gaussccd_bh_comp)

    # multiple testing correction for microtubules
    wp_mt_kruskal_gaussccd_adj, wp_pass_gaussccd_bh_mt = utils.benji_hoch(alpha_gauss, wp_mt_kruskal) 
    utils.np_save_overwriting("output/pickles/wp_mt_kruskal_gaussccd_adj.npy", wp_mt_kruskal_gaussccd_adj)
    utils.np_save_overwriting("output/pickles/wp_pass_gaussccd_bh_mt.npy", wp_pass_gaussccd_bh_mt)
    
    # save the phase information
    utils.np_save_overwriting("output/pickles/curr_wp_phases.npy", np.array(curr_wp_phases, dtype=object))
    utils.np_save_overwriting("output/pickles/mockbulk_phases.npy", np.array(mockbulk_phases))

    print(f"{len(wp_pass_kruskal_gaussccd_bh_comp)}: number of genes tested")
    print(f"{sum(wp_pass_kruskal_gaussccd_bh_comp)}: number of passing genes at {alpha_gauss*100}% FDR in compartment")

    return wp_comp_kruskal_gaussccd_adj, wp_pass_kruskal_gaussccd_bh_comp, wp_mt_kruskal_gaussccd_adj, wp_pass_gaussccd_bh_mt
def analyze_ccd_variation_by_phase_rna(adata, normalized_exp_data,
                                       biotype_to_use):
    stages = np.array(adata.obs["phase"])
    g1_exp = np.take(normalized_exp_data,
                     np.nonzero(stages == "G1")[0],
                     axis=0)
    s_exp = np.take(normalized_exp_data,
                    np.nonzero(stages == "S-ph")[0],
                    axis=0)
    g2_exp = np.take(normalized_exp_data,
                     np.nonzero(stages == "G2M")[0],
                     axis=0)
    tests_fp = [
        scipy.stats.kruskal(g1_exp[:, geneidx], s_exp[:, geneidx],
                            g2_exp[:, geneidx])
        for geneidx in range(len(g1_exp[0, :]))
    ]
    pvals = [p for (F, p) in tests_fp]
    pvals_corrected_BH, reject_BH = utils.benji_hoch(0.01, pvals)
    pvals_correctedBonf, rejectBonf = utils.bonf(0.01, pvals)
    bulk_phase_tests = pd.DataFrame({
        "gene": adata.var_names,
        "pvalue": pvals,
        "pvaladj_BH": pvals_corrected_BH,
        "reject_BH": reject_BH,
        "pvaladj_B": pvals_correctedBonf,
        "reject_B": rejectBonf
    })
    bulk_phase_tests.to_csv(
        f"output/phase_clustered_transcript_CCD_analysis_{biotype_to_use}.csv")
    return bulk_phase_tests
Пример #3
0
def analyze_cnv_calls(adata, ccdtranscript):
    '''Take results from cnvkit calls to analyze effects of copy number variation'''
    cnsresults = pd.read_csv("input/RNAData/CnsCallSummary.tsv", sep="\t")
    cnsresults_gene = cnsresults["gene"]
    cnsresults_allgenes = np.concatenate([g.split(',') for g in cnsresults_gene])
    genenamedict = utils.getGeneNameDict()
    adata_names = np.array(utils.ccd_gene_names_gapped(adata.var_names[ccdtranscript], genenamedict))
    adata_ccd_isInCns = adata[np.isin(adata.obs["Well_Plate"], cnsresults.columns), 
                              np.arange(len(ccdtranscript))[ccdtranscript][np.isin(adata_names, cnsresults_allgenes)]]
    adata_ccd_isInCns_names = utils.ccd_gene_names_gapped(adata_ccd_isInCns.var_names, genenamedict)
    cnsresultIdx = np.array([[n in genelist for genelist in cnsresults_gene] for n in adata_ccd_isInCns_names])
    geneInJustOneList = np.array([sum(x) == 1 for x in cnsresultIdx])
    adata_ccd_isInCns_inJustOneList = adata_ccd_isInCns[:, geneInJustOneList]
    adata_ccd_isInCns_inJustOneList_names = utils.ccd_gene_names_gapped(adata_ccd_isInCns_inJustOneList.var_names, genenamedict)
    cnsresultIdx_inJustOneList = cnsresultIdx[geneInJustOneList]
    cnsResultsCellData = np.array(cnsresults)[:, np.isin(cnsresults.columns, adata_ccd_isInCns_inJustOneList.obs["Well_Plate"])]
    
    # evaluate consistency of CNVs
    heatmap = np.zeros(cnsResultsCellData.T.shape)
    heatmap[cnsResultsCellData.T == -5] = -1
    heatmap[(cnsResultsCellData.T > -5) & (cnsResultsCellData.T < 1)] = 0
    heatmap[cnsResultsCellData.T == 1] = 1
    heatmap[cnsResultsCellData.T == 2] = 2
    heatmap[cnsResultsCellData.T > 2] = 3
    clustergrid = sbn.clustermap(heatmap[:,:-8], col_cluster=False)
    plt.savefig("figures/CnvConsistency.pdf")
    plt.close()
    
    # heatmaps for phases
    adata_idx = np.array([list(adata.obs["Well_Plate"]).index(wp) for wp in cnsresults.columns[np.isin(cnsresults.columns, 
                                               adata_ccd_isInCns_inJustOneList.obs["Well_Plate"])]])
    sbn.heatmap([adata_ccd_isInCns.obs["phase"][np.asarray(clustergrid.dendrogram_row.reordered_ind)] == "G1",
                 adata_ccd_isInCns.obs["phase"][np.asarray(clustergrid.dendrogram_row.reordered_ind)] == "S-ph",
                 adata_ccd_isInCns.obs["phase"][np.asarray(clustergrid.dendrogram_row.reordered_ind)] == "G2M"],
                yticklabels=["G1", "S", "G2"])
    plt.savefig("figures/CnvConsistencyPhases.pdf")
    plt.close()
    
    # is there enrichment for phase in the highly amplified genes?
    # print(adata_ccd_isInCns.obs["phase"][clustergrid.dendrogram_row.reordered_ind[:100]].value_counts())
    
    # yes, so is there correlation?
    x = adata_ccd_isInCns.obs["fucci_time"]
    y = np.mean(cnsResultsCellData, axis=0)
    linearModel = scipy.stats.linregress(np.asarray(x).astype(float), np.asarray(y).astype(float))
    plt.scatter(x * fucci.TOT_LEN, y)
    plt.scatter(x * fucci.TOT_LEN, linearModel.intercept + x * linearModel.slope)
    plt.xlabel("Cell Division Time, hrs")
    plt.ylabel("Mean CNV of All Chromosome Arms")
    plt.savefig("figures/CnvCorrelation.pdf")
    plt.close()
    
    print(f"{linearModel[3]}: p-value for nonzero slope by two-sided t test")
    residualLinearModel = scipy.stats.linregress(np.asarray(x).astype(float), np.asarray(y - (linearModel.intercept + x * linearModel.slope)).astype(float))
    residualNormality = scipy.stats.normaltest(np.asarray(y - (linearModel.intercept + x * linearModel.slope)))
    print(f"{residualLinearModel[3]}: p-value for nonzero slope of residuals by two-sided t-test")
    print(f"{residualNormality[1]}: p-value for normality of residuals")
    
    # what if we only look at one phase? G1 before doubling? for all genes?
    adata_names = np.array(utils.ccd_gene_names_gapped(adata.var_names, genenamedict))
    adata_ccd_isInCns = adata[np.isin(adata.obs["Well_Plate"], cnsresults.columns) & (adata.obs["phase"] == "G1"), np.arange(len(adata_names))[np.isin(adata_names, cnsresults_allgenes)]]
    adata_ccd_isInCns_names = utils.ccd_gene_names_gapped(adata_ccd_isInCns.var_names, genenamedict)
    cnsresultIdx = np.array([[n in genelist for genelist in cnsresults_gene] for n in adata_ccd_isInCns_names])
    geneInJustOneList = np.array([sum(x) == 1 for x in cnsresultIdx])
    adata_ccd_isInCns_inJustOneList = adata_ccd_isInCns[:, geneInJustOneList]
    adata_ccd_isInCns_inJustOneList_names = utils.ccd_gene_names_gapped(adata_ccd_isInCns_inJustOneList.var_names, genenamedict)
    cnsresultIdx_inJustOneList = cnsresultIdx[geneInJustOneList]
    cnsResultsCellData = np.array(cnsresults)[:, np.isin(cnsresults.columns, adata_ccd_isInCns_inJustOneList.obs["Well_Plate"])]
    cnvAmplified, cnvPvalOneSided = [],[]
    cnvDeleted, cnvPvalOneSidedDeleted = [],[]
    amplifiedTpmsAll, neutralTpmsAll, deletionTpmsAll = [],[],[]
    for ii, tpm in enumerate(adata_ccd_isInCns.X.T[geneInJustOneList]):
        cnv = np.concatenate(cnsResultsCellData[cnsresultIdx_inJustOneList[ii],:])
        missingData = cnv == -5
        amplified, amplifiedTpms = cnv[~missingData & (cnv > 1)], tpm[~missingData & (cnv > 1)]
        neutral, neutralTpms = cnv[~missingData & (cnv == 1)], tpm[~missingData & (cnv == 1)]
        deletion, deletionTpms = cnv[~missingData & (cnv < 1)], tpm[~missingData & (cnv < 1)]
        cnvAmplified.append(np.median(amplifiedTpms) > np.median(tpm[~missingData]))
        cnvPvalOneSided.append(scipy.stats.kruskal(amplifiedTpms, neutralTpms)[1] * 2)
        cnvDeleted.append(np.median(deletionTpms) < np.median(tpm[~missingData]))
        cnvPvalOneSidedDeleted.append(scipy.stats.kruskal(deletionTpms, neutralTpms)[1] * 2)
        amplifiedTpmsAll.extend(amplifiedTpms)
        neutralTpmsAll.extend(neutralTpms)
        deletionTpmsAll.extend(deletionTpms)
    cnvAmplified = np.asarray(cnvAmplified)
    cnvTestPvals_BH, cnvTestPvals_rejectBH = utils.benji_hoch(0.01, cnvPvalOneSided)
    cnvTestPvalsDel_BH, cnvTestPvalsDel_rejectBH = utils.benji_hoch(0.01, cnvPvalOneSidedDeleted)
    print(f"{sum(cnvAmplified & cnvTestPvals_rejectBH)}: number of novel CCD with significantly higher expression with amplified CNVs than neutral")
    print(f"{sum(cnvDeleted & cnvTestPvalsDel_rejectBH)}: number of novel CCD with significantly higher expression with amplified CNVs than neutral")
    utils.general_boxplot([amplifiedTpmsAll, neutralTpmsAll, deletionTpmsAll], 
                          ["amplified", "neutral", "deletion"], "", "logTPMs", "", False, "figures/CNVStateBoxplot.pdf")
    print(f"Of {len(cnvAmplified)} genes:")
    print(f"{scipy.stats.kruskal(amplifiedTpmsAll, neutralTpmsAll, deletionTpmsAll)[1]}: kruskal two sided pval that there's a difference between the three")
    print(f"{scipy.stats.kruskal(amplifiedTpmsAll, neutralTpmsAll)[1]}: kruskal two sided pval that there's a difference between amplified/neutral")
Пример #4
0
def identify_bimodal_intensity_distributions(
        u_well_plates, wp_ensg, pol_sort_well_plate, pol_sort_norm_rev,
        pol_sort_ab_cell, pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_mt_cell,
        wp_iscell, wp_isnuc, wp_iscyto, do_plotting):
    '''
    Some proteins display bimodal intensity distributions. 
    This method seeks to identify distributions with high- and low-expressing cells, 
        so that they may be assessed for CCD independently in `ProteinCellCycleDependence.py`.
    '''
    wp_bimodal_cluster_idxs = []
    wp_bimodal_diffmeans = []
    wp_bimodal_fcmeans = []
    wp_bimodal_fcmaxmin = []
    wp_bimodal_clusterlabels = []
    wp_isbimodal_p = []
    wp_timebimodal_p = []
    wp_intensities = []

    # Use Gaussian clustering to investigate if there is bimodality
    gaussian = sklearn.mixture.GaussianMixture(n_components=2,
                                               random_state=1,
                                               max_iter=500)
    for i, well in enumerate(u_well_plates):
        curr_well_inds = pol_sort_well_plate == well  # the reversal isn't really helpful here
        curr_pol = pol_sort_norm_rev[curr_well_inds]
        curr_ab_cell = pol_sort_ab_cell[curr_well_inds]
        curr_ab_nuc = pol_sort_ab_nuc[curr_well_inds]
        curr_ab_cyto = pol_sort_ab_cyto[curr_well_inds]
        curr_mt_cell = pol_sort_mt_cell[curr_well_inds]

        # Normalize mean intensities, normalized for display
        curr_ab_cell_norm = curr_ab_cell / np.max(curr_ab_cell)
        curr_ab_nuc_norm = curr_ab_nuc / np.max(curr_ab_nuc)
        curr_ab_cyto_norm = curr_ab_cyto / np.max(curr_ab_cyto)
        curr_mt_cell_norm = curr_mt_cell / np.max(curr_mt_cell)
        curr_comp_norm = np.asarray(
            curr_ab_cell_norm if wp_iscell[i] else
            curr_ab_nuc_norm if wp_isnuc[i] else curr_ab_cyto_norm)
        wp_intensities.append(curr_comp_norm)

        cluster_labels = gaussian.fit_predict(curr_comp_norm.reshape(1, -1).T)
        #    cluster_labels = gaussian.fit_predict(np.array([curr_pol, curr_comp_norm]).T)
        wp_bimodal_clusterlabels.append(cluster_labels)
        c1 = cluster_labels == 0
        c2 = cluster_labels == 1
        wp_bimodal_cluster_idxs.append([c1, c2])
        wp_bimodal_diffmeans.append(
            np.mean(curr_comp_norm[c2]) - np.mean(curr_comp_norm[c1]))
        wp_bimodal_fcmeans.append(
            np.mean(curr_comp_norm[c2]) / np.mean(curr_comp_norm[c1]))
        wp_bimodal_fcmaxmin.append(
            np.max(curr_comp_norm) / np.min(curr_comp_norm))

        # Use a kruskal-wallis test to assess whether there's a significant difference of intensities between clusters
        k, p = scipy.stats.kruskal(curr_comp_norm[c1], curr_comp_norm[c2])
        wp_isbimodal_p.append(p)

        # Use a kruskal-wallis test to assess whether there's (not) a sigificant difference in pseudotime between clusters,
        # since strongly CCD proteins will produce bimodal intensity distributions that should still be assessed as one population
        k, p = scipy.stats.kruskal(curr_pol[c1], curr_pol[c2])
        wp_timebimodal_p.append(p)

    # Multiple testing corrections
    wp_isbimodal_padj, wp_isbimodal_pass = utils.benji_hoch(
        0.01, wp_isbimodal_p)
    wp_timebimodal_padj, wp_timebimodal_pass = utils.benji_hoch(
        0.01, wp_timebimodal_p)

    wp_enoughcellsinbothclusters = np.array([
        sum(c1[0]) > 50 and sum(c1[1]) > 50 for c1 in wp_bimodal_cluster_idxs
    ])
    wp_isbimodal_generally = (np.abs(np.log(wp_bimodal_fcmeans) / np.log(2)) >
                              1) & wp_isbimodal_pass
    wp_isbimodal_fcpadj_pass = (
        np.abs(np.log(wp_bimodal_fcmeans) / np.log(2)) > 1
    ) & wp_isbimodal_pass & ~wp_timebimodal_pass & wp_enoughcellsinbothclusters
    removeThese = pd.read_csv("input/ProteinData/ReplicatesToRemove.txt",
                              header=None)[0]
    wp_removeReplicate = np.isin(u_well_plates, removeThese)
    print(
        f"{sum(~wp_isbimodal_generally[~wp_removeReplicate])}: number of proteins displaying unimodal distributions ({sum(~wp_isbimodal_generally)/len(wp_isbimodal_generally)}%)"
    )
    print(
        f"{sum(wp_isbimodal_generally[~wp_removeReplicate])}: number of proteins displaying bimodal distributions ({sum(wp_isbimodal_generally)/len(wp_isbimodal_generally)}%)"
    )

    if do_plotting:
        # Show that the intensity measurements are reasonable for these bimodal samples
        plt.hist(
            np.concatenate(
                np.array(wp_intensities,
                         dtype=object)[wp_isbimodal_generally]))
        plt.xlabel("Mean intensity")
        plt.ylabel("Count")
        plt.title(
            "Intensities of Cells within Bimodal Distributions\nAre Similar to those Overall"
        )
        # plt.show()
        plt.close()

        print(
            "Illustrate the significantly distinct high- and low-expressing cell populations"
        )
        plt.scatter(np.log(wp_bimodal_fcmeans) / np.log(2),
                    -np.log10(wp_isbimodal_padj),
                    c=wp_isbimodal_generally,
                    alpha=0.5,
                    cmap="bwr_r")
        plt.xlabel("Log2 Fold Change Between Gaussian Clusters")
        plt.ylabel("-Log10 Adj. p-Value for Difference Between Clusters")
        plt.savefig("figures/BimodalSignificance_GeneralBimodality.png")
        # plt.show()
        plt.close()

        print(
            "Illustrate the significantly distinct high- and low-expressing cell populations"
        )
        print(
            "with no difference in pseudotime. These are evaluated separately for CCD."
        )
        plt.scatter(np.log(wp_bimodal_fcmeans) / np.log(2),
                    -np.log10(wp_isbimodal_padj),
                    c=wp_isbimodal_fcpadj_pass,
                    alpha=0.5,
                    cmap="bwr_r")
        plt.xlabel("Log2 Fold Change Between Gaussian Clusters")
        plt.ylabel("-Log10 Adj. p-Value for Difference Between Clusters")
        plt.savefig("figures/BimodalSignificance.png")
        plt.savefig("figures/BimodalSignificance.pdf")
        # plt.show()
        plt.close()

        print(
            "Illustrate the samples with sufficient cell count for CCD evaluation of high- and low-expressing cell populations."
        )
        plt.scatter([sum(c1[0]) for c1 in wp_bimodal_cluster_idxs],
                    [sum(c1[1]) for c1 in wp_bimodal_cluster_idxs],
                    c=wp_enoughcellsinbothclusters,
                    alpha=0.5,
                    cmap="bwr_r")
        plt.xlabel("Cell Count, Cluster 1")
        plt.ylabel("Cell Count, Cluster 2")
        plt.savefig("figures/BimodalCellCount.png")
        # plt.show()
        plt.close()

    return wp_isbimodal_fcpadj_pass, wp_bimodal_cluster_idxs, wp_isbimodal_generally, wp_bimodal_fcmaxmin