Exemplo n.º 1
0
    def run_gsea(
            cls_file: str,
            gct_file: str,
            gmt_file: str,
            save_dir: str
    ):
        """
        Run GSEA
        :param cls_file:
        :param gct_file:
        :param gmt_file:
        :param save_dir:
        :return:
        """
        assert os.path.exists(cls_file)
        assert os.path.exists(gct_file)
        assert os.path.exists(gmt_file)
        assert os.path.exists(save_dir)

        gseapy.gsea(
            data=gct_file,
            gene_sets=gmt_file,
            cls=cls_file,
            outdir=save_dir,
            processes=4,
            verbose=True
        )
Exemplo n.º 2
0
 def gsea_wrapper(
     counts: Counts,
     cell_labels: Union[List[int], np.ndarray],
     gene_set_gmt: str,
     output_dir: Union[Path, str],
     **kwargs,
 ):
     """
     Uncoupled from `CellORM` infrastructure
     Args:
         counts: counts
         cell_labels:
         gene_set_gmt:
         output_dir:
         **kwargs:
     """
     gct_df = counts.to_df().T
     gct_df.insert(0, "Description", "None")
     gct_df.index = gct_df.index.rename("NAME")
     kwargs = {
         **GSEA.DEFAULT_KWARGS,
         "data": gct_df,
         "gene_sets": gene_set_gmt,
         "cls": cell_labels,
         "outdir": str(output_dir),
         **kwargs,
     }
     print(kwargs["permutation_num"])
     try:
         gp.gsea(**kwargs)
     except IndexError:
         raise SinglePopulationError(cell_labels[0])
Exemplo n.º 3
0
def main(cls_file: str,
         gct_file: str,
         gene_set: str = 'GO_Biological_Process_2017b',
         permutation_type: str = 'phenotype',
         method: str = "ratio_of_classes",
         output_dir: str = "gsea_report",
         format: str = "pdf",
         permutation_num: int = 1000,
         threads: int = 1) -> None:
    """
    Perform GSEA processing
    """
    phenoA, phenoB, class_vector = gp.parser.gsea_cls_parser(cls_file)

    gene_exp = pd.read_table(gct_file, header=0, index_col=0, skiprows=2)

    gs_res = gp.gsea(data=gene_exp,
                     cls=class_vector,
                     permutation_type=permutation_type,
                     permutation_num=permutation_num,
                     outdir=output_dir,
                     gene_sets=gene_set,
                     method=method,
                     processes=threads,
                     format=format,
                     graph_num=50)
Exemplo n.º 4
0
 def run(data,
         gmt,
         cls,
         permutation_type='phenotype',
         method='signal_to_noise',
         permution_num=1000):
     prefix = gp.__name__ + "."
     for importer, modname, ispkg in pkgutil.iter_modules(
             gp.__path__, prefix):
         if modname == "gseapy.gsea":
             module = __import__(modname, fromlist="dummy")
     vs = gp.__version__.split(".")
     if int(vs[0]) == 0 and int(vs[1]) < 9:
         module.ranking_metric = GSEA._ranking_metric
     else:
         module.ranking_metric = GSEA._ranking_metric2
     gp.algorithm.ranking_metric = GSEA._ranking_metric
     res = gp.gsea(data,
                   gmt,
                   cls,
                   permutation_type=permutation_type,
                   permutation_num=permution_num,
                   outdir=os.path.join(
                       os.path.dirname(os.path.realpath(__file__)),
                       'images'),
                   method=method)
     return GSEA(res.res2d, data, gmt, cls)
Exemplo n.º 5
0
def run_gsea(gene_exp: str,
             gene_set: str,
             phenotype_class: str,
             permutations: int = 500,
             output_dir: str = GSEA):
    """Run GSEA on a given dataset with a given gene set.

    :param gene_exp: file with gene expression data
    :param gene_set: gmt files containing pathway gene sets
    :param phenotype_class: cls file containing information on class labels
    :param permutations: number of permutations
    :param output_dir: output directory
    :return:
    """
    return gseapy.gsea(
        data=gene_exp,
        gene_sets=gene_set,
        cls=phenotype_class,  # cls=class_vector
        max_size=3000,
        # set permutation_type to phenotype if samples >=15
        permutation_type='phenotype',
        permutation_num=permutations,  # reduce number to speed up test
        outdir=output_dir,  # do not write output to disk
        no_plot=True,  # Skip plotting
        processes=4,
        format='png',
    )
Exemplo n.º 6
0
    def calculate_genesets():
        gs_res = gp.gsea(data=gene_exp, # or data='./P53_resampling_data.txt'
                        gene_sets=gene_set, # enrichr library names
                        cls= grouping, # cls=class_vector
                        # set permutation_type to phenotype if samples >=15
                        permutation_type=permtype,
                        permutation_num=100, # reduce number to speed up test
                        outdir=None,  # do not write output to disk
                        no_plot=True, # Skip plotting
                        method=statmethod, # or t_test
                        processes=4, seed= 7,
                        format='png')

        return(gs_res)
Exemplo n.º 7
0
 def run(data, gmt, cls, permutation_type='phenotype', method='signal_to_noise', permution_num=1000):
     prefix = gp.__name__ + "."
     for importer, modname, ispkg in pkgutil.iter_modules(gp.__path__, prefix):
         if modname == "gseapy.gsea":
             module = __import__(modname, fromlist="dummy")
     vs = gp.__version__.split(".")
     if int(vs[0]) == 0 and int(vs[1]) < 9:
         module.ranking_metric = GSEA._ranking_metric
     else:
         module.ranking_metric = GSEA._ranking_metric2
     gp.algorithm.ranking_metric = GSEA._ranking_metric
     res = gp.gsea(data, gmt, cls, permutation_type=permutation_type, permutation_num=permution_num,
                   outdir=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'images'), method=method)
     return GSEA(res.res2d, data, gmt, cls)
Exemplo n.º 8
0
 def run_gsea(self, ordered_df, classes, db=None, db_name=None,
              processes=4, no_plot=True, method='signal_to_noise',
              permutation_type='gene_set'):
     if db is None:
         db = hallmarks_db
     if db_name is None:
         db_name = db.split("/")[-1]
     gsea_analysis = gseapy.gsea(ordered_df,
                                 gene_sets=db,
                                 cls=classes,
                                 outdir=os.path.join(self.gsea_save, db_name),
                                 method=method,
                                 no_plot=no_plot,
                                 processes=processes,
                                 permutation_type=permutation_type)
     return gsea_analysis
Exemplo n.º 9
0
def perform_gsea(data: Union[str, pd.DataFrame], gmt: str, class_vector: List,
                 output_dir: str, min_size: int, max_size: int,
                 permutation_type: str, permutation_num: int, method: str):
    """Run GSEA on a given dataset and geneset."""
    return gseapy.gsea(
        data=data,
        gene_sets=gmt,
        cls=class_vector,
        min_size=min_size,
        max_size=max_size,
        permutation_type=
        permutation_type,  # set permutation_type to phenotype if samples >=15
        permutation_num=permutation_num,  # reduce number to speed up test
        method=method,
        outdir=output_dir,
        no_plot=True,  # Skip plotting
        processes=1,
    )
Exemplo n.º 10
0
def do_gsea(matrix, cls, gmt, out_dir):
    """Run GSEA."""
    mat = pd.read_csv(matrix, sep='\t', header=0, index_col=0)
    click.echo('Running GSEA')
    gs_res = gp.gsea(
        data=mat,  # data matrix
        gene_sets=gmt,  # enrichr library names
        cls=cls,  # cls=class_vector
        # set permutation_type to phenotype if samples >=15
        permutation_type='phenotype',
        permutation_num=100,  # reduce number to speed up test
        outdir=out_dir,
        no_plot=True,  # Skip plotting
        method='signal_to_noise',
        processes=4,
        format='png',
    )
    out_path = os.path.join(out_dir, 'gsea_result.tsv')
    gs_res.res2d.to_csv(out_path, sep='\t')
Exemplo n.º 11
0
def test():
    classfile = 'GSEApy/data/P53.cls'
    # 50 2 1
    # #MUT WT
    # MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT \
    # MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT MUT WT WT WT WT WT WT WT WT WT \
    # WT WT WT WT WT WT WT WT

    geneexpfile = "GSEApy/data/P53_resampling_data.txt"
    #            NAME   786-0  BT-549  CCRF-CEM  COLO 205    EKVX  HCC-2998  HCT-15  \
    # 0        CTLA2B  111.19   86.22    121.85     75.19  208.62    130.59  124.72
    # 1        SCARA3  460.30  558.34    183.55     37.29  158.00     43.61   80.83
    # 2  LOC100044683   97.25  118.94     81.17    119.51  119.88    107.73  165.57
    # 3          CMBL   33.45   55.10    221.67     50.30   35.12     75.70   84.01
    # 4         CLIC6   35.75   41.26     63.04    219.86   42.53     54.19   86.98

    phenoA, phenoB, class_vector = gp.parser.gsea_cls_parser(classfile)

    gene_exp = pd.read_table(geneexpfile)

    # gene_exp.head()

    gs_res = gp.gsea(
        data=gene_exp,
        gene_sets='KEGG_2016',
        # cls=class_vector,
        cls=['Control'
             for i in range(25)] + ['Drug Treatment' for i in range(25)],
        permutation_type='phenotype',
        outdir='output',
        method='signal_to_noise',
        format='png')

    gsea_results = gs_res.res2d
    # gs_res.res2d.head()

    with plt.style.context('ggplot'):
        gsea_results = gsea_results.reset_index()
        gsea_results.head(5).plot.barh(y='fdr', x='Term', fontsize=10)

        plt.savefig('figure-gsea.pdf')
Exemplo n.º 12
0
def gsea_enrichr(diff, treat, ctrl, log2fc, padj, go):
    # python code
    import os, errno
    from pandas import read_excel
    import gseapy as gp

    #outputfile name
    outGSEAname = "%s_vs_%s" % (treat, ctrl)
    #treat, ctrl =outGSEAname.split("_vs_")

    #parse blacklist and skip no significant results
    if os.path.isfile("temp/blacklist.txt"):
        with open("temp/blacklist.txt") as black:
            blacklist = [bla.strip().split("/")[-1] for bla in black]
        # handle files with no significant genes
        bk = diff.split("/")[-1]
        if bk in blacklist:
            print("Skip GSEA and Enrichr Procedure for %s vs %s." %
                  (treat, ctrl))
            for domain in go:
                #touch gsea output
                outfile1 = "differential_expression/GO/GSEA_%s/%s/gseapy.gsea.gene_sets.report.csv" % (
                    outGSEAname, domain)
                os.makedirs("differential_expression/GO/GSEA_%s/%s".format(
                    outGSEAname, domain),
                            exist_ok=True)
                os.system("touch %s" % outfile1)
                #toutch Enrichr output
                # for gl_type in ['all','up','down']:
                #     touchdirs = "GO/Enrichr_{n}/{d}_{t}".format(n=outGSEAname, d=domain, t=gl_type)
                #     os.makedirs(touchdirs, exist_ok=True)
                #     outfile2='{n}/{d}.{t}.enrichr.reports.txt'.format(n=touchdirs, d=domain, t=gl_type)
                #     os.system("touch %s"%outfile2)
            return

    #start to parse significant results
    al_res = read_excel(diff, sheet_name=None)
    sig_deg = al_res["sig-all.log2fc%s-padj%s" % (log2fc, padj)]
    sig_deg_up = al_res['sig-up']
    sig_deg_dw = al_res['sig-down']

    degs_sig = [
        deg.gene_name.squeeze() for deg in [sig_deg, sig_deg_up, sig_deg_dw]
    ]

    sig_deg_gsea = sig_deg[['gene_name', 'log2FoldChange']]
    sig_deg_gsea_sort = sig_deg_gsea.sort_values('log2FoldChange',
                                                 ascending=False)
    sig_deg_gsea_sort = sig_deg_gsea_sort.reset_index(drop=True)

    #dir for blacklist
    os.makedirs("temp/blacklist.GO", exist_ok=True)
    # enrichr and gsea start
    for glist, gl_type in zip(degs_sig, ['all', 'up', 'down']):
        outdir = 'differential_expression/GO/Enrichr_%s/%s' % (outGSEAname,
                                                               gl_type)
        outfile = "{o}/{t}.enrichr.reports.txt".format(o=outdir, t=gl_type)
        # skip plotting while file exists
        if os.path.isfile(outfile): continue
        try:
            res_enr = gp.enrichr(gene_list=glist,
                                 gene_sets=go,
                                 description=gl_type,
                                 cutoff=0.1,
                                 outdir=outdir)
        except Exception as e:
            log1 = "Enrichr Server No response: %s vs %s, %s \n" % (
                treat,
                ctrl,
                gl_type,
            )
            log2 = "the lenght of input gene list = %s \n" % (len(glist))
            print(log1, log2)
            # touch file error exists
            #os.system("touch  %s"%outfile)
            with open(
                    "temp/blacklist.GO/blacklist.enrichr.degs.%s_vs_%s.txt" %
                (treat, ctrl), 'a') as black:
                black.write(log1)
                black.write(log2)
    #run prerank
    """
    for domain in go:
        try:
            outdir="GO/GSEA_prerank_%s/%s"%(outGSEAname, domain)
            gp.prerank(rnk=sig_deg_gsea_sort, gene_sets=domain,
                        pheno_pos=treat, pheno_neg=ctrl, min_size=15, max_size=500,
                        outdir=outdir)
        except:
            print("Oops...%s_vs_%s: skip GSEA plotting for %s, please adjust paramters for GSEA input."%(treat, ctrl, domain))

    """
    #select columns for gsea
    cols_ = [col for col in sig_deg.columns if col.startswith("TPM")]

    cols_group = [col.lstrip("TPM.") for col in cols_]
    cols  = [col for col, group in zip(cols_, cols_group) if group.startswith(treat)] +\
            [col for col, group in zip(cols_, cols_group) if group.startswith(ctrl)]

    col2 = ['gene_name'] + cols
    cls_vec = [treat for group in cols_group if group.startswith(treat)] +\
              [ctrl for  group in cols_group if group.startswith(ctrl)]

    # run gsea
    for domain in go:
        outdir = "differential_expression/GO/GSEA_%s/%s" % (outGSEAname,
                                                            domain)
        outfile = "%s/gseapy.gsea.gene_sets.report.csv" % outdir
        #skip plotting while file exists
        if os.path.isfile(outfile): continue
        try:
            gs = gp.gsea(data=sig_deg[col2],
                         gene_sets=domain,
                         cls=cls_vec,
                         min_size=15,
                         max_size=500,
                         outdir=outdir)
        except:
            log1 = "Oops...%s_vs_%s: skip GSEA plotting for %s, please adjust paramters for GSEA input.\n" % (
                treat, ctrl, domain)
            log2 = "the lenght of input degs = %s \n" % sig_deg[col2].shape[0]
            print(log1, log2)
            os.system("touch %s/gseapy.gsea.gene_sets.report.csv" % outdir)
            with open(
                    "temp/blacklist.GO/blacklist.gsea.degs.%s_vs_%s.txt" %
                (treat, ctrl), 'a') as black:
                black.write(log1)
                black.write(log2)

    return
Exemplo n.º 13
0
def set_c(input_data, group_info, group_samples, session_key):
    def fet_f(a1, b1, total_gene_assum):
        a1_inter_b1 = list(set(a1).intersection(b1))
        a1_unique_fromb1 = list(set(a1) - set(a1_inter_b1))
        b1_unique_froma1 = list(set(b1) - set(a1_inter_b1))

        oddsratio, pvalue = stats.fisher_exact(
            [[len(a1_inter_b1), len(b1_unique_froma1)],
             [
                 len(a1_unique_fromb1),
                 total_gene_assum - (len(a1_inter_b1) + len(b1_unique_froma1) +
                                     len(a1_unique_fromb1))
             ]])
        return len(a1), len(a1_inter_b1), pvalue

    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"
    template_plot_path = "images/" + session_key + "/"
    test_name = "USER_SET"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
    else:
        files = glob.glob(plot_path + "*")
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    pt = MY_PLOT()
    df_all = all_expr_df

    #############GSEA#############
    gmt_temp = 'USER_SET\tNA\t' + '\t'.join(input_data)
    #fixed_path_gmt = 'user_data/'+userID+'/user.gmt'
    fixed_path_gmt = plot_path + 'user.gmt'
    rw = open(fixed_path_gmt, 'w')
    rw.write(gmt_temp)
    rw.close()

    sample_list = group_samples
    class_vector = [[group_info[i]] * len(item)
                    for i, item in enumerate(sample_list)]
    class_vector = [y for x in class_vector for y in x]
    class_vector = map(str, class_vector)

    current_task.update_state(state='PROGRESS', meta={'process_percent3': 20})

    df_user_s = [df_all[s] for s in sample_list]
    df_user_s = pd.concat(df_user_s, axis=1)

    df_user_s.columns = range(len(df_user_s.columns.tolist()))
    df_user_s = df_user_s.reset_index()

    gseapy.gsea(data=df_user_s,
                gene_sets=fixed_path_gmt,
                cls=class_vector,
                outdir=plot_path,
                min_size=2,
                max_size=1000,
                weighted_score_type=1,
                permutation_type='gene_set',
                method='signal_to_noise',
                ascending=False,
                figsize=(6.5, 6),
                format='png')

    file_list.append(template_plot_path + "USER_SET.gsea.png")
    filecount += 1
    #############GSEA#############
    current_task.update_state(state='PROGRESS', meta={'process_percent3': 40})
    #############MRA#############
    mra_set_t = mra_set.T
    mra_list = list(set(mra_set.index.tolist()))
    mra_targets = [mra_set_t[x].values.tolist() for x in mra_list]
    mra_targets = [
        map(str, x[0]) if type(x[0]) == list else [str(x[0])]
        for x in mra_targets
    ]

    total_genes = len(list(set(mra_set.values.flatten())))

    pvals = [
        fet_f(mra_targets[a], input_data, total_genes)
        for a in range(len(mra_list))
    ]
    pvals_list = [[mra_list[i], item[0], item[1], item[2]]
                  for i, item in enumerate(pvals)
                  if item[2] < 0.01 and item[0] > 10]
    table_arr = pvals_list  #####Table data

    current_task.update_state(state='PROGRESS', meta={'process_percent3': 60})

    rw = open(plot_path + 'mra_candidates.tsv', "w")
    rw.write("Gene(EntrezID)\tTF_targets\tMapped_genes\tP-value\n")
    for x in table_arr:
        x = [str(y) for y in x]
        rw.write('\t'.join(x) + '\n')
    rw.close()

    network_data = pd.DataFrame(data=pvals_list,
                                columns=['TF', 'targets', 'mapped', 'pval'])
    network_data = network_data.set_index('TF')
    network_data[
        'prob_mapped'] = network_data['mapped'] / network_data['targets']
    network_data = network_data.sort_values('prob_mapped', ascending=False)
    network_data = network_data.loc[network_data.index.tolist()[:10]]

    current_task.update_state(state='PROGRESS', meta={'process_percent': 80})

    selected_network_expr = df_all[group_samples[0]].loc[
        network_data.index.tolist()[:10]]

    pt.network_plot(network_data,
                    selected_network_expr,
                    tit=group_info[0],
                    filename=plot_path + test_name + str(filecount))
    file_list.append(template_plot_path + test_name + str(filecount) + ".png")

    current_task.update_state(state='PROGRESS', meta={'process_percent3': 100})

    #############MRA#############
    return random.random()
Exemplo n.º 14
0
async def GSEAonExperiments(data,
                            experiments,
                            res={},
                            savename='',
                            scaling=[],
                            geneset='GO_Biological_Process_2015',
                            cores=8,
                            cleanfunc=lambda i: i.split('(GO')[0]):
    """

    Will run GSEA on a set of experiment

    Args:
    -----
      data: a pandas.df rows: gene counts; columns: [experimentA_X,..., experimentD_X..., control_X] where X is the replicate number
      experiments: a list of experiment names (here experimentA,.. experimentD)
      scaling: a dict(experiment:(mean,std)) of scaling factors and their associated standard error for each experiments
      res: you can provide a dict containing results from
      savename: if you want to save the plots as pdfs, provides a location/name
      geneset: the geneset to run it on. (can be a filepath to your own geneset)
      cores: to run GSEA on
      cleanfunc: a func applied to the names of the gene sets to change it in some way (often to make it more readable)
    Returns
    -------
      plots the results
      1: returns a matrix with the enrichment for each term for each experiment
      2: returns a dict(experiment:pd.df) with dataframe being the output of GSEA (with pvalues etc..) for each experiments
    """
    for i, val in enumerate(experiments):
        print(val)
        totest = data[[
            v for v in data.columns[:-1] if val + '-' in v or 'AAVS1' in v
        ]]
        cls = [
            'Condition' if val + '-' in v else 'DMSO' for v in totest.columns
        ]
        if scaling:
            if abs(scaling[val.split('_')[1]][0]) > scaling[val.split('_')
                                                            [1]][1]:
                print("rescaling this one")
                cols = [i for i in totest.columns if val + '-' in i]
                totest[cols] = totest[cols] * \
                    (2**scaling[val.split('_')[1]][0])
        if val in res:
            print(val + " is already in set")
            continue
        res[val] = gseapy.gsea(data=totest,
                               gene_sets=geneset,
                               cls=cls,
                               no_plot=False,
                               processes=cores)
        res[val].res2d['Term'] = [i for i in res[val].res2d.index]
        for i, v in res.items():
            res[i].res2d['Term'] = [cleanfunc(i) for i in v.res2d['Term']]
        plt.figure(i)
        sns.barplot(data=res[val].res2d.iloc[:25],
                    x="es",
                    y="Term",
                    hue_order="geneset_size").set_title(val)
    a = set()
    for k, val in res.items():
        a.update(set(val.res2d.Term))
    a = {i: [0] * len(res) for i in a}
    for n, (k, val) in enumerate(res.items()):
        for i, v in val.res2d.iterrows():
            a[v.Term][n] = v.es
    pres = pd.DataFrame(a, index=res.keys())
    a = sns.clustermap(figsize=(25, 20),
                       data=res,
                       vmin=-1,
                       vmax=1,
                       yticklabels=res.index,
                       cmap=plt.cm.RdYlBu)
    b = sns.clustermap(-res.T.corr(), cmap=plt.cm.RdYlBu, vmin=-1, vmax=1)
    if savename:
        res.to_csv(savename + ".csv")
        a.savefig(savename + "_genesets.pdf")
        b.savefig(savename + "_correlation.pdf")
    return pres, res
Exemplo n.º 15
0
    'Pierre_sets': 'tracks/GSEA_gene_sets/Pierre_gene_sets.gmt',
    'Pierre_sets_TLX_enh_TSS':
    'tracks/GSEA_gene_sets/Pierre_gene_sets_plus.gmt',
    'Pierre_sets_v2': 'tracks/GSEA_gene_sets/Pierre_gene_sets_v2.gmt'
}

#~ for g_set in gs_dic.keys():
for g_set in ['Pierre_sets_v2']:
    out_dir = 'GSEA/TLX3vsRAG_' + g_set + '_classic_std'

    gs_res = gp.gsea(
        data=tbn,
        gene_sets=gs_dic[g_set],
        weighted_score_type=0,
        #~ method = 'ratio_of_classes',
        min_size=10,
        max_size=10000,
        graph_num=150,
        permutation_type='gene_set',
        outdir=out_dir,
        cls=classes)

    # plotting
    gsea_results = gs_res.res2d
    with plt.style.context('ggplot'):
        gsea_results = gsea_results.reset_index()
        gsea_results.head(40).plot.barh(y='fdr',
                                        x='Term',
                                        figsize=(18, 6),
                                        fontsize=10)
        plt.gca().invert_yaxis()
Exemplo n.º 16
0
def set_calculator(input_data,
                   group_info,
                   group_samples,
                   session_key,
                   set_name='USER_GENE_SET'):
    def fet_f(a1, b1, total_gene_assum):
        a1_inter_b1 = list(set(a1).intersection(b1))
        a1_unique_fromb1 = list(set(a1) - set(a1_inter_b1))
        b1_unique_froma1 = list(set(b1) - set(a1_inter_b1))

        oddsratio, pvalue = stats.fisher_exact(
            [[len(a1_inter_b1), len(b1_unique_froma1)],
             [
                 len(a1_unique_fromb1),
                 total_gene_assum - (len(a1_inter_b1) + len(b1_unique_froma1) +
                                     len(a1_unique_fromb1))
             ]])
        return len(a1), len(a1_inter_b1), pvalue

    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"
    nginx_plot_path = "/home/ubuntu/django_proj/pcta_updated/main/staticimages/" + session_key + "/"

    template_plot_path = "images/" + session_key + "/"
    test_name = "USER_SET"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
        os.mkdir(nginx_plot_path)
    else:
        files = glob.glob(plot_path + "*")
        for f in files:
            os.remove(f)
        files = glob.glob(nginx_plot_path + "*")
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    pt = MY_PLOT()
    df_all = all_expr_df

    gsea_mapping_rate = len(
        list(set(input_data).intersection(df_all.index.tolist())))
    list_numb = len(input_data)
    gsea_mapping_rate = float(gsea_mapping_rate) / float(list_numb) * 100
    gsea_mapping_rate = "%.2f" % gsea_mapping_rate

    #############GSEA#############
    gmt_temp = set_name + '\tNA\t' + '\t'.join(input_data)
    #fixed_path_gmt = 'user_data/'+userID+'/user.gmt'
    fixed_path_gmt = plot_path + 'user.gmt'
    rw = open(fixed_path_gmt, 'w')
    rw.write(gmt_temp)
    rw.close()

    sample_list = group_samples
    class_vector = [[group_info[i]] * len(item)
                    for i, item in enumerate(sample_list)]
    class_vector = [y for x in class_vector for y in x]
    class_vector = map(str, class_vector)

    df_user_s = [df_all[s] for s in sample_list]
    df_user_s = pd.concat(df_user_s, axis=1)

    df_user_s.columns = range(len(df_user_s.columns.tolist()))
    df_user_s = df_user_s.reset_index()

    gsea_result = gseapy.gsea(data=df_user_s,
                              gene_sets=fixed_path_gmt,
                              cls=class_vector,
                              outdir=plot_path,
                              min_size=2,
                              max_size=1000,
                              weighted_score_type=1,
                              permutation_type='phenotype',
                              method='signal_to_noise',
                              ascending=False,
                              figsize=(6.5, 6),
                              format='png')
    #ledge_genes = gsea_result.results[gsea_result.results.keys()[0]]['ledge_genes'].split(";")## leading edge subset
    pval_es = gsea_result.results[gsea_result.results.keys()[0]][
        'pval']  ###GSEA p-value

    #with Image(filename=plot_path+set_name+".gsea.pdf", resolution=300) as img:
    #	with Image(width=img.width, height=img.height, background=Color("white")) as bg:
    #		bg.composite(img,0,0)
    #		bg.save(filename=plot_path+set_name+".gsea.png")

    if pval_es <= 0.05:

        file_list.append(template_plot_path + set_name + ".gsea.png")
        filecount += 1
    else:
        file_list.append(" ")
        gsea_mapping_rate = 'Not Applicable'
        filecount += 1
    #############GSEA#############

    fold_change = all_expr_df[sample_list[0]].median(
        axis=1) - all_expr_df[sample_list[1]].median(axis=1)

    if pval_es <= 0.05:
        #############MRA#############
        mra_set_t = mra_set.T
        mra_list = list(set(mra_set.index.tolist()))
        mra_targets = [mra_set_t[x].values.tolist() for x in mra_list]
        mra_targets = [
            map(str, x[0]) if type(x[0]) == list else [str(x[0])]
            for x in mra_targets
        ]

        total_genes = len(list(set(mra_set.values.flatten())))

        pvals = [
            fet_f(mra_targets[a], input_data, total_genes)
            for a in range(len(mra_list))
        ]
        #pvals = [fet_f(mra_targets[a], ledge_genes, total_genes) for a in range(len(mra_list))] ## leading edge subset
        pvals_list = [[
            mra_list[i],
            int(item[0]),
            int(item[1]),
            float("{0:.4f}".format(item[2])),
            float("{0:.4f}".format(fold_change.loc[mra_list[i]]))
        ] for i, item in enumerate(pvals) if item[2] < 0.01
                      and fold_change.loc[mra_list[i]] >= 0.1 and item[0] > 10]
        #table_arr = pvals_list #####Table data

        index_change = lambda x, y: [y] + x[1:]
        #table_arr = [index_change(x,pcta_id.loc[x[0]]['Symbol']) for x in table_arr]

        network_data = pd.DataFrame(
            data=pvals_list, columns=['TF', 'targets', 'mapped', 'pval', 'fc'])
        network_data = network_data.set_index('TF')
        network_data['Symbol'] = pcta_id.loc[map(
            str, network_data.index.tolist())]['Symbol']

        network_data[
            'prob_mapped'] = network_data['mapped'] / network_data['targets']
        network_data = network_data.sort_values('prob_mapped', ascending=False)
        network_data = network_data.loc[network_data.index.tolist()[:10]]
        network_data = network_data.round(4)
        network_data['targets'].astype(int)
        network_data['mapped'].astype(int)

        selected_network_expr = df_all[group_samples[0]].loc[
            network_data.index.tolist()[:10]]
        selected_network_expr['Symbol'] = pcta_id.loc[map(
            str, selected_network_expr.index.tolist())]['Symbol']

        network_data = network_data.set_index('Symbol')
        selected_network_expr = selected_network_expr.set_index('Symbol')

        pt.network_plot(network_data,
                        selected_network_expr,
                        tit=group_info[0],
                        filename=plot_path + test_name + str(filecount))
        file_list.append(template_plot_path + test_name + str(filecount) +
                         ".png")

        network_data = network_data[[
            'targets', 'mapped', 'prob_mapped', 'fc', 'pval'
        ]]

        int_ = lambda x: [int(x[0]), int(x[1])] + x[2:]
        table_arr = [[i] + int_(network_data.loc[i].tolist())
                     for i in network_data.index.tolist()]

        network_data.to_csv(plot_path + 'mra_candidates.csv')

        os.system("cp -rf %s/* %s" % (plot_path, nginx_plot_path))

        #############MRA#############

    else:
        table_arr = []
        file_list.append(" ")

    return file_list, table_arr, gsea_mapping_rate
Exemplo n.º 17
0
import gseapy as gp
import numpy as np
from os.path import join
import pandas as pd

tbl = pd.read_table(join('tracks', 'TLX3vsRAG-results_genes.txt'), index_col=0)

tbl = tbl[(tbl.padj < 0.05)].dropna()
names = pd.read_table(
    "tracks/annot_tracks/references/mm9/mm9_EnsemblTransc_GeneNames.txt",
    index_col=0,
    header=0,
    names=['GeneID', 'TransID', 'Gene_name'])

names = names.drop('TransID', axis=1).drop_duplicates()
names = names.loc[tbl.index]

tbn = pd.concat([names, tbl], axis=1)

tbn = tbn.drop(
    ['baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue', 'padj'], axis=1)

tbn['Gene_name'] = tbn['Gene_name'].str.upper()
tbn = tbn.reset_index(drop=True)

## ==== GSEAPY
classes = ['RAG', 'RAG', 'RAG', 'TLX3', 'TLX3', 'TLX3']
g_set = 'Reactome_2013'
gs_res = gp.gsea(data=tbn, gene_sets=g_set, outdir='_test', cls=classes)
Exemplo n.º 18
0
Arquivo: a_cov.py Projeto: numpde/cbb
# ... as a dictionary: gene set name -> gene set
genesets = df_genesets.T.symbols.to_dict()
# print("Gene sets:", genesets)

# Gene set enrichment analysis
# The gsea module produces GSEA results. (https://pypi.org/project/gseapy/)
# http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Main_Page
import gseapy

print(F"Running GSEA on {len(df_cov2)} genes")
# Rearrange columns to have SARS samples first, Mock samples second
df_cov2 = df_cov2[list(calu3.sort_values().index)]
gsea_res = gseapy.gsea(data=df_cov2,
                       gene_sets=genesets,
                       cls=list(~calu3),
                       outdir=str(ROOT / "gsea"),
                       min_size=2)

df_gsea_results = pd.DataFrame(
    data={(gs, info['fdr'], info['nes'], info['es'], info['pval'])
          for (gs, info) in gsea_res.results.items()},
    columns=["Geneset", "FDR", "Norm. score", "Score", "p-value"],
)
df_gsea_results = df_gsea_results.sort_values("Norm. score", ascending=False)
print(df_gsea_results.to_markdown())

# |    | Geneset                                    |      FDR |   Norm. score |     Score |   p-value |
# |---:|:-------------------------------------------|---------:|--------------:|----------:|----------:|
# |  4 | HALLMARK_IL6_JAK_STAT3_SIGNALING           | 0.341983 |      1.43446  |  0.875    | 0.0584046 |
# |  6 | HALLMARK_TNFA_SIGNALING_VIA_NFKB           | 0.204814 |      1.39715  |  0.785688 | 0.103362  |