示例#1
0
def vec_enrich(vec, gene_ids, quantile, gene_sets):
    ind = np.quantile(vec, quantile) > vec
    print("... {} features selected...".format(sum(ind)))
    genes = gene_ids[ind]

    # remove ens id version
    genes = [re.sub("\\..*$", "", g) for g in genes]

    print("Mapping to gene names...")
    # map ens ids to gene symbols
    bm = Biomart()
    bm_result = bm.query(
        dataset="hsapiens_gene_ensembl",
        attributes=[
            "ensembl_gene_id",
            "external_gene_name",
            "entrezgene_id",
            "go_id",
        ],
        filters={"ensembl_gene_id": genes},
    )
    gene_symbols = list(bm_result["external_gene_name"].unique())

    print("Calculating enrichment...")
    enr = gp.enrichr(
        gene_list=gene_symbols,
        gene_sets=gene_sets,
        organism="Human",
        cutoff=0.05,
    )
    return enr
示例#2
0
    def get_background(self):
        """get background gene"""

        # input is a file
        if os.path.isfile(self.background):
            with open(self.background) as b:
                bg2 = b.readlines() 
            bg = [g.strip() for g in bg2]  
            return set(bg)
        
        # package included data
        DB_FILE = resource_filename("gseapy", "data/{}.background.genes.txt".format(self.background))
        if os.path.exists(DB_FILE):
            df = pd.read_csv(DB_FILE,sep="\t")
        else:
            # background is a biomart database name
            self._logger.warning("Downloading %s for the first time. It might take a couple of miniutes."%self.background)
            bm = Biomart()
            df = bm.query(dataset=self.background)
            df.dropna(subset=['go_id'], inplace=True)
        self._logger.info("Using all annotated genes with GO_ID as background: %s"%self.background)
        df.dropna(subset=['entrezgene_id'], inplace=True)     
        # input id type: entrez or gene_name
        if self._isezid:
            bg = df['entrezgene_id'].astype(int)
        else:
            bg = df['external_gene_name']

        return set(bg)
示例#3
0
文件: enrichr.py 项目: yxngl/GSEApy
    def get_background(self):
        """get background gene"""
        DB_FILE = resource_filename("gseapy", "data/{}.background.genes.txt".format(self.background))
        filename = os.path.join(DEFAULT_CACHE_PATH, "{}.background.genes.txt".format(self.background))
        if os.path.exists(filename):
            df = pd.read_table(filename)
        elif os.path.exists(DB_FILE):
            df = pd.read_table(DB_FILE)
        else:
            self._logger.warning("Downloading %s for the first time. It might take a couple of miniutes."%self.background)
            bm = Biomart()
            df = bm.query(dataset=self.background)
            df.dropna(subset=['go_id'], inplace=True)
        self._logger.info("using all annotated genes with GO_ID as background genes")
        df.dropna(subset=['entrezgene'], inplace=True)     

        return df
示例#4
0
adata = sc.AnnData(adata)
adata.obs["group"]=group.astype(np.str)

# rank gene by importance for clusters
sc.tl.rank_genes_groups(adata, "group", n_genes = 500)

r = adata.uns['rank_genes_groups']['names']
#pd.DataFrame.from_records(r).to_csv('marker_genes_group.csv')
#", ".join(pd.DataFrame.from_records(r)['1'].values)


for x in pd.DataFrame.from_records(r).columns:
    print("group :"+x, end = "\r")
    # rank gene by importance for clusters
    glist = pd.DataFrame.from_records(r)[x].tolist()
    bm = Biomart()
    if not os.path.exists("test"):
        os.makedirs("test")
    results = bm.query(dataset='hsapiens_gene_ensembl',
                    attributes=['external_gene_name', 'go_id'],
                    filters={'hgnc_symbol': glist},
                    # save output file
                    filename="test/query_"+x+".results.txt")

    enr = gp.enrichr(gene_list=glist,
                    description='test_name',
                    gene_sets=['KEGG_2016'],
                    outdir="test/enrichr_kegg_group"+x,
                    cutoff=0.5 # test dataset, use lower value from range(0,1)
                    )
示例#5
0
def main():
    GO_name = "TEST"
    args = parser.parse_args()
    input = args.input
    # "score_D.txt"
    p_threshold = float(args.threshold)
    # 0.05
    num = int(args.number)
    ############
    # get GENE_COMM and final_score_D
    GENE_COMM = []
    final_score_D = []
    file_D = input
    fp = open(file_D, "r")
    line = fp.readline()
    line = fp.readline()
    while (line != ""):
        words = line.split("\t")
        GENE_COMM.append(words[0])
        final_score_D.append(words[-1].strip())
        line = fp.readline()

    fp.close()

    ###########
    # sort data and get top 500 genes
    data = np.array([GENE_COMM, final_score_D])
    idex = np.lexsort([data[1, :]])
    data_sorted = data[:, idex[::-1]]
    GENES_top = data_sorted[0, :num]
    SCORES_top = data_sorted[1, :num]

    ################
    ## use BioMart and convert ensembl_id to gene_symbol
    bm = Biomart(verbose=False, host="asia.ensembl.org")
    marts = bm.get_marts()
    datasets = bm.get_datasets(mart='ENSEMBL_MART_ENSEMBL')
    attrs = bm.get_attributes(dataset='hsapiens_gene_ensembl')
    filters = bm.get_filters(dataset='hsapiens_gene_ensembl')
    results = bm.query(dataset='hsapiens_gene_ensembl',
                       attributes=["ensembl_gene_id", "hgnc_symbol"],
                       filters={'ensembl_gene_id': GENES_top.tolist()},
                       filename="query.results.txt")

    gene_List = []
    hgnc_symbol_list = results.hgnc_symbol.tolist()
    for gene in hgnc_symbol_list:
        if (isinstance(gene, str)):
            gene_List.append(gene)

    libs = [
        "GO_Biological_Process_2018", "GO_Cellular_Component_2018",
        "GO_Molecular_Function_2018"
    ]
    gseapy.enrichr(gene_list=gene_List,
                   description=GO_name,
                   gene_sets=libs,
                   outdir=GO_name)

    ##########
    # fig for GO enrichment analysis. the threshold of adjusted p-value is less than 0.05, calculate -lg(adjusted p-value) and use gene num
    TERMS = []
    P_VALUE = []
    GENES_NUM = []
    file_GO = "GO_reports.txt"
    fp_w = open(file_GO, "w")
    mark = 0

    for lib in libs:
        GO_file = GO_name + "/" + lib + "." + GO_name + ".enrichr.reports.txt"
        fp = open(GO_file, "r")
        line = fp.readline()
        if (mark == 0):
            fp_w.write(line)
            mark = 1
        line = fp.readline()
        while (line != ""):
            fp_w.write(line)
            words = line.split("\t")
            term = words[1]
            p_value = float(words[4])
            genes_num = len(words[4].split(";"))
            if (p_value < p_threshold):
                TERMS.append(term.split(" ")[-1].strip('(').strip(')'))
                P_VALUE.append(p_value)
                GENES_NUM.append(genes_num)
            line = fp.readline()
        fp.close()

    fp_w.close()
    ########
    GO_value = []
    for value in P_VALUE:
        GO_value.append(-math.log(value, 10))

    ##############
    fig = plt.figure(figsize=(9, 6))
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False

    plt.barh(range(len(GO_value)),
             GO_value,
             height=0.7,
             color='steelblue',
             alpha=0.8)
    plt.yticks(range(len(GO_value)), TERMS)
    plt.xlim(0, 10)
    plt.xlabel("-LgP")
    plt.title("GO enrichment")
    for x, y in enumerate(GO_value):
        plt.text(y + 0.2, x - 0.1, '%s' % y)

    plt.show()
    fig.savefig("GO enrichment barh.png")

    #######
    fig = plt.figure(figsize=(9, 6))
    cm = plt.cm.get_cmap('RdYlGn')
    NUM = []
    for num in GENES_NUM:
        NUM.append(100 * num)

    sc = plt.scatter(GO_value, TERMS, c=GO_value, vmin=0, s=NUM, cmap=cm)
    plt.colorbar(sc)
    plt.show()
    fig.savefig("GO enrichment scatter.png")