예제 #1
0
def pathway_enrichment(gene_names, pipe_section=1, dbs=None, total_genes=20531, p_cutoff=0.05, cache_path='../data/cache/'):
    mg = MyGeneInfo()
    mg.set_caching(cache_db=os.path.join(cache_path, 'mygene_cache'), verbose=False)
    if not os.path.exists(cache_path):
        os.makedirs(cache_path)

    gene_ids = []
    for g in gene_names:
        gene_ids.append(g.split('|')[pipe_section])
    gene_info = mg.getgenes(geneids=gene_ids, fields='pathway', as_dataframe=True, df_index=False)
    try:
        pathways = gene_info['pathway']
    except Exception as e:
        print(e)
        print('No pathways found with the selected genes:')
        print(gene_names)
        return None
    p_df = []
    for idx, p in pathways.iteritems():
        if not (p is np.nan or p != p):
            # print(p)
            path = dict(p)
            for key in path.keys():
                if dbs is not None and key not in dbs:
                    continue
                p_dict = path[key]
                if type(p_dict) is list:
                    for k in p_dict:
                        p_df.append([k['id'], k['name'], key, str(gene_info['query'][idx])])
                else:
                    p_df.append([p_dict['id'], p_dict['name'], key, str(gene_info['query'][idx])])

    p_df = pd.DataFrame(p_df, columns=['id', 'name', 'db', 'genes'])
    p_df = p_df.groupby(['id', 'name', 'db'], as_index=False)['genes'].apply(list)
    p_df = p_df.reset_index()
    p_df.columns = ['id', 'name', 'db', 'genes']
    pathway_size = []
    for idx, p_row in p_df.iterrows():
        if idx % 50 == 0:
            print('querying {}/{}'.format(idx, p_df.shape[0]))
        p_size = mg.query('pathway.{}.id:{}'.format(p_row.db, p_row.id), size=0, verbose=False)['total']
        pathway_size.append(p_size)

    p_df['sup'] = [len(x) for x in p_df.genes.as_matrix()]
    p_df['size'] = pathway_size

    p_p = []
    nb_slected_genes = len(gene_names)
    for idx, p_row in p_df.iterrows():
        p_p.append(hypergeom.sf(p_row['sup'] - 1, total_genes, p_row['size'], nb_slected_genes))
    p_df['p_value'] = p_p

    p_df = p_df[p_df['p_value'] <= p_cutoff]

    p_df['ratio'] = [x['sup'] / x['size'] for i, x in p_df.iterrows()]
    p_df = p_df.sort_values(by=['p_value']).reset_index(drop=True)

    return p_df
예제 #2
0
def ensemble_to_symbol(ens):
    mg = MyGeneInfo()

    gene_info = mg.getgenes(geneids=ens, fields='symbol', as_dataframe=True, df_index=False)
    gene_info = gene_info.drop_duplicates('query').reset_index()
    
    gene_symbol = gene_info['symbol'].values
    gene_id = gene_info.symbol.str.cat([gene_info['query']], sep='|', na_rep='?').values

    return gene_symbol, gene_id