forked from fortuno/mutanome-project
/
GroupGenes.py
104 lines (89 loc) · 3.86 KB
/
GroupGenes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np
import pandas as pd
import argparse
import os
from tqdm import tqdm
import pickle
import requests
def createGeneIndex(mafDir, cancerTypes, pipelines, geneIndexFile):
#Creates a dataframe that has all unique genes
#importaint column is the protein files, which will be populated later
geneFrame = pd.DataFrame(columns = ['Entrez_Gene_Id', 'uniProt' 'pdb_files']+pipelines)
for cancer in tqdm(cancerTypes):
cancerDir = mafDir+ cancerType + '/'
for pipeline in pipelines:
filepath = cancerDir+pipeline+'.maf'
with open(filepath, r) as mafFile:
mafDF = pd.read_table(file, skiprows= 5)
geneIds = mafDF['Entrez_Gene_Id'].unique()
for geneId in geneIds:
try:
geneFrame.loc[geneId, pipeline] = 1
except:
geneFrame = geneFrame.append({'Entrez_Gene_Id': 1, pipeline:1}, ignore_index=True)
geneFrame = geneFrame.set_index('Entrez_Gene_Id')
#now pickle the geneFrame
geneFrame.to_pickle(geneIndexFile)
def getUniprot(Entrez, split = True):
#Takes in an entrez id, and receives the top uniprot id
"""Returns the first Uniprot named structure for a entrez gene id"""
url = 'https://www.uniprot.org/uploadlists/'
params = {
'from':'P_ENTREZGENEID',
'to':'ID',
'format':'tab',
'query':Entrez
}
data = urllib.parse.urlencode(params)
request = urllib.request.Request(url, data.encode("utf-8"))
contact = "aa@uchicago.edu"
# Please set a contact email address here to help us debug in case of problems (see https://www.uniprot.org/help/privacy).
request.add_header('User-Agent', 'Python %s' % contact)
response = urllib.request.urlopen(request)
page = response.read(200000)
page = page.decode("utf-8")
if split:
try:
return page.split('\n')[1].split('\t')[1]
except:
return page
else:
return page
def mapUniProt(geneIndexFile):
#fills in the uniProt column of the geneIndex dataframe by calling the uniprot mapping server
tqdm.pandas(desc="mapping entrez to uniProt")
indexFrame = pd.read_pickle(geneIndexFile)
indexFrame['uniProt'] = indexFrame['Entrez_Gene_Id'].progress_apply(lambda entrez: getUniprot(entrez))
def associatePDB(geneIndexFile):
#associates the entrezGeneIds with a sequence of pdb files
#use a greedy algorithm to determine which sequences to use
pass
def main():
#Parse Args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--resetIndex', action='store_true',help = 'regenerate the index?')
arg_parser.add_argument('--resetUniProt', action='store_true',help = 'call mapping servers to re-associate entrez with Uniprot?')
arg_parser.add_argument('--resetPdb', action='store_true',help = 'ping servers to re-associate geneIds with pdb files')
cancerTypes = ['BRCA', 'GBM', 'OV', 'LUAD', 'UCEC', 'KIRC',
'HNSC', 'LGG', 'THCA', 'LUSC', 'PRAD', 'SKCM'
'COAD', 'STAD', 'BLCA', 'LIHC', 'CESC', 'KIRP',
'SARC', 'LAML', 'ESCA', 'PAAD', 'PCPG', 'READ',
'TGCT', 'THYM', 'THYM', 'KICH', 'ACC', 'MESO',
'UVM', 'DLBC', 'UCS', 'CHOL']
#######DEBUG#####
cancerTypes = cancerTypes[1]
#######DEBUG#####
pipelines = ['Muse', 'Mutect', 'Somatic_Sniper', 'Varscan']
mafDir = "MafArchive/"
geneIndexFile = "geneIndex.pkl"
if resetIndex or not os.path.exists(geneIndexFile):
createGeneIndex(mafDir, cancerTypes, pipelines, geneIndexFile)
mapUniProt(geneIndexFile)
associatePDB(geneIndexFile)
elif resetUniProt:
mapUniProt(geneIndexFile)
associatePDB(geneIndexFile)
elif resetPdb:
associatePDB(geneIndexFile)
if __name__ == "__main__":
main()