/
oxyphen_multinome.py
240 lines (173 loc) · 7.99 KB
/
oxyphen_multinome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import RFE
from Bio.ExPASy import Enzyme
import numpy as np
import pandas as pd
import os, glob
CONFIG_FILE = open("SETTINGS", "r").read().splitlines()
if os.path.isfile("OUTPUT/results_table.tsv"):
print "OUTPUT FILE EXISTS, PLEASE MOVE/REMOVE YOUR PREVIOUS RESULTS!!!"
exit()
else:
GLOBAL_RESULTS = open("OUTPUT/results_table.tsv", "a")
def read_config():
multinome_folder=""
for line in CONFIG_FILE:
if line.startswith("BLAST_PATH"):
blast_path = line.split("=")[1]
if line.startswith("NUM_THREADS"):
num_threads = float(line.split("=")[1])
if line.startswith("PROTEOMES_FOLDER") and line.split("=")[1]:
### enter multinome mode
multinome_folder = line.split("=")[1]
return blast_path, num_threads, multinome_folder
def do_oxyphen(proteome, output_filename, ec_classes_file):
'''
Read and parse enzyme.dat file
'''
input_name = "DATA/enzyme.dat"
output_name = "DATA/ec_uniprot.tsv"
### program ###
handle = open(input_name)
records = Enzyme.parse(handle)
out = dict() #dict of dicts, first key: EC number, second key: field
transferred = dict() #dict of lists
for record in records:
if 'Transferred entry:' in record['DE']:
record['DE'] = record['DE'].rstrip('.') #remove period
record['DE'] = record['DE'].replace('Transferred entry:',' ') #remove title
record['DE'] = record['DE'].replace(',',' ') #remove commas
record['DE'] = record['DE'].replace('and',' ') #remove and
point_to = record['DE'].split()
transferred[record['ID']] = point_to
else:
out[record['ID']] = dict()
out[record['ID']]['uniprot'] = ' '.join([x[0] for x in record['DR']])
out[record['ID']]['description'] = record['DE']
out[record['ID']]['transferred'] = False
# for id in transferred:
# out[id] = dict()
# out[id]['uniprot'] = ' '.join([out[x]['uniprot'] for x in transferred[id]])
# out[id]['description'] = 'Transferred entry: ' + ' '.join(transferred[id])
# out[id]['transferred'] = True
df = pd.DataFrame.from_dict(out, orient='index')
df.index.name = 'EC'
df.to_csv(output_name, sep='\t')
'''
Take a subset of ecs of interest
'''
oxidases = tuple(open("DATA/oxygen_ecclasses", "r").read().splitlines())
infile = open("DATA/ec_uniprot.tsv", "r").readlines()
outfile = open("DATA/ec_uniprot_oxidases.tsv", "w")
for line in infile:
if line.startswith("EC"):
outfile.write(line)
elif line.startswith(oxidases):
outfile.write(line)
outfile.close()
'''
write a file with one uniprot ID per line, containing all of the
uniprot IDs mentioned in uniprot column of the input file
Ignore EC numbers that have been transferred
'''
input = "DATA/ec_uniprot_oxidases.tsv"
output = "DATA/uniprot_ids.txt"
df = pd.read_table(input)
df.dropna(subset=['uniprot'], inplace=True) #ignore EC numbers with no uniprot ids associated
#df = df[df.transferred == False] #ignore EC numbers that are obsolete due to transfer
unique_uniprot = set(" ".join(df.uniprot.values).split(" "))
with open(output, "w") as outfile:
for id in unique_uniprot:
outfile.write(id + "\n")
outfile.close()
'''
Make blastdb out of the swissprot subset
'''
blast_path, num_threads, multinome_folder = read_config()
os.system("%s -in DATA/sprot_subset.fasta -dbtype prot -out DATA/sprot_subset -hash_index" % (os.path.join(blast_path, "makeblastdb")))
'''
Blast our pre-selected proteomes against the uniprot subset
'''
print "Performing Blast searches against oxygen-utilizing database..."
os.system("%s -max_target_seqs 1 -outfmt '6 qseqid sseqid pident evalue qcovs' -query %s -db DATA/sprot_subset -out DATA/new_sequences_sprot_enzyme.tab -num_threads %d" % (os.path.join(blast_path, "blastp"), proteome, num_threads) )
'''
Filter Blast output.
'''
evalue = 10e-3
identity = 40.0
coverage = 40.0
print "Filtering Blast output: evalue",evalue, " identity", identity, " coverage", coverage
hits_table_file_name = "DATA/new_sequences_sprot_enzyme.tab"
hits_table_file_name_filtered_out = open("DATA/new_sequences_sprot_enzyme_filtered.tab", "w")
hits_table_file_name_filtered_out.write("\t".join(["hit","subject","id","len","eval","cov"])+"\n")
for line in open(hits_table_file_name, "r").read().splitlines():
if line.startswith("#"): continue
query, target, ident, eval, cover = line.split("\t")
eval = float(eval)
ident = float(ident)
cover = float(cover)
if eval <= evalue and ident >= identity and cover >= coverage:
hits_table_file_name_filtered_out.write(line+"\n")
hits_table_file_name_filtered_out.close()
hits_table_file_name_filtered = "DATA/new_sequences_sprot_enzyme_filtered.tab"
enzyme_table_file_name = 'DATA/ec_uniprot_oxidases.tsv'
hits = pd.read_csv(hits_table_file_name_filtered, sep="\t", header=0)
enzyme = pd.read_csv(enzyme_table_file_name, sep="\t", header=0)
hits.fillna('', inplace=True) #replace empty values with blank spaces
enzyme.fillna('', inplace=True)
enzyme = enzyme[enzyme.transferred == False] #drop transferred EC numbers
hits.subject = hits.subject.str[3:9] #take just the uniprot ID from the name
def get_ecs(uniprot):
if uniprot == '': #ignore invalid uniprot ids
return ''
else:
return ' '.join(enzyme.EC[enzyme.uniprot.str.contains(uniprot)].values)
hits['EC'] = hits.subject.apply(get_ecs)
output_file_name = output_filename
hits.to_csv(output_file_name, sep="\t", index=False)
### read final mapping output
mapping_out = open(output_file_name, "r").read().splitlines()
ecs_dict = {}
for line in mapping_out[1:]:
splitted = line.split("\t")
ecs = splitted[-1]
for ec in ecs.split():
if ec not in ecs_dict:
ecs_dict[ec] = []
ecs_dict[ec].append(splitted[0])
print "\n\n"
print len(ecs_dict), "oxygen-utilizing enzymes were found from classes", ecs_dict.keys()
ec_out = open(ec_classes_file, "w")
ec_out.write("\t".join(ecs_dict.keys()))
ec_out.close()
GLOBAL_RESULTS.write(os.path.basename(proteome)+"\t"+str(len(ecs_dict))+"\t"+",".join(ecs_dict.keys())+"\n")
#print "Detailed mapping can be found in OUTPUT/oxygen_utilizing_annot.tsv file"
#print "Executing SVM classifier..."
infile = open("DATA/model_svm", "r").read().splitlines()
classifier_input = []
classes = []
ec_classes = []
for line in infile:
if line.startswith("@attribute") and "class" not in line:
ec_classes.append(line.split()[1].replace("'",""))
def do_for_all():
"""
Execute oxyphen for all proteomes in the input folder
"""
blast_path, num_threads, multinome_folder = read_config()
proteomes = glob.glob(os.path.join(multinome_folder,"*"))
print "\n\nPROTEOMES IN YOUR PROTEOMES_FOLDER DIRECTORY:\n", "\n".join(proteomes)
for proteome in proteomes:
fname = os.path.splitext(os.path.basename(proteome))[0]
output_filename = os.path.join("OUTPUT",fname+"_oxygen_utilizing_annot.tsv")
ec_classes_file = os.path.join("OUTPUT",fname+"_EC_CLASSES.txt")
print "\n\nRUNNING OXYPHEN FOR PROTEOME %s" % proteome
do_oxyphen(proteome, output_filename, ec_classes_file)
print "\n\nOUTPUT MAPPING FILE FOR THIS PROTEOME CAN BE FOUND IN %s" % output_filename
print "LIST OF EC CLASSES FOR THIS PROTEOME CAN BE FOUND IN %s" % ec_classes_file
GLOBAL_RESULTS.close()
if __name__ == '__main__':
do_for_all()
print "YOUR OUTPUT FILES CAN BE FOUND IN OUTPUT/ FOLDER"