-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_mc.py
299 lines (270 loc) · 11.4 KB
/
run_mc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
## Functions for doing the monte-carlo runs
import numpy as np
from all_data import *
import matplotlib.pyplot as plt
import itertools
## Create the relevant objects
print "Creating Data Object PDZ_Data"
PDZ_Data = Data()
PDZ_Data.load_data()
print "PDZ_Data ready!"
def eval_score(domain, sequence, pos = 0):
"""
Function which evaluates the score using Stiffler model
"""
score = 0.0
for i in range(5):
score += domain.thetas[i,sequence[i]]
return score - domain.thresholds[pos]
### Some utility functions for doing math
def sigmoid(x, a=1):
return 1.0/(1+np.exp(-1.0*a*x))
def log_modified(x):
if x > 0:
return np.log(1+np.exp(-x))
else:
return -x+ np.log(1+np.exp(x))
def calc_log_proba_mod(peptide, domain, sequence):
"""
Function which computes the log of the updated probability.
For numerical stability, the sum of the logs is computed as
log(exp(logA)+exp(logB)) which is just log(A+B)
"""
ix = PDZ_Data.domain_names.index(domain.name)
alpha = PDZ_Data.fp_interaction_matrix[peptide.name][ix]
score = eval_score(domain, sequence,0)
z_1 = log_modified(score)
z_2 = log_modified(-1.0*score)
if alpha > 0:
a = peptide.posterior_matrix[1,1]
x = np.log(a) -z_1
b = peptide.posterior_matrix[1,0]
y = np.log(b) - z_2
result = np.logaddexp(x,y)
else:
a = peptide.posterior_matrix[0,1]
x = np.log(a) - z_1
b = peptide.posterior_matrix[0,0]
y = np.log(b) - z_2
result = np.logaddexp(x,y)
return result*-1.0
def eval_log_energy(peptide,sequence):
en = 0.0
for domain in PDZ_Data.domains:
en += calc_log_proba_mod(peptide, domain, sequence)
return en
def calc_energy_ground():
for peptide in PDZ_Data.peptides:
peptide.energy_ground = eval_log_energy(peptide, PDZ_Data.convert2int(peptide.sequence_bis))
def pair_quantities():
"""
Function which computes relevant quantities for each peptide-domain pair
"""
for peptide in PDZ_Data.peptides:
peptide.domain_data = {}
for i in range(len(PDZ_Data.domain_names)):
quant_list = []
alpha = PDZ_Data.fp_interaction_matrix[peptide.name][i]
if alpha > 0:
quant_list.append(1.0)
else:
quant_list.append(alpha)
energy = eval_score(domain, convert2int(peptide.sequence_bis),2)
quant_list.append(energy) ## Score Calculated from Stiffler Model
## P(y_manip=1|seq) = P(y_manip=1|y_model = -1)*P(mod=-1|seq) + P(y_manip=1|y_model=1)*P(mod=1|seq)
manip_1 = peptide.posterior_matrix[1,0]*sigmoid(energy,-1) + peptide.posterior_matrix[1,1]*sigmoid(energy)
## P(y_manip=-1|seq) = P(y_manip=-1|y_model = -1)*P(mod=-1|seq) + P(y_manip=-1|y_model=1)*P(mod=1|seq)
manip_0 = peptide.posterior_matrix[0,0]*sigmoid(energy,-1) + peptide.posterior_matrix[0,1]*sigmoid(energy)
quant_list.append(manip_1)
quant_list.append(manip_0)
peptide.domain_data[domain.name] = quant_list
def run_mc(nb_runs, peptide,beta = 1.0, nb_cycles=10, plot=False, verbose=True, print_reject = False):
sims = []
print "Name of Peptide {}".format(peptide.name)
print "Base Energy {}".format(peptide.energy_ground)
print "Base Sequence {}".format(peptide.sequence_bis)
base_seq = PDZ_Data.convert2int(peptide.sequence_bis)
peptide.mutations = []
for j in range (nb_cycles):
print "\n Cycle number : {}\n".format(j+1)
sim_results = []
mutated_sequences = []
mutated_energies = []
for_plot = []
sequences_accepted = []
mut_seq = base_seq
mut_energy = peptide.energy_ground
for i in range(nb_runs):
y = np.random.randint(5)
z = np.random.randint(19)
## Remove if the amino acid change is the same as before
if z >= mut_seq[y]:
z = z+1
temp_seq = mut_seq[:]
#print "Last sequence seen {}".format(convert2seq(mut_seq))
temp_seq[y] = z
#print "Sequence after mutation {}\n".format(convert2seq(temp_seq))
temp_energy = eval_log_energy(peptide, temp_seq)
ratio = np.exp(-1.0*beta*(temp_energy-mut_energy))
prob_trans = min(1, ratio)
x = np.random.uniform()
if x < prob_trans:
mut_energy = temp_energy
mut_seq = temp_seq
if verbose:
print "Run number: {}\n".format(i)
print "Uniform {} Ratio {} Prob_Trans {} ".format(x,ratio,prob_trans)
print "Accepted {} {} {} {} \n".format(temp_seq, temp_energy, PDZ_Data.convert2seq(temp_seq), PDZ_Data.convert2seq(mut_seq))
sim_results.append({'Sequence': temp_seq, 'Energy': temp_energy, 'Status': 'Accepted'})
for_plot.append(temp_energy)
else:
if verbose:
if print_reject:
print "Run number: {}\n".format(i)
print "Uniform {} Ratio {} Prob_Trans {} ".format(x,ratio,prob_trans)
print "Rejected {} {} {} {}\n".format(temp_seq, temp_energy, PDZ_Data.convert2seq(temp_seq), PDZ_Data.convert2seq(mut_seq))
sim_results.append({'Sequence': temp_seq, 'Energy': temp_energy, 'Status': 'Rejected'})
##print "Rejected {} {}".format(temp_seq, temp_energy)
mutated_sequences.append(temp_seq)
mutated_energies.append(temp_energy)
peptide.mutations.append(sequences_accepted)
print "Lowest Energy {} Sequence {}\n".format(np.min(mutated_energies), PDZ_Data.convert2seq(mutated_sequences[np.argmin(mutated_energies)]))
if plot == True:
plt.figure(j)
plt.axhline(y = peptide.energy_ground, hold = None, c = 'r', linewidth=0.5)
if len(for_plot) == 1:
plt.axhline(y = for_plot[0], hold=None, c = 'b', linewidth = 1.5)
plt.plot(for_plot)
plt.show()
sims.append({'Results' : sim_results, 'Mutated sequences': mutated_sequences, 'Mutated Energies': mutated_energies})
peptide.sims = sims
print " Completed run for {}\n".format(peptide.name)
### Functions to be used once the simulations have been run
def compute_unique_mut(peptide):
unique = []
for run in peptide.mutations:
for item in run:
if item not in unique:
unique.append(item)
return unique
def compute_freq_matrix(pep):
freq_matrix = np.zeros((5,20))
accepted = []
for i in range(len(pep.sims)):
temp = pep.sims[i]['Results']
for j in range(len(temp)):
if temp[j]['Status'] == 'Accepted':
accepted.append(temp[j]['Sequence'])
for mut in accepted:
for i in range(5):
freq_matrix[i,mut[i]] += 1
freq_matrix_normalized = freq_matrix.astype('float') / freq_matrix.sum(axis=1)[:, np.newaxis]
pep.freq_matrix = freq_matrix
pep.freq_matrix_normalized = freq_matrix_normalized
return [freq_matrix, freq_matrix_normalized]
def compute_proba_pos(pep,pos):
return 1-compute_freq_matrix(pep)[1][pos,convert2int(pep.sequence_bis)[pos]]
def plot_freq_matrix(pep, normalized = True):
if normalized:
fm_normalized = compute_freq_matrix(pep)[1]
else:
fm_normalized = compute_freq_matrix(pep)[0]
print "Sequence of Peptide {}".format(pep.sequence_bis)
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Frequency matrix for Peptide {}".format(pep.name))
plt.xticks(range(len(PDZ_Data.aminoacids)), PDZ_Data.aminoacids, fontsize=12)
plt.imshow(fm_normalized,interpolation='nearest', cmap = plt.cm.Blues)
plt.colorbar()
plt.show()
def print_seq(pep):
"""
Returns the sequence with the names of the amino acids
"""
for acid in pep.sequence_bis:
print "{} {} ".format(acid,acid_dict[acid])
def compute_accepted_sequences(peptide):
"""
Computes the list of all the sequences which were
accepted during the monte-carlo run over all cycles
Utility function for calculating joint probability distribution
"""
sequences_accepted = []
for i in range(len(peptide.sims)):
temp = peptide.sims[i]['Results']
for j in range(len(temp)):
if temp[j]['Status'] == 'Accepted':
sequences_accepted.append(temp[j]['Sequence'])
return sequences_accepted
## Need to edit the calculation for the entropy
def compute_entropy_sequence(peptide):
test_matrix = compute_freq_matrix(peptide)[1]
entropy_sequence = []
x, y = test_matrix.shape
for i in range(x):
w = 0.0
for j in range(y):
if test_matrix[i][j] == 0:
w +=0
else:
w += test_matrix[i][j] * np.log(test_matrix[i][j])
w = -1.0*w
entropy_sequence.append(w)
peptide.entropy_sequence = entropy_sequence
return entropy_sequence
def compute_coincidence_count(peptide, acid_a, acid_b, pos_i, pos_j):
count = 0
sequences_accepted = []
for i in range(len(peptide.sims)):
temp = peptide.sims[i]['Results']
for j in range(len(temp)):
if temp[j]['Status'] == 'Accepted':
sequences_accepted.append(temp[j]['Sequence'])
for sequence in sequences_accepted:
if (sequence[pos_i] == acid_a) & (sequence[pos_j] == acid_b):
count +=1
return count
def compute_joint_proba(peptide, acid_a, acid_b, pos_i, pos_j):
count = 0
for sequence in compute_accepted_sequences(peptide):
if (sequence[pos_i] == acid_a) & (sequence[pos_j] == acid_b):
count += 1
proba = 1.0 *count/len(peptide.sequences_accepted)
return proba
def compute_mutual_information(peptide):
mutual_information = np.zeros((5,5))
freq_matrix = compute_freq_matrix(peptide)[1]
for pos_i, pos_j in itertools.product(range(5), range(5)):
count = 0
for acid_a, acid_b in itertools.product(range(20), range(20)):
joint_proba = compute_joint_proba(peptide, acid_a, acid_b, pos_i, pos_j)
proba_a = freq_matrix[pos_i, acid_a]
proba_b = freq_matrix[pos_j, acid_b]
if (joint_proba ==0) | (proba_a == 0) | (proba_b ==0):
count += 0.0
else:
x1 = np.log(joint_proba)
x2 = np.log(proba_a)
x3 = np.log(proba_b)
count += joint_proba*(x1 - x2 - x3)
mi_matrix[pos_i, pos_j] = count
return mi_matrix
def correlation(peptide, pos_i, pos_j):
correlation_matrix = np.zeros((20,20))
freq_matrix = compute_freq_matrix(peptide)[1]
for acid_a, acid_b in itertools.product(range(20), range(20)):
correlation_matrix[acid_a, acid_b] = compute_joint_proba(peptide, acid_a, acid_b, pos_i, pos_j) - \
freq_matrix[pos_i,acid_a]*freq_matrix[pos_j,acid_b]
return correlation_matrix
def most_probable_sequence(peptide):
freq_matrix = compute_freq_matrix(peptide)[1]
probable_sequence = []
for i in range(5):
probable_sequence.append(np.argmax(freq_matrix[i]))
return probable_sequence, PDZ_Data.convert2seq(probable_sequence)
def distance_hamming(seq1, seq2):
dist = 0
assert len(seq1) == len(seq2)
for i in range(len(seq1)):
if seq1[i] != seq2[i]:
dist+=1
return dist