-
Notifications
You must be signed in to change notification settings - Fork 0
/
auxilary_function.py
285 lines (232 loc) · 11.7 KB
/
auxilary_function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 9 11:37:50 2015
@author: pubuntu
"""
import os
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from Reader import Reader
from sklearn.metrics import confusion_matrix
from Traj_creator import Traj_data
from Randomforest import RandomForest_Autotunner,plot_matrix,Measure,check_rotate
def MitoseClassif(obj_norm,
y_name_3state="Type",classif_Mitose="MitoseOrNot",
num_str="0015"):
print "\n We first load the unnormalized data: \n"
if os.path.isfile("H2b_data.csv"):
print "The file existed so I loaded it."
H2b = Traj_data(file_name="H2b_data.csv",pkl_traj_file="./Pkl_file")
else:
H2b=Traj_data()
H2b.extracting(num_str,"both_channels_0015.hdf5",'primary')
## Extracting the hdf5 file for the primary channel (H2b)
H2b.Add_traj(normalize=False)## ,num_traj=10) ## (you can reduce the number of traj)
## Adding Alice's work on tracking to have trajectories
file_loc="0015_PCNA.xml"
H2b.label_finder(file_loc)
## Finding associated labels by minimizing distance by click and distance of cell
H2b.renaming_and_merge()
## renaming the labels to have G1=="1", S=="S", G2=="2" and M=="M"
#This procedure may take a long time.
H2b.data.to_csv('H2b_data.csv',index=False,header=True)
print "\n We train a classifier for mitosis or not: \n"
obj_unnorm=H2b
train_file="MitoseClassif.arff"
train_1=Reader()
train_1.arrf_read(train_file)
train_1.renaming_for_mitosis()
train_1.data["label"].value_counts()
kfold=3
if train_1.Var_missing[0] in train_1.data.columns:
train_1.missing_features_data()
values=[100 + i*10 for i in range(15)]
model_1=RandomForest_Autotunner(values)
model_1.tunning(train_1.data[train_1.names],train_1.data["label"],kfold,plot=True,fit_new_model=True)
plt.show()
model_1.cm_normalized = model_1.cm.astype('float') / model_1.cm.sum(axis=1)[:, np.newaxis]
plot_matrix(model_1.cm_normalized,title="Normalized confusion matrix",names=["M","O","S"])
plt.show()
## To reduce computation and none useless things, we remove instances that do not belong to trajectories.
obj_norm.data=obj_norm.data.ix[pd.notnull(obj_norm.data["traj"]),obj_norm.data.columns]
obj_unnorm.data=obj_unnorm.data.ix[pd.notnull(obj_unnorm.data["traj"]),obj_unnorm.data.columns]
obj_norm.update()
obj_unnorm.update()
## Predicting model 1
index_no_missing=obj_norm.data[obj_norm.names].dropna(axis=0, how='any').index
obj_norm.data.ix[index_no_missing,classif_Mitose]=model_1.predict(obj_unnorm.data.ix[index_no_missing,train_1.names])
## Carefull, we put the unnormalized data in the above prediction.
print "\n A bit of statistics on the overall predictions: \n"
print "Frequency of predicted values for the Mitosis or not classifier: \n"
print obj_norm.data[classif_Mitose].value_counts()
print "\n We were however not able to predict %d instances because of missing values" % (obj_norm.data.shape[0]-len(index_no_missing))
obj_norm.data
obj_norm.update()
### Giving priority to the first classif...
model_1.names_to_give=train_1.names
return(obj_norm,model_1)
def EmissionMat(model_1,model_n_m):
print "\n We compute the emission state probability matrix from the confusion matrix for the first classifier: \n"
X3=model_n_m.cm_normalized
X3=X3.T
X3=np.array([X3[0],X3[2],X3[1]])
X3=X3.T
EmissionMat=np.zeros(shape=(5,5))
EmissionMat[0,0]=model_1.cm_normalized[0,0]
EmissionMat[4,4]=model_1.cm_normalized[0,0]
EmissionMat[1:5,0]=(1-model_1.cm_normalized[0,0])/3
EmissionMat[0:4,4]=(1-model_1.cm_normalized[0,0])/3
### Bricolage
EmissionMat[1:4,1:4]=X3
EmissionMat[1:4,1:3]+=-EmissionMat[3,0]*2/3
### On modifie car la diag n'est pas assez bonne...
EmissionMat[3,2:4]=[0.4,0.5]
EmissionMat[0,1:4]=sum(model_1.cm_normalized[0,1:3])/3
EmissionMat[4,1:4]=sum(model_1.cm_normalized[0,1:3])/3
EmissionMat=abs(EmissionMat).astype('float') / abs(EmissionMat).sum(axis=1)[:, np.newaxis]
## Put something better then abs...
plot_matrix(EmissionMat,title="Emission matrix",names=["M_B","G1","S","G2","M_E"])
plt.show()
return(EmissionMat)
def prep_for_R(obj_norm,classif_3state="3state",classif_final="Pred_fusion",classif_Mitose="MitoseOrNot",num_str="0015"):
def f(value_1,value_2):
if value_1=="M":
return(value_1)
else:
return(value_2)
obj_norm.data[classif_final]=obj_norm.data.apply(lambda r: f(r[classif_Mitose],r[classif_3state]),axis=1)
print "\n We prioritize our predictor of mitosis events before the 3 state classfier giving \n us a four state classifier. \n"
print "Frequency of predicted values for the 4 state classifier: \n"
print obj_norm.data[classif_final].value_counts()
obj_norm.update()
##First we are going to seperate beginning M's and ending M's
if num_str+"_id_frame" in obj_norm.data.columns:
obj_norm.data=obj_norm.data.sort_values(['traj', num_str+"_id_frame"], ascending=[1, 1])
##First we are going to seperate beginning M's and ending M's
for i in range(len(obj_norm.trajectories)):
new_obs=np.array(obj_norm.data.ix[obj_norm.data["traj"]==i,classif_final])
n_obs=len(new_obs)
for j in range(n_obs/2):
if new_obs[j]=='M':
new_obs[j]='B' #Beginning
obj_norm.data.ix[obj_norm.data["traj"]==i,classif_final]=new_obs
obj_norm.data.ix[obj_norm.data[classif_final]=='M',classif_final]='E' #Ending
else:
obj_norm.data=obj_norm.data.sort_values(["Well",'traj',"Frame"], ascending=[1, 1, 1])
subset=obj_norm.data[["Well","traj"]].drop_duplicates()
tuples = [tuple(x) for x in subset.values]
for i_well,i_traj in tuples:
new_obs=np.array(obj_norm.data.ix[(obj_norm.data["Well"]==i_well) & (obj_norm.data["traj"]==i_traj),classif_final])
n_obs=len(new_obs)
for j in range(n_obs/2):
if new_obs[j]=='M':
new_obs[j]='B' #Beginning
obj_norm.data.ix[(obj_norm.data["Well"]==i_well) & (obj_norm.data["traj"]==i_traj),classif_final]=new_obs
obj_norm.data.ix[obj_norm.data[classif_final]=='M',classif_final]='E' #Ending
if "Well" in obj_norm.data.columns:
data=obj_norm.data.ix[pd.notnull(obj_norm.data["traj"]),["Well","traj","Frame",classif_final]]
else:
data=obj_norm.data.ix[pd.notnull(obj_norm.data["traj"]),["traj",num_str+"_id_frame",classif_final]]
data.ix[data[classif_final]=='2',classif_final]="4"
data.ix[data[classif_final]=='1',classif_final]="2"
data.ix[data[classif_final]=='B',classif_final]="1"
data.ix[data[classif_final]=='E',classif_final]="5"
data.ix[data[classif_final]=='S',classif_final]="3"
return(obj_norm,data)
def final_classif_HMM(data,obj_norm,
y_name_3state="Type",classif_Mitose="MitoseOrNot",
classif_3state="3state",classif_final="Pred_fusion",
classif_hmm="HMM",
ratio=5.9/60,
plot=True,
obs_number=0):
print "Here we are going to join the corrected data (from R) to our current data in Python \n "
data.ix[data.HMM==1,classif_hmm]="M"
data.ix[data.HMM==2,classif_hmm]="1"
data.ix[data.HMM==3,classif_hmm]="S"
data.ix[data.HMM==4,classif_hmm]="2"
data.ix[data.HMM==5,classif_hmm]="M"
to_join=pd.Series(data[classif_hmm])
to_join.index=[int(el) for el in to_join.index]
obj_norm.data=obj_norm.data.join(to_join)
obj_norm.update()
if hasattr(obj_norm, 'train'):
print "Recap of our data: \n "
print obj_norm.train[["traj",y_name_3state,classif_Mitose,classif_3state,classif_final,classif_hmm]].head()
i=0
G1=[]
S=[]
G2=[]
CC=[]
print "We are going to count the lengths of the G1 phase, the S phase and the G2 phase: \n"
print "To quickly asses we print the trajectory and his corrected trajectory, for sequence number:" + str(obs_number)
for el in obj_norm.Group_of_traj:
new_obs=el[1][classif_hmm]
if i==obs_number:
test=np.array(el[1][classif_final])
test_hmm=np.array(el[1][classif_hmm])
print classif_final+": \n"
print test
print "\n Corrected HMM: \n"
print test_hmm
i+=1
if not check_rotate(new_obs):
G1.append(Measure(new_obs,'1',_last=True))
S.append(Measure(new_obs,'S',_last=True,_first=True))
G2.append(Measure(new_obs,'2',_first=True))
CC.append(Measure(new_obs,'M'))
elif not check_rotate(new_obs[:-1]):
G1.append(Measure(new_obs[:-1],'1',_last=True))
S.append(Measure(new_obs[:-1],'S',_last=True,_first=True))
G2.append(Measure(new_obs[:-1],'2',_first=True))
CC.append(Measure(new_obs[:-1],'M'))
elif not check_rotate(new_obs[:-2]):
G1.append(Measure(new_obs[:-2],'1',_last=True))
S.append(Measure(new_obs[:-2],'S',_last=True,_first=True))
G2.append(Measure(new_obs[:-2],'2',_first=True))
CC.append(Measure(new_obs[:-2],'M'))
else:
G1.append(-1)
S.append(-1)
G2.append(-1)
CC.append(-1)
G1_p=[el*ratio for el in G1 if el>-1]
S_p= [el*ratio for el in S if el>-1]
G2_p=[el*ratio for el in G2 if el>-1]
CC_p=[el*ratio for el in CC if el>-1]
res = {'mean' : pd.Series([np.mean(G1_p), np.mean(S_p), np.mean(G2_p),np.mean(CC_p)], index=['G1', 'S', 'G2','CellCycle']),
'Standard deviation' : pd.Series([np.std(G1_p),np.std(S_p),np.std(G2_p),np.std(CC_p)], index=['G1', 'S', 'G2','CellCycle']),
'Accepted trajectories': pd.Series([len(G1_p),len(S_p),len(G2_p),len(CC_p)], index=['G1', 'S', 'G2','CellCycle'])
}
print "Number of total trajectories: "+str(len(G1))
G1_p={"val":G1_p,"name":"G1"}
S_p={"val":S_p,"name":"S"}
G2_p={"val":G2_p,"name":"G2"}
CC_p={"val":CC_p,"name":"CellCycle"}
if plot:
for el in [G1_p,S_p,G2_p,CC_p]:
plt.hist(el["val"],bins=int(0.75*len(el["val"])))
plt.title(el["name"]+" lengths distribution")
plt.xlabel("times (hours)")
plt.ylabel("Frequency")
plt.show()
if hasattr(obj_norm, 'train'):
temp_X=obj_norm.train.ix[pd.notnull(obj_norm.train[classif_hmm]),[classif_hmm,y_name_3state]]
print temp_X[y_name_3state].value_counts()
cm=confusion_matrix(temp_X[y_name_3state],temp_X[classif_hmm])
print "We reach an accuracy of %5.3f \n" %(float(cm.trace())/cm.sum())
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plot_matrix(cm_normalized,title="Confusion matrix for the HMM classification")
return(obj_norm,pd.DataFrame(res))
def Modify_transProbs(transProbs):
n,p=transProbs.shape
transProbs_Mito=np.zeros(shape=(n,p))
for i in range(n-1):
TP_diag_i=transProbs[i,i]
TP_diag_i_1=transProbs[i,i+1]
TP_trans=transProbs[i+1,i+1]
transProbs_Mito[i,i+1]=(TP_diag_i**3)*(TP_diag_i_1**1)*(TP_trans**0)+(TP_diag_i**2)*(TP_diag_i_1**1)*(TP_trans**1)+(TP_diag_i**1)*(TP_diag_i_1**1)*(TP_trans**2)+(TP_diag_i**0)*(TP_diag_i_1**1)*(TP_trans**3)
transProbs_Mito[i,i]=1-transProbs_Mito[i,i+1]
transProbs_Mito[n-1,n-1]=1
return(transProbs_Mito)