/
select_features.py
178 lines (145 loc) · 6.98 KB
/
select_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
usage: python select_features.py
No command line arguments. This program expects to find two data files in a
subdirectory called 'data'.
"""
import numpy as np
import sklearn.svm
import sklearn.preprocessing
import sklearn.grid_search
import sklearn.cross_validation
import sklearn.linear_model
import mothur_files
def select_features():
shared_data_file_path = 'data/Stool.0.03.subsample.0.03.filter.shared'
design_data_file_path = 'data/Stool.0.03.subsample.0.03.filter.mix.design'
shared_label_names, shared_group_names, otu_column_names, shared_data = \
mothur_files.load_shared_file(shared_data_file_path)
design_group_names, design_partition_names = \
mothur_files.load_design_file(design_data_file_path)
elastic_net(shared_data, otu_column_names, design_partition_names)
linear_support_vector_machine(shared_data, otu_column_names, design_partition_names)
rbf_support_vector_machine(shared_data, otu_column_names, design_partition_names)
"""
This function performs feature selection using elastic net. The hyperparameters
l1_ratio and alpha are determined by cross validation.
"""
def elastic_net(shared_data, otu_column_names, design_partition_names):
# scale the data to have mean 0 and std 1 in each feature
scaler = sklearn.preprocessing.StandardScaler()
X = scaler.fit_transform(shared_data)
# convert the last character of each partition name to a float - not elegant
y = np.array(map(lambda x: float(x[-1]), design_partition_names))
##############################################################################
n_train = 400
best_model = None
best_model_score = 0.0
# I could not get ElasticNetCV to choose the best l1_ratio from a list
# so I am doing that explictly with a loop
for l1_ratio in [.1, .2, .3, .4]:
model = sklearn.linear_model.ElasticNetCV(
l1_ratio=l1_ratio,
cv=sklearn.cross_validation.StratifiedKFold(y[n_train:], 10),
verbose=False
).fit(X[n_train:, :], y[n_train:])
model_score = model.score(X[:n_train,:], y[:n_train])
if model_score > best_model_score:
best_model = model
best_model_score = model_score
print('l1_ratio: {}'.format(model.l1_ratio_))
print('alpha: {}'.format(model.alpha_))
print('model score: {}'.format(model_score))
print('feature ranking by elastic net:')
print(' OTU Rank')
enet_top_feature_list = get_enet_top_features(model)
for n, (feature_ndx, rank) in enumerate(enet_top_feature_list[:50]):
print('{:2d} {} {:4.2f}'.format(n, otu_column_names[feature_ndx], rank))
print('best l1_ratio: {}'.format(best_model.l1_ratio_))
print('best alpha: {}'.format(best_model.alpha_))
print('best model score: {}'.format(best_model_score))
print('')
def get_enet_top_features(enet_model):
sorted_coef_ndx = list(np.argsort(np.abs(enet_model.coef_), axis=0))
sorted_coef_ndx.reverse()
abs_coef_sum = np.sum(np.abs(enet_model.coef_))
return [(i, 100.0*np.abs(enet_model.coef_[i])/abs_coef_sum) for i in sorted_coef_ndx]
"""
This function fits a SVM model but no feature selection is done here. This
is really just to determine the classification performance.
"""
def rbf_support_vector_machine(shared_data, otu_column_names, design_partition_names):
# scale the data to have mean 0 and std 1 in each feature
scaler = sklearn.preprocessing.StandardScaler()
X = scaler.fit_transform(shared_data)
# convert the last character of each partition name to a float - not elegant
y = np.array(map(lambda x: float(x[-1]), design_partition_names))
C_range = 10.0 ** np.arange(-1, 2)
gamma_range = 10.0 ** np.arange(-5, -3)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = sklearn.cross_validation.StratifiedKFold(y=y, n_folds=10)
grid = sklearn.grid_search.GridSearchCV(
sklearn.svm.SVC(kernel='rbf'),
param_grid=param_grid,
cv=cv,
verbose=False
)
grid.fit(X, y)
print("The best RBF SVM classifier is: ", grid.best_estimator_)
print('best classifier score: {}'.format(grid.best_score_))
"""
I tried the following code using the scikit-learn RFE but it takes a long
time and the results were not consistent with classify.shared and elastic
net
rfecv = sklearn.feature_selection.RFECV(
estimator=sklearn.svm.SVC(C=0.01, kernel='linear'),
cv=3,
verbose=True
)
rfecv.fit(X, y)
sorted_ranking_ndx = np.argsort(rfecv.ranking_)
for n,i in enumerate(sorted_ranking_ndx[:20]):
print('{} {} {}'.format(n, otu_column_names[i], rfecv.ranking_[i]))
This function performs feature selection using SVM-RFE. I found a good C
parameter by cross-validation and then hard-coded it.
"""
def linear_support_vector_machine(shared_data, otu_column_names, design_partition_names):
scaler = sklearn.preprocessing.StandardScaler()
# the scaler returns a copy by default
X = scaler.fit_transform(shared_data)
# convert the last character of each partition name to a float - not elegant
y = np.array(map(lambda x: float(x[-1]), design_partition_names))
remaining_otu_list = np.arange(len(otu_column_names))
n_train = 400
removed_feature_list = []
while len(remaining_otu_list) > 0:
svc = sklearn.svm.SVC(C=0.01, kernel='linear')
svc.fit(X[:n_train, remaining_otu_list], y[:n_train])
#print('linear svm score: {}'.format(svc.score(X[n_train:, remaining_otu_list], y[n_train:])))
#w_squared = svc.coef_.sum(axis=0)**2
w_squared = (svc.coef_**2).sum(axis=0)
w_squared_min_ndx = np.argmin(w_squared)
otu_to_remove_ndx = remaining_otu_list[w_squared_min_ndx]
otu_to_remove = otu_column_names[otu_to_remove_ndx]
#print('removing {}'.format(otu_to_remove))
remaining_otu_list = np.delete(remaining_otu_list, w_squared_min_ndx)
removed_feature_list.append(otu_to_remove)
removed_feature_list.reverse()
# calculate a rank value by removing each feature
svc = sklearn.svm.SVC(C=0.01, kernel='linear')
svc.fit(X[:n_train, :], y[:n_train])
all_features_score = svc.score(X[n_train:, :], y[n_train:])
print('linear SVM score {}'.format(all_features_score))
print('features ranked by linear SVM-RFE:')
print(' n OTU')
for n, otu_name in enumerate(removed_feature_list[:50]):
print('{:2d} {}'.format(n, otu_name))
#svc = sklearn.svm.SVC(C=0.01, kernel='linear')
#otu_ndx = otu_column_names.index(otu_name)
#print('otu_ndx for {}: {}'.format(otu_name, otu_ndx))
#reduced_otu_list = range(len(otu_column_names))
#reduced_otu_list.remove(otu_ndx)
#svc.fit(X[:n_train, np.array([1, otu_ndx])], y[:n_train])
#score = svc.score(X[n_train:, np.array([1, otu_ndx])], y[n_train:])
#print('{:2d} {} {:4.2f}'.format(n, otu_name, all_features_score/score))
if __name__ == '__main__':
select_features()