-
Notifications
You must be signed in to change notification settings - Fork 0
/
first_weka_module.py
241 lines (188 loc) · 7.3 KB
/
first_weka_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import glob
import os
import shutil
import time
import sys
import weka.core.converters
from weka.core.converters import Loader
import weka.core.jvm as jvm
from weka.classifiers import Classifier
from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
from weka.filters import Filter
from weka.experiments import Tester, ResultMatrix, SimpleCrossValidationExperiment
from weka_variables import set_weka_options
##### FUNCTIONS ####
####################
#voir commentaire + en détail pour la doc
# selects attributes and return a string containing filtered data
def use_filter(data, str_eval, str_search):
"""
Uses the AttributeSelection filter for attribute selection.
:param data: the dataset to use
:type data: Instances
"""
# print("\n2. Filter")
flter = Filter(classname="weka.filters.supervised.attribute.AttributeSelection")
aseval = ASEvaluation(classname=str_eval) #weka.attributeSelection.CfsSubsetEval
assearch = ASSearch(classname=str_search, options=["-B"]) #weka.attributeSelection.GreedyStepwise
flter.set_property("evaluator", aseval.jobject)
flter.set_property("search", assearch.jobject)
flter.inputformat(data)
filtered = flter.filter(data)
return str(filtered)
#creates temporary files with selected variables and return a list of these files path
def create_temp_filtered_files(datasets, str_eval, str_search, tmp_dir):
list_temp_files = list()
tmp_eval_name = str_eval.split(".")[-1]
tmp_search_name = str_search.split(".")[-1]
loader = Loader(classname="weka.core.converters.ArffLoader")
for ds in datasets:
data_filtered = use_filter(loader.load_file(ds), str_eval, str_search)
tmp_file_name = ds.split("/")[-1]
full_name = tmp_eval_name +"-" + tmp_search_name + "_" + tmp_file_name
list_temp_files.append("tmpFiles/"+ full_name)
# print(list_temp_files[-1])
with open(tmp_dir + full_name, "w") as text_file:
print(data_filtered, file=text_file)
return list_temp_files
# creates all datasets filtered
def attribute_selection(list_attribute_selection, datasets):
all_datasets_filtered = list()
for element in list_attribute_selection:
datasets_filtered = create_temp_filtered_files(datasets, element[0], element[1], tmp_dir)
all_datasets_filtered.append(datasets_filtered)
return all_datasets_filtered
#experimenter unfiltered
def experimenter(datasets, base_res, nb_runs, nb_folds, classifiers):
tmpres=list()
result = base_res + "_" + str(nb_folds) + "folds.arff"
exp = SimpleCrossValidationExperiment(
classification=True,
runs=nb_runs,
folds=nb_folds,
datasets=datasets,
classifiers=classifiers,
result=result)
exp.setup()
exp.run()
tmpres.append(result)
return tmpres
#run experimenter on filtered datasets
def experimenter_filtered (all_datasets_filtered, base_res, nb_runs, nb_folds, classifiers_for_filtered):
tmpres=list()
for ds in all_datasets_filtered:
attrib_name = (ds[-1].split("_")[0]).split("/")[-1]
res = base_res + "_" + attrib_name + "_" + str(nb_folds) + "folds.arff"
exp = SimpleCrossValidationExperiment(
classification=True,
runs=nb_runs,
folds=nb_folds,
datasets=ds,
classifiers=classifiers_for_filtered,
result=res)
exp.setup()
exp.run()
tmpres.append(res)
return tmpres
# display results of one experiment according to a comparison metric
def expe_printer(res_file, comparison_metric):
loader = weka.core.converters.loader_for_file(res_file)
data = loader.load_file(res_file)
matrix = ResultMatrix(classname="weka.experiment.ResultMatrixPlainText")
tester = Tester(classname="weka.experiment.PairedCorrectedTTester")
tester.resultmatrix = matrix
comparison_col = data.attribute_by_name(comparison_metric).index
tester.instances = data
print(tester.header(comparison_col))
print(tester.multi_resultset_full(0, comparison_col))
def expe_printer_to_latex(res_file, comparison_metric):
text = ""
loader = weka.core.converters.loader_for_file(res_file)
data = loader.load_file(res_file)
matrix = ResultMatrix(classname="weka.experiment.ResultMatrixLatex")
tester = Tester(classname="weka.experiment.PairedCorrectedTTester")
tester.resultmatrix = matrix
comparison_col = data.attribute_by_name(comparison_metric).index
tester.instances = data
text += tester.header(comparison_col)
text += tester.multi_resultset_full(0, comparison_col)
return text
#display results of a list of experiments for several comparison metrics
def full_expe_printer(list_of_res_files, list_of_comparison_metric, destination):
latex_table = ""
for R in list_of_res_files:
for CM in list_of_comparison_metric:
expe_printer(R, CM)
latex_table += expe_printer_to_latex(R, CM)
with open(destination, "w") as text_file:
print(latex_table, file=text_file)
#function to run autoweka
#examples of call autoweka(data, "1", "areaUnderROC")
#autoweka(data, "1", "fMeasure")
#autoweka(data, "1", "fMeasure")
def autoweka(data, duration, metric, nb_folds):
classifier = Classifier(classname="weka.classifiers.meta.AutoWEKAClassifier", options=["-x", nb_folds, "-timeLimit", duration, "-metric", metric]) #classname="weka.classifiers.functions.Logistic", options=["-R", "1.0E-2"]
classifier.build_classifier(data)
print(classifier)
# GLOBAL VARIABLES
path = os.getcwd()
tmp_dir = path + '/tmpFiles/'
data_dir = "../datasets/weka/"
res_dir = path + "/results/"
res_exp_dir = res_dir + "weka_exp_arff/"
res_latex_dir = res_dir + "weka_latex_tabs/"
nb_folds = 5
nb_runs = 1
##### GET FILES ####
####################
begin = time.time()
pattern = "T3_VOC" #input("Simple instance name: ")
print(sys.argv[1])
pattern = sys.argv[1]
nb_folds = int(sys.argv[2])
#print(nb_folds)
base_res = res_exp_dir + pattern
datasets = [f for f in glob.glob(data_dir + pattern + "*.arff", recursive=True)]
#### CREATE DIRECTORIES ####
############################
if not os.path.exists(res_dir):
os.mkdir(res_dir)
if not os.path.exists(res_exp_dir):
os.mkdir(res_exp_dir)
if not os.path.exists(res_latex_dir):
os.mkdir(res_latex_dir)
if not os.path.exists(tmp_dir):
os.mkdir(tmp_dir)
##### START ########
###################
jvm.start(packages=True)
##### LIST OF CLASSIFIERS ####
####################
classifiers, classifiers_for_filtered, list_attribute_selection, list_of_comparison_metric = set_weka_options()
#### ATTRIBUTE SELECTION ####
#############################
print("-- ATTRIBUTE SELECTION")
# to convert files before using function use_filter
all_datasets_filtered = attribute_selection(list_attribute_selection, datasets)
end = time.time()
print("---- duration of first phase: " + str(end-begin))
##### EXPERIMENTER ####
####################
results = list()
print("-- EXPERIMENTER FULL")
begin = time.time()
results += experimenter(datasets, base_res, nb_runs, nb_folds, classifiers)
end = time.time()
print("---- duration of phase: " + str(end-begin))
print("-- EXPERIMENTER FILTERED")
begin = time.time()
results += experimenter_filtered(all_datasets_filtered, base_res, nb_runs, nb_folds, classifiers_for_filtered)
end = time.time()
print("---- duration of phase: " + str(end-begin))
#### VISUALISATION + RESULTS #####
##################################
#TODO: check possibilities
full_expe_printer(results, list_of_comparison_metric, res_latex_dir+ pattern + "_" + str(nb_folds) + "folds_latex")
jvm.stop()
shutil.rmtree(tmp_dir, ignore_errors=True)
print("-- THE END --")