def process_by_ml_name(ml): from dataset_loader import DataSetLoader from sklearn import cross_validation print 'start ', ml loader = DataSetLoader() x, y = loader.loadData()[DataSetLoader.dataset_name[0]] score_lst = [] for ml in ml: print 'start cross val' scores = cross_validation.cross_val_score(ml, x, y, cv=5) print 'end cross val' score_lst.append(scores.mean()) return score_lst
def load_dataset(self): loader = DataSetLoader() lst = loader.loadData() return lst
def process(self): ml_lst = self.gen_ml_lst() dataset_lst = self.load_dataset() result = {} ml_value = ml_lst['svm'] self.log_debug.info('*************************************** ' + self.dataset_name) all_data = [] self.log_debug.info('***** start ' + self.dataset_name) if self.dataset_name == 'shuttle': d_loader = DataSetLoader() data_value = d_loader.svm_shuttle('data/statlog/shuttle.data')['shuttle'] x_data = data_value[0] y_data = data_value[1] if is_run_missing: print 'before************** ', x_data[0], y_data x_data, y_data = self.svm_uni_remove(x_data, y_data) print 'after****************', x_data[0], y_data elif self.dataset_name == 'segment': d_loader = DataSetLoader() data_value = d_loader.load_segment() x_data = data_value[0] y_data = data_value[1] else: data_value = dataset_lst[self.dataset_name] x_data = data_value[0] y_data = data_value[1] if is_run_missing: print 'before************** ', x_data[0], y_data x_data, y_data = self.remove_by_chi2_process(x_data, y_data) print 'after****************', x_data[0], y_data datasets_data_lst = [] ml = None for d_size in self.data_size: self.log_debug.info('***** start size ' + str(d_size)) ran_num = random.randint(1, 100) x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=d_size, random_state=ran_num) print 'x train ', x_train self.log_debug.info('********* start cross validation') ml = self.cross_validation(ml_value, x_train, y_train) self.log_kernel.info('************* kernel : ' + str(ml.kernel) + " | degree : " + str(ml.degree)) self.log_debug.info('************* end cross validation') acc_lst = [] f1_lst = [] time_pred = [] total_ins = [] precision_lst = [] recall_lst = [] for i in range(0, self.loop): self.log_debug.info('loop {} size {} data set {} ml {}'.format(i, d_size, self.dataset_name, 'svm')) ran_num = random.randint(1, 10000) x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=d_size, random_state=ran_num) try: ml_c = copy.deepcopy(ml) ml_c.fit(x_train, y_train) start = time.time() y_pred = ml_c.predict(x_test) except Exception as e: self.log_error.info(str(e)) total_time = time.time() - start acc = accuracy_score(y_test, y_pred) print 'y_test ', y_test print 'y_pred ', y_pred fsc = f1_score(y_test, y_pred) acc_lst.append(acc) f1_lst.append(fsc) time_pred.append(total_time) total_ins.append(len(y_test)) pre_score = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision_lst.append(pre_score) recall_lst.append(recall) self.log_debug.info('------------- end loop -----') datasets_data_lst.append(np.mean(acc_lst)) datasets_data_lst.append(float("{:.5f}".format(np.mean(f1_lst)))) datasets_data_lst.append(np.mean(time_pred)) datasets_data_lst.append(np.mean(total_ins)) self.log.info('---------------------------------------------') self.log.info('data size ' + str(d_size) + ' data set ' + self.dataset_name) self.log.info(acc_lst) self.log.info(f1_lst) self.log.info(time_pred) self.log.info(total_ins) self.log.info('---------------------------------------------') self.log_debug.info('*********** end size') self.log.info('ml type ' + str(ml.kernel)) all_data.append(datasets_data_lst) self.log_debug.info('******* end data set') self.result[self.dataset_name] = all_data self.log_debug.info('************ end ml') file_name = '{}_svm_result_{}.obj'.format(self.dataset_name, self.loop) pickle.dump(self.result, open(file_name, 'wb')) self.report_all(result)
y_pred = self.predict(X) average_score = (accuracy_score(y, y_pred) + f1_score(y, y_pred)) / 2.0 return average_score def predict(self, x): f_result = open(self.path_test_data, 'w') self.__write_data_file(f_result, x, [0] * len(x)) f_result.close() create_predict = libsvm_path + '/svm-predict' + ' {} {} {}'.format(self.path_test_data, self.path_model_result, self.path_result) print create_predict os.system(create_predict) return self.__read_result() def get_params(self, deep=True): return {"kernel": self.kernel} def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self, parameter, value) if __name__ == '__main__': from dataset_loader import DataSetLoader from sklearn.cross_validation import train_test_split loader = DataSetLoader() x, y = loader.loadData()['heart'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.75, random_state=42) ml = LibSVMWrapper(kernel=0) ml.fit(x_train, y_train)