def gbdt(i_time=10): ''' 函数说明:gbdt算法 参数说明: i_time:生成模型取平均表现的个数 X,y,X_test,y_test如果传值的话,会运行的快一些 ''' one_hot_key = read_config['gbdt']['one_hot_key'] standard_scaler = read_config['gbdt']['standard_scaler'] hyper_paras = get_section('gbdt') for i in range(i_time): gbdt = GradientBoostingClassifier( learning_rate=hyper_paras['learning_rate'], n_estimators=hyper_paras['n_estimators'], max_depth=hyper_paras['max_depth'], min_samples_split=hyper_paras['min_samples_split'], ) temp_file_name = os.path.join( file_path, '../output_data/gbdt/gbdt{}.csv'.format(i)) X, y, X_test, y_test = read_data.trans_data( one_hot_key=one_hot_key, standard_scaler=standard_scaler, filename=temp_file_name) start = time.clock() gbdt.fit(X, y) end = time.clock() train_time = end - start if i == 0: file_describe = open(result_file_path, 'w') result = evaluate.calc_all(gbdt, X_test, y_test, train_time, file_describe, save_img=True, filename="gbdt.png", result_filename=temp_file_name) else: file_describe = open(result_file_path, 'a') tools.dict_add( result, evaluate.calc_all(gbdt, X_test, y_test, train_time, file_describe, result_filename=temp_file_name)) tools.add_complete() tools.dict_div(result, i_time) file_describe = open(result_file_path, 'a') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() return result
def logistic(X=None, y=None, X_test=None, y_test=None, i_time=10): ''' 函数说明:lg算法 参数说明: i_time:生成模型取平均表现的个数 X,y,X_test,y_test如果传值的话,会运行的快一些 ''' one_hot_key = read_config['logistic']['one_hot_key'] standard_scaler = read_config['logistic']['standard_scaler'] hyper_paras = get_section('logistic') for i in range(i_time): lg_reg = LogisticRegression(C=hyper_paras['c'], max_iter=hyper_paras['max_iter'], penalty=hyper_paras['penalty'], random_state=np.random.randint( 0, i_time * 10)) temp_file_name = os.path.join( file_path, '../output_data/logistic/logistic{}.csv'.format(i)) X, y, X_test, y_test = read_data.trans_data( one_hot_key=one_hot_key, standard_scaler=standard_scaler, filename=temp_file_name) start = time.clock() lg_reg.fit(X, y) end = time.clock() train_time = end - start if i == 0: # print(type(knn_clf),type(X_test),type(y_test),type(train_time)) file_describe = open(result_file_path, 'w') result = evaluate.calc_all(lg_reg, X_test, y_test, train_time, file_describe, save_img=True, filename="logistic.png", result_filename=temp_file_name) else: file_describe = open(result_file_path, 'a') tools.dict_add( result, evaluate.calc_all(lg_reg, X_test, y_test, train_time, file_describe, result_filename=temp_file_name)) tools.add_complete() tools.dict_div(result, i_time) file_describe = open(result_file_path, 'a') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() return result
def ada_boost(i_time=10): ''' 函数说明:adaboost算法 参数说明: i_time:生成模型取平均表现的个数 X,y,X_test,y_test如果传值的话,会运行的快一些 ''' hyper_paras = get_section('ada_boost') one_hot_key = read_config['adaboost']['one_hot_key'] standard_scaler = read_config['adaboost']['standard_scaler'] for i in range(i_time): ada_clf = AdaBoostClassifier( n_estimators=hyper_paras['n_estimators'], learning_rate=hyper_paras['learning_rate'], algorithm=hyper_paras['algorithm'], ) temp_file_name = os.path.join( file_path, '../output_data/adaboost/adaboost{}.csv'.format(i)) X, y, X_test, y_test = read_data.trans_data( one_hot_key=one_hot_key, standard_scaler=standard_scaler, filename=temp_file_name) start = time.clock() ada_clf.fit(X, y) end = time.clock() train_time = end - start if i == 0: # print(type(knn_clf),type(X_test),type(y_test),type(train_time)) file_describe = open(result_file_path, 'w') result = evaluate.calc_all(ada_clf, X_test, y_test, train_time, file_describe, save_img=True, filename="adaboost.png", result_filename=temp_file_name) else: file_describe = open(result_file_path, 'a') tools.dict_add( result, evaluate.calc_all(ada_clf, X_test, y_test, train_time, file_describe, result_filename=temp_file_name)) tools.add_complete() tools.dict_div(result, i_time) file_describe = open(result_file_path, 'a') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() return result
def knn(i_time=10): ''' 函数说明:knn算法 参数说明: i_time:生成模型取平均表现的个数 X,y,X_test,y_test如果传值的话,会运行的快一些 ''' one_hot_key = read_config['knn']['one_hot_key'] standard_scaler = read_config['knn']['standard_scaler'] hyper_paras = get_section('knn') for i in range(i_time): knn_clf = KNeighborsClassifier(algorithm=hyper_paras['algorithm'], leaf_size=hyper_paras['leaf_size'], n_neighbors=hyper_paras['n_neighbors']) temp_file_name = os.path.join(file_path, '../output_data/knn/knn{}.csv'.format(i)) X, y, X_test, y_test = read_data.trans_data( one_hot_key=one_hot_key, standard_scaler=standard_scaler, filename=temp_file_name) # print(X.shape) # print(X_test.shape) start = time.clock() knn_clf.fit(X, y) end = time.clock() train_time = end - start if i == 0: # print(type(knn_clf),type(X_test),type(y_test),type(train_time)) file_describe = open(result_file_path, 'w') result = evaluate.calc_all(knn_clf, X_test, y_test, train_time, file_describe, save_img=True, filename="knn.png", result_filename=temp_file_name) else: file_describe = open(result_file_path, 'a') tools.dict_add( result, evaluate.calc_all(knn_clf, X_test, y_test, train_time, file_describe, result_filename=temp_file_name)) tools.add_complete() tools.dict_div(result, i_time) file_describe = open(result_file_path, 'a') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() return result
def svm(X=None, y=None, X_test=None, y_test=None, i_time=10): ''' 函数说明:svm算法 参数说明: i_time:生成模型取平均表现的个数 X,y,X_test,y_test如果传值的话,会运行的快一些 ''' if X == None or y == None or X_test == None or y_test == None: one_hot_key = read_config['svm']['one_hot_key'] standard_scaler = True X, y = read_data.get_train_data(one_hot_key=one_hot_key, standard_scaler=standard_scaler) X_test, y_test = read_data.get_test_data( one_hot_key=one_hot_key, standard_scaler=standard_scaler) ''' del_index = np.append(np.where(y==2),np.where(y==3)) X = np.delete(X,del_index,axis=0) y = np.delete(y,del_index,axis=0) del_index_test = np.append(np.where(y_test==2),np.where(y_test==3)) X_test = np.delete(X_test,del_index_test,axis=0) y_test = np.delete(y_test,del_index_test,axis=0) ''' X1 = np.copy(X) y1 = np.copy(y) X_test1 = np.copy(X_test) y_test1 = np.copy(y_test) y[y == 2] = 1 y[y == 3] = 1 y_test[y_test == 2] = 1 y_test[y_test == 3] = 1 hyper_paras = get_section('svm') for i in np.linspace(3, 13, 250): #print(type(hyper_paras['kernel']),type(hyper_paras['degree']),type(hyper_paras['gamma']),type(hyper_paras['shrinking']),type(hyper_paras['c'])) # 这里重点:使用类别平衡算法 #svm_clf = SVC(kernel=hyper_paras['kernel'],class_weight='balanced') svm_clf = SVC(class_weight={0: 1, 1: i}) ''' svm_clf = SVC(kernel=hyper_paras['kernel'], degree=hyper_paras['degree'], gamma=hyper_paras['gamma'], shrinking=eval(hyper_paras['shrinking']), C=hyper_paras['c'], class_weight=hyper_paras['class_weight']) ''' start = time.clock() svm_clf.fit(X, y) end = time.clock() train_time = end - start result = evaluate.calc_all(svm_clf, X_test, y_test, train_time) tools.dict_div(result, i_time) print(result)
def k_mean(X=None, y=None, X_test=None, y_test=None, i_time=10): ''' 函数说明:k均值算法 参数说明: i_time:生成模型取平均表现的个数 X,y,X_test,y_test如果传值的话,会运行的快一些 ''' one_hot_key = read_config['k_mean']['one_hot_key'] standard_scaler = read_config['k_mean']['standard_scaler'] for i in range(i_time): k_mean = KMeans(n_clusters=4) temp_file_name = os.path.join( file_path, '../output_data/k_mean/k_mean{}.csv'.format(i)) X, y, X_test, y_test = read_data.trans_data( one_hot_key=one_hot_key, standard_scaler=standard_scaler, filename=temp_file_name) start = time.clock() k_mean.fit(X) y_predict = k_mean.predict(X_test) end = time.clock() train_time = end - start if i == 0: file_describe = open(result_file_path, 'w') result = evaluate.calc_all(k_mean, X_test, y_test, train_time, file_describe, result_filename=temp_file_name) result['precision'] = _cal_precision(y_test, y_predict) file_describe = open(real_result_file_path, 'w') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() else: file_describe = open(result_file_path, 'a') # 这儿数据有问题 temp_result = evaluate.calc_all(k_mean, X_test, y_test, train_time, file_describe, save_img=True, filename="k_mean.png", result_filename=temp_file_name) temp_result['precision'] = _cal_precision(y_test, y_predict) file_describe = open(real_result_file_path, 'a') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() tools.dict_add(result, temp_result) tools.add_complete() tools.dict_div(result, i_time) file_describe = open(real_result_file_path, 'a') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() return result
def random_forest(X=None, y=None, X_test=None, y_test=None, i_time=10): ''' 函数说明:随机森林算法 参数说明: i_time:生成模型取平均表现的个数 X,y,X_test,y_test如果传值的话,会运行的快一些 ''' one_hot_key = read_config['random_forest']['one_hot_key'] standard_scaler = read_config['random_forest']['standard_scaler'] #X_test,y_test = read_data.get_test_data(one_hot_key=one_hot_key,standard_scaler=standard_scaler) hyper_paras = get_section('random_forest') for i in range(i_time): forest_clf = RandomForestClassifier( max_depth=hyper_paras['max_depth'], max_features=hyper_paras['max_features'], min_samples_split=hyper_paras['min_samples_split'], n_estimators=hyper_paras['n_estimators'], min_samples_leaf=hyper_paras['min_samples_leaf'], random_state=np.random.randint(0, i_time * 10)) temp_file_name = os.path.join( file_path, '../output_data/random_forest/random_forest{}.csv'.format(i)) X, y, X_test, y_test = read_data.trans_data( one_hot_key=one_hot_key, standard_scaler=standard_scaler, filename=temp_file_name) start = time.clock() forest_clf.fit(X, y) end = time.clock() train_time = end - start if i == 0: # print(type(knn_clf),type(X_test),type(y_test),type(train_time)) file_describe = open(result_file_path, 'w') result = evaluate.calc_all(forest_clf, X_test, y_test, train_time, file_describe, save_img=True, filename="random_forest.png", result_filename=temp_file_name) else: file_describe = open(result_file_path, 'a') tools.dict_add( result, evaluate.calc_all(forest_clf, X_test, y_test, train_time, file_describe, result_filename=temp_file_name)) tools.add_complete() tools.dict_div(result, i_time) file_describe = open(result_file_path, 'a') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() return result
def net(i_time=10): ''' 函数说明:神经网络算法 参数说明: i_time:生成模型取平均表现的个数 X,y,X_test,y_test如果传值的话,会运行的快一些 ''' one_hot_key = read_config['net']['one_hot_key'] standard_scaler = read_config['net']['standard_scaler'] for i in range(i_time): temp_file_name = os.path.join(file_path, '../output_data/net/net{}.csv'.format(i)) X, y, X_test, y_test = read_data.trans_data( one_hot_key=one_hot_key, standard_scaler=standard_scaler, filename=temp_file_name) start = time.clock() with tf.Session() as sess: init.run() for epoch in range(n_epoches): for iteraton in range(len(X) // batch_size): X_batch, y_batch = next_batch(batch_size, X, y) sess.run([training_op, extra_update_ops], feed_dict={ training: True, X_tensor: X_batch, y_tensor: y_batch }) output = logists.eval(feed_dict={ X_tensor: X_test, y_tensor: y_test }) resultx = [int(np.where(i == i.max())[0]) for i in output] model = Temp_result_model(resultx) end = time.clock() train_time = end - start if i == 0: # print(type(knn_clf),type(X_test),type(y_test),type(train_time)) file_describe = open(result_file_path, 'w') result = evaluate.calc_all(model, X_test, y_test, train_time, file_describe, save_img=False, filename="net.png", result_filename=temp_file_name) else: file_describe = open(result_file_path, 'a') tools.dict_add( result, evaluate.calc_all(model, X_test, y_test, train_time, file_describe, result_filename=temp_file_name)) tools.add_complete() tools.dict_div(result, i_time) file_describe = open(result_file_path, 'a') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() return result
def _tree_cls(method, i_time=10): ''' 函数说明:id3,c4.5,cart算法 参数说明: method:根据传入的方法,调用相应的fit方法,就会对应执行不同的决策树算法,取值['id3','c4.5','cart'] i_time:生成模型取平均表现的个数 X,y,X_test,y_test如果传值的话,会运行的快一些 ''' if method in ['id3', 'cart', 'c4_5']: one_hot_key = read_config[method]['one_hot_key'] standard_scaler = read_config[method]['standard_scaler'] result_file_path = os.path.join( file_path, '../frontend/data/{}.json'.format(method)) img_path = '{}.png'.format(method) else: raise ValueError('wrong input,just suuport id3,c4_5,cart.') #X_test,y_test = read_data.get_test_data(one_hot_key=one_hot_key,standard_scaler=standard_scaler) hyper_paras = get_section(method) for i in range(i_time): tree_clf = DecisionTreeClassifier( criterion=hyper_paras['criterion'], max_depth=hyper_paras['max_depth'], min_samples_split=hyper_paras['min_samples_split']) temp_file_name = os.path.join( file_path, '../output_data/{}/{}{}.csv'.format(method, method, i)) X, y, X_test, y_test = read_data.trans_data( one_hot_key=one_hot_key, standard_scaler=standard_scaler, filename=temp_file_name) start = time.clock() tree_clf.fit(X, y) end = time.clock() train_time = end - start if i == 0: # print(type(knn_clf),type(X_test),type(y_test),type(train_time)) file_describe = open(result_file_path, 'w') result = evaluate.calc_all(tree_clf, X_test, y_test, train_time, file_describe, save_img=True, filename=img_path, result_filename=temp_file_name) else: file_describe = open(result_file_path, 'a') tools.dict_add( result, evaluate.calc_all(tree_clf, X_test, y_test, train_time, file_describe, result_filename=temp_file_name)) tools.add_complete() tools.dict_div(result, i_time) file_describe = open(result_file_path, 'a') jsonobj = json.dumps(result) file_describe.write(jsonobj) file_describe.write('\n') file_describe.close() return result