예제 #1
0
def gbdt(i_time=10):
    '''
    函数说明:gbdt算法
    参数说明:
        i_time:生成模型取平均表现的个数
        X,y,X_test,y_test如果传值的话,会运行的快一些
    '''

    one_hot_key = read_config['gbdt']['one_hot_key']
    standard_scaler = read_config['gbdt']['standard_scaler']
    hyper_paras = get_section('gbdt')

    for i in range(i_time):
        gbdt = GradientBoostingClassifier(
            learning_rate=hyper_paras['learning_rate'],
            n_estimators=hyper_paras['n_estimators'],
            max_depth=hyper_paras['max_depth'],
            min_samples_split=hyper_paras['min_samples_split'],
        )
        temp_file_name = os.path.join(
            file_path, '../output_data/gbdt/gbdt{}.csv'.format(i))
        X, y, X_test, y_test = read_data.trans_data(
            one_hot_key=one_hot_key,
            standard_scaler=standard_scaler,
            filename=temp_file_name)
        start = time.clock()
        gbdt.fit(X, y)
        end = time.clock()
        train_time = end - start
        if i == 0:
            file_describe = open(result_file_path, 'w')
            result = evaluate.calc_all(gbdt,
                                       X_test,
                                       y_test,
                                       train_time,
                                       file_describe,
                                       save_img=True,
                                       filename="gbdt.png",
                                       result_filename=temp_file_name)
        else:
            file_describe = open(result_file_path, 'a')
            tools.dict_add(
                result,
                evaluate.calc_all(gbdt,
                                  X_test,
                                  y_test,
                                  train_time,
                                  file_describe,
                                  result_filename=temp_file_name))
        tools.add_complete()
    tools.dict_div(result, i_time)
    file_describe = open(result_file_path, 'a')
    jsonobj = json.dumps(result)
    file_describe.write(jsonobj)
    file_describe.write('\n')
    file_describe.close()
    return result
예제 #2
0
def logistic(X=None, y=None, X_test=None, y_test=None, i_time=10):
    '''
    函数说明:lg算法
    参数说明:
        i_time:生成模型取平均表现的个数
        X,y,X_test,y_test如果传值的话,会运行的快一些
    '''
    one_hot_key = read_config['logistic']['one_hot_key']
    standard_scaler = read_config['logistic']['standard_scaler']
    hyper_paras = get_section('logistic')

    for i in range(i_time):
        lg_reg = LogisticRegression(C=hyper_paras['c'],
                                    max_iter=hyper_paras['max_iter'],
                                    penalty=hyper_paras['penalty'],
                                    random_state=np.random.randint(
                                        0, i_time * 10))
        temp_file_name = os.path.join(
            file_path, '../output_data/logistic/logistic{}.csv'.format(i))
        X, y, X_test, y_test = read_data.trans_data(
            one_hot_key=one_hot_key,
            standard_scaler=standard_scaler,
            filename=temp_file_name)
        start = time.clock()
        lg_reg.fit(X, y)
        end = time.clock()
        train_time = end - start
        if i == 0:
            # print(type(knn_clf),type(X_test),type(y_test),type(train_time))
            file_describe = open(result_file_path, 'w')
            result = evaluate.calc_all(lg_reg,
                                       X_test,
                                       y_test,
                                       train_time,
                                       file_describe,
                                       save_img=True,
                                       filename="logistic.png",
                                       result_filename=temp_file_name)
        else:
            file_describe = open(result_file_path, 'a')
            tools.dict_add(
                result,
                evaluate.calc_all(lg_reg,
                                  X_test,
                                  y_test,
                                  train_time,
                                  file_describe,
                                  result_filename=temp_file_name))
        tools.add_complete()
    tools.dict_div(result, i_time)
    file_describe = open(result_file_path, 'a')
    jsonobj = json.dumps(result)
    file_describe.write(jsonobj)
    file_describe.write('\n')
    file_describe.close()
    return result
예제 #3
0
def ada_boost(i_time=10):
    '''
    函数说明:adaboost算法
    参数说明:
        i_time:生成模型取平均表现的个数
        X,y,X_test,y_test如果传值的话,会运行的快一些
    '''
    hyper_paras = get_section('ada_boost')

    one_hot_key = read_config['adaboost']['one_hot_key']
    standard_scaler = read_config['adaboost']['standard_scaler']
    for i in range(i_time):
        ada_clf = AdaBoostClassifier(
            n_estimators=hyper_paras['n_estimators'],
            learning_rate=hyper_paras['learning_rate'],
            algorithm=hyper_paras['algorithm'],
        )
        temp_file_name = os.path.join(
            file_path, '../output_data/adaboost/adaboost{}.csv'.format(i))
        X, y, X_test, y_test = read_data.trans_data(
            one_hot_key=one_hot_key,
            standard_scaler=standard_scaler,
            filename=temp_file_name)
        start = time.clock()
        ada_clf.fit(X, y)
        end = time.clock()
        train_time = end - start
        if i == 0:
            # print(type(knn_clf),type(X_test),type(y_test),type(train_time))
            file_describe = open(result_file_path, 'w')
            result = evaluate.calc_all(ada_clf,
                                       X_test,
                                       y_test,
                                       train_time,
                                       file_describe,
                                       save_img=True,
                                       filename="adaboost.png",
                                       result_filename=temp_file_name)
        else:
            file_describe = open(result_file_path, 'a')
            tools.dict_add(
                result,
                evaluate.calc_all(ada_clf,
                                  X_test,
                                  y_test,
                                  train_time,
                                  file_describe,
                                  result_filename=temp_file_name))
        tools.add_complete()
    tools.dict_div(result, i_time)
    file_describe = open(result_file_path, 'a')
    jsonobj = json.dumps(result)
    file_describe.write(jsonobj)
    file_describe.write('\n')
    file_describe.close()
    return result
예제 #4
0
def knn(i_time=10):
    '''
    函数说明:knn算法
    参数说明:
        i_time:生成模型取平均表现的个数
        X,y,X_test,y_test如果传值的话,会运行的快一些
    '''
    one_hot_key = read_config['knn']['one_hot_key']
    standard_scaler = read_config['knn']['standard_scaler']
    hyper_paras = get_section('knn')

    for i in range(i_time):
        knn_clf = KNeighborsClassifier(algorithm=hyper_paras['algorithm'],
                                       leaf_size=hyper_paras['leaf_size'],
                                       n_neighbors=hyper_paras['n_neighbors'])
        temp_file_name = os.path.join(file_path,
                                      '../output_data/knn/knn{}.csv'.format(i))
        X, y, X_test, y_test = read_data.trans_data(
            one_hot_key=one_hot_key,
            standard_scaler=standard_scaler,
            filename=temp_file_name)
        # print(X.shape)
        # print(X_test.shape)
        start = time.clock()
        knn_clf.fit(X, y)
        end = time.clock()
        train_time = end - start
        if i == 0:
            # print(type(knn_clf),type(X_test),type(y_test),type(train_time))
            file_describe = open(result_file_path, 'w')
            result = evaluate.calc_all(knn_clf,
                                       X_test,
                                       y_test,
                                       train_time,
                                       file_describe,
                                       save_img=True,
                                       filename="knn.png",
                                       result_filename=temp_file_name)
        else:
            file_describe = open(result_file_path, 'a')
            tools.dict_add(
                result,
                evaluate.calc_all(knn_clf,
                                  X_test,
                                  y_test,
                                  train_time,
                                  file_describe,
                                  result_filename=temp_file_name))
        tools.add_complete()
    tools.dict_div(result, i_time)
    file_describe = open(result_file_path, 'a')
    jsonobj = json.dumps(result)
    file_describe.write(jsonobj)
    file_describe.write('\n')
    file_describe.close()
    return result
예제 #5
0
def svm(X=None, y=None, X_test=None, y_test=None, i_time=10):
    '''
    函数说明:svm算法
    参数说明:
        i_time:生成模型取平均表现的个数
        X,y,X_test,y_test如果传值的话,会运行的快一些
    '''
    if X == None or y == None or X_test == None or y_test == None:
        one_hot_key = read_config['svm']['one_hot_key']
        standard_scaler = True
        X, y = read_data.get_train_data(one_hot_key=one_hot_key,
                                        standard_scaler=standard_scaler)
        X_test, y_test = read_data.get_test_data(
            one_hot_key=one_hot_key, standard_scaler=standard_scaler)
        '''
        del_index = np.append(np.where(y==2),np.where(y==3))
        X = np.delete(X,del_index,axis=0)
        y = np.delete(y,del_index,axis=0)

        del_index_test = np.append(np.where(y_test==2),np.where(y_test==3))
        X_test = np.delete(X_test,del_index_test,axis=0)
        y_test = np.delete(y_test,del_index_test,axis=0)
        '''
        X1 = np.copy(X)
        y1 = np.copy(y)
        X_test1 = np.copy(X_test)
        y_test1 = np.copy(y_test)

        y[y == 2] = 1
        y[y == 3] = 1
        y_test[y_test == 2] = 1
        y_test[y_test == 3] = 1
    hyper_paras = get_section('svm')

    for i in np.linspace(3, 13, 250):
        #print(type(hyper_paras['kernel']),type(hyper_paras['degree']),type(hyper_paras['gamma']),type(hyper_paras['shrinking']),type(hyper_paras['c']))
        # 这里重点:使用类别平衡算法

        #svm_clf = SVC(kernel=hyper_paras['kernel'],class_weight='balanced')
        svm_clf = SVC(class_weight={0: 1, 1: i})
        '''
        svm_clf = SVC(kernel=hyper_paras['kernel'],
                      degree=hyper_paras['degree'],
                      gamma=hyper_paras['gamma'],
                      shrinking=eval(hyper_paras['shrinking']),
                      C=hyper_paras['c'],
                      class_weight=hyper_paras['class_weight'])
        '''
        start = time.clock()
        svm_clf.fit(X, y)
        end = time.clock()
        train_time = end - start

        result = evaluate.calc_all(svm_clf, X_test, y_test, train_time)
        tools.dict_div(result, i_time)
        print(result)
def random_forest(X=None, y=None, X_test=None, y_test=None, i_time=10):
    '''
    函数说明:随机森林算法
    参数说明:
        i_time:生成模型取平均表现的个数
        X,y,X_test,y_test如果传值的话,会运行的快一些
    '''
    one_hot_key = read_config['random_forest']['one_hot_key']
    standard_scaler = read_config['random_forest']['standard_scaler']
    #X_test,y_test = read_data.get_test_data(one_hot_key=one_hot_key,standard_scaler=standard_scaler)
    hyper_paras = get_section('random_forest')

    for i in range(i_time):
        forest_clf = RandomForestClassifier(
            max_depth=hyper_paras['max_depth'],
            max_features=hyper_paras['max_features'],
            min_samples_split=hyper_paras['min_samples_split'],
            n_estimators=hyper_paras['n_estimators'],
            min_samples_leaf=hyper_paras['min_samples_leaf'],
            random_state=np.random.randint(0, i_time * 10))
        temp_file_name = os.path.join(
            file_path,
            '../output_data/random_forest/random_forest{}.csv'.format(i))
        X, y, X_test, y_test = read_data.trans_data(
            one_hot_key=one_hot_key,
            standard_scaler=standard_scaler,
            filename=temp_file_name)

        start = time.clock()
        forest_clf.fit(X, y)
        end = time.clock()
        train_time = end - start
        if i == 0:
            # print(type(knn_clf),type(X_test),type(y_test),type(train_time))
            file_describe = open(result_file_path, 'w')
            result = evaluate.calc_all(forest_clf,
                                       X_test,
                                       y_test,
                                       train_time,
                                       file_describe,
                                       save_img=True,
                                       filename="random_forest.png",
                                       result_filename=temp_file_name)
        else:
            file_describe = open(result_file_path, 'a')
            tools.dict_add(
                result,
                evaluate.calc_all(forest_clf,
                                  X_test,
                                  y_test,
                                  train_time,
                                  file_describe,
                                  result_filename=temp_file_name))
        tools.add_complete()
    tools.dict_div(result, i_time)
    file_describe = open(result_file_path, 'a')
    jsonobj = json.dumps(result)
    file_describe.write(jsonobj)
    file_describe.write('\n')
    file_describe.close()
    return result
def _tree_cls(method, i_time=10):
    '''
    函数说明:id3,c4.5,cart算法
    参数说明:
        method:根据传入的方法,调用相应的fit方法,就会对应执行不同的决策树算法,取值['id3','c4.5','cart']
        i_time:生成模型取平均表现的个数
        X,y,X_test,y_test如果传值的话,会运行的快一些
    '''
    if method in ['id3', 'cart', 'c4_5']:
        one_hot_key = read_config[method]['one_hot_key']
        standard_scaler = read_config[method]['standard_scaler']
        result_file_path = os.path.join(
            file_path, '../frontend/data/{}.json'.format(method))
        img_path = '{}.png'.format(method)
    else:
        raise ValueError('wrong input,just suuport id3,c4_5,cart.')
        #X_test,y_test = read_data.get_test_data(one_hot_key=one_hot_key,standard_scaler=standard_scaler)
    hyper_paras = get_section(method)

    for i in range(i_time):
        tree_clf = DecisionTreeClassifier(
            criterion=hyper_paras['criterion'],
            max_depth=hyper_paras['max_depth'],
            min_samples_split=hyper_paras['min_samples_split'])
        temp_file_name = os.path.join(
            file_path, '../output_data/{}/{}{}.csv'.format(method, method, i))
        X, y, X_test, y_test = read_data.trans_data(
            one_hot_key=one_hot_key,
            standard_scaler=standard_scaler,
            filename=temp_file_name)
        start = time.clock()
        tree_clf.fit(X, y)
        end = time.clock()
        train_time = end - start
        if i == 0:
            # print(type(knn_clf),type(X_test),type(y_test),type(train_time))
            file_describe = open(result_file_path, 'w')
            result = evaluate.calc_all(tree_clf,
                                       X_test,
                                       y_test,
                                       train_time,
                                       file_describe,
                                       save_img=True,
                                       filename=img_path,
                                       result_filename=temp_file_name)
        else:
            file_describe = open(result_file_path, 'a')
            tools.dict_add(
                result,
                evaluate.calc_all(tree_clf,
                                  X_test,
                                  y_test,
                                  train_time,
                                  file_describe,
                                  result_filename=temp_file_name))
        tools.add_complete()
    tools.dict_div(result, i_time)
    file_describe = open(result_file_path, 'a')
    jsonobj = json.dumps(result)
    file_describe.write(jsonobj)
    file_describe.write('\n')
    file_describe.close()
    return result