Пример #1
0
def gbdt_lr(para):
    print("gbdt_lr")
    x_train = para[0]
    x_train_lr = para[1]
    x_test = para[2]
    y_train = para[3]
    y_train_lr = para[4]
    y_test = para[5]
    maxleafnodes = 11
    gbc = GBDT(max_leaf_nodes=maxleafnodes - 1,
               n_estimators=600,
               min_samples_leaf=5,
               max_depth=3,
               learning_rate=0.02,
               subsample=0.2,
               max_features=0.1)
    gbc.fit(x_train, y_train)
    ohe = OHE()
    ohe.fit(gbc.apply(x_train)[:, :])
    li = gbc.apply(x_train_lr)[:, :]
    x_train_lr_gbc = ohe.transform(li)
    #x_train_lr_gbc=myTransform(li,max_leaf_nodes=maxleafnodes)
    li = gbc.apply(x_test)[:, :]
    x_test_gbc = ohe.transform(li)
    #x_test_gbc=myTransform(li,max_leaf_nodes=maxleafnodes)
    del (li)
    lr = sgd(n_iter=50)
    lr.fit(x_train_lr_gbc, y_train_lr)
    yp = lr.predict(x_test_gbc)
    print("GBDT+SGD: " + str(auc(y_test, yp)))
    return (gbc, yp)
Пример #2
0
def GBDT_train(train_data, test_data):
    train_y = train_data[:, 0]
    train_x = train_data[:, 1:]

    # param_test = {'n_estimators': range(50, 1000, 50)}
    gbdt_model = GBDT(learning_rate=0.05,
                      n_estimators=250,
                      max_leaf_nodes=8,
                      min_samples_split=6,
                      max_depth=3)
    # gsearch = GridSearchCV(estimator=gbdt_model, param_grid=param_test, scoring='accuracy', cv=5)
    # gsearch.fit(train_x, train_y)
    # print(gsearch.best_params_, gsearch.best_score_)

    # bagging_gbdt = BaggingClassifier(gbdt_model, max_samples=0.8)
    print("GBDT cross score:")
    print(
        cross_val_score(gbdt_model, train_x, train_y, cv=5,
                        scoring='accuracy'))
    #print(cross_val_score(bagging_gbdt, train_x, train_y, cv=5))
    gbdt_model.fit(train_x, train_y)
    test_y = gbdt_model.predict(test_data)
    return test_y
Пример #3
0
def gbdt_test(para):
    x_train = para[0]
    x_train_lr = para[1]
    x_test = para[2]
    import pdb
    pdb.set_trace()
    y_train = para[3]
    y_train_lr = para[4]
    y_test = para[5]
    import pdb
    pdb.set_trace()
    xt = vstack([para[0], para[1]])
    yt = merge_y(y_train, y_train_lr)
    #para[0]=0;para[1]=0;
    clf = GBDT()
    clf.subsample = 0.1
    clf.max_features = 0.05
    clf.min_samples_leaf = 5
    clf.n_estimators = 200
    clf.learning_rate = 0.03
    clf.fit(xt, yt)
    yp_gbdt = clf.predict((para[2]).toarray())
    print("GBDT: " + str(auc(y_test, yp_gbdt)))
    return (clf)
Пример #4
0
from sklearn.ensemble import AdaBoostClassifier as AdaBoost
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier as etc
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn.neural_network import MLPClassifier as mlp

valid_data = data[3200:].reset_index()

clf_gender = mlp(hidden_layer_sizes=(2, 1), verbose=0, activation='tanh')
clf_gender.fit(f_tfidf[:3200], data.gender[:3200])
valid_data.gender = clf_gender.predict(f_tfidf[3200:])

# clf_age_pre = LR()
# clf_age_pre.fit(f_tfidf[:3200], data.age[:3200])

clf_age = GBDT(n_estimators=300, verbose=1)
clf_age.fit(f_tfidf[:3200], data.age[:3200])
valid_data.age = clf_age.predict(f_tfidf[3200:])

clf_location = GBDT(n_estimators=300, verbose=1)
clf_location.fit(f_tfidf[:3200], data.location[:3200])
valid_data.location = clf_location.predict(f_tfidf[3200:])

# # 输出到temp.csv

# In[7]:

valid_data.loc[:, ['id', 'age', 'gender', 'location']].to_csv(
    'result/gender_mlp_2_1_age_gbdt_n_est_300_loc_gbdt_n_est_300.csv',
    index=False)
Пример #5
0
from sklearn.ensemble import GradientBoostingClassifier as GBDT
import pickle
import numpy as np

if __name__ == "__main__":
    with open("VGG16_feature_dataset.pkl", "rb") as f:
        p = pickle.load(f)
    train_x = p["train_conv_feature"]
    test_x = p["test_conv_feature"]
    train_y = np.asarray(np.argmax(p["train_label"], axis=1), dtype=np.float32)
    test_y = np.asarray(np.argmax(p["test_label"], axis=1), dtype=np.float32)
    clf = GBDT()
    print(train_x.shape, train_y.shape)
    clf.fit(train_x, train_y)
    print(np.mean(test_y == clf.predict(test_x)))
Пример #6
0
Файл: svm.py Проект: caoyw/FAN
    print("精准率:", precision_score(y_test, y_pred))
    print("召回率:", recall_score(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred))
    print("ROC:", roc_auc_score(y_test, y_pred))
    confusion_mat = confusion_matrix(y_test, y_pred)
    endtime = time.time()
    totaltime = endtime - starttime
    print("XGB的时间:", totaltime)
    print("   ")

    # ax = model.plot_tree(model, tree_index=1, figsize=(20, 8), show_info=['split_gain'])
    # plt.show()


    starttime = time.time()
    clf = GBDT(n_estimators=100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("GBDT准确率:", accuracy_score(y_test, y_pred))
    print("精准率:", precision_score(y_test, y_pred))
    print("召回率:", recall_score(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred))
    print("ROC:", roc_auc_score(y_test, y_pred))
    endtime = time.time()
    totaltime = endtime - starttime
    print("GBDT的时间:", totaltime)
    print("   ")

    # 画ROC
    # false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    # roc_auc = auc(false_positive_rate, true_positive_rate)
Пример #7
0
    assert (len(y_gold) == len(y_pred))

    ap = []
    for gold, pred in zip(y_gold, y_pred):
        val, pred = 0.0, pred[:len(gold)]
        if len(gold) > 0:
            ap.append(average_precision(pred, gold))

    precision = sum(ap) / len(ap)
    return precision


classifiers = {
    'svm': svm.SVC(kernel='linear', probability=True),
    'rf': RF(n_estimators=200, n_jobs=5),
    'gbdt': GBDT()
}

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('train', help='training data file')
    parser.add_argument('test', help='test data file')
    parser.add_argument(
        'multilabel',
        help='whether this is a multilabel classification problem')
    parser.add_argument('model', help='specify the classifier to use')
    options = parser.parse_args()
    print options
    if not options.model in classifiers:
        print 'Invalid model:', options.model
        print 'Available models:'
Пример #8
0
def fit_model(features, sumstats, train_genes, test_genes, model='logit'):
    """
    Fit classifier to train_genes and calculate RMSE on test_genes
    """

    all_genes = train_genes + test_genes

    # Join sumstats with features for logistic regression, subset to
    # genes of interest, and drop genes with NaN BFDPs
    full_df = sumstats.merge(features,
                             how='left',
                             left_index=True,
                             right_index=True)
    full_df = full_df.loc[full_df.index.isin(all_genes), :].dropna()
    train_df = full_df.loc[full_df.index.isin(train_genes), :].\
                   drop(labels='chrom', axis=1)
    test_df = full_df.loc[full_df.index.isin(test_genes), :].\
                  drop(labels='chrom', axis=1)

    # Instantiate classifier dependent on model
    if model == 'logit':
        grid_params = {
            'C': [10**x for x in range(-2, 3, 1)],
            'l1_ratio': [x / 10 for x in range(0, 11, 1)]
        }
        base_class = logit(solver='saga', penalty='elasticnet')
    elif model == 'svm':
        grid_params = {'C': [10**x for x in range(-2, 2, 1)]}
        base_class = SVC(random_state=0,
                         probability=True,
                         break_ties=True,
                         kernel='rbf')
    elif model == 'randomforest':
        grid_params = {
            'n_estimators': [50, 100, 500],
            'criterion': ['gini', 'entropy']
        }
        base_class = RFC(random_state=0, bootstrap=True, oob_score=True)
    elif model == 'lda':
        grid_params = {
            'shrinkage': [None, 0, 0.5, 1, 'auto'],
            'solver': ['svd', 'lsqr', 'eigen']
        }
        base_class = LDAC()
    elif model == 'naivebayes':
        grid_params = {'var_smoothing': [10**x for x in range(-4, -11, -1)]}
        base_class = GNBC()
    elif model == 'neuralnet':
        grid_params = {
            'hidden_layer_sizes': [(10, 5, 2), (20, 10, 5), (20, 10, 5, 2),
                                   (50, 20, 10), (50, 20, 10, 5),
                                   (50, 20, 10, 5, 2)],
            'alpha': [10**x for x in range(-4, 5, 1)]
        }
        base_class = MLPC(activation='relu',
                          solver='adam',
                          early_stopping=True,
                          random_state=0)
    elif model == 'gbdt':
        grid_params = {'n_estimators': [50, 100], 'subsample': [0.5, 1]}
        base_class = GBDT(random_state=0)
    elif model == 'knn':
        grid_params = {
            'n_neighbors': [10, 50, 100, 500],
            'weights': ['uniform', 'distance'],
            'leaf_size': [5, 10, 25, 50, 100]
        }
        base_class = KNN()

    # Learn best parameters for classifier using cross-validated grid search
    classifier = GridSearchCV(base_class, grid_params, verbose=1, n_jobs=-1)

    # Fit sklearn model & predict on test set
    # (Models parameterized by grid search need to be treated separately)
    if isinstance(classifier, GridSearchCV):
        fitted_model = classifier.fit(train_df.drop(labels='bfdp', axis=1),
                                      np.round(train_df.bfdp)).best_estimator_
    else:
        fitted_model = classifier.fit(train_df.drop(labels='bfdp', axis=1),
                                      np.round(train_df.bfdp))
    test_bfdps = pd.Series(fitted_model.predict_proba(
        test_df.drop(labels='bfdp', axis=1))[:, 1],
                           name='pred',
                           index=test_df.index)

    # Compute RMSE of bfdps for test set
    test_vals = test_df.merge(test_bfdps, left_index=True, right_index=True).\
                    loc[:, 'bfdp pred'.split()]
    test_rmse = rmse(test_vals.to_records(index=False))

    return fitted_model, test_rmse
Пример #9
0
def new_gbdt():
    args = {"n_estimators": 400,
            "max_depth": 10,
            "max_features": "sqrt",
            }
    return GBDT(**args)
Пример #10
0
def final_sim_pred(sqlContext,
                   database_name,
                   windowx=3,
                   kfold_use=False,
                   version='v1'):
    """v2.0版本,加入 unrichness特征并使用模型预测"""
    mix_model_data = sqlContext.sql(
        'select * from {0}.synonyms_mix_model_data'.format(
            database_name)).toPandas()
    label_data = sqlContext.sql(
        'select * from {0}.synonyms_label_sample'.format(
            database_name)).toPandas()
    #
    ## 1) join feature 和 label
    df0 = pds.merge(mix_model_data,
                    label_data,
                    how='left',
                    on=['target_word', 'sim_word'])
    df0.set_index(['target_word', 'sim_word'], inplace=True)
    feature_col = df0.columns[0:-1].tolist()
    label_col = df0.columns[-1]
    #df0.head(100)
    #
    ## 2)  基于规则融合的结果 rule_pred
    rel_max_n_thr = df0.tf_max.describe(
        percentiles=[0.95])['95%'] / 5 * 0.1  # tf词频 阈值
    df0['rule_pred'] = df0.apply(lambda x: cuple_wd_is_sim(
        x=x, unrelated_check=True, rel_max_n=rel_max_n_thr, windowx=windowx),
                                 axis=1)
    #
    ## 3)  基于model融合预测的结果 mdl_pred
    if version == 'v2':
        ### 划分 train & test data
        iloc_index = split_data_random_by_index(
            dfx=df0.loc[df0.is_sim.notna() == True, :].copy(),
            part_num=2,
            split_type='weight',
            weights=[0.7, 0.3])
        train_data = df0.loc[df0.is_sim.notna() == True, :].iloc[
            iloc_index[0], :][[label_col] + feature_col].copy()  # label放到首列
        test_data = df0.loc[df0.is_sim.notna() == True, :].iloc[
            iloc_index[1], :][[label_col] + feature_col].copy()  # label放到首列
        ### 单模型
        mdl = GBDT(learning_rate=0.1, n_estimators=50, max_depth=3)  # 87% ,88%
        mdl.fit(train_data[feature_col], train_data[label_col])
        df0['mdl_pred'] = mdl.predict(df0[feature_col])
        ### 多模型k-fold stacking  --when there are a few label samples , not recommend
        #kfold_use=False # defalut not run
        if kfold_use:
            k = 4
            model_classes = [(LR, {
                'penalty': 'l2'
            }),
                             (GBDT, {
                                 'learning_rate': 0.1,
                                 'n_estimators': 100,
                                 'max_depth': 6
                             })]
            model_stacking_weight = [0.7, 0.7]
            confidence = 0.05
            uniform_voting = False
            kfold_train_pred, kfold_test_pred, iloc_index = k_fold_cross_fit(
                train_data=train_data,
                test_data=df0[[label_col] + feature_col],
                model_classes=model_classes,
                model_stacking_weight=model_stacking_weight,
                confidence=confidence,
                k=k,
                stacking_type='DT',
                uniform_voting=uniform_voting)
            df0.iloc[iloc_index, 'kfold_pred'] = kfold_test_pred
        else:
            df0['kfold_pred'] = ''
    else:
        df0['mdl_pred'] = ''
        df0['kfold_pred'] = ''
    #
    # 4) storage
    try:
        df0['mdl_pred'] = df0.mdl_pred.astype('int')
    except:
        print('')
    try:
        df0['kfold_pred'] = df0.kfold_pred.astype('int')
    except:
        print('')
    sim_recog = sqlContext.createDataFrame(df0.reset_index())
    sqlContext.sql('drop table if exists {0}.word_semantic_similarity'.format(
        database_name))
    sim_recog.write.saveAsTable(
        '{0}.word_semantic_similarity'.format(database_name), mode='overwrite')
Пример #11
0
def get_model(model_name, feature):
    clf = " "
    if model_name == "lr":
        if feature == "word":
            clf = LogisticRegression(penalty='l2',
                                     dual=True,
                                     fit_intercept=True,
                                     C=1,
                                     tol=0.0001,
                                     class_weight=None,
                                     random_state=None,
                                     intercept_scaling=0.1)
        elif feature == "length":
            clf = LogisticRegression(penalty='l2',
                                     dual=True,
                                     fit_intercept=True,
                                     C=0.09,
                                     tol=0.0001,
                                     class_weight=None,
                                     random_state=None,
                                     intercept_scaling=0.1)
        elif feature == "struct":
            clf = LogisticRegression(penalty='l2',
                                     dual=True,
                                     fit_intercept=True,
                                     C=2,
                                     tol=0.0001,
                                     class_weight=None,
                                     random_state=None,
                                     intercept_scaling=0.1)
        elif feature == "lsa":
            clf = LogisticRegression(penalty='l2',
                                     dual=True,
                                     fit_intercept=True,
                                     C=2,
                                     tol=0.0001,
                                     class_weight=None,
                                     random_state=None,
                                     intercept_scaling=0.1)
        else:
            sp = feature.split(',')
            if set(sp) == set(["word", "length", "struct"]):
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=1,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif set(sp) == set(["word", "length", "lsa"]):
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.8,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif set(sp) == set(["struct", "length", "lsa"]):
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=2,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.3)
            elif set(sp) == set(["struct", "length", "lsa", "word"]):
                clf = LogisticRegression(penalty='l2',
                                         dual=False,
                                         fit_intercept=True,
                                         C=3,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=2)

            elif "word" in sp and "length" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.2,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "word" in sp and "struct" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=5,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "word" in sp and "lsa" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=2,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "length" in sp and "struct" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.08,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "length" in sp and "lsa" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.3,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "struct" in sp and "lsa" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=2.5,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            else:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.09,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.1)

    elif model_name == "nb":
        clf = NB()
    elif model_name == "knn":
        if feature == "lsa":
            clf = KNN(n_neighbors=60)
        else:
            clf = KNN(n_neighbors=120)

    elif model_name == "rf":
        clf = RF(n_estimators=1000,
                 max_features="auto",
                 max_depth=8,
                 min_samples_split=10,
                 min_samples_leaf=2)

    elif model_name == "gbdt":
        clf = GBDT(n_estimators=400,
                   max_features="auto",
                   max_depth=8,
                   min_samples_split=10,
                   min_samples_leaf=2)

    elif model_name == "svm":
        if feature == "word" or feature == "length":
            clf = svm.SVC(C=0.8, kernel='rbf', gamma=0.08)
        elif feature == "structure":
            clf = svm.SVC(C=0.1, kernel='rbf', gamma=0.08)
        else:
            sp = feature.split(',')
            if "struct" in sp and "lsa" in sp:
                clf = svm.SVC(C=0.9, kernel='rbf', gamma=0.08)
            else:
                clf = svm.SVC(C=3, kernel='rbf', gamma=0.08)
    else:
        print("你只能从LR,NB,RF几种模型里选择")
        sys.exit(1)
    return clf
Пример #12
0
model_tree = DecisionTreeClassifier(random_state=0)  # 建立分类决策树模型对象
selector_4 = feature_selection.SelectFromModel(model_tree)
sel_features4 = selector_4.fit_transform(x, y)  # 训练并转换数据
print(sel_features4.shape)  # 打印形状
print(sel_features4[:3])  # 打印前3条记录

# 使用sklearn的LDA进行维度转换
model_lda = LDA()  # 建立LDA模型对象
model_lda.fit(x, y)  # 将数据集输入模型并训练
convert_features = model_lda.transform(x)  # 转换数据
print(convert_features.shape)  # 打印形状
print(model_lda.explained_variance_ratio_)  # 获得各成分解释方差占比
print(convert_features[:3])  # 打印前3条记录

# 使用sklearn的GBDT方法组合特征
model_gbdt = GBDT()
model_gbdt.fit(x, y)
conbine_features = model_gbdt.apply(x)[:, :, 0]
print(conbine_features.shape)  # 打印形状
print(conbine_features[0])  # 打印第1条记录

# 使用sklearn的PolynomialFeatures方法组合特征
model_plf = plf(2)
plf_features = model_plf.fit_transform(x)
print(plf_features.shape)  # 打印形状
print(plf_features[0])  # 打印第1条数据

# 使用gplearn的genetic方法组合特征
data = datasets.load_boston()  # 加载数据集
x, y = data.data, data.target  # 分割形成x和y
print(x.shape)  # 查看x的形状
Пример #13
0
def new_gbdt(k):
    args = {
        "n_estimators": k,
    }
    return GBDT(**args)