예제 #1
0
def model_stack2():
    _, test_df, train_label = data_process.get_person_data()
    train_data, test_data = data_process.get_scale_data()
    X_train, X_val, y_train, y_val = train_test_split(train_data,
                                                      train_label,
                                                      test_size=0.2,
                                                      random_state=66)
    id_list = list(test_df.pop('ID'))
    model1 = gbt.XGBRegressor(n_estimators=1000,
                              subsample=0.8,
                              learning_rate=0.25,
                              objective='reg:linear')
    model2 = gbt.XGBRegressor(n_estimators=1000,
                              subsample=0.8,
                              learning_rate=0.25,
                              objective='reg:gamma')
    model3 = gbt.XGBRegressor(n_estimators=1000,
                              subsample=0.8,
                              learning_rate=0.25,
                              objective='reg:tweedie')
    model4 = svm.SVR()
    stack_model = StackingClassifier(
        classifiers=[model1, model2, model3, model4], meta_classifier=model3)
    stack_model.fit(train_data, train_label)
    yHat = stack_model.predict(test_data)
    result = pd.DataFrame({'id': id_list, 'yhat': yHat})
    result.to_csv('result/result6.csv',
                  index=False,
                  header=None,
                  encoding='utf-8')
    def run(cls) -> StackingClassifier:
        """
        Run a Stacking Classifier using all registered models
        """
        sc = cls()
        X, y = sc.load_train()

        # Define the StackingClassifier using all models registered.
        classifiers = [Model() for Model in sc._models if Model.__name__ != 'DumbModel']

        clf = StackingClassifier(classifiers=classifiers,
                                 meta_classifier=LogisticRegression(),
                                 verbose=1,
                                 average_probas=False,
                                 use_probas=True
                                 )

        # Run cross-val to get an idea of what to expect for final output
        #scores = cross_val_score(clf, X.copy(), y.copy(), scoring='neg_log_loss', cv=2)

        #print('\n---------\nCross validation (3) --> StackingClassifier - Avg Log Loss: {:.8f} - STD: {:.4f}\n---------'
        #      .format(scores.mean(), scores.std())
        #      )

        # Finally, refit clf to entire dataset
        print('Fitting Stacking Classifier to entire training dataset...')
        clf.fit(X.copy(), y.copy())
        return clf
예제 #3
0
class ClassifierBlender:
    def __init__(self, x_train, x_test, y_train, y_test=None):
        x_train.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True)
        x_test.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True)
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train['y'].values
        if self.y_train is not None:
            self.y_test = y_test['y'].values

    def clf_blend(self):
        mete_clf = LinearRegression()
        clf1 = model.svm_regressor()
        clf2 = model.randomforest_regressor()
        clf3 = model.xgb_regressor()
        self.blend = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                        meta_classifier=mete_clf)
        self.blend.fit(self.x_train, self.y_train)
        return self.blend

    def score(self):
        scores = cross_val_score(self.blend,
                                 X=self.x_train,
                                 y=self.y_train,
                                 cv=10,
                                 verbose=2)
        return scores

    def prediction(self):
        y_pred = self.blend.predict(self.x_test)
        return y_pred
예제 #4
0
def stacking2():
    from sklearn.datasets import load_iris
    from mlxtend.classifier import StackingClassifier
    from mlxtend.feature_selection import ColumnSelector
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn import model_selection

    iris = load_iris()
    X = iris.data
    y = iris.target

    pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression())
    pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression())
    sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                              meta_classifier=LogisticRegression(),
                              use_features_in_secondary=True,
                              store_train_meta_features=True)
    sclf.fit(X, y)
    scores = model_selection.cross_val_score(sclf,
                                             X,
                                             y,
                                             cv=5,
                                             scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
예제 #5
0
def test_StackingClassifier_avg_vs_concat():
    np.random.seed(123)
    lr1 = LogisticRegression()
    sclf1 = StackingClassifier(classifiers=[lr1, lr1],
                               use_probas=True,
                               average_probas=True,
                               meta_classifier=lr1)

    sclf1.fit(X, y)
    r1 = sclf1._predict_meta_features(X[:2])
    assert r1.shape == (2, 3)
    assert_almost_equal(np.sum(r1[0]), 1.0, places=6)
    assert_almost_equal(np.sum(r1[1]), 1.0, places=6)

    sclf2 = StackingClassifier(classifiers=[lr1, lr1],
                               use_probas=True,
                               average_probas=False,
                               meta_classifier=lr1)

    sclf2.fit(X, y)
    r2 = sclf2._predict_meta_features(X[:2])
    assert r2.shape == (2, 6)
    assert_almost_equal(np.sum(r2[0]), 2.0, places=6)
    assert_almost_equal(np.sum(r2[1]), 2.0, places=6)
    np.array_equal(r2[0][:3], r2[0][3:])
예제 #6
0
def stacking3Model(model1, model2, metamodel, xtr, ytr, xts, yts):
    model = StackingClassifier(classifiers=[model1, model2], meta_classifier=metamodel)
    train, testt = scaling(xtr, xts,MaxAbsScaler())
    model.fit(train, ytr)
    acc = accuracy_score(yts, model.predict(testt))
    predict = model.predict(testt)
    return acc, predict
def create_stacked(dataset, x_train, y_train):
    for i, y in enumerate(dataset.y_true):
        dataset.y_true[i] = dataset.class_labels.index(y)

    for i, y in enumerate(y_train):
        y_train[i] = dataset.class_labels.index(y)
    dataset.class_labels = range(0, len(dataset.class_labels))

    clf1 = RandomForestClassifier(n_estimators=1000,
                                  n_jobs=-1,
                                  random_state=42)
    clf2 = KNeighborsClassifier(n_neighbors=10)
    clf3 = GaussianNB()
    clf4 = MLPClassifier(activation='relu',
                         max_iter=100000,
                         hidden_layer_sizes=(50, 50, 50, 50, 50))
    clf5 = MLPClassifier(activation='relu',
                         max_iter=1000000,
                         hidden_layer_sizes=(500, 500))
    clf6 = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
    clf_meta = LogisticRegression()
    clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5, clf6],
                             meta_classifier=clf_meta,
                             use_probas=True)

    clf.fit(x_train, y_train)

    return clf
def kfold_train(mode):
    acc_list, loss_list = [], []
    prediction = np.zeros((x_test.shape[0], 4))
    for i in range(10):
        print(str(i + 1) + ' th kflod' + '*' * 50)
        kf = KFold(n_splits=5, shuffle=True, random_state=i)
        kfold_list = []
        for k, (train_index, test_index) in enumerate(kf.split(x_train)):
            print(str(k + 1) + 'fold--------------')
            train_x, train_y = x_train[train_index], labels[train_index]
            test_x, test_y = x_train[test_index], labels[test_index]
            # train
            if mode == 'cat':
                model = catboost_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    #early_stopping_rounds=1000, verbose=False
                )
                #print(pd.DataFrame({'column': features, 'importance': model.feature_importances_}).sort_values(by='importance'))
            if mode == 'lgb':
                model = lightgbm_model()
                model.fit(
                    train_x,
                    np.argmax(train_y, 1),
                    eval_set=(test_x, np.argmax(test_y, 1)),
                    # early_stopping_rounds=50, verbose=True
                    verbose=False)
            if mode == 'xgb':
                model = xgboost_model()
                model.fit(train_x, np.argmax(train_y, 1), verbose=True)
            if mode == 'stack':
                model = StackingClassifier(classifiers=[
                    xgboost_model(),
                    catboost_model(),
                    lightgbm_model(),
                    adaboost_model()
                ],
                                           use_probas=True,
                                           average_probas=False,
                                           meta_classifier=lr)

                model.fit(train_x, np.argmax(train_y, 1))
            # test
            pred = model.predict_proba(test_x)
            acc = accuracy_score(np.argmax(test_y, 1), np.argmax(pred, 1))
            loss = log_loss(test_y, pred)
            acc_list.append(acc)
            loss_list.append(loss)
            kfold_list.append(loss)
            print('test acc: %f, test loss: %f' % (acc, loss))
            # predict
            prediction += model.predict_proba(x_test)
        print('this fold mean loss:', np.mean(kfold_list))
    print('*' * 50)
    print('mean acc: %f, mean loss: %f' %
          (np.mean(acc_list), np.mean(loss_list)))
    prediction = prediction / 50.
    return prediction
def test_StackingClassifier_avg_vs_concat():
    np.random.seed(123)
    lr1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    sclf1 = StackingClassifier(classifiers=[lr1, lr1],
                               use_probas=True,
                               average_probas=True,
                               meta_classifier=lr1)

    sclf1.fit(X, y)
    r1 = sclf1.predict_meta_features(X[:2])
    assert r1.shape == (2, 3)
    assert_almost_equal(np.sum(r1[0]), 1.0, decimal=6)
    assert_almost_equal(np.sum(r1[1]), 1.0, decimal=6)

    sclf2 = StackingClassifier(classifiers=[lr1, lr1],
                               use_probas=True,
                               average_probas=False,
                               meta_classifier=lr1)

    sclf2.fit(X, y)
    r2 = sclf2.predict_meta_features(X[:2])
    assert r2.shape == (2, 6)
    assert_almost_equal(np.sum(r2[0]), 2.0, decimal=6)
    assert_almost_equal(np.sum(r2[1]), 2.0, decimal=6)
    np.array_equal(r2[0][:3], r2[0][3:])
def test_sample_weight():
    # Make sure that:
    #    prediction with weight
    # != prediction with no weight
    # == prediction with weight ones
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y))])

    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta)
    prob1 = sclf.fit(X, y, sample_weight=w).predict_proba(X)

    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta)
    prob2 = sclf.fit(X, y, sample_weight=None).predict_proba(X)

    maxdiff = np.max(np.abs(prob1 - prob2))
    assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff

    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta)
    prob3 = sclf.fit(X, y, sample_weight=np.ones(len(y))).predict_proba(X)

    maxdiff = np.max(np.abs(prob2 - prob3))
    assert maxdiff < 1e-3, "max diff is %.4f" % maxdiff
예제 #11
0
파일: main.py 프로젝트: JolosJoestar/ml_01
def stacking_clf(train_x, train_y):
    clf1 = RandomForestClassifier(n_estimators=300,
                                  max_features="sqrt",
                                  min_samples_split=20,
                                  min_samples_leaf=15,
                                  max_depth=6,
                                  bootstrap=True,
                                  n_jobs=8)
    clf2 = svm.SVC(C=10)
    clf3 = xgb.XGBClassifier(n_estimators=300,
                             learning_rate=0.1,
                             n_jobs=8,
                             object="multi:softmax",
                             colsample_bylevel=0.8,
                             reg_lambda=1,
                             max_depth=6,
                             min_child_weight=1)

    clf4 = GradientBoostingClassifier(n_estimators=300,
                                      learning_rate=0.1,
                                      min_samples_split=20,
                                      min_samples_leaf=15,
                                      max_depth=6,
                                      max_features="sqrt")

    clf5 = LogisticRegression(penalty='l2', C=100, multi_class='ovr')

    sclf = StackingClassifier(
        classifiers=[clf1, clf3, clf4],
        meta_classifier=clf5,
    )
    sclf.fit(train_x, train_y)
    return sclf
예제 #12
0
def stacking_prediction2(m1, m2, meta):
    # model_train, model_test = stacking(clf, Xtrain2,ytrain2, Xtest2)
    # model.fit(model_train, ytrain2)
    tr, ts = scaling(Xtrain2,Xtest2,MaxAbsScaler())
    m = StackingClassifier(classifiers=[m1, m2],meta_classifier=meta) 
    m.fit(tr, ytrain2)
    predict_mm = m.predict(ts)
    return predict_mm
예제 #13
0
파일: model.py 프로젝트: zhuNanyang/sensor
    def stacking(self):
        train_data, test_data = self.Extract_feature.extract_count()
        from sklearn.svm import SVR
        from sklearn.pipeline import make_pipeline
        from sklearn.preprocessing import RobustScaler, MinMaxScaler
        from sklearn.preprocessing import StandardScaler
        from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
        from xgboost import XGBClassifier
        import lightgbm as lgb
        from lightgbm import LGBMClassifier
        import xgboost as xgb
        from mlxtend.classifier import StackingClassifier

        import scipy as sc
        from sklearn import model_selection

        lasso = make_pipeline(SVC(C=2.1, gamma=0.005))
        rforest = make_pipeline(
            RandomForestClassifier(random_state=0, n_estimators=6))
        Gboost = GradientBoostingClassifier(n_estimators=500,
                                            learning_rate=0.01,
                                            max_depth=12,
                                            max_features="sqrt",
                                            min_samples_leaf=15,
                                            min_samples_split=97,
                                            random_state=200)
        model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603,
                                      gamma=10,
                                      learning_rate=0.01,
                                      max_depth=11,
                                      n_estimators=500,
                                      reg_alpha=0.01,
                                      reg_lambda=5,
                                      subsample=0.5213,
                                      seed=1024,
                                      nthread=-1)

        lr = LogisticRegression()
        classifiers = [rforest, lasso, Gboost, model_xgb, lr]
        stregr = StackingClassifier(classifiers=classifiers,
                                    meta_classifier=lr)
        stregr.fit(train_data, self.train_label)

        prediction = stregr.predict(test_data)
        classification = classification_report(y_true=self.test_label,
                                               y_pred=prediction)
        print("classification:{}".format(classification))
        print("测试集的score:{}".format(stregr.score(test_data, self.test_label)))
        for clf, label in zip(
            [rforest, lasso, Gboost, lr, model_xgb, stregr],
            ['rf', 'svr', 'gboost', 'lr', 'xgb', 'stackingclassifier']):
            scores = model_selection.cross_val_score(clf,
                                                     train_data,
                                                     self.train_label,
                                                     cv=3,
                                                     scoring='accuracy')
            print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
                  (scores.mean(), scores.std(), label))
예제 #14
0
def test_weight_unsupported_no_weight():
    # This is okay since we do not pass sample weight
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    clf3 = KNeighborsClassifier()
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                              meta_classifier=meta)
    sclf.fit(X, y)
예제 #15
0
def data_ensemble(cancer_type,feat):
	data_dir = "/home/ubuntu/cancer/"
	data_file = data_dir + cancer_type + "_matrix.csv"
	features = data_dir + cancer_type + "_output.txt"
	output_file = data_dir + cancer_type + "_accuracy.txt"
	file = open(features, "r")
	o_file = open(output_file, "w")
	line = file.readline()
	line = file.readline()
	df = pd.read_csv(data_file)
	df = shuffle(df)
	file_ids=df.pop('file_id')
	y = df.pop('label').values
	dataf=df.pop(line[:-1])
	#dataframe consisting of only important features
	for x in range(feat):
		line = file.readline()
		dataf=np.column_stack((dataf,df.pop(line[:-1])))
	X=normalize(dataf)
	X=scale(X)
	pca=PCA()
	pca.fit(X)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
        #multiple classifiers
	clf1 = RandomForestClassifier(random_state=1,n_estimators=100)
	clf2 = GradientBoostingClassifier(n_estimators=1200,subsample=0.5,random_state=3)
	clf3 = SVC(gamma='auto')
	clf4 = KNeighborsClassifier(n_neighbors=1)
	clf5 = DecisionTreeClassifier(random_state=0)
	lr = LogisticRegression(solver='lbfgs')
	#stacking for data ensemble
	sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], meta_classifier=lr)
	clf1.fit(X_train,y_train)
	clf2.fit(X_train,y_train)
	clf3.fit(X_train,y_train)
	clf4.fit(X_train,y_train)
	clf5.fit(X_train,y_train)
	sclf.fit(X_train,y_train)
	y_test_predict=sclf.predict(X_test)
	precision = precision_score(y_test, y_test_predict)
	accuracy = accuracy_score(y_test, y_test_predict)
	f1 = f1_score(y_test, y_test_predict)
	recall = recall_score(y_test, y_test_predict)
	scores = [precision,accuracy,f1,recall]
	label = ['RF', 'GBDT', 'SVM','KNN','DT','Stacking']
	clf_list = [clf1, clf2, clf3, clf4, clf5, sclf]
	#score calculation
	for clf, label in zip(clf_list, label):
		y_test_predict = clf.predict(X_test)
		tn, fp, fn, tp = confusion_matrix(y_test, y_test_predict).ravel()
		specificity = tn / (tn+fp)
		recall = tp / (tp+fn)
		precision = tp / (tp+fp)
		accuracy = (tp + tn) / (tp+tn+fp+fn)
		f1 = 2*tp / (2*tp+fp+fn)
		o_file.write("\nAccuracy: %.2f [%s] \nPrecision: %.2f [%s] \nRecall: %.2f [%s] \nF1 score: %.2f [%s] \nSpecificity: %.2f [%s]\n" %(accuracy,label,precision, label, recall, label, f1, label, specificity, label))
def test_weight_unsupported_no_weight():
    # This is okay since we do not pass sample weight
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    clf3 = KNeighborsClassifier()
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                              meta_classifier=meta)
    sclf.fit(X, y)
예제 #17
0
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_probas=True,
                              meta_classifier=meta,
                              verbose=3)
    sclf.fit(iris.data, iris.target)
예제 #18
0
def stackingPerformanceEditor():
    nb_clf = GaussianNB()
    svm_clf = RandomForestClassifier(n_estimators=100, max_depth=400, random_state=5)
    mlp_clff = MLPClassifier(hidden_layer_sizes=(500,500))
    label = ["NB","RF","MLP"]

    acc = StackingClassifier(classifiers=[nb_clf,svm_clf,mlp_clff], meta_classifier=svm_clf)
    acc.fit(Xtrain2, ytrain2)
    pred = accuracy_score(ytest, acc.predict(Xtest2))
    return pred
예제 #19
0
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_probas=True,
                              meta_classifier=meta,
                              verbose=3)
    sclf.fit(iris.data, iris.target)
예제 #20
0
 def stacking(self):
     from sklearn.svm import SVC
     from sklearn.pipeline import make_pipeline
     from sklearn.preprocessing import RobustScaler, MinMaxScaler
     from sklearn.preprocessing import StandardScaler
     from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
     from lightgbm import LGBMClassifier
     import xgboost as xgb
     from mlxtend.classifier import StackingClassifier
     import scipy as sc
     svc = make_pipeline(SVC(kernel='rbf', C=2.8, gamma=2))
     rf = RandomForestClassifier(random_state=590, n_estimators=6)
     GBoost = GradientBoostingClassifier(n_estimators=500,
                                         learning_rate=0.01,
                                         max_depth=12,
                                         max_features='sqrt',
                                         min_samples_leaf=15,
                                         min_samples_split=97,
                                         random_state=200)
     model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603,
                                   gamma=10,
                                   learning_rate=0.01,
                                   max_depth=11,
                                   min_child_weight=1.7817,
                                   n_estimators=500,
                                   reg_alpha=0.01,
                                   reg_lambda=5,
                                   subsample=0.5213,
                                   silent=1,
                                   seed=1024,
                                   nthread=-1)
     model_lgb = LGBMClassifier(objective='regression',
                                num_leaves=5,
                                learning_rate=0.05,
                                n_estimators=550,
                                max_bin=25,
                                bagging_fraction=1,
                                bagging_freq=5,
                                feature_fraction=0.7,
                                feature_fraction_seed=9,
                                bagging_seed=9,
                                min_data_in_leaf=42,
                                min_sum_hessian_in_leaf=40)
     regressors = [rf, svc, GBoost, model_lgb, model_xgb]
     stregr = StackingClassifier(classifiers=regressors,
                                 meta_classifier=model_xgb,
                                 verbose=1)
     stregr.fit(self.X_train, self.y_train)
     print(
         "the model is stregr and the valid's f1 is: ",
         f1_score(self.y_test, stregr.predict(self.X_test),
                  average="macro"))
     # print("the model is stregr and the valid's precision_score is: ", precision_score(self.y_test, stregr.predict(self.X_test),average="macro"))
     # print("the model is stregr and the valid's recall_score is: ", recall_score(self.y_test, stregr.predict(self.X_test),average="macro"))
     return stregr
예제 #21
0
def test_weight_unsupported_no_weight():
    # This is okay since we do not pass sample weight
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    clf3 = KNeighborsClassifier()
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                              meta_classifier=meta)
    sclf.fit(X, y)
def train3():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression())
    pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression())

    sclf = StackingClassifier(classifiers=[pipe1, pipe2], meta_classifier=LogisticRegression())

    sclf.fit(x, y)
예제 #23
0
def test_train_meta_features_():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    stclf = StackingClassifier(classifiers=[knn, gnb],
                               meta_classifier=lr,
                               store_train_meta_features=True)
    X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
예제 #24
0
def test_train_meta_features_():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    stclf = StackingClassifier(classifiers=[knn, gnb],
                               meta_classifier=lr,
                               store_train_meta_features=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_probas=True,
                              meta_classifier=meta,
                              verbose=3)
    X, y = iris_data()
    sclf.fit(X, y)
예제 #26
0
def test_weight_unsupported():
    # Error since KNN does not support sample_weight
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    clf3 = KNeighborsClassifier()
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                              meta_classifier=meta)
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y))])
    sclf.fit(X, y, sample_seight=w)
def classifer_stacking(data_file,alertgroup_name,classifier_list):
    classifiers = {'KNN':KNeighborsClassifier(),
                   # n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=1),
                   # 'LR': LogisticRegression(),
                   'RF':  RandomForestClassifier(),
                   # n_estimators=60,max_depth=13,min_samples_split=120,min_samples_leaf=20,random_state=10
                   'DT': tree.DecisionTreeClassifier(),
                   # criterion='gini',splitter=random,max_features=None,max_depth=13,min_samples_leaf=2
                   'GBDT': GradientBoostingClassifier()
                       # loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1,max_depth=3,verbose=0,presort='auto')
                   # 'XGB':xgboost_classifier
                   }
    all_data = pd.read_csv(data_file, sep=',', dtype=str)
    for alertgroup, group in all_data.groupby('alertgroup'):
        if alertgroup == alertgroup_name:
            train_x, test_x, train_y, test_y = get_data(group, split=True)
            arr_x = train_x.values
            arr_y = train_y.values
            max_fs = 0
            best_model = None
            stratified_folder = StratifiedKFold(n_folds=3,random_state=0,shuffle=False)

            for train_index,test_index in stratified_folder.split(train_x):
                train_x = arr_x[train_index]
                train_y = arr_y[train_index]
                test_x = arr_x[test_index]
                test_y = arr_y[test_index]
                classifiers_list = [classifiers[cl] for cl in classifier_list]
                stack_model = StackingClassifier(classifiers = classifiers_list,use_probas=True,
                                                average_probas=True,meta_classifier=classifiers['RF'])

                stack_model.fit(train_x,train_y)
                predict = stack_model.predict(test_x)
                fbetascore = fbeta_score(test_y, predict, 1)
                print(' f2score:' + str(fbetascore))
                if fbetascore > max_fs:
                    max_fs = fbetascore
                    best_model = stack_model

            stack_model = best_model
            predict = stack_model.predict(test_x)
            precision = metrics.precision_score(test_y, predict)
            recall = metrics.recall_score(test_y, predict)
            fbetascore = fbeta_score(test_y, predict, 0.5)
            accuracy = metrics.accuracy_score(test_y, predict)
            print('final performance:')
            print(alertgroup_name)
            print('precision: %.6f' % (100 *precision))
            print('recall: %.6f' % (100 * recall))
            print('f0.5score: %.6f' % (100 * fbetascore))
            print('accuracy: %.6f%%' % (100 * accuracy))

            return best_model
예제 #28
0
def test_verbose():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_probas=True,
                              meta_classifier=meta,
                              verbose=3)
    X, y = iris_data()
    sclf.fit(X, y)
def test_train_meta_features_():
    np.random.seed(123)
    knn = KNeighborsClassifier()
    lr = LogisticRegression(solver='liblinear', multi_class='ovr')
    gnb = GaussianNB()
    stclf = StackingClassifier(classifiers=[knn, gnb],
                               meta_classifier=lr,
                               store_train_meta_features=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
예제 #30
0
def test_predict_meta_features():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    #  test default (class labels)
    stclf = StackingClassifier(classifiers=[knn, gnb],
                               meta_classifier=lr,
                               store_train_meta_features=True)
    stclf.fit(X_train, y_train)
    test_meta_features = stclf.predict(X_test)
    assert test_meta_features.shape == (X_test.shape[0], )
예제 #31
0
파일: train.py 프로젝트: li-phone/Nopimal
 def stack_models(self, x1, y1, x2, y2, meta):
     classifiers = [v['model'] for k, v in self.fine_models.items()]
     meta_classifier = self.fine_models[meta]['model']
     sclf_lr = StackingClassifier(classifiers=classifiers,
                                  meta_classifier=meta_classifier,
                                  use_probas=True,
                                  average_probas=True,
                                  use_features_in_secondary=True)
     sclf_lr.fit(x1, y1.values)
     d = dict(model=sclf_lr, name='stacking_models', meta_classifier=meta)
     save_name = os.path.join(self.cfg.model_dir, 'stacking_models.m')
     joblib.dump(d, save_name, compress=5)
     return sclf_lr
예제 #32
0
def test_predict_meta_features():
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

    #  test default (class labels)
    stclf = StackingClassifier(classifiers=[knn, gnb],
                               meta_classifier=lr,
                               store_train_meta_features=True)
    stclf.fit(X_train, y_train)
    test_meta_features = stclf.predict(X_test)
    assert test_meta_features.shape == (X_test.shape[0],)
def stack_model(X_train, Y_train, X_test, expert_model, n_estimator):
    estimators = [('DT', DecisionTreeClassifier()), ('MLP', MLPClassifier())]
    if expert_model == "DT":
        model = StackingClassifier(estimators=estimators,
                                   final_estimator=DecisionTreeClassifier())
    if expert_model == "MLP":
        model = StackingClassifier(estimators=estimators,
                                   final_estimator=MLPClassifier())
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)

    return Y_pred
예제 #34
0
def test_train_meta_features_():
    np.random.seed(123)
    knn = KNeighborsClassifier()
    lr = LogisticRegression(solver='liblinear',
                            multi_class='ovr')
    gnb = GaussianNB()
    stclf = StackingClassifier(classifiers=[knn, gnb],
                               meta_classifier=lr,
                               store_train_meta_features=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    stclf.fit(X_train, y_train)
    train_meta_features = stclf.train_meta_features_
    assert train_meta_features.shape == (X_train.shape[0], 2)
def test_use_features_in_secondary_sparse_input_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    sclf = StackingClassifier(classifiers=[clf1],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(sparse.csr_matrix(X), y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(sparse.csr_matrix(X[idx]))[:, 0]
    expect = np.array([0.910, 0.829, 0.882])
    np.testing.assert_almost_equal(y_pred, expect, 3)
def test_weight_unsupported():
    # Error since KNN does not support sample_weight
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    clf3 = KNeighborsClassifier()
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                              meta_classifier=meta)
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y))])

    with pytest.raises(TypeError):
        sclf.fit(X, y, sample_seight=w)
예제 #37
0
def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.911, 0.829, 0.885])
    np.testing.assert_almost_equal(y_pred, expect, 3)
예제 #38
0
def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.911, 0.829, 0.885])
    np.testing.assert_almost_equal(y_pred, expect, 3)
예제 #39
0
def test_weight_unsupported():
    # Error since KNN does not support sample_weight
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    clf3 = KNeighborsClassifier()
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                              meta_classifier=meta)
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y))])

    with pytest.raises(TypeError):
        sclf.fit(X, y, sample_seight=w)
예제 #40
0
    def model_test(self,model,best_params):

        print 'Model Test'
        print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        lr = self.model_init(model)
        clf1 = self.model_init('KNN')
        clf2 = self.model_init('RFC')
        clf3 = self.model_init('GNB')
        sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr)
        
        sclf.set_params(**best_params)
        
        train_data = self.train.values.copy() 
        train_label = self.train_label['label'].values.copy()
        
        sclf.fit(train_data, train_label)
        
        if model.upper()=='LR':
            coef=sclf.coef_.reshape(clf.coef_.shape[1])
            ind=coef.argsort()
            att=self.train.columns[ind[-30:]].tolist()
            print att
        elif model.upper()=='RFC':
            imp=sclf.feature_importances_
            print imp
            ind=imp.argsort()
            att=self.train.columns[ind[-30:]].tolist()
            print att
        elif model.upper()=='XGB':
            imp=sclf.feature_importances_
            print imp
            ind=imp.argsort()
            att=self.train.columns[ind[-30:]].tolist()
            print att           
            
        test_data = self.test.values.copy()
        test_label = self.test_label['label'].values.copy()
        test_label = test_label.reshape(test_label.shape[0])
            
        res_proba=sclf.predict_proba(test_data)              
        res_auc=roc_auc_score(test_label,res_proba[:,1])
        
        print 'Model: {0} ; Test: {1}'.format(model,res_auc)
                
        print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        return res_auc
예제 #41
0
def test_use_features_in_secondary_sparse_input_predict_proba():
    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    sclf = StackingClassifier(classifiers=[clf1],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(sparse.csr_matrix(X), y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(
        sparse.csr_matrix(X[idx])
    )[:, 0]
    expect = np.array([0.910, 0.829, 0.882])
    np.testing.assert_almost_equal(y_pred, expect, 3)
예제 #42
0
def test_use_features_in_secondary_predict_proba():
    np.random.seed(123)
    X, y = iris_data()
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf1 = RandomForestClassifier(n_estimators=10, random_state=1)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_features_in_secondary=True,
                              meta_classifier=meta)

    sclf.fit(X, y)
    idx = [0, 1, 2]
    y_pred = sclf.predict_proba(X[idx])[:, 0]
    expect = np.array([0.916, 0.828, 0.889])
    np.testing.assert_almost_equal(y_pred, expect, 3)
예제 #43
0
def test_multivariate_class():
    np.random.seed(123)
    meta = KNeighborsClassifier()
    clf1 = RandomForestClassifier()
    clf2 = KNeighborsClassifier()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              meta_classifier=meta)
    y_pred = sclf.fit(X, y2).predict(X)
    ca = .973
    assert round((y_pred == y2).mean(), 3) == ca
예제 #44
0
def model_processing(X_train,X_test,y_train,y_test):
    log_reg = LogisticRegression(C=0.01, penalty='l2')
    svc = SVC(C=0.7, kernel='linear')
    tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
    rf_clf = RandomForestClassifier(n_estimators=70,criterion='entropy', max_features='auto',min_samples_leaf=6)
    xgb = XGBClassifier(gamma=0.3, max_depth=4, min_child_weight=8,reg_alpha=0.05)
    
    sclf = StackingClassifier(classifiers=[log_reg,svc,tree_clf,rf_clf],meta_classifier=xgb)
    sclf.fit(X_train,y_train)
    y_pred_train = sclf.predict(X_train)
    y_pred = sclf.predict(X_test)
    
    print('*' * 30,'在训练集上的得分' )
    
    accuracy = accuracy_score(y_train,y_pred_train)
    precision = precision_score(y_train,y_pred_train)
    f1 = f1_score(y_train,y_pred_train)
    recall = recall_score(y_train,y_pred_train)
    auc = roc_auc_score(y_train,y_pred_train)
    model_name = '堆叠模型-训练集'
     
    print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy))
    print('{} 准确度(precision):{:.2f}'.format(model_name,precision))
    print('{} F1 Score :{:.2f}'.format(model_name,f1))
    print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall))
    print('{} auc Score:{:.2f}'.format(model_name,auc))
    
    
    print('*' * 30,'在测试集上的得分' )
    
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    auc = roc_auc_score(y_test,y_pred)
    model_name = '堆叠模型'
     
    print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy))
    print('{} 准确度(precision):{:.2f}'.format(model_name,precision))
    print('{} F1 Score :{:.2f}'.format(model_name,f1))
    print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall))
    print('{} auc Score:{:.2f}'.format(model_name,auc))    
예제 #45
0
def test_StackingClassifier_drop_last_proba():
    np.random.seed(123)
    lr1 = LogisticRegression(solver='liblinear',
                             multi_class='ovr')
    sclf1 = StackingClassifier(classifiers=[lr1, lr1],
                               use_probas=True,
                               drop_last_proba=False,
                               meta_classifier=lr1)

    sclf1.fit(X, y)
    r1 = sclf1.predict_meta_features(X[:2])
    assert r1.shape == (2, 6)

    sclf2 = StackingClassifier(classifiers=[lr1, lr1],
                               use_probas=True,
                               drop_last_proba=True,
                               meta_classifier=lr1)

    sclf2.fit(X, y)
    r2 = sclf2.predict_meta_features(X[:2])
    assert r2.shape == (2, 4), r2.shape

    sclf3 = StackingClassifier(classifiers=[lr1, lr1],
                               use_probas=True,
                               drop_last_proba=True,
                               meta_classifier=lr1)

    sclf3.fit(X[0:100], y[0:100])  # only 2 classes
    r3 = sclf3.predict_meta_features(X[:2])
    assert r3.shape == (2, 2), r3.shape
예제 #46
0
def test_sample_weight():
    # Make sure that:
    #    prediction with weight
    # != prediction with no weight
    # == prediction with weight ones
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y))])

    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              meta_classifier=meta)
    prob1 = sclf.fit(X, y, sample_weight=w).predict_proba(X)

    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              meta_classifier=meta)
    prob2 = sclf.fit(X, y, sample_weight=None).predict_proba(X)

    maxdiff = np.max(np.abs(prob1 - prob2))
    assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff

    np.random.seed(123)
    meta = LogisticRegression(solver='liblinear',
                              multi_class='ovr')
    clf1 = RandomForestClassifier(n_estimators=10)
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              meta_classifier=meta)
    prob3 = sclf.fit(X, y, sample_weight=np.ones(len(y))).predict_proba(X)

    maxdiff = np.max(np.abs(prob2 - prob3))
    assert maxdiff < 1e-3, "max diff is %.4f" % maxdiff
예제 #47
0
def predictor_ev():
    print "Building Neural Net classifiers for devices with events"
    n_input = X_train_ev.shape[1]
    n_train = X_train_ev.shape[0]
    
    from keras.models import Sequential
    from keras.layers import Dense, Activation
    from keras.layers.core import Dropout
    from keras.layers.advanced_activations import PReLU
    from keras.regularizers import l2
    from keras.optimizers import Adadelta
    from keras.optimizers import SGD
    from keras.wrappers.scikit_learn import KerasClassifier
    from keras.callbacks import ModelCheckpoint
    
    def create_model(n_hidden_layers=1, nodes=[50], reg=1.0, dropouts=[.5], acts=['relu']):
        n_in = n_input    
        model = Sequential()
        for i in xrange(n_hidden_layers):
            n_out = nodes[i]
            dropout = dropouts[i]
            act = acts[i]
            model.add(Dense(output_dim=n_out, input_dim=n_in, W_regularizer=l2(reg)))
            model.add(Activation(act))
            model.add(Dropout(dropout))
            n_in = n_out
        model.add(Dense(output_dim=12, W_regularizer=l2(reg)))
        model.add(Activation("softmax"))
        # Compile model
        adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08)
        sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=['accuracy'])
        return model
    
    class KerasClassifier2(KerasClassifier):
            
        def __init__(self, build_fn, fn_args, random_state=0, nb_epoch=10, batch_size=500, verbose=2):
            self.random_state = random_state
            self.nb_epoch = nb_epoch
            self.batch_size = batch_size
            self.verbose = verbose
            super(KerasClassifier2, self).__init__(build_fn, **fn_args)
            self.classes_= np.arange(12)
            self.n_classes_ = 12
            self.model = build_fn(**fn_args)
            
        def fit(self, X, y, sample_weight=None):
            return super(KerasClassifier2, self).fit(X, indicator(y),
                             verbose = self.verbose, sample_weight=sample_weight,
                             validation_data=(X_cv_ev, indicator(y_cv_ev)),
                             nb_epoch=self.nb_epoch, batch_size=self.batch_size)
    
    
        def predict_proba(self, X):
            return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0)
            
        def predict(self, X):
            return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0)            
    
    nn1_args = {'n_hidden_layers': 2, 'nodes': [600, 400], 'reg': 1.8,
                'dropouts': [.3, .4], 'acts': ['relu', 'relu']}
    nn2_args = {'n_hidden_layers': 3, 'nodes': [300, 100, 50], 'reg': 2.0,
                'dropouts': [.2, .4, .5], 'acts': ['relu', 'relu', 'relu']}
    nn3_args = {'n_hidden_layers': 4, 'nodes': [1001, 511, 245, 99], 'reg': 2.0,
                'dropouts': [.2, .3, .2, .3], 'acts': ['relu', 'relu', 'relu', 'relu']}
    nn4_args = {'n_hidden_layers': 1, 'nodes': [500], 'reg': 1.2,
                'dropouts': [.25], 'acts': ['relu']}
    nn5_args = {'n_hidden_layers': 5, 'nodes': [1343, 1012, 757, 539, 117],
                'reg': 2.5, 'dropouts': [.2, .3, .4, .4, .4],
                'acts': ['relu', 'relu', 'relu', 'relu', 'relu']}
    
    clfNN1 = KerasClassifier2(create_model, nn1_args, random_state=5, nb_epoch=5)
    clfNN2 = KerasClassifier2(create_model, nn2_args, random_state=23, nb_epoch=11)
    clfNN3 = KerasClassifier2(create_model, nn3_args, random_state=710, nb_epoch=6)
    clfNN4 = KerasClassifier2(create_model, nn4_args, random_state=5072, nb_epoch=6)
    clfNN5 = KerasClassifier2(create_model, nn5_args, random_state=2016, nb_epoch=12)
    
    print "Building XGBoost classifiers for devices with events"
    xgb_params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3.5,
    }
    
    class XGBClassifier2(xgb.XGBClassifier):
    
        def __init__(self, max_depth=xgb_params['max_depth'],
                     objective='multi:softprob', missing=None, 
                     learning_rate=xgb_params['eta'], n_estimators=40, subsample=1,
                     reg_alpha=xgb_params['alpha'], seed=2016, booster='gblinear'):
            super(XGBClassifier2, self).__init__(max_depth=max_depth, seed=seed,
                        objective=objective, missing=missing,
                        learning_rate=learning_rate, n_estimators=n_estimators,
                        subsample=subsample, reg_alpha=reg_alpha)
            self.booster = xgb_params['booster']
            
        def fit(self, X, y):
            super(XGBClassifier2, self).fit(X.tocsc(), y, eval_metric='mlogloss',
                                            eval_set=[(X_cv_ev.tocsc(), y_cv_ev)])
    
    gbm1 = XGBClassifier2(seed=0, booster='gblinear', n_estimators=28)
    gbm2 = XGBClassifier2(seed=6, booster='gblinear', n_estimators=28)
    gbm3 = XGBClassifier2(seed=151, booster='gbtree', n_estimators=28)
    gbm4 = XGBClassifier2(seed=1047, booster='gbtree', n_estimators=28)
    gbm5 = XGBClassifier2(seed=22, booster='dart', n_estimators=28)
    
    print "Building Logistic Regression classifier for devices with events"
    clfLR = LogisticRegression(C=.02, random_state=2016, multi_class='multinomial', solver='newton-cg')
    
    #Combine results of classifiers
    print "Stacking classifiers for devices with events"
    clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR]
    meta = LogisticRegression()
    stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1)
    
    stack.fit(X_train_ev, y_train_ev)
    print log_loss(y_cv_ev, stack.predict_proba(X_cv_ev))
    y_pred_ev = stack.predict_proba(X_test_ev)
    #y_pre = (pred_prob_nn+y_pre)/2.0
    return y_pred_ev
예제 #48
0
#clfKNN = KNeighborsClassifier(n_neighbors=5)
#clfKNN.fit(X_train_noev, y_train_noev)
#print log_loss(y_cv_noev, clfKNN.predict_proba(X_cv_noev))
#
##NB
#clfNB = MultinomialNB(alpha=1.0)
#clfNB.fit(X_train_noev, y_train_noev)
#print log_loss(y_cv_noev, clfNB.predict_proba(X_cv_noev))

#Combine results of classifiers
print "Stacking classifiers for devices with no events"
clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR]
meta = LogisticRegression()
stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1)

stack.fit(X_train_noev, y_train_noev)
print log_loss(y_cv_noev, stack.predict_proba(X_cv_noev))
y_pred_noev = stack.predict_proba(X_test_noev)
    #y_pre = (pred_prob_nn+y_pre)/2.0
#    return y_pred_noev

y_pred_ev = predictor_ev()
#y_pred_noev = predictor_noev()

# Write results
result = pd.DataFrame(np.hstack(y_pred_ev, y_pred_noev), columns=le.classes_)
result["device_id"] = test_dev
result = result.set_index("device_id")
result.to_csv('stacking_1.gz', index=True,
              index_label='device_id', compression="gzip")