Пример #1
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    shuffle=True,
                                                    test_size=0.25)

xgb = XGBClassifier()
# xgb_new_model.fit(X_train, y_train)
# print('Accuracy ', accuracy_score(y_test, xgb_new_model.predict(X_test)))

cv = StratifiedKFold(n_splits=5, shuffle=True)
param = {
    'n_estimators': [x for x in range(10, 201, 20)],
    'max_depth': [x for x in range(1, 5)]
}

grid = GridSearchCV(xgb,
                    param_grid=param,
                    cv=cv,
                    scoring='accuracy',
                    verbose=1)
grid.fit(X_train, y_train)
model = grid.best_estimator_

accuracy = accuracy_score(y_test, grid.predict(X_test))
print(accuracy)

save_model(model_name='xgb_new',
           model=model,
           accuracy=accuracy,
           features=features)
Пример #2
0
path_to_models = list(map(lambda x: 'full/' + x, path_to_models))
models = [pickle.load(open(name, 'rb')) for name in path_to_models]

df_base = df_base.loc[:, features]
df_base.reset_index(inplace=True, drop=True)

df_non = df_non.loc[:, features]
df_label_0 = df_label_0.loc[:, features]
df_label_0.reset_index(inplace=True, drop=True)

df = pd.concat([df_base, df_label_0, df_non], ignore_index=True)
X, y = df.drop('y', axis=1).values, df['y'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
S_train, S_test = stacking(models, X_train, y_train, X_test,
                           regression=False,
                           mode='oof_pred_bag',
                           needs_proba=False,
                           save_dir=None,
                           metric=accuracy_score,
                           n_folds=5,
                           stratified=True,
                           shuffle=True,
                           verbose=2)
model = LogisticRegression()
model.fit(S_train, y_train)
print(accuracy_score(y_test, model.predict(S_test)))

meta_feat = ['rf_predict', 'xgb_predict', 'xgb_new_predict', 'xgb_new_model_predict']
save_model('Meta_model', model, accuracy_score(y_test, model.predict(S_test)), meta_feat)
Пример #3
0
df = pd.concat([df_base, df_label_0, df_non], ignore_index=True)
X, y = df.drop('y', axis=1).values, df['y'].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    shuffle=True,
                                                    test_size=0.25)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)

forest = RandomForestClassifier(n_estimators=1000)
params = {
    'max_depth': [x for x in range(3, 12)],
    'min_samples_split': [x for x in range(2, 6)],
    'min_samples_leaf': [x for x in range(1, 4)]
}

search = RandomizedSearchCV(estimator=forest,
                            param_distributions=params,
                            n_iter=50,
                            scoring='accuracy',
                            cv=cv)
print("TRAINING")
search.fit(X_train, y_train)
model = search.best_estimator_
print("BEST ESTIMATOR {}".format(model))

accuracy = accuracy_score(y_test, model.predict(X_test))
print(accuracy)

save_model(model_name='rf', model=model, accuracy=accuracy, features=features)
Пример #4
0
df_base = df_base.loc[:, features]
df_base.reset_index(inplace=True, drop=True)

df_non = df_non.loc[:, features]
df_label_0 = df_label_0.loc[:, features]
df_label_0.reset_index(inplace=True, drop=True)

df = pd.concat([df_base, df_label_0, df_non,  df4, df5, df6, df7, df8, df9], ignore_index=True)
X, y = df.drop('y', axis=1).values, df['y'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)

forest = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
params = {
    'max_depth': [x for x in range(3, 12)],
    'min_samples_split': [x for x in range(2, 6)],
    'min_samples_leaf': [x for x in range(1, 4)]
}

search = RandomizedSearchCV(estimator=forest, param_distributions=params, n_iter=50, scoring='recall', cv=cv)
print("TRAINING")
search.fit(X_train, y_train)
model = search.best_estimator_
print("BEST ESTIMATOR {}".format(model))

accuracy = recall_score(y_test, model.predict(X_test), average=None)
print(accuracy)

save_model(model_name='rf_fake_without_cw', model=model, accuracy=None, features=features)
Пример #5
0
df_non = df_non.loc[:, features]
df_label_0 = df_label_0.loc[:, features]

df = pd.concat([df_base, df_label_0, df_non, df4, df5, df6, df7, df8, df9],
               ignore_index=True)

X, y = df.drop('y', axis=1).values, df['y'].values
weights = np.ones(X.shape[0])
weights[y == 1] = 1.5
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(
    X, y, weights, shuffle=True, test_size=0.25)

xgb = XGBClassifier()
# xgb_new_model.fit(X_train, y_train)
# print('Accuracy ', accuracy_score(y_test, xgb_new_model.predict(X_test)))

cv = StratifiedKFold(n_splits=5, shuffle=True)
param = {
    'n_estimators': [x for x in range(10, 51, 5)],
    'max_depth': [x for x in range(1, 5)]
}

grid = GridSearchCV(xgb, param_grid=param, cv=cv, scoring='recall', verbose=1)
grid.fit(X_train, y_train, sample_weight=weights_train)
model = grid.best_estimator_

accuracy = recall_score(y_test, grid.predict(X_test), average=None)
print(accuracy)

save_model('xgb_recall', model, None, features)