Пример #1
0
#Análise de Métricas

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print('Accuracy Score : ' + str(accuracy_score(Y_chegada, Y_chegada_pred_lgbm)))
print('Precision Score : ' + str(precision_score(Y_chegada, Y_chegada_pred_lgbm)))
print('Recall Score : ' + str(recall_score(Y_chegada, Y_chegada_pred_lgbm)))
print('F1 Score : ' + str(f1_score(Y_chegada, Y_chegada_pred_lgbm)))

#Treinamento e Predição com as Bases X_chegada e Y_chegada

from lightgbm import LGBMClassifier

classifier_lgbm_chegada = LGBMClassifier( max_depth = 1000, 
                                          learning_rate = 0.01,
                                          num_leaves = 2000,
                                          min_data_in_leaf = 200,
                                          n_estimators = 5000,
                                          objective = 'binary',
                                          metric = 'binary_logloss' )

classifier_lgbm_chegada.fit(X_chegada_train, Y_chegada_train)

Y_chegada_pred_lgbm = classifier_lgbm_chegada.predict(X_chegada_test)

#Análise de Métricas

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print('Accuracy Score : ' + str(accuracy_score(Y_chegada_test, Y_chegada_pred_lgbm)))
print('Precision Score : ' + str(precision_score(Y_chegada_test, Y_chegada_pred_lgbm)))
print('Recall Score : ' + str(recall_score(Y_chegada_test, Y_chegada_pred_lgbm)))
print('F1 Score : ' + str(f1_score(Y_chegada_test, Y_chegada_pred_lgbm)))
Пример #2
0
def cv_lgbm_scores(df_, num_folds, params, 
                   target_name = 'TARGET', index_name = 'SK_ID_CURR',
                   stratified = False, rs = 1001, verbose = -1):
    
    warnings.simplefilter('ignore')
    
    # Cleaning and defining parameters for LGBM
    params = int_lgbm_params(params)
    clf = LGBMClassifier(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)

    # Divide in training/validation and test data
    df_train_ = df_[df_[target_name].notnull()]
    df_test_ = df_[df_[target_name].isnull()]
    print("Starting LightGBM cross-validation at {}".format(time.ctime()))
    print("Train shape: {}, test shape: {}".format(df_train_.shape, df_test_.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = rs)
    else:
        folds = KFold(n_splits = num_folds, shuffle = True, random_state = rs)
        
    # Create arrays to store results
    train_pred = np.zeros(df_train_.shape[0])
    train_pred_proba = np.zeros(df_train_.shape[0])

    test_pred = np.zeros(df_train_.shape[0])
    test_pred_proba = np.zeros(df_train_.shape[0])
    
    prediction = np.zeros(df_test_.shape[0]) # prediction for test set
    
    feats = df_train_.columns.drop([target_name, index_name])
    
    df_feat_imp_ = pd.DataFrame(index = feats)
    
    # Cross-validation cycle
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train_[feats], df_train_[target_name])):
        print('--- Fold {} started at {}'.format(n_fold, time.ctime()))
        
        train_x, train_y = df_train_[feats].iloc[train_idx], df_train_[target_name].iloc[train_idx]
        valid_x, valid_y = df_train_[feats].iloc[valid_idx], df_train_[target_name].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(valid_x, valid_y)], eval_metric = 'auc', 
                verbose = verbose, early_stopping_rounds = 100)

        train_pred[train_idx] = clf.predict(train_x, num_iteration = clf.best_iteration_)
        train_pred_proba[train_idx] = clf.predict_proba(train_x, num_iteration = clf.best_iteration_)[:, 1]
        test_pred[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration_)
        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        prediction += clf.predict_proba(df_test_[feats], 
                                        num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits

        df_feat_imp_[n_fold] = pd.Series(clf.feature_importances_, index = feats)
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    # Computation of metrics
    roc_auc_train = roc_auc_score(df_train_[target_name], train_pred_proba)
    precision_train = precision_score(df_train_[target_name], train_pred, average = None)
    recall_train = recall_score(df_train_[target_name], train_pred, average = None)
    
    roc_auc_test = roc_auc_score(df_train_[target_name], test_pred_proba)
    precision_test = precision_score(df_train_[target_name], test_pred, average = None)
    recall_test = recall_score(df_train_[target_name], test_pred, average = None)

    print('Full AUC score {:.6f}'.format(roc_auc_test))
    
    # Filling the feature_importance table
    df_feat_imp_.fillna(0, inplace = True)
    df_feat_imp_['mean'] = df_feat_imp_.mean(axis = 1)
    
    # Preparing results of prediction for saving
    prediction_train = df_train_[[index_name]]
    prediction_train[target_name] = test_pred_proba
    prediction_test = df_test_[[index_name]]
    prediction_test[target_name] = prediction
    
    del df_train_, df_test_
    gc.collect()
    
    # Returning the results and metrics in format for scores' table
    return df_feat_imp_, prediction_train, prediction_test,            [roc_auc_train, roc_auc_test,
            precision_train[0], precision_test[0], precision_train[1], precision_test[1],
            recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]
Пример #3
0
test_pca = pca.transform(test)

# use stratifiedkfold(n_splits=3) to fit_predict the test set 3 times
# and get the proba

lgb_params = dict()
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 1000
# lgb_params['max_depth'] = 10
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8
lgb_params['min_child_samples'] = 500

lgb = LGBMClassifier(**lgb_params)

skf = StratifiedKFold(n_splits=3, shuffle=True)

predictions = np.zeros((test_pca.shape[0], 3))
for train_index, test_index in skf.split(train_pca, train_target):
    i = 0
    lgb_train = train_pca[train_index]
    lgb_target = train_target[train_index]
    lgb.fit(lgb_train, lgb_target)
    y_pred = lgb.predict_proba(test_pca)[:, 1]
    predictions[:, i] = y_pred
    i += 1

# write the result to a csv
Пример #4
0
    def fit(self, X: pd.DataFrame, y: np.array) -> tuple:
        # process cat cols
        if self.cat_validation == "None":
            encoder = MultipleEncoder(cols=self.cat_cols,
                                      encoders_names_tuple=self.encoders_names)
            X = encoder.fit_transform(X, y)

        for n_fold, (train_idx,
                     val_idx) in enumerate(self.model_validation.split(X, y)):
            X_train, X_val = (
                X.iloc[train_idx].reset_index(drop=True),
                X.iloc[val_idx].reset_index(drop=True),
            )
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            if self.cat_validation == "Single":
                encoder = MultipleEncoder(
                    cols=self.cat_cols,
                    encoders_names_tuple=self.encoders_names)
                X_train = encoder.fit_transform(X_train, y_train)
                X_val = encoder.transform(X_val)

            if self.cat_validation == "Double":
                encoder = DoubleValidationEncoderNumerical(
                    cols=self.cat_cols,
                    encoders_names_tuple=self.encoders_names)
                X_train = encoder.fit_transform(X_train, y_train)
                X_val = encoder.transform(X_val)
                pass
            self.encoders_list.append(encoder)

            # check for OrdinalEncoder encoding
            for col in [
                    col for col in X_train.columns if "OrdinalEncoder" in col
            ]:
                X_train[col] = X_train[col].astype("category")
                X_val[col] = X_val[col].astype("category")

            # fit model
            model = LGBMClassifier(**self.model_params)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_train, y_train), (X_val, y_val)],
                early_stopping_rounds=50,
                verbose=False,
            )
            self.models_trees.append(model.best_iteration_)
            self.models_list.append(model)

            y_hat = model.predict_proba(X_train)[:, 1]
            score_train = roc_auc_score(y_train, y_hat)
            self.scores_list_train.append(score_train)
            y_hat = model.predict_proba(X_val)[:, 1]
            score_val = roc_auc_score(y_val, y_hat)
            self.scores_list_val.append(score_val)

        mean_score_train = np.mean(self.scores_list_train)
        mean_score_val = np.mean(self.scores_list_val)
        avg_num_trees = int(np.mean(self.models_trees))
        print(f"Mean score train : {np.round(mean_score_train, 4)}")
        print(f"Mean score val : {np.round(mean_score_val, 4)}")
        return mean_score_train, mean_score_val, avg_num_trees
Пример #5
0
def without_cv_transfer_a_to_b_modeling():
    """

    :return:
    """

    '''Data input'''
    data_a_train = pd.read_csv('../data/A_train_final.csv', index_col='no')
    data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no')
    y_of_b_train = data_b_train['flag']
    data_b_test = pd.read_csv('../data/B_test_final.csv', index_col='no')

    '''A train特征工程'''
    data_a_train_without_label = data_a_train.drop('flag', axis=1)

    data_a_train_without_label['UserInfo_222x82'] = data_a_train_without_label['UserInfo_82'] * data_a_train_without_label['UserInfo_222']

    '''缺失值填充'''
    data_a_train_filled = data_a_train_without_label.fillna(value=10)

    '''特征的名字'''
    feature_name = list(data_a_train_without_label.columns.values)
    data_b_test_user_id = list(data_b_test.index.values)

    '''构造训练集和测试集'''
    x_temp = data_a_train_filled.iloc[:, :].as_matrix()  # 自变量
    y = data_a_train.iloc[:, -1].as_matrix()  # 因变量

    '''Feature selection 注意如果加特征的话,feature name还是需要改的'''
    X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, "0.1*mean")

    '''B train特征工程'''
    data_b_train_without_label = data_b_train.drop('flag', axis=1)

    data_b_train_without_label['UserInfo_222x82'] = data_b_train_without_label['UserInfo_82'] * data_b_train_without_label['UserInfo_222']
    data_b_train_filled = data_b_train_without_label.fillna(value=10)

    '''b test 特征工程'''
    data_b_test['UserInfo_222x82'] = data_b_test['UserInfo_82'] * data_b_test['UserInfo_222']
    data_b_test_filled = data_b_test.fillna(value=10)

    '''特征筛选'''
    data_b_train_filled_after_feature_selection = data_test_feature_drop(data_b_train_filled, dropped_feature_name)
    data_b_test_filled_after_feature_selection = data_test_feature_drop(data_b_test_filled, dropped_feature_name)

    '''用A_train建模预测B_train'''

    print '起始时间'
    print time.clock()*1.0/60

    parameter_n_estimators = 400
    classifier = LGBMClassifier(n_estimators=parameter_n_estimators)

    a_model = classifier.fit(X, y)

    prob_of_b_train = a_model.predict_proba(data_b_train_filled_after_feature_selection)

    print '训练终止时间'
    print time.clock()*1.0/60

    '''画roc曲线'''
    fpr, tpr, thresholds = roc_curve(y_of_b_train, prob_of_b_train[:, 1])

    roc_auc = auc(fpr, tpr)

    print '\nauc='+str(roc_auc)

    '''预测Btest'''

    prob_of_b_test = a_model.predict_proba(data_b_test_filled_after_feature_selection)

    result_file_name = '../result/B_test_predict_using_A_LGBLGB_without_cv_fillna_10' + '_N_' + str(parameter_n_estimators) + '_features_' + \
                       str(len_feature_choose) + '_offline_'+str(roc_auc)+'.csv'

    write_predict_results_to_csv(result_file_name, data_b_test_user_id, prob_of_b_test[:, 1].tolist())
Пример #6
0
def kfold_lightgbm(df, debug=False):
    # Divide in training/validation and test data

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()

    folds = KFold(n_splits=10, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])  # predicted valid_y
    sub_preds = np.zeros(test_df.shape[0])  # submission preds
    feature_importance_df = pd.DataFrame()  # feature importance

    fold_auc_best_df = pd.DataFrame(columns=["FOLD", "AUC", "BEST_ITER"])  # holding best iter to save model
    feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index',
                                                      "APP_index", "BURO_index", "PREV_index", "INSTAL_index",
                                                      "CC_index", "POS_index"]]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            n_jobs=-1,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y),
                          (valid_x, valid_y)],
                eval_metric='auc',
                verbose=200,
                early_stopping_rounds=200)

        # predicted valid_y
        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        # submission preds. her kat icin test setini tahmin edip tum katların ortalamasini alıyor.
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        # fold, auc and best iteration
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

        # best auc & iteration
        fold_auc_best_df = fold_auc_best_df.append({'FOLD': int(n_fold + 1),
                                                    'AUC': roc_auc_score(valid_y, oof_preds[valid_idx]),
                                                    "BEST_ITER": clf.best_iteration_}, ignore_index=True)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    # OUTPUTS
    print(fold_auc_best_df)
    print(feature_importance_df)

    # feature importance'ları df olarak kaydet
    feature_importance_df.to_pickle("outputs/features/feature_importance_df.pkl")
    fold_auc_best_df.to_pickle("outputs/features/fold_auc_best_df.pkl")

    # Final Model
    best_iter_1 = int(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:1]["BEST_ITER"].values)

    y_train = train_df["TARGET"]
    x_train = train_df[feats]

    final_model = LGBMClassifier(
        n_jobs=-1,
        n_estimators=best_iter_1,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.041545473,
        reg_lambda=0.0735294,
        min_split_gain=0.0222415,
        min_child_weight=39.3259775,
        silent=-1,
        verbose=-1).fit(x_train, y_train)

    cur_dir = os.getcwd()
    os.chdir('models/reference/')
    pickle.dump(final_model, open("lightgbm_final_model.pkl", 'wb'))  # model
    os.chdir(cur_dir)

    # her bir fold icin tahmin edilen valid_y'ler aslında train setinin y'lerinin farklı parcalarda yer alan tahminleri.
    cowsay.cow('Full Train(Validation) AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        cur_dir = os.getcwd()
        os.chdir('outputs/predictions/')
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv("reference_submission.csv", index=False)
        os.chdir(cur_dir)
    display_importances(feature_importance_df)
    del x_train, y_train

    return feature_importance_df
Пример #7
0
for i in categorical_features:
    class_le.fit(alldata.iloc[:, i].values)
    alldata.iloc[:, i] = class_le.transform(alldata.iloc[:, i].values)
alldata.head()

# In[49]:

x_train = alldata.iloc[:, 1:]
y_train = alldata.iloc[:, 0]

# In[59]:

start = time.time()
estimator = LGBMClassifier(objective='binary',
                           colsample_bytree=0.8,
                           subsample=0.8,
                           eval_metric='auc',
                           learning_rate=0.3,
                           n_estimators=25)
param_grid = {
    'max_depth': range(6, 18, 3),
    'num_leaves': range(1000, 10000, 2000)
}
gs = GridSearchCV(estimator, param_grid, cv=3)
print(gs.fit(x_train.head(100000), y_train.head(100000)))
print('{:.2f}'.format(time.time() - start) + ' sec')

# In[10]:

#网格化搜索max_depth,num_leaves,且num_leaves<2**max_depth
estimator = LGBMClassifier(objective='binary',
                           colsample_bytree=0.8,
Пример #8
0
'''

# for fixing LightGBMError: Do not support special JSON characters in feature name.
#train_data.columns       = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_data.columns]
#train_data_label.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_data_label.columns]

# LightGBM Classifier  
lgbm_param = {
            'max_depth':[30,35,40,45,50,55,60],
            'min_child_samples':[10,15,20,30,40,45,50],
            'n_estimators':[200,300,400,500,600,650,700,800],
            'learning_rate':stats.uniform(0.2, 0.5),
            'num_leaves':[40,45,50,55,60,65,70,80]}


lgbm = LGBMClassifier()
start = time()
random_search = RandomizedSearchCV(lgbm, param_distributions=lgbm_param,n_iter=N_ITER,n_jobs=4)
random_search.fit(train_data,mapped_labels )
print("RandomizedSearchCV took %.2f seconds for LGBM." % (time() - start))
report(random_search.cv_results_)
'''
# XGboost Classifier 

xg_params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'learning_rate': stats.uniform(0, 0.02),
        'max_depth': [5,6,7,8,9,10,11,12],
Пример #9
0
rfc_predict = rfc.predict(X_test_std)
rfc_predict_proba = rfc.predict_proba(X_test_std)[:,1]
get_scores(y_test,rfc_predict,rfc_predict_proba)
print('')

#GBDT
print('GBDT:')
gdbt = GradientBoostingClassifier(random_state=2018)
gdbt.fit(X_train_std,y_train)
gdbt_predict = gdbt.predict(X_test_std)
gdbt_predict_proba = gdbt.predict_proba(X_test_std)[:,1]
get_scores(y_test,gdbt_predict,gdbt_predict_proba)
print('')

#XGBoost
print('XGBoost:')
xgbs = XGBClassifier(random_state=2018)
xgbs.fit(X_train_std,y_train)
xgbs_predict = xgbs.predict(X_test_std)
xgbs_predict_proba = xgbs.predict_proba(X_test_std)[:,1]
get_scores(y_test,xgbs_predict,xgbs_predict_proba)
print('')

#LightGBM
print('LightGBM:')
lgbm = LGBMClassifier(random_state=2018)
lgbm.fit(X_train_std,y_train)
lgbm_predict = lgbm.predict(X_test_std)
lgbm_predict_proba = lgbm.predict_proba(X_test_std)[:,1]
get_scores(y_test,lgbm_predict,lr_predict_pro)
# 3 拆分测试集
drop_feature = [
    'risk_time', 'consumer_no', 'month_status', 'first_status', 'data_type'
]
test_x = test_data.loc[:, ~test_data.columns.isin(drop_feature)]
test_y = test_data.loc[:, 'month_status']

# 4 不平衡样本训练
numericFeature = train_x.columns.tolist()
OversampleRandom5 = {'RandomSample': {'ratio': 0.5, 'random_state': 10}}
OversampleRandom4 = {'SMOTEENN': {'ratio': 0.4, 'random_state': 10}}
OversampleRandom3 = {'Smote': {'ratio': 0.3, 'random_state': 10}}


lgb = LGBMClassifier(boosting_type='gbdt',learning_rate=0.1, max_depth=2,n_estimators=500,\
                               n_jobs=-1,objective='binary',importance_type = 'gain',min,\
                               random_state=10)
imbRandom5 = imbalanceOversampleProcess(numericFeature, OversampleRandom5, lgb)
imbRandom4 = imbalanceOversampleProcess(numericFeature, OversampleRandom4, lgb)
imbRandom3 = imbalanceOversampleProcess(numericFeature, OversampleRandom3, lgb)

# 5 评估流程
data_dict = {
    'train': {
        'X': train_x,
        'y': train_y
    },
    'test': {
        'X': test_x,
        'y': test_y
    }
Пример #11
0
CASE = 1

titanic = pd.read_pickle('tests/data/clean_titanic.pkl')
if CASE == 1:
    features = ['Pclass', 'Survived', 'Embarked', 'Sex']
    encoder = one_hot.OneHotEncoder(titanic, cols=['Embarked', 'Sex'])
    X = titanic[features]
    y = titanic['Age'].to_frame()
    model = LGBMRegressor()

elif CASE == 2:
    features = ['Pclass', 'Age', 'Embarked', 'Sex']
    encoder = one_hot.OneHotEncoder(titanic, cols=['Embarked', 'Sex'])
    X = titanic[features]
    y = titanic['Survived'].to_frame()
    model = LGBMClassifier()

else:
    features = ['Survived', 'Age', 'Embarked', 'Sex']
    encoder = one_hot.OneHotEncoder(titanic, cols=['Embarked', 'Sex'])
    X = titanic[features]
    y = titanic['Pclass'].to_frame()
    model = LGBMClassifier()

titanic_enc = encoder.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    titanic_enc,
    y,
    test_size=0.2,
)
Пример #12
0
def lgbm_modeling_cross_validation(params,
                                   full_train,
                                   y,
                                   classes,
                                   class_weights,
                                   nr_fold=10,
                                   random_state=7):

    unique_y = np.unique(y)
    class_map = dict()
    for i, val in enumerate(unique_y):
        class_map[val] = i

    # y = np.array([class_map[val] for val in y])
    y = y.apply(lambda x: class_map[x])

    # Compute weights
    w = y.value_counts()
    weights = {i: np.sum(w) / w[i] for i in w.index}

    clfs = []
    importances = pd.DataFrame()
    folds = StratifiedKFold(n_splits=nr_fold,
                            shuffle=True,
                            random_state=random_state)

    oof_preds = np.zeros((len(full_train), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_]
        val_x, val_y = full_train.iloc[val_], y.iloc[val_]

        trn_xa, trn_y, val_xa, val_y = smoteAdataset(trn_x.values,
                                                     trn_y.values,
                                                     val_x.values,
                                                     val_y.values)
        trn_x = pd.DataFrame(data=trn_xa, columns=trn_x.columns)

        val_x = pd.DataFrame(data=val_xa, columns=val_x.columns)

        clf = LGBMClassifier(**params)
        clf.fit(trn_x,
                trn_y,
                eval_set=[(trn_x, trn_y), (val_x, val_y)],
                eval_metric=lgbm_multi_weighted_logloss,
                verbose=100,
                early_stopping_rounds=50,
                sample_weight=trn_y.map(weights))

        clf.my_name = "lgbm"

        clfs.append(clf)

        oof_preds[val_, :] = clf.predict_proba(
            val_x)  #, num_iteration=clf.best_iteration_)
        print('no {}-fold loss: {}'.format(
            fold_ + 1,
            multi_weighted_logloss(val_y, oof_preds[val_, :], classes,
                                   class_weights)))

        imp_df = pd.DataFrame({
            'feature': full_train.columns,
            'gain': clf.feature_importances_,
            'fold': [fold_ + 1] * len(full_train.columns),
        })
        importances = pd.concat([importances, imp_df], axis=0, sort=False)

    score = multi_weighted_logloss(y_true=y,
                                   y_preds=oof_preds,
                                   classes=classes,
                                   class_weights=class_weights)
    print('MULTI WEIGHTED LOG LOSS: {:.5f}'.format(score))
    df_importances = save_importances(importances_=importances)
    df_importances.to_csv('lgbm_importances.csv', index=False)

    cnf = confusion_matrix(y, np.argmax(oof_preds, axis=1))
    plot_confusion_matrix(cnf,
                          classes=classes,
                          normalize=True,
                          filename="lgbm")

    return clfs, score, oof_preds
Пример #13
0
''' for each day build a model '''
for i in tqdm(range(start - 1, end, 1)):

    output = ''
    ''' import dataframe '''
    train = pd.read_csv(train_path[i],
                        dtype=train_dtypes,
                        usecols=train_fields)
    X_train = train.loc[:, train.columns != 'skip_2']
    y_train = train['skip_2']

    X_test = pd.read_csv(test_path[i], dtype=test_dtypes, usecols=test_fields)

    clf = LGBMClassifier(n_estimators=100,
                         objective='binary',
                         learning_rate=0.05,
                         n_jobs=-1,
                         random_state=42)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    ''' file format '''
    position = X_test['session_position'].tolist()
    length = X_test['session_length'].tolist()

    for pos in range(len(y_pred)):
        output += str(y_pred[pos])
        if position[pos] == length[pos]:
            output += '\n'
    '''save file'''
    output_path = 'output_1224/' + date_of_name[i] + '.txt'
Пример #14
0
X_test = test.drop(columns='id')

from sklearn.preprocessing import StandardScaler
_ = StandardScaler().fit_transform(X_test)
X_test = pd.DataFrame(_, columns=X_test.columns)

clf_feature_selection = LGBMClassifier(boosting_type='gbdt',
                                       class_weight='balanced',
                                       colsample_bytree=1.0,
                                       importance_type='split',
                                       learning_rate=0.1,
                                       max_depth=10,
                                       min_child_samples=20,
                                       min_child_weight=0.001,
                                       min_split_gain=0.01,
                                       n_estimators=200,
                                       n_jobs=-1,
                                       num_leaves=31,
                                       objective='binary',
                                       random_state=42,
                                       reg_alpha=0.5,
                                       reg_lambda=0,
                                       silent=True,
                                       subsample=1.0,
                                       subsample_for_bin=200000,
                                       subsample_freq=0)

rfecv = RFECV(estimator=clf_feature_selection, step=1, cv=5, scoring='roc_auc')
params = {
    'random_state': [42],
    'objective': ['binary'],
    'class_weight': ['balanced', None],
Пример #15
0
real = pd.read_pickle(os.path.join(data_dir, "eval_real.p"))
train_users = pd.read_pickle(os.path.join(data_dir, "train_users.p"))
test_users = pd.read_pickle(os.path.join(data_dir, "test_users.p"))
trainset = pd.read_pickle(os.path.join(data_dir, "train.p"))
evalset = pd.read_pickle(os.path.join(data_dir, "eval.p"))

real = real.loc[train_users]
evalset = evalset[evalset.user_id.isin(train_users)]

mds = [3, 5, 8, 10]
eval_ps = [1 / i for i in range(1, 20)]
res = pd.DataFrame([], index=mds, columns=eval_ps)

for md in mds:
    learner = LGBMClassifier(n_estimators=10000, max_depth=md)

    learner.fit(trainset.drop("reordered", axis=1),
                trainset.reordered,
                eval_metric="auc",
                early_stopping_rounds=10,
                eval_set=[(trainset.drop("reordered",
                                         axis=1), trainset.reordered),
                          (evalset.drop("reordered",
                                        axis=1), evalset.reordered)])

    preds = learner.predict_proba(evalset.drop("reordered", axis=1))[:, -1]

    for p in eval_ps:
        ppreds = evalset[preds > p]
        ppreds = ppreds.groupby("user_id").product_id.apply(set)
Пример #16
0
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=47)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=47)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [
        f for f in train_df.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            #is_unbalance=True,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            #scale_pos_weight=11
        )

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                verbose=100,
                early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name,
                                                 index=False)
    display_importances(feature_importance_df)
    return feature_importance_df
Пример #17
0
 def __buildModel__(self,modelParams):
     
     #note that lgbm does not supports multi-output classification.
     self.model=MultiOutputClassifier(LGBMClassifier(n_jobs=-1,
                                                     **modelParams))
     
# print(train_y)

# 특성 라벨 정하기
feat_labels = train_x.columns[:]
# print(feat_labels)

# 훈련 데이터와 테스트 데이터로 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_x,
                                                    train_y,
                                                    test_size=0.2,
                                                    random_state=0)

# 모델링 / 훈련
from lightgbm import LGBMClassifier
LGBM = LGBMClassifier()
LGBM.fit(X_train, y_train, verbose=True)

# 정규화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
test_x_scaled = scaler.transform(test_x)
print(X_train_scaled)

# 시각화
import matplotlib.pyplot as plt
plt.hist(X_train_scaled)
plt.title('StandardScaler')
Пример #19
0
GB_Classifier.fit(X_train, Y_train)

# AdaBoost
AD_Classifier = ske.AdaBoostClassifier(n_estimators=100)
AD_Classifier.fit(X_train, Y_train)

# GaussianNB
GS_Classifier = GaussianNB()
GS_Classifier.fit(X_train, Y_train)

# XGBoost
XGB_Classifier = xgb.XGBClassifier()
XGB_Classifier.fit(X_train, Y_train)

# LightGBM
lgbm_Classifier = LGBMClassifier()
lgbm_Classifier.fit(X_train, Y_train)

models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('KNeighborsClassifier', KNN_Classifier))
models.append(('LogisticRegression', LGR_Classifier))
models.append(('RandomForest', RD_Classifier))
models.append(('GradientBoosting', GB_Classifier))
models.append(('AdaBoost', AD_Classifier))
models.append(('GaussianNB', GS_Classifier))
models.append(('XGBoost', XGB_Classifier))
models.append(('LightGBM', lgbm_Classifier))

for i, v in models:
Пример #20
0
    xgb.fit(x_train, y_train)
    test_xg_prob = xgb.predict_proba(x_test)
    train_xg_prob = xgb.predict_proba(x_train)
    print('xgboost的训练集log损失', log_loss(y_train, train_xg_prob))
    print('xgboost的测试集log损失', log_loss(y_test, test_xg_prob))
    time2_1 = time.time()
    print('xgboost计算时间', time2_1 - time2_0)

# 用lgb建模
if flag == 3 or flag == 0:
    print("开始lgbm训练")
    time3_0 = time.time()
    lgb = LGBMClassifier(objective='binary',
                         learning_rate=0.02,
                         n_estimators=100,
                         num_leaves=45,
                         depth=12,
                         colsample_bytree=0.8,
                         min_child_samples=14,
                         subsample=0.9)
    lgb.fit(x_train, y_train)
    test_lgb_prob = lgb.predict_proba(x_test)
    train_lgb_prob = lgb.predict_proba(x_train)
    print('lightgbm的训练集log损失', log_loss(y_train, train_lgb_prob))
    print('lightgbm的测试集集log损失', log_loss(y_test, test_lgb_prob))
    time3_1 = time.time()
    print('lightgbm计算时间', time3_1 - time3_0)
'''
#验证集输出结果,线上测试
import getFearures01
path_test = '../data/round1_ijcai_18_test_b_20180418.txt'
test_df = getFearures01.cpfeature(path_test)
    'Arbitration_ID', 'Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5',
    'Data6', 'Data7'
]]
#test_ys = test_s_df['Class']
'''
scaler = StandardScaler()
scaler.fit(df_x)
scaler.fit(test_x)
x_test_scaled = scaler.transform(test_x)
x_scaled = scaler.transform(df_x)
'''

model_d = LGBMClassifier(random_state=0,
                         metric='binary_error',
                         boosting_type='gbdt',
                         learning_rate=0.1,
                         n_estimators=100,
                         num_leaves=16,
                         objective='binary')
model_d.fit(df_xd, df_yd, verbose=2)
pred_yd = model_d.predict(test_xd)

model_s = LGBMClassifier(random_state=0,
                         metric='binary_error',
                         boosting_type='gbdt',
                         learning_rate=0.1,
                         n_estimators=100,
                         num_leaves=16,
                         objective='binary')
model_s.fit(df_xs, df_ys, verbose=2)
pred_ys = model_s.predict(test_xs)
Пример #22
0
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
import sys
sys.path.insert(1,'../paragrid')
#%%

from paradec import parallel
@parallel
def ml_model(X,y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.33,
                                                        random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    return np.sum(y_pred==y_test)/len(y_test)
# spaces
space_gpdt = {'learning_rate': [0.001, 0.1, ],
              'n_estimators': [2, 70, 5],
              'max_depth': [2, 50, 4]}


# Classification
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target    

args  = [[X,y,LGBMClassifier(n_estimators=i)] for i in [5,10,15,25]]
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(ml_model, args)
for i in results:
    print(i)
def fill_nan_lgbm(dataframe, feature, target):

    from sklearn import preprocessing
    from lightgbm import LGBMClassifier
    from lightgbm import LGBMRegressor
    from sklearn.model_selection import KFold
    PREDICT_NAME = 'predict'

    df = dataframe
    df = df.reset_index(drop=True)

    feature_list = [
        f_ for f_ in dataframe.columns if f_ != feature and f_ != target
    ]
    train = df[feature_list]
    train, _ = one_hot_encoder(train, True)
    for f_ in train.columns:
        train, _ = fill_nan_mean(train, f_, intern=True)

    train = pd.concat([train, df[feature]], axis=1)
    train_df = train[train[feature].notnull()].drop(feature, axis=1)
    train_target = train.loc[train[feature].notnull(), feature]
    test_df = train[train[feature].isnull()].drop(feature, axis=1)
    test_target = train[train[feature].isnull()][[feature]]
    valid = train[train[feature].notnull()][[feature]]
    valid[PREDICT_NAME] = 0
    valid.reset_index(inplace=True)
    folds = KFold(n_splits=5, shuffle=True, random_state=1001)

    for n_fold, (train_idx,
                 valid_idx) in enumerate(folds.split(train_df, train_target)):
        train_x, train_y = train_df.iloc[train_idx], train_target.iloc[
            train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], train_target.iloc[
            valid_idx]

        # Fix new label in valid_y
        if train_y.dtype == 'object':
            train_y_value_list = train_y.unique()
            train_y_value_mode = train_y.mode().values[0]
            valid_y = valid_y.apply(lambda x: x if x in train_y_value_list\
                                    else train_y_value_mode)

        lgbm = 0
        if train_target.dtype == 'object':
            lgbm = LGBMClassifier(
                nthread=4,
                n_estimators=1000,
                learning_rate=0.02,
                #            num_leaves=34,
                num_leaves=6,
                colsample_bytree=0.95,
                subsample=0.87,
                #            max_depth=8,
                reg_alpha=0.04,
                reg_lambda=0.07,
                min_split_gain=0.02,
                min_child_weight=40,
                silent=-1,
                verbose=-1,
            )

        else:
            lgbm = LGBMRegressor(
                nthread=4,
                n_estimators=1000,
                learning_rate=0.02,
                #            num_leaves=34,
                num_leaves=6,
                colsample_bytree=0.95,
                subsample=0.87,
                #            max_depth=8,
                reg_alpha=0.04,
                reg_lambda=0.07,
                min_split_gain=0.02,
                min_child_weight=40,
                silent=-1,
                verbose=-1,
            )

        debug('++++++++++++++++++++LGBM++++++++++' + feature +
              '+++++++++++++++++++++++++++++++++')
        lgbm.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], \
                                             verbose= 1000, early_stopping_rounds= 200)
        valid.ix[valid_idx, [PREDICT_NAME]] = lgbm.predict(valid_x)
        if test_target.shape[0] > 0:
            test_target.loc[:, PREDICT_NAME] = lgbm.predict(test_df)

    acc = fill_na_performance(valid, feature, PREDICT_NAME)
    if test_target.shape[0] > 0:
        for i in test_target.index.values:
            df.loc[i, feature] = test_target.loc[i, PREDICT_NAME]

    trace('fill_nan_lgbm ' + feature + ' acc: ' + str(acc))
    return df, acc
Пример #24
0
lgb_params4 = {}
lgb_params4['n_estimators'] = 1450
lgb_params4['max_bin'] = 20
lgb_params4['max_depth'] = 6
lgb_params4['learning_rate'] = 0.25  # shrinkage_rate
lgb_params4['boosting_type'] = 'gbdt'
lgb_params4['objective'] = 'binary'
lgb_params4['min_data'] = 500  # min_data_in_leaf
lgb_params4['min_hessian'] = 0.05  # min_sum_hessian_in_leaf
lgb_params2['num_leaves'] = 64
lgb_params4['verbose'] = 0
lgb_params4['device'] = 'gpu'
lgb_params4['gpu_platform_id'] = 0
lgb_params4['gpu_device_id'] = 0

lgb_model = LGBMClassifier(**lgb_params)

lgb_model2 = LGBMClassifier(**lgb_params2)

lgb_model3 = LGBMClassifier(**lgb_params3)

lgb_model4 = LGBMClassifier(**lgb_params4)

log_model = LogisticRegression()

stack = Ensemble(n_splits=5,
                 stacker=log_model,
                 base_models=(lgb_model, lgb_model2, lgb_model3, lgb_model4))

y_pred = stack.fit_predict(train, target_train, test)
                             min_samples_leaf=10,
                             n_estimators=300,
                             verbose=True)
clf.fit(train_x, train_y)
selector = SelectFromModel(clf, prefit=True)
new_train_x = selector.transform(train_x)

# Train stacking model
rf = RandomForestClassifier(max_depth=6,
                            random_state=0,
                            min_samples_leaf=10,
                            n_estimators=300)
lgb1 = LGBMClassifier(n_estimators=400,
                      num_leaves=50,
                      max_depth=6,
                      learning_rate=0.03,
                      subsample=0.8,
                      reg_alpha=1.0,
                      reg_lambda=0.5,
                      n_jobs=6)
lgb2 = LGBMClassifier(n_estimators=300,
                      num_leaves=60,
                      max_depth=3,
                      learning_rate=0.07,
                      subsample=0.8,
                      reg_alpha=0,
                      reg_lambda=1,
                      n_jobs=6)
estimators = [('rf', rf), ('lgbt1', lgb1), ('lgbt2', lgb2)]
clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=400,
Пример #26
0
                                      min_child_weight=10,
                                      zero_as_missing=True,
                                      learning_rate=0.01,
                                      num_leaves=100,
                                      feature_fraction=0.7,
                                      bagging_fraction=0.7,
                                      n_estimators=800,
                                      n_jobs=-1,
                                      min_child_samples=30)
            else:
                model = LGBMClassifier(reg_alpha=0.3,
                                       reg_lambda=0.1,
                                       min_child_weight=10,
                                       zero_as_missing=True,
                                       learning_rate=0.01,
                                       num_leaves=100,
                                       feature_fraction=0.7,
                                       bagging_fraction=0.7,
                                       n_estimators=800,
                                       n_jobs=-1,
                                       min_child_samples=30)

            train_x, test_x, train_y, test_y = df_X[train], df_X[test], df_y[
                train], df_y[test]
            # train_test_split(df_X, df_y, test_size=0.15)

            model.fit(train_x,
                      train_y,
                      eval_set=(test_x, test_y),
                      early_stopping_rounds=7)
            models.append(model)
Пример #27
0
def get_model_from_name(model_name, training_params=None, is_hp_search=False):
    # For Keras
    epochs = 1000
    # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning':
    #     print('Heard that this is the test suite. Limiting number of epochs, which will increase '
    #           'training speed dramatically at the expense of model accuracy')
    #     epochs = 100

    all_model_params = {
        'LogisticRegression': {},
        'RandomForestClassifier': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'ExtraTreesClassifier': {
            'n_jobs': -1
        },
        'AdaBoostClassifier': {},
        'SGDClassifier': {
            'n_jobs': -1
        },
        'Perceptron': {
            'n_jobs': -1
        },
        'LinearSVC': {
            'dual': False
        },
        'LinearRegression': {
            'n_jobs': -2
        },
        'RandomForestRegressor': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'LinearSVR': {
            'dual': False,
            'loss': 'squared_epsilon_insensitive'
        },
        'ExtraTreesRegressor': {
            'n_jobs': -1
        },
        'MiniBatchKMeans': {
            'n_clusters': 8
        },
        'GradientBoostingRegressor': {
            'presort': False,
            'learning_rate': 0.1,
            'warm_start': True
        },
        'GradientBoostingClassifier': {
            'presort': False,
            'learning_rate': 0.1,
            'warm_start': True
        },
        'SGDRegressor': {
            'shuffle': False
        },
        'PassiveAggressiveRegressor': {
            'shuffle': False
        },
        'AdaBoostRegressor': {},
        'LGBMRegressor': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'LGBMClassifier': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'DeepLearningRegressor': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'DeepLearningClassifier': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    # if os.environ.get('is_test_suite', 0) == 'True':
    #     all_model_params

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if is_hp_search is True:
        if model_name[:12] == 'DeepLearning':
            model_params['epochs'] = 50
        if model_name[:4] == 'LGBM':
            model_params['n_estimators'] = 500

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,
        # 000 trees, we will let them do it)
        model_params.update(training_params)
        print(
            'After overwriting our defaults with your values, here are the final params that will '
            'be used to initialize the model:')
        print(model_params)

    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans(),
    }

    try:
        model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001)
        model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
            max_iter=1000, tol=0.001)
        model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(
            max_iter=1000, tol=0.001)
    except TypeError:
        model_map['SGDClassifier'] = SGDClassifier()
        model_map['Perceptron'] = Perceptron()
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
        )
        model_map['SGDRegressor'] = SGDRegressor()
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor()

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor()
        model_map['CatBoostClassifier'] = CatBoostClassifier()

    if model_name[:12] == 'DeepLearning':
        if keras_installed is False:
            # Suppress some level of logs if TF is installed (but allow it to not be installed,
            # and use Theano instead)
            try:
                os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                from tensorflow import logging
                logging.set_verbosity(logging.INFO)
            except:
                # TODO: Fix bare Except
                pass

        model_map['DeepLearningClassifier'] = KerasClassifier(
            build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(
            build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print(
            'It appears you are trying to use a library that is not available when we try to '
            'import it, or using a value for model_names that we do not recognize.'
        )
        raise e

    if os.environ.get('is_test_suite', False) == 'True':
        if 'n_jobs' in model_params:
            model_params['n_jobs'] = 1
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
Пример #28
0
import shap
import os
import time

from matplotlib import pyplot as plt
import matplotlib
matplotlib.use('TkAgg')

lgbm = LGBMClassifier(boosting_type='gbdt',
                      num_leaves=31,
                      max_depth=-1,
                      learning_rate=0.001,
                      n_estimators=2000,
                      objective=None,
                      min_split_gain=0,
                      min_child_weight=3,
                      min_child_samples=10,
                      subsample=0.8,
                      subsample_freq=1,
                      colsample_bytree=0.7,
                      reg_alpha=0.3,
                      reg_lambda=0,
                      seed=17)

path_to_features = r'C:\Users\kotov-d\Documents\TASKS\feature_selection\\features_to_calc'
path_to_calculated = r'\\Zstorage\!z\Shuranov\calculated_features'

for xXx in os.listdir(path_to_features):
    print(xXx[:-4])
    start = time.time()
    with open(os.path.join(path_to_features, xXx), "rb") as f:
Пример #29
0
    'mean_word_len'
]]

feature_inputs = X_data.columns
label = data['encoded_sentiment']
X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                    label,
                                                    test_size=0.3,
                                                    random_state=42)

clf = LGBMClassifier(
    n_estimators=10000,
    learning_rate=0.10,
    num_leaves=30,
    subsample=.9,
    max_depth=7,
    reg_alpha=.1,
    reg_lambda=.1,
    min_split_gain=.01,
    min_child_weight=2,
    silent=-1,
    verbose=-1,
)

print('bat dau train')
LGBM_model = clf.fit(X_train, y_train)
filename = 'lgbm.pkl'
with open(filename, 'wb') as file:
    pickle.dump(LGBMModel, file)

print('Da luu model thanh cong')
Пример #30
0
def trainModel(x,y):

    #lightgbm/xgboost的自定义评价指标
    def self_metric(y_true, y_pred):
        score = f1_score(y_true, 1*(y_pred>=0.5))  # 因为传入进去的y_true和y_pred必须是二进制的数据类型,因此需要
        return 'f1', score, False

    from sklearn.ensemble import BaggingClassifier
    params = {"num_leaves":81, "n_estimators":100, "learning_rate":0.2,#绝对需要的参数
              "subsample":0.9,"class_weight":{1:1,0:1},"reg_lambda":2 #仅做尝试
              }
    x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,random_state=345)
    lgb = LGBMClassifier(**params)
    """
    boosting_type='gbdt', 默认是 gdbt 梯度提升树 dart dropout和MArt的结合 后者就是多层加法树模型(multiple additive regression tree)goss(基于梯度的单侧采样) rf 随机森林
    num_leaves=31, 基础学习器最多的叶子节点
    max_depth=-1,  基础学习器的最大树深  小于等于0意味着没有限制
    learning_rate=0.1, boosting的缩放系数
    n_estimators=100, 学习器的数量
    subsample_for_bin=200000, 多少样本构建分箱
    objective=None, 指定具体的任务类型 如果是是分类就是 binary或者multiple 回归就是regression 排序就是lambdarank
    class_weight=None, 样本权重 不同类别的样本权重可能不一样
    min_split_gain=0., 
    min_child_weight=1e-3, 
    min_child_samples=20,
    subsample=1., 
    subsample_freq=0, 
    colsample_bytree=1.,
    reg_alpha=0., 
    reg_lambda=0.,
    random_state=None,
    n_jobs=-1, 
    silent=True, 
    importance_type='split'
    
    
    """
    # model = BaggingClassifier(base_estimator=lg, n_estimators=100, max_samples=0.8, max_features=0.8)
    model  = lgb
    model.fit(x_train, y_train, eval_metric=self_metric, eval_set=[(x_train, y_train),(x_test, y_test)]) # 默认没有F1指标 所以自定义
    # model.fit(x_train, y_train)
    """
    sample_weight=None, 
    init_score=None,
    eval_set=None, 
    eval_names=None, 
    eval_sample_weight=None,
    eval_class_weight=None, 
    eval_init_score=None, 
    eval_metric=None,
    early_stopping_rounds=None,  正常的是应该在测试集效果越来越好,如果连续n轮效果越来越差 就提前结束训练
    verbose=True,
    feature_name='auto', 
    categorical_feature='auto', 
    callbacks=None
    
    
    
    """

    """
    如果设置early_stopping这个参数 那么要将迭代的轮数回传给模型,把迭代效果不好的轮次不要掉 model.n_estimators = model.best_iteration_
    
    """

    #质变部分 - 取合理的阈值来指定 f1指标
    #todo 可以自己划分多个阈值(2000个以上)直接计算f1指标,看哪个阈值最好,更加精确
    pre_train = model.predict_proba(x_train)[:,1]
    pre_test = model.predict_proba(x_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_train, pre_train)
    thre_index = (tpr - fpr).argmax()
    thres = thresholds[thre_index]


    print("训练集阈值",thres)
    pre_train = 1*(pre_train>=thres)
    pre_test = 1 * (pre_test >= thres)
    print("train f1_score",f1_score(y_train, pre_train))
    print("test f1_score", f1_score(y_test, pre_test))
    print("train recall_score",recall_score(y_train, pre_train))
    print("test recall_score", recall_score(y_test, pre_test))
    print("train precision_score",precision_score(y_train, pre_train))
    print("test precision_score", precision_score(y_test, pre_test))
    return model,thres