def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th): # SelectfromModel clf = LGBMClassifier(n_estimators=400) clf.fit(matrix_x_temp, label_y) sfm = SelectFromModel(clf, prefit=True, threshold=th) matrix_x = sfm.transform(matrix_x_temp) # 打印出有多少特征重要性非零的特征 feature_score_dict = {} for fn, s in zip(fe_name, clf.feature_importances_): feature_score_dict[fn] = s m = 0 for k in feature_score_dict: if feature_score_dict[k] == 0.0: m += 1 print 'number of not-zero features:' + str(len(feature_score_dict) - m) # 打印出特征重要性 feature_score_dict_sorted = sorted(feature_score_dict.items(), key=lambda d: d[1], reverse=True) print 'feature_importance:' for ii in range(len(feature_score_dict_sorted)): print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] print '\n' f = open('../eda/lgb_feature_importance.txt', 'w') f.write(th) f.write('\nRank\tFeature Name\tFeature Importance\n') for i in range(len(feature_score_dict_sorted)): f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') f.close() # 打印具体使用了哪些字段 how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 feature_used_dict_temp = feature_score_dict_sorted[:how_long] feature_used_name = [] for ii in range(len(feature_used_dict_temp)): feature_used_name.append(feature_used_dict_temp[ii][0]) print 'feature_chooesed:' for ii in range(len(feature_used_name)): print feature_used_name[ii] print '\n' f = open('../eda/lgb_feature_chose.txt', 'w') f.write('Feature Chose Name :\n') for i in range(len(feature_used_name)): f.write(str(feature_used_name[i]) + '\n') f.close() # 找到未被使用的字段名 feature_not_used_name = [] for i in range(len(fe_name)): if fe_name[i] not in feature_used_name: feature_not_used_name.append(fe_name[i]) return matrix_x, feature_not_used_name[:], len(feature_used_name)
def main(): # load the data print('\nloading...') wd = '/Users/ewenwang/Documents/credit/data' os.chdir(wd) dataFile = 'creditcard.csv' dataset = pd.read_csv(dataFile, low_memory=False) # set target and predictors target = 'Class' predictors = [x for x in dataset.columns if x not in [target]] # split the data into training and test sets seed = 2017 dtrain, dtest = train_test_split(dataset, test_size=0.33, random_state=seed) # build the classifier gbm = LGBMClassifier( learning_rate=0.01, n_estimators=5000, objective='binary', metric='auc', max_depth=10, subsample=0.83, colsample_bytree=0.63, save_binary=True, is_unbalance=True, random_state=seed ) # train the model print('\nfitting...') gbm.fit(dtrain[predictors], dtrain[target]) # report report(gbm, dtrain, dtest, predictors, target) return None
# param_distributions={ # "n_estimators": np.arange(10, 100, 10), # "learning_rate": np.arange(0.1, 1, 0.1) # }, # n_iter=1, # scoring=SCORING, # refit=SCORING[0], # cv=CV, # return_train_score=False, # random_state=RANDOM_STATE # ) # model.add_model("AdaBoost", adb_rs) # 4. LightGBM lgbm_rs = RandomizedSearchCV( estimator=LGBMClassifier(), param_distributions={ # "n_estimators": np.arange(40, 80, 10), # "learning_rate": np.arange(0.1, 0.4, 0.1), # "max_depth": np.arange(30, 60, 10), "n_estimators": [60], "learning_rate": [0.2], "max_depth": [60], }, n_iter=1, scoring=SCORING, refit=SCORING[0], cv=CV, return_train_score=False, random_state=RANDOM_STATE )
data = train_set[['label_list', 'course_vecs2']] labels = data['label_list'].values.tolist() y = [item for elem in labels for item in elem] course_info = data['course_vecs2'].values.tolist() course_list = [item for elem in course_info for item in elem] model_lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=64, learning_rate=0.01, n_estimators=2000, max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0, min_child_weight=5, min_child_samples=10, subsample=0.8, subsample_freq=1, colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, n_jobs=-1, silent=True) model_lgb.fit(course_list, y, eval_names=['train'], eval_metric=['logloss', 'auc'], eval_set=[(course_list, y)], early_stopping_rounds=10)
import numpy from sklearn.datasets import load_iris from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from lightgbm import LGBMClassifier data = load_iris() X = data.data[:, :2] y = data.target ind = numpy.arange(X.shape[0]) numpy.random.shuffle(ind) X = X[ind, :].copy() y = y[ind].copy() pipe = Pipeline([('scaler', StandardScaler()), ('lgbm', LGBMClassifier(n_estimators=1, max_depth=1))]) pipe.fit(X, y) ################################## # The conversion happens here and fails. try: model_onnx = convert_sklearn(pipe, 'pipeline', [('input', FloatTensorType([1, 2]))]) except Exception as e: print(e) ################################### # *sklearn-onnx* needs to know the appropriate converter # for class *LGBMClassifier*, the converter needs to be registered. # The converter comes with two pieces: a shape calculator which
'n_estimators': 376, 'reg_alpha': 1.05, 'reg_lambda': 2.53, 'objective': 'multiclass', 'boosting_type': 'gbdt', 'subsample': 0.7, 'random_state': 42, 'colsample_bytree': 0.7 } X_train, X_test, y_train, y_test = train_test_split(ttrain, train['Segmentation'], test_size=0.2, random_state=42) clf = LGBMClassifier(**params) clf.fit(X_train, y_train) p = clf.predict(X_test) accuracy_score(y_test, p) confusion_matrix(y_test, p) plt.barh(colx, clf.feature_importances_) clf.fit(ttrain, train["Segmentation"]) pred = clf.predict(ttest) testx["Segmentation"] = pred test = pd.merge(test, testx[['ID', 'Segmentation']], on='ID', how='left') test["Segmentation"] = np.where(test["Segmentation_x"].isnull(), test["Segmentation_y"], test["Segmentation_x"]) test[["ID", "Segmentation"]].to_csv(link + "\output.csv", index=False)
X = final_train[cols] y = final_train['like'] folds = KFold(n_splits=10, shuffle=True, random_state=123) oof_preds = np.zeros(final_train.shape[0]) sub_preds = np.zeros(final_test.shape[0]) for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X)): trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx] val_x, val_y = X.iloc[val_idx], y.iloc[val_idx] print(trn_x.shape, trn_y.shape, val_x.shape, val_y.shape) clf = LGBMClassifier( n_estimators=2000, learning_rate=0.01, num_leaves=100, colsample_bytree=.8, subsample=.9, max_depth=20, reg_alpha=.1, reg_lambda=.1, min_split_gain=.01, min_child_weight=2 ) clf.fit(trn_x, trn_y, eval_set= [(trn_x, trn_y), (val_x, val_y)], eval_metric='auc', verbose=500, early_stopping_rounds=400 ) oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba(final_test[cols], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
vals = [0.26, 0.27, 0.28, 0.29, .30] lgb_test_preds = [] for val in vals: f1_threshold = val mod = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5, learning_rate=0.1, max_depth=-1, metric='None', min_child_samples=20, min_child_weight=20, min_split_gain=0.0, n_estimators=10000, n_jobs=8, num_leaves=30, objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=1) n = preds(df_train=fe_train, y=y, seed=100, df_test=fe_test, mod=mod) lgb_test_preds.append((pd.Series(np.column_stack(n[1]).mean(axis=1)) > f1_threshold).astype('int')) print(pd.Series(np.column_stack(n[1]).mean(axis=1)) > f1_threshold).value_counts(1) print pd.Series(n[0] > f1_threshold).value_counts(1)
def kfold_lightgbm(df, num_folds, stratified = False, debug= False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( nthread=4, is_unbalance=False, n_estimators=10000, learning_rate=0.02, num_leaves=30, colsample_bytree=0.05, subsample=1, max_depth= 8, reg_alpha=0, reg_lambda=100, min_split_gain=0.5, min_child_weight=70, silent= -1, verbose= -1, max_bin= 300, subsample_freq= 1 ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 1000, early_stopping_rounds= 200) oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False) display_importances(feature_importance_df) return feature_importance_df
def classification(X_train, X_test, y_train, y_test, label_type, pca_dim=100): scaler = StandardScaler() scaler.fit(np.vstack([X_train, X_test])) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) if label_type != 'labels': scaler = MinMaxScaler() temp1, temp2 = y_train.loc[:, ['valence', 'activation' ]], y_test.loc[:, ['valence', 'activation']] temp = pd.concat((temp1, temp2)) scaler.fit(temp.values) y_train = scaler.transform(temp1.values) y_test = scaler.transform(temp2.values) else: y_train = y_train.loc[:, 'cur_label'].values y_test = y_test.loc[:, 'cur_label'].values if pca_dim > 0: pca_model = PCA(n_components=min(pca_dim, X_train.shape[1])).fit( np.array(X_train)) X_train = pca_model.transform(np.array(X_train)) X_test = pca_model.transform(np.array(X_test)) with open('train_test_data' + '.pickle', 'rb') as f: [train_data, test_data] = pickle.load(f) # with open('best_rf_cl' + '.pickle', 'rb') as f: # clf = pickle.load(f) clf = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.001, n_estimators=1000, objective=None, min_split_gain=0, min_child_weight=3, min_child_samples=10, subsample=0.8, subsample_freq=1, colsample_bytree=0.7, reg_alpha=0.3, reg_lambda=0, seed=17) if label_type == 'valence': print('VALENCE') for i in combs: print('train {}, test {}'.format(cleaned(i[0]), cleaned(i[1]))) [X_temp_train, y_temp_train] = cut_extreme_values(y_train[:, 0], X_train, i[0][0], i[0][1]) y_temp_train = [extreme_features(x) for x in y_temp_train] [X_temp_test, y_temp_test] = cut_extreme_values(y_test[:, 0], X_test, i[1][0], i[1][1]) y_temp_test = [extreme_features(x) for x in y_temp_test] clf.fit(X_temp_train, y_temp_train) y_pred = clf.predict(X_temp_test) print('f1_score= {}'.format( round(f1_score(y_pred, y_temp_test, average='macro'), 3))) # print(classification_report(y_pred, y_test)) # y_test = [extreme_features(x) for x in y_test] elif label_type == 'arousal': print('AROUSAL') for i in combs: print('train {}, test {}'.format(cleaned(i[0]), cleaned(i[1]))) [X_temp_train, y_temp_train] = cut_extreme_values(y_train[:, 1], X_train, i[0][0], i[0][1]) y_temp_train = [extreme_features(x) for x in y_temp_train] [X_temp_test, y_temp_test] = cut_extreme_values(y_test[:, 1], X_test, i[1][0], i[1][1]) y_temp_test = [extreme_features(x) for x in y_temp_test] clf.fit(X_temp_train, y_temp_train) y_pred = clf.predict(X_temp_test) print('f1_score= {}'.format( round(f1_score(y_pred, y_temp_test, average='macro'), 3))) # print(classification_report(y_pred, y_test)) elif label_type == 'labels': # STRATIFICATION # X = np.hstack((X_train, y_train[:, np.newaxis])) # X_pd = pd.DataFrame(X) # X_new = np.zeros((1, X.shape[1])) # max_num = max(X_pd.iloc[:, -1].value_counts()) # for label in np.unique(y_train): # indexes_to_add = np.random.choice(a=X_pd[X_pd.iloc[:, -1] == label].index, size=(max_num,)) # X_additional = X_pd.loc[indexes_to_add, :] # X_new = np.vstack((X_new, X_additional.values)) # X_new = X_new[1:,:] # X_train = X_new[:,:-1] # y_train = X_new[:, -1] # # ============================================================ # shuffle combined = list(zip(X_train, y_train)) random.shuffle(combined) X_train[:], y_train[:] = zip(*combined) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print('f1_score= {}'.format(f1_score(y_pred, y_test, average='macro'))) else: raise Exception('label_type is mistaken')
def gbdt_lgb_cv_modeling(): """ :return: """ '''Data input''' data_train = pd.read_csv('../data/train.csv', index_col='ID') data_predict = pd.read_csv('../data/pred.csv', index_col='ID') '''train 特征工程''' data_train_without_label = data_train.drop('Label', axis=1) # del data_train_without_label['V17'] # data_train_without_label['V14×V17'] = data_train_without_label['V14'] * data_train_without_label['V17'] # data_train_without_label['V14×V4'] = data_train_without_label['V14'] * data_train_without_label['V4'] # data_train_without_label['V14×V20'] = data_train_without_label['V14'] * data_train_without_label['V20'] # data_train_without_label['V14×V7'] = data_train_without_label['V14']*data_train_without_label['V7'] # data_train_without_label['V14×V10'] = data_train_without_label['V14'] * data_train_without_label['V10'] # # data_train_without_label['V17×V4'] = data_train_without_label['V17'] * data_train_without_label['V4'] # data_train_without_label['V17×V20'] = data_train_without_label['V17'] * data_train_without_label['V20'] # data_train_without_label['V17×V7'] = data_train_without_label['V17'] * data_train_without_label['V7'] # data_train_without_label['V17×V10'] = data_train_without_label['V17'] * data_train_without_label['V10'] # # data_train_without_label['V4×V20'] = data_train_without_label['V4'] * data_train_without_label['V20'] # data_train_without_label['V4×V7'] = data_train_without_label['V4'] * data_train_without_label['V7'] # data_train_without_label['V4×V10'] = data_train_without_label['V4'] * data_train_without_label['V10'] # # data_train_without_label['V20×V7'] = data_train_without_label['V20'] * data_train_without_label['V7'] # data_train_without_label['V20×V10'] = data_train_without_label['V20'] * data_train_without_label['V10'] # # data_train_without_label['V7×V10'] = data_train_without_label['V7'] * data_train_without_label['V10'] feature_name = list(data_train_without_label.columns.values) data_predict_user_id = list(data_predict.index.values) '''缺失值填充''' frames = [data_train_without_label, data_predict] data_all = pd.concat(frames) data_train_filled = data_train_without_label.fillna( value=data_all.median()) '''构造训练集和测试集''' x_temp = data_train_filled.iloc[:, :].as_matrix() # 自变量 y = data_train.iloc[:, -1].as_matrix() # 因变量 '''Feature selection''' X, dropped_feature_name, len_feature_choose = gbdt_feature_selection( feature_name, x_temp, y, '0.0005*mean') # 0.1*mean可以选出10个特征 # 0.00001*mean可以选出14个特征 '''处理 验证集 B_test''' # del data_predict['V17'] data_predict_filled = data_predict.fillna(value=data_all.median()) data_predict_filled_after_feature_selection = data_test_feature_drop( data_predict_filled, dropped_feature_name) '''Split train/test data sets''' cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # 分层抽样 cv的意思是cross-validation '''Choose a classification model''' parameter_n_estimators = 100 classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1) '''Model fit, predict and ROC''' colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue']) lw = 2 mean_f1 = 0.0 mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 500) i_of_roc = 0 a = 0 th = 0.3 for (train_indice, test_indice), color in zip(cv.split(X, y), colors): a_model = classifier.fit(X[train_indice], y[train_indice]) # y_predict_label = a_model.predict(X[test_indice]) probas_ = a_model.predict_proba(X[test_indice]) fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1]) a += 1 # 序号加1 mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc)) i_of_roc += 1 label_transformed = probas_[:, 1] for i in range(len(label_transformed)): if label_transformed[i] > th: label_transformed[i] = 1 else: label_transformed[i] = 0 lt = label_transformed.astype('int32') f1 = f1_score(y[test_indice], lt) mean_f1 += f1 # 0.7739 plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) print 'mean_auc=' + str(mean_auc) print 'mean_f1=' + str(mean_f1 / 5) plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw) plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) plt.xlabel('False Positive Rate mean_f1:' + str(mean_f1)) plt.ylabel('True Positive Rate') plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1 / 5)) plt.legend(loc="lower right") plt.savefig('../result/pred_ROC_GL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba_to_label_using_th_' + str(th) + '.png') # plt.show() a_model = classifier.fit(X, y) # label_predict = a_model.predict(data_predict_filled_after_feature_selection) # 对B_test进行预测 proba_predict = a_model.predict_proba( data_predict_filled_after_feature_selection) '''写入预测出概率的结果''' result_file_name = '../result/pred_result_GL_N_' + str( parameter_n_estimators) + '_features_' + str( len_feature_choose) + '_proba.csv' write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist()) '''写入要提交的结果''' label_transformed = proba_predict[:, 1] sum_of_1 = 0 for i in range(len(label_transformed)): if label_transformed[i] > th: label_transformed[i] = 1 sum_of_1 += 1 else: label_transformed[i] = 0 lt = label_transformed.astype('int32') result_file_name = '../result/pred_result_GL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \ '_proba_to_label_using_th_' + str(th) + '_' + str(sum_of_1) + '.csv' write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist())
probility = np.zeros((len(test_df), label.shape[1])) i = 0 model_type = 'ensemble' # model_type ='single' # K折交叉划分训练 for train_index, valid_index in kf.split(train_df, label): print("\nFold {}".format(i + 1)) i += 1 X_train, label_train = train_df[train_index], label[train_index] X_valid, label_valid = train_df[valid_index], label[valid_index] clf1 = OneVsRestClassifier( XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, n_estimators=150)) clf2 = LGBMClassifier() clf3 = LogisticRegression(max_iter=500, n_jobs=20) # 模型集成方法1 if model_type == 'ensemble': # 因为XGB的单模型效果比其他两个好,所以权重是2:1:1 model = OneVsRestClassifier( EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[2, 1, 1], voting='soft', verbose=2)) # 模型集成方法2 elif model_type == 'stacking': lr = LogisticRegression() base = StackingClassifier(classifiers=[clf1, clf2, clf3], use_probas=True, average_probas=False,
'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100], 'scale_pos_weight': [100, 200, 300, 400, 500, 600]} fit_params={"early_stopping_rounds":30, "eval_metric" : 'auc', "eval_set" : [(X_test,y_test)], 'eval_names': ['valid'], #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)], 'verbose': 100, 'categorical_feature': 'auto'} from lightgbm import LGBMClassifier lgbm = LGBMClassifier(n_jobs=-1, scale_pos_weight=578) lgbm_gs = RandomizedSearchCV( lgbm, param_distributions=param_test, n_iter=100, scoring=scorer, cv=k_fold, refit=True, n_jobs=-1, ) lgbm_gs.fit(X_train, y_train) logger.info(f'LGBM finetuned best scores: {lgbm_gs.best_score_}') logger.info(f'LGBM finetuned best params: {lgbm_gs.best_params_}') logger.info('Evaluating models on test set...') logger.info('LR score:') logger.info(f'{rs.score(X_test, y_test)}') logger.info('LGBN score:') logger.info(f'{lgbm_gs.score(X_test, y_test)}')
def HyperOptPipeline(algo, n_iter=-1): if algo in ['linreg', 'logreg', 'svr', 'svc']: ss = StandardScaler() mms = MinMaxScaler() if algo == 'linreg': model_linreg = LinearRegression() model_lasso = Lasso() model_ridge = Ridge() model_elasticnet = ElasticNet() params = [ { 'scaler': [ss, mms], 'estimator': [model_linreg] },{ 'scaler': [ss, mms], 'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator': [model_lasso] },{ 'scaler': [ss, mms], 'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator': [model_ridge] },{ 'scaler': [ss, mms], 'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'estimator': [model_elasticnet] } ] pipeline = Pipeline([('scaler', ss), ('estimator', model_linreg)]) if algo == 'logreg': model_logreg = LogisticRegression(class_weight='balanced', solver='saga', max_iter=100_000) params = [ { 'scaler': [ss, mms], 'estimator__penalty': ['l1', 'l2'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], }, { 'scaler': [ss, mms], 'estimator__penalty': ['elasticnet'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator__l1_ratio': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] }, { 'scaler': [ss, mms], 'estimator__penalty': ['none'], }, ] pipeline = Pipeline([('scaler', ss), ('estimator', model_logreg)]) if algo in ['svc', 'svr']: model = SVC(class_weight='balanced') if algo == 'svc' else SVR() params = [ { 'scaler': [ss, mms], 'estimator__kernel': ['linear'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], }, { 'scaler': [ss, mms], 'estimator__kernel': ['rbf', 'sigmoid'], 'estimator__gamma': ['scale', 'auto'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], }, { 'scaler': [ss, mms], 'estimator__kernel': ['poly'], 'estimator__gamma': ['scale', 'auto'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator__degree': [2, 3, 4, 5] } ] pipeline = Pipeline([('scaler', ss), ('estimator', model)]) if algo in ['ctree', 'rtree']: if algo == 'ctree': model_rf = RandomForestClassifier(class_weight='balanced') model_gb = GradientBoostingClassifier() model_et = ExtraTreesClassifier(class_weight='balanced') model_xgb = XGBClassifier() model_xgbrf = XGBRFClassifier() model_cb = CatBoostClassifier(bootstrap_type='Bernoulli') model_lgbm = LGBMClassifier(class_weight='balanced') else: model_rf = RandomForestRegressor() model_gb = GradientBoostingRegressor() model_et = ExtraTreesRegressor() model_xgb = XGBRegressor() model_xgbrf = XGBRFRegressor() model_cb = CatBoostRegressor(bootstrap_type='Bernoulli') model_lgbm = LGBMRegressor() params = [ { 'estimator': [model_rf], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__max_depth': [5, 10, 15, 25, 30, None], 'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100], 'estimator__min_samples_leaf': [1, 2, 5, 10], }, { 'estimator': [model_gb], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25], 'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100], 'estimator__min_samples_leaf': [1, 2, 5, 10], }, { 'estimator': [model_et], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__max_depth': [5, 10, 15, 25, 30, None], 'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100], 'estimator__min_samples_leaf': [1, 2, 5, 10], }, { 'estimator': [model_xgb], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25], 'estimator__gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0], 'estimator__min_child_weight': [1, 3, 5, 7], 'estimator__reg_lambda': [0.01, 0.1, 1.0], 'estimator__reg_alpha': [0, 0.1, 0.5, 1.0], }, { 'estimator': [model_xgbrf], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25], 'estimator__gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0], 'estimator__min_child_weight': [1, 3, 5, 7], 'estimator__reg_lambda': [0.01, 0.1, 1.0], 'estimator__reg_alpha': [0, 0.1, 0.5, 1.0], }, { 'estimator': [model_cb], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__max_depth': [3, 5, 7, 9, 12, 15, 16], 'estimator__reg_lambda': [0.01, 0.1, 1.0], }, { 'estimator': [model_lgbm], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__min_child_samples': [1, 2, 5, 10, 15, 100], 'estimator__min_child_weight': [1, 3, 5, 7], 'estimator__reg_lambda': [0.01, 0.1, 1.0], 'estimator__reg_alpha': [0, 0.1, 0.5, 1.0], } ] pipeline = Pipeline([('estimator', model_rf)]) n_params = 0 for param_dict in params: n = 1 for v in param_dict.values(): n *= len(v) n_params += n print(n_params, 'parameter settings identified') if n_iter == -1: return GridSearchCV(pipeline, params, cv=ShuffleSplit(test_size=0.1, n_splits=1, random_state=19)) return RandomizedSearchCV(pipeline, params, n_iter=n_iter, cv=ShuffleSplit(test_size=0.1, n_splits=1, random_state=19), random_state=19)
# TF-iDF from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(max_features= 10000) all_review = tfidf.fit_transform(clean_reviews) all_review=all_review.toarray() print(len(y)) X_train=all_review[:len(y)] #LGBM 사용 from lightgbm import LGBMClassifier lgbm_model = LGBMClassifier(n_estimators=220, learning_rate=0.2, num_leaves=120, random_state=77) lgbm_model.fit(X_train,y) X_test = all_review[len(y):] preds = lgbm_model.predict(X_test) submit=pd.DataFrame(all["id"][len(y):]) submit['sentiment']=preds submit.head() #id의 ""를 지우는 함수 def remove_d(word):
print('transforming...') count_vec = TfidfVectorizer() data_ip = count_vec.fit_transform(data['item_property_list']) train = data[data.is_trade.notnull()] train_index = list(train[train.day < 24].index) test_index = list(train[train.day == 24].index) ip_train = data_ip[train_index,:] ip_test = data_ip[test_index,:] gbm = LGBMClassifier( objective='binary', num_leaves=24, max_depth=3, learning_rate=0.1, seed=2018, colsample_bytree=0.3, subsample=0.8, n_jobs=-1, n_estimators=2000 ) print('fitting...') gbm.fit(ip_train, train.loc[train_index, 'is_trade'], eval_set=[(ip_test, train.loc[test_index, 'is_trade'])], early_stopping_rounds=10) property_df = pd.DataFrame(columns=['instance_id', 'item_property_prob']) property_df['instance_id'] = data['instance_id'] property_df['item_property_prob'] = gbm.predict_proba(data_ip)[:, 1] def NatureLP(data, columns):
def OOFPreds(X, y, test_X, params, n_splits=5, random_state=23, clf='lgb'): """ 输入要求数据为 Dataframe 返回数据 Series """ # 方便后续特征重要度分析 feature_importance = pd.DataFrame(columns=['feature', 'importance', 'fold']) folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) # oof 是交叉验证结果 sub是测试集预测结果 oof_preds, sub_preds = np.zeros(X.shape[0]), np.zeros(test_X.shape[0]) oof_train = np.zeros(X.shape[0]) print(X.shape, test_X.shape) valid_scores = [] train_scores = [] for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)): trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx] val_x, val_y = X.iloc[val_idx], y.iloc[val_idx] # # 初始化 score记录方式 # trn_init_score = pd.Series([0.95] * len(trn_x), index=trn_x.index) # val_init_score = pd.Series([0.95] * len(val_x), index=val_x.index) # 模型构建与预测任务 if clf == 'lgb': with timer('{} fold 训练时间:'.format(n_fold)) as time: gbm = LGBMClassifier(**params) gbm.fit(trn_x, trn_y, init_score=trn_init_score, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_init_score=[trn_init_score, val_init_score], eval_metric='auc', verbose=30, early_stopping_rounds=100) print('best iteration: {}'.format(gbm.best_iteration_)) print('100单次训练时间: {:.3f}'.format(time*100/gbm.best_iteration_)) pred_val = gbm.predict_proba(val_x, num_iteration=gbm.best_iteration_)[:, 1] pred_test = gbm.predict_proba(test_X, num_iteration=gbm.best_iteration_)[:, 1] # 预测分数 预测结果记录 oof_preds[val_idx] = pred_val sub_preds += pred_test / folds.n_splits print(gbm.best_score_) valid_score = gbm.best_score_['valid_1']['auc'] train_score = gbm.best_score_['training']['auc'] valid_scores.append(valid_score) train_scores.append(train_score) feature_importance = feature_importance.append(pd.DataFrame({ 'importance': gbm.feature_importances_, 'fold': [n_fold + 1] * X.shape[1], 'feature': X.columns.tolist()})) else: # 自己的模型 # 任务一:完成模型的构建预测任务 # 任务二:完成预测分数,预测结果的记录 # 任务三:完成模型重要程度的记录 clf = LogisticRegression(**params) clf.fit(trn_x, trn_y) pred_train = clf.predict_proba(trn_x)[:, 1] pred_val = clf.predict_proba(val_x)[:, 1] pred_test = clf.predict_proba(test_X)[:, 1] \ oof_preds[val_idx] = pred_val sub_preds += pred_test / folds.n_splits valid_score = roc_auc_score(val_y, pred_val) train_score = roc_auc_score(trn_y, pred_train) valid_scores.append(valid_score) train_scores.append(train_score) feature_importance = feature_importance.append(pd.DataFrame({ 'importance': clf.coef_[0], 'fold': [n_fold + 1] * X.shape[1], 'feature': X.columns.tolist()})) print('Fold {:02d} 训练集 AUC: {:.6f} 验证集 AUC: {:.6f} '.format(n_fold + 1, train_score, valid_score)) del trn_x, trn_y, val_x, val_y; gc.collect() feature_importance['importance'] = feature_importance['importance'].astype(float) fold_names = list(range(folds.n_splits)) fold_names.append('overall') valid_auc = roc_auc_score(y, oof_preds) valid_scores.append(valid_auc) train_scores.append(np.mean(train_scores)) # 构建记录分数的 Dataframe metrics = pd.DataFrame({'fold': fold_names, 'train': train_scores, 'valid': valid_scores}) oof_preds = pd.Series(oof_preds.flatten(), index=X.index).rename('TARGET') sub_preds = pd.Series(sub_preds.flatten(), index=test_X.index).rename('TARGET') return oof_preds, sub_preds, feature_importance, metrics
'regression': LGBMRegressor(boosting_type='gbdt', learning_rate=0.05, num_iterations=1200, max_depth=5, n_estimators=1000, verbose=-1, num_leaves=2**5, silent=True, n_jobs=4), 'classification': LGBMClassifier(boosting_type='gbdt', learning_rate=0.05, num_iterations=1200, max_depth=5, n_estimators=1000, verbose=-1, num_leaves=2**5, silent=True, n_jobs=4), 'type': 'gbdt' }, #{'classification': CatBoostClassifier( # **{**CATBOOST_PARAMS, **{ # 'loss_function': 'MultiClass', # 'verbose': False, # 'thread_count': 4, # 'random_seed': 0} # } #), 'regression': CatBoostRegressor(
train = get_interval_ratio_feat(train) test = get_interval_ratio_feat(test) cate_feature = [ 'gender', 'age', 'edu', 'play_mday', 'play_weekday', 'play_isweekend', 'fav_click_type', 'fav_like_type', 'fav_play_type', 'fav_show_type' ] feature = list(train.columns) lgb_model = LGBMClassifier( boosting_type="gbdt", num_leaves=64, reg_alpha=3, reg_lambda=3, max_depth=-1, n_estimators=10000, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.01, random_state=1230, n_jobs=-1, ) predict_result = pd.DataFrame() predict_result['userid'] = test_id predict_result['rentention_rate'] = 0 best_score = [] skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True) st = time.time() for index, (train_index, test_index) in enumerate(skf.split(train, y)): print('Start', index + 1, ' Fold') train_x, test_x, train_y, test_y = train.loc[train_index], train.loc[
##KNeighborsClassifier(n_neighbors=1) rnf = RandomForestClassifier(n_estimators=181, max_features='sqrt', bootstrap=False, max_depth=60, min_samples_split=2, min_samples_leaf=1, random_state=1) etr = ExtraTreesClassifier(n_estimators=500, max_features=X_copy.shape[1], min_samples_split=5, min_samples_leaf=1, random_state=1) lgb = LGBMClassifier(objective='multiclass', num_class=7, learning_rate=0.2, num_leaves=X_copy.shape[1], random_state=1) #num_leaves=109, lrg = LogisticRegression(C=1000, multi_class='multinomial', solver='newton-cg', random_state=1) mlp = MLPClassifier(activation='logistic', max_iter=500) #xgb = xgb.XGBClassifier(objective='multi:softmax') #------------------------------------------------------------------------------ rf_param = { 'n_estimators': [250, 300, 350, 400], 'max_features': ['auto', 'sqrt'], 'max_depth': [None, 50, 60, 70, 80, 90], #'min_samples_split' : [2, 5, 10], #'min_samples_leaf' : [1, 2, 4],
def train_model(train): excluded_features = [ 'target', 'user_hash', # 'City_post_HOME', 'City_post_WORK', # 'Raion_post_HOME', 'Raion_post_WORK', # 'City_post_HOME', 'City_post_WORK', 'lat_quad_home', 'lat_quad_work', 'lon_quad_home', 'lon_quad_work', 'LAT_WORK', 'LAT_HOME', 'LON_WORK', 'LON_HOME' ] # , 'data_type_3_m1', 'data_type_1_m1', 'data_type_2_m1'] train_features = [x for x in train.columns if x not in excluded_features] cats = list(train.dtypes[train.dtypes == 'object'].index.values) cats = [x for x in cats if x not in excluded_features] for f in cats: train[f], indexer = pd.factorize(train[f]) importances = pd.DataFrame() importances['feature'] = train_features importances['gain'] = 0 n_splits = 5 kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) for (train_index, valid_index) in kf.split(train, train['target']): trn_x, trn_y = train[train_features].iloc[train_index], train[ 'target'].iloc[train_index] val_x, val_y = train[train_features].iloc[valid_index], train[ 'target'].iloc[valid_index] clf = LGBMClassifier( objective='multiclass', num_class=6, num_leaves=16, max_depth=5, learning_rate=0.06, n_estimators=1000, subsample=.9, colsample_bytree=.8, # lambda_l1=10, # lambda_l2=0.01, random_state=1) clf.fit( trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], # eval_names=['train', 'valid'], early_stopping_rounds=50, verbose=50, categorical_feature=cats) importances['gain'] += clf.booster_.feature_importance( importance_type='gain') / n_splits y_pred = clf.predict(val_x) acc = accuracy_score(val_y, y_pred) print(f'accuracy_score={acc}') plt.figure(figsize=(12, 16)) sns.barplot(x='gain', y='feature', data=importances.sort_values('gain', ascending=False)[:60]) plt.savefig('importance.png')
'PROVINCE_NAME_广西壮族自治区', 'LCENTERTYPERANK_3.0', 'LCENTERTYPERANK_4.0', 'm1_maxmin_LOG_DAY', 'm0_maxmin_KJ_CLICK_NUM', 'm4_LM_CLICK_NUM', 'SEX_1', 'MARRIAGE_其它', 'm5_maxmin_LOGIN_DURATION', 'm0_KJ_CLICK_NUM', 'm2_maxmin_LOG_DAY', 'LCENTERTYPERANK_2.0', 'PROVINCE_NAME_江西省', 'm5_maxmin_LOG_DAY', 'm3_KJ_CLICK_NUM', 'm2_LOGIN_NUM', 'LCENTERTYPENAME_直属', 'm1_KJ_CLICK_NUM', 'm4_LOGIN_NUM', 'm3_LOGIN_NUM', 'STD_LM_CLICK_NUM', 'm1_maxmin_LOGIN_DURATION', 'm4_KJ_CLICK_NUM', 'LCENTERTYPERANK_1.0', 'PROVINCE_NAME_福建省', 'PROVINCE_NAME_内蒙古自治区', 'm0_maxmin_LOG_DAY', 'm1_LOGIN_NUM', 'm3_maxmin_LOG_DAY', 'm2_maxmin_LOGIN_NUM', 'm0_LOGIN_NUM', 'm5_maxmin_LOGIN_NUM', 'm5_LOGIN_DURATION', 'DAY_MEAN_LM_CLICK_NUM', 'm4_maxmin_LOG_DAY', 'm1_maxmin_LOGIN_NUM', 'mean_LOG_DAY', 'm3_maxmin_LOGIN_NUM', 'm2_maxmin_LOGIN_DURATION', 'm3_maxmin_LOGIN_DURATION', 'm2_LOGIN_DURATION', 'm0_maxmin_LOGIN_NUM', 'w_mLOGIN_NUM', 'mean_LOGIN_NUM', 'w_stdKJ_CLICK_NUM', 'm4_maxmin_LOGIN_NUM', 'm4_maxmin_LOGIN_DURATION', 'w_mLOG_DAY', 'm0_maxmin_LOGIN_DURATION', 'w_mKJ_CLICK_NUM', 'mean_KJ_CLICK_NUM', 'm3_LOGIN_DURATION', 'w_stdLOGIN_NUM', 'm1_LOGIN_DURATION', 'w_mLOGIN_DURATION', 'STD_LOG_DAY', 'm4_LOGIN_DURATION', 'mean_LOGIN_DURATION', 'w_stdLOGIN_DURATION'] ] # lightgbm clf = LGBMClassifier(num_leaves=40, learning_rate=0.05, max_depth=20, n_estimators=300, subsample=0.8, colsample_bytree=1, min_child_weight=1) # 计算特征重要度 clf.fit(X=data, y=data_y) score = clf.feature_importances_ score = [(data.columns[i], score[i]) for i in range(len(score))] score = sorted(score, key=lambda k: k[1], reverse=True) for i in range(len(score)): print(i, score[i]) # start = time.time() # # 交叉验证 # score_name = 'roc_auc' # score = model_selection.cross_val_score(estimator=clf, X=data, y=data_y, # cv=model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=8), # scoring=score_name, groups=data_y) # print(score)
import sklearn from sklearn.linear_model import LogisticRegression from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import lightgbm from lightgbm import LGBMClassifier, Dataset, train as train_lgbm import onnxruntime as rt import skl2onnx import onnxmltools from onnxconverter_common.data_types import FloatTensorType from onnxmltools.convert import convert_lightgbm iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y) clr = LGBMClassifier() clr.fit(X_train, y_train) print(clr) ########################### # Convert a model into ONNX # +++++++++++++++++++++++++ initial_type = [('float_input', FloatTensorType([None, 4]))] onx = convert_lightgbm(clr, initial_types=initial_type) ################################### # Compute the predictions with onnxruntime # ++++++++++++++++++++++++++++++++++++++++ sess = rt.InferenceSession(onx.SerializeToString())
start = time.time() for train_index, test_index in skf.split(data, data_y): # 训练集 train_data = data.iloc[train_index] train_y = data_y[train_index] # 测试集 test_data = data.iloc[test_index] test_y = data_y[test_index] print(len(train_data), len(test_data)) # 训练模型 model = LGBMClassifier(num_leaves=6, learning_rate=0.05, max_depth=6, n_estimators=200, subsample=1, colsample_bytree=1, min_child_weight=1) model.fit(X=train_data, y=train_y) # 预测 pred = model.predict(test_data) # tmp_score = metrics.roc_auc_score(y_true=test_y, y_score=pred) tmp_score = metrics.f1_score(y_true=test_y, y_pred=pred, average='macro') print(tmp_score) score.append(tmp_score) print(score) print('f1:', sum(score) / len(score), 'time:', time.time() - start)
images = images.reshape(images.shape[0], SHAPE_SIZE_X * SHAPE_SIZE_Y) labels = labels.astype('int') images_validation = images_validation.reshape(images_validation.shape[0], SHAPE_SIZE_X * SHAPE_SIZE_Y) labels_validation = labels_validation.astype('int') images, labels = reduced_dataset(images, labels) # # Train the light GBM # model = LGBMClassifier() # cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) # n_scores = cross_val_score(model, images, labels, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise') # print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores))) # fit the model on the whole dataset model = LGBMClassifier(objective="binary", class_weight="balanced") start_time = time.time() model = model.fit(images, labels) print("Train Light GBM --- %s seconds ---" % (time.time() - start_time)) start_time = time.time() basic_score = model.score(images_validation, labels_validation) print("Validation Light GBM --- %s seconds ---" % (time.time() - start_time)) print("Light GBM scikit learn basic score: %0.4f" % basic_score) # Validating the model and evaluation start_time = time.time() scores = cross_validate(model,
config['cat_columns'] = cat_feat if os.path.exists('../trans_data/train_1000_10.pkl'): train = pickle.load(open('../trans_data/train_1000_10.pkl', 'rb')) test = pickle.load(open('../trans_data/test_1000_10.pkl', 'rb')) else: d={'add':'+', 'sub':'-', 'mul':'*', 'div':'/'} feat0 = feat.copy() for i in trange(len(feat)): df_temp=train[feat0].copy() for j in range(i+1,len(feat)): df_temp['%s|%s|add'%(feat[i],feat[j])] = train[feat[i]]+train[feat[j]] df_temp['%s|%s|sub'%(feat[i],feat[j])] = train[feat[i]]-train[feat[j]] df_temp['%s|%s|mul'%(feat[i],feat[j])] = train[feat[i]]*train[feat[j]] df_temp['%s|%s|div'%(feat[i],feat[j])] = train[feat[i]]/train[feat[j]] model = LGBMClassifier(n_estimators=1000, learning_rate=0.08, max_depth=7, subsample=0.8, colsample_bytree=0.6, n_jobs=4) model.fit(df_temp.values, train_y) qq = pd.Series(model.feature_importances_, index=df_temp.columns).sort_values() for col in set(qq.loc[qq>10].index)-set(feat0): f0, f1, f2 = col.split('|') train[col] = df_temp[col] test[col] = eval("test['%s']%stest['%s']"%(f0,d[f2],f1)) feat0.extend(list(set(qq.loc[qq>10].index)-set(feat0))) pickle.dump(train, open('../trans_data/train_1000_10.pkl','wb')) pickle.dump(test, open('../trans_data/test_1000_10.pkl','wb')) def gen_feat(data): for col in cat_feat: data[col] = data[col].fillna('empty').astype(str) for col in data.columns: if '年' not in col and '|' not in col and data[col].isna().sum()>0:
for file in test_list: df_action = pd.read_csv(file) df_test.append(df_action) df_test = pd.concat(df_test, axis=0, ignore_index=True) print(df_train['sign'].value_counts()) print(df_test['sign'].value_counts()) time.sleep(2) # Prepare data y_train = df_train.pop('sign') x_train = df_train.values y_test = df_test.pop('sign') x_test = df_test.values # Model # print('Training random forest') # model = RandomForestClassifier(n_estimators=9) # model.fit(x_train, y_train) model = LGBMClassifier(n_estimators=50) model.fit(x_train, y_train) y_pred = model.predict(x_test) accuracy = accuracy_score(y_test, y_pred) print('Accuracy of Random forest: ', accuracy) # Save model with open('../model/md.pickle', 'wb') as f: pickle.dump(model, f)
lgb_params['min_child_samples'] = 500 lgb_params['seed'] = 99 lgb_params2 = {} lgb_params2['n_estimators'] = 1090 lgb_params2['learning_rate'] = 0.02 lgb_params2['colsample_bytree'] = 0.3 lgb_params2['subsample'] = 0.7 lgb_params2['subsample_freq'] = 2 lgb_params2['num_leaves'] = 16 lgb_params2['seed'] = 99 log_params = {} log_params['class_weight'] = {0: 1, 1: 4.5} log_params['random_state'] = 99 lgb_model = LGBMClassifier(**lgb_params) lgb_model2 = LGBMClassifier(**lgb_params2) log_model = LogisticRegression(**log_params) stack = Ensemble(n_splits=3, stacker=log_model, base_models=(lgb_model, lgb_model2)) train = train.drop([ 'PRXYDATA_98', 'PRXYDATA_97', 'PRXYDATA_2', 'PRXRETRY_98', 'PRXRETRY_97', 'PRVHLTIN_98', 'PRVHLTIN_97', 'PRVHLTIN_85', 'MEDICARE_98', 'MEDICARE_97', 'MEDICARE_85', 'IRWELMOS_7', 'IRWELMOS_10', 'IRPRVHLT_2', 'IRPINC3_7', 'IRPINC3_6', 'IROTHHLT_99', 'IROTHHLT_1', 'IIOTHHLT_3', 'IIOTHHLT_1', 'IIMEDICR_3', 'IIHHSIZ2_3', 'IIHHSIZ2_1', 'HLTINNOS_99', 'HLTINNOS_98', 'HLTINNOS_97', 'HLTINNOS_94', 'HLTINNOS_2', 'HLTINNOS_1', 'HLNVSOR_98',
from imblearn.under_sampling import OneSidedSelection from imblearn.under_sampling import NeighbourhoodCleaningRule from imblearn.under_sampling import InstanceHardnessThreshold from imblearn.combine import SMOTEENN from imblearn.combine import SMOTETomek from collections import Counter r = 1001 models = [ DecisionTreeClassifier(random_state=r), BaggingClassifier(random_state=r), RandomForestClassifier(random_state=r), GradientBoostingClassifier(random_state=r), LGBMClassifier(), XGBClassifier(random_state=r), CatBoostClassifier(random_state=r, verbose=False), ] names = [ "Decision Tree", "Ensemble-Bagging", "Ensemble-Random Forest", "Ensemble-Gradient Boosting", "Light Gradient Boosting", "XG Boost", "Cat Boost", ] samplers = [ # imbalanced-learn Over
#cat_params['l2_leaf_reg'] = 3.5 #cat_params['border_count'] = 8 #cat_params['gradient_iterations'] = 4 # Regularized Greedy Forest params #rgf_params = {} #rgf_params['max_leaf'] = 2000 #rgf_params['learning_rate'] = 0.5 #rgf_params['algorithm'] = "RGF_Sib" #rgf_params['test_interval'] = 100 #rgf_params['min_samples_leaf'] = 3 #rgf_params['reg_depth'] = 1.0 #rgf_params['l2'] = 0.5 #rgf_params['sl2'] = 0.005 lgb_model_1 = LGBMClassifier(**lgb_params_1) lgb_model_2 = LGBMClassifier(**lgb_params_2) lgb_model_3 = LGBMClassifier(**lgb_params_3) #rf_model = RandomForestClassifier(**rf_params) #et_model = ExtraTreesClassifier(**et_params) #xgb_model = XGBClassifier(**xgb_params) #cat_model = CatBoostClassifier(**cat_params) #rgf_model = RGFClassifier(**rgf_params)
def compute_part_of_the_stacking(X_train_nn,X_train_sk,X_train_basic,\ X_train_GoQ,y_train,X_test_nn,X_test_sk,\ X_test_basic,X_test_GoQ,max_size,embedding_matrix,glove_embedding\ ,word_vector_dim,drop_rate,my_optimizer,batch_size,nb_epoch,my_patience=0): ''' Compute the stackings features from the given input matrices that are defined Various easy parameters are used ... ''' X_train_basic_GoQ = np.concatenate((X_train_basic, X_train_GoQ), axis=1) X_test_basic_GoQ = np.concatenate((X_test_basic, X_test_GoQ), axis=1) mcp1 = ModelCheckpoint('weights.stack.hdf5', monitor="val_acc", save_best_only=True, save_weights_only=False) model = create_nn_model(max_size, embedding_matrix, word_vector_dim, drop_rate, my_optimizer) model.fit( X_train_nn, y_train, batch_size=batch_size, epochs=nb_epoch, validation_split=0.1, callbacks=[mcp1], ) keras.backend.clear_session() mcp2 = ModelCheckpoint('weights.stack_gv.hdf5', monitor="val_acc", save_best_only=True, save_weights_only=False) print('Computing Neural Netword for Glove features') model_gv = create_nn_model(max_size, glove_embedding, 200, drop_rate, my_optimizer) model_gv.fit( X_train_nn, y_train, batch_size=batch_size, epochs=nb_epoch, validation_split=0.1, callbacks=[mcp2], ) keras.backend.clear_session() X_train_nn_GoQ = X_train_nn + [X_train_GoQ] X_test_nn_GoQ = X_test_nn + [X_test_GoQ] mcp3 = ModelCheckpoint('weights.stack_1.hdf5', monitor="val_acc", save_best_only=True, save_weights_only=False) model_v2 = create_nn_model_v2(max_size, embedding_matrix, word_vector_dim, drop_rate, my_optimizer, X_train_GoQ.shape[1]) model_v2.fit( X_train_nn_GoQ, y_train, batch_size=batch_size, epochs=nb_epoch, validation_split=0.1, callbacks=[mcp3], ) keras.backend.clear_session() mcp4 = ModelCheckpoint('weights.stack_gv_1.hdf5', monitor="val_acc", save_best_only=True, save_weights_only=False) model_gv_v2 = create_nn_model_v2(max_size, glove_embedding, 200, drop_rate, my_optimizer, X_train_GoQ.shape[1]) model_gv_v2.fit( X_train_nn_GoQ, y_train, batch_size=batch_size, epochs=nb_epoch, validation_split=0.1, callbacks=[mcp4], ) keras.backend.clear_session() X_train_nn_basic = X_train_nn + [X_train_basic] X_test_nn_basic = X_test_nn + [X_test_basic] mcp5 = ModelCheckpoint('weights.stack_2.hdf5', monitor="val_acc", save_best_only=True, save_weights_only=False) model_v3 = create_nn_model_v2(max_size, embedding_matrix, word_vector_dim, drop_rate, my_optimizer, X_train_basic.shape[1]) model_v3.fit( X_train_nn_basic, y_train, batch_size=batch_size, epochs=nb_epoch, validation_split=0.1, callbacks=[mcp5], ) keras.backend.clear_session() mcp6 = ModelCheckpoint('weights.stack_gv_2.hdf5', monitor="val_acc", save_best_only=True, save_weights_only=False) model_gv_v3 = create_nn_model_v2(max_size, glove_embedding, 200, drop_rate, my_optimizer, X_train_basic.shape[1]) model_gv_v3.fit( X_train_nn_basic, y_train, batch_size=batch_size, epochs=nb_epoch, validation_split=0.1, callbacks=[mcp6], ) keras.backend.clear_session() X_train_nn_basic_GoQ = X_train_nn + [X_train_basic_GoQ] X_test_nn_basic_GoQ = X_test_nn + [X_test_basic_GoQ] mcp7 = ModelCheckpoint('weights.stack_3.hdf5', monitor="val_acc", save_best_only=True, save_weights_only=False) model_v4 = create_nn_model_v2(max_size, embedding_matrix, word_vector_dim, drop_rate, my_optimizer, X_train_basic_GoQ.shape[1]) model_v4.fit( X_train_nn_basic_GoQ, y_train, batch_size=batch_size, epochs=nb_epoch, validation_split=0.1, callbacks=[mcp7], ) keras.backend.clear_session() mcp8 = ModelCheckpoint('weights.stack_gv_3.hdf5', monitor="val_acc", save_best_only=True, save_weights_only=False) model_gv_v4 = create_nn_model_v2(max_size, glove_embedding, 200, drop_rate, my_optimizer, X_train_basic_GoQ.shape[1]) model_gv_v4.fit( X_train_nn_basic_GoQ, y_train, batch_size=batch_size, epochs=nb_epoch, validation_split=0.1, callbacks=[mcp8], ) keras.backend.clear_session() print('Computing skmodels') rf_model = RandomForestClassifier(n_estimators=50) elastic_net_model = ElasticNet() log_reg_model = LogisticRegression() lin_reg_model = LinearRegression() km_model = KNeighborsClassifier(n_neighbors=50) svm_model = SVC(probability=True) xgb_model = XGBClassifier(max_depth=6, n_estimators=100, reg_lambda=1, seed=5) lightgbm_model = LGBMClassifier(max_depth=6, n_estimators=100, reg_lambda=1, seed=5) rf_model_GoQ = RandomForestClassifier(n_estimators=50) elastic_net_model_GoQ = ElasticNet() log_reg_model_GoQ = LogisticRegression() lin_reg_model_GoQ = LinearRegression() km_model_GoQ = KNeighborsClassifier(n_neighbors=50) svm_model_GoQ = SVC(probability=True) xgb_model_GoQ = XGBClassifier(max_depth=6, n_estimators=100, reg_lambda=1, seed=5) lightgbm_model_GOQ = LGBMClassifier(max_depth=6, n_estimators=100, reg_lambda=1, seed=5) rf_model_basic = RandomForestClassifier(n_estimators=50) elastic_net_model_basic = ElasticNet() log_reg_model_basic = LogisticRegression() lin_reg_model_basic = LinearRegression() km_model_basic = KNeighborsClassifier(n_neighbors=50) svm_model_basic = SVC(probability=True) xgb_model_basic = XGBClassifier(max_depth=6, n_estimators=100, reg_lambda=1, seed=5) lightgbm_model_basic = LGBMClassifier(max_depth=6, n_estimators=100, reg_lambda=1, seed=5) rf_model_basic_GoQ = RandomForestClassifier(n_estimators=200) elastic_net_model_basic_GoQ = ElasticNet() log_reg_model_basic_GoQ = LogisticRegression() lin_reg_model_basic_GoQ = LinearRegression() km_model_basic_GoQ = KNeighborsClassifier(n_neighbors=50) svm_model_basic_GoQ = SVC(probability=True) xgb_model_basic_GoQ = XGBClassifier(max_depth=6, n_estimators=150, reg_lambda=1, seed=5) lightgbm_model_basic_GoQ = LGBMClassifier(max_depth=6, n_estimators=150, reg_lambda=1, seed=5) print('XGB Model') xgb_model.fit(X_train_sk, y_train) xgb_model_basic.fit(X_train_basic, y_train) xgb_model_GoQ.fit(X_train_GoQ, y_train) xgb_model_basic_GoQ.fit(X_train_basic_GoQ, y_train) print('Light GBM') lightgbm_model.fit(X_train_sk, y_train) lightgbm_model_basic.fit(X_train_basic, y_train) lightgbm_model_GOQ.fit(X_train_GoQ, y_train) lightgbm_model_basic_GoQ.fit(X_train_basic_GoQ, y_train) print('Train RF Model') rf_model.fit(X_train_sk, y_train) rf_model_basic.fit(X_train_basic, y_train) rf_model_GoQ.fit(X_train_GoQ, y_train) rf_model_basic_GoQ.fit(X_train_basic_GoQ, y_train) print('ElasticNet Model') elastic_net_model.fit(X_train_sk, y_train) elastic_net_model_basic.fit(X_train_basic, y_train) elastic_net_model_GoQ.fit(X_train_GoQ, y_train) elastic_net_model_basic_GoQ.fit(X_train_basic_GoQ, y_train) print('Logistic Reg Model') log_reg_model.fit(X_train_sk, y_train) log_reg_model_basic.fit(X_train_basic, y_train) log_reg_model_GoQ.fit(X_train_GoQ, y_train) log_reg_model_basic_GoQ.fit(X_train_basic_GoQ, y_train) print('Linear Reg Model') lin_reg_model.fit(X_train_sk, y_train) lin_reg_model_basic.fit(X_train_basic, y_train) lin_reg_model_GoQ.fit(X_train_GoQ, y_train) lin_reg_model_basic_GoQ.fit(X_train_basic_GoQ, y_train) print('KMeans Model') km_model.fit(X_train_sk, y_train) km_model_basic.fit(X_train_basic, y_train) km_model_GoQ.fit(X_train_GoQ, y_train) km_model_basic_GoQ.fit(X_train_basic_GoQ, y_train) '''#Too long to compute print('SVM Model') svm_model.fit(X_train_sk,y_train) svm_model_basic.fit(X_train_basic,y_train) svm_model_GoQ.fit(X_train_GoQ,y_train) svm_model_basic_GoQ.fit(X_train_basic_GoQ,y_train) ''' print('Predict Output Test') model_gv_v4 = keras.models.load_model('weights.stack_gv_3.hdf5') model_v4 = keras.models.load_model('weights.stack_3.hdf5') model_gv_v3 = keras.models.load_model('weights.stack_gv_2.hdf5') model_v3 = keras.models.load_model('weights.stack_2.hdf5') model_gv_v2 = keras.models.load_model('weights.stack_gv_1.hdf5') model_v2 = keras.models.load_model('weights.stack_1.hdf5') model_gv = keras.models.load_model('weights.stack_gv.hdf5') model = keras.models.load_model('weights.stack.hdf5') outcome_nn_test = model.predict(X_test_nn) outcome_nn_test_gv = model_gv.predict(X_test_nn) outcome_nn_test_v2 = model_v2.predict(X_test_nn_GoQ) outcome_nn_test_gv_v2 = model_gv_v2.predict(X_test_nn_GoQ) outcome_nn_test_v3 = model_v3.predict(X_test_nn_basic) outcome_nn_test_gv_v3 = model_gv_v3.predict(X_test_nn_basic) outcome_nn_test_v4 = model_v4.predict(X_test_nn_basic_GoQ) outcome_nn_test_gv_v4 = model_gv_v4.predict(X_test_nn_basic_GoQ) keras.backend.clear_session() outcome_rf_test = rf_model.predict_proba(X_test_sk)[:, 1].reshape((-1, 1)) outcome_ada_test = elastic_net_model.predict(X_test_sk).reshape((-1, 1)) outcome_log_reg_model_test = log_reg_model.predict_proba( X_test_sk)[:, 1].reshape((-1, 1)) outcme_lin_model_test = lin_reg_model.predict(X_test_sk).reshape((-1, 1)) outcome_kmeans_test = km_model.predict_proba(X_test_sk)[:, 1].reshape( (-1, 1)) #outcome_svm_test = svm_model.predict(X_test_sk).reshape((-1,1)) outcome_xgb_test = xgb_model.predict_proba(X_test_sk)[:, 1].reshape( (-1, 1)) outcome_lgb_test = lightgbm_model.predict_proba(X_test_sk)[:, 1].reshape( (-1, 1)) outcome_rf_test_basic = rf_model_basic.predict_proba( X_test_basic)[:, 1].reshape((-1, 1)) outcome_ada_test_basic = elastic_net_model_basic.predict( X_test_basic).reshape((-1, 1)) outcome_log_reg_model_test_basic = log_reg_model_basic.predict_proba( X_test_basic)[:, 1].reshape((-1, 1)) outcme_lin_mode_testl_basic = lin_reg_model_basic.predict( X_test_basic).reshape((-1, 1)) outcome_kmeans_test_basic = km_model_basic.predict_proba( X_test_basic)[:, 1].reshape((-1, 1)) #outcome_svm_test_basic = svm_model_basic.predict(X_test_basic).reshape((-1,1)) outcome_xgb_test_basic = xgb_model_basic.predict_proba( X_test_basic)[:, 1].reshape((-1, 1)) outcome_lgb_test_basic = lightgbm_model_basic.predict_proba( X_test_basic)[:, 1].reshape((-1, 1)) outcome_rf_test_GoQ = rf_model_GoQ.predict_proba(X_test_GoQ)[:, 1].reshape( (-1, 1)) outcome_ada_test_GoQ = elastic_net_model_GoQ.predict(X_test_GoQ).reshape( (-1, 1)) outcome_log_reg_model_test_GoQ = log_reg_model_GoQ.predict_proba( X_test_GoQ)[:, 1].reshape((-1, 1)) outcme_lin_mode_testl_GoQ = lin_reg_model_GoQ.predict(X_test_GoQ).reshape( (-1, 1)) outcome_kmeans_test_GoQ = km_model_GoQ.predict_proba( X_test_GoQ)[:, 1].reshape((-1, 1)) #outcome_svm_test_GoQ = svm_model_GoQ.predict(X_test_GoQ).reshape((-1,1)) outcome_xgb_test_GoQ = xgb_model_GoQ.predict_proba(X_test_GoQ)[:, 1].reshape( (-1, 1)) outcome_lgb_test_GoQ = lightgbm_model_GOQ.predict_proba( X_test_GoQ)[:, 1].reshape((-1, 1)) outcome_rf_test_basic_GoQ = rf_model_basic_GoQ.predict_proba( X_test_basic_GoQ)[:, 1].reshape((-1, 1)) outcome_ada_test_basic_GoQ = elastic_net_model_basic_GoQ.predict( X_test_basic_GoQ).reshape((-1, 1)) outcome_log_reg_model_test_basic_GoQ = log_reg_model_basic_GoQ.predict_proba( X_test_basic_GoQ)[:, 1].reshape((-1, 1)) outcme_lin_mode_testl_basic_GoQ = lin_reg_model_basic_GoQ.predict( X_test_basic_GoQ).reshape((-1, 1)) outcome_kmeans_test_basic_GoQ = km_model_basic_GoQ.predict_proba( X_test_basic_GoQ)[:, 1].reshape((-1, 1)) #outcome_svm_test_basic_GoQ = svm_model_basic_GoQ.predict(X_test_basic_GoQ).reshape((-1,1)) outcome_xgb_test_basic_GoQ = xgb_model_basic_GoQ.predict_proba( X_test_basic_GoQ)[:, 1].reshape((-1, 1)) outcome_lgb_test_basic_GoQ = lightgbm_model_basic_GoQ.predict_proba( X_test_basic_GoQ)[:, 1].reshape((-1, 1)) '''X_test = np.concatenate([outcome_nn_test_gv,outcome_nn_test\ ,outcome_svm_test,outcome_rf_test,outcome_ada_test,outcome_log_reg_model_test,outcme_lin_model_test,outcome_kmeans_test,\ outcome_rf_test_basic,outcome_ada_test_basic,outcome_log_reg_model_test_basic,outcme_lin_mode_testl_basic,outcome_kmeans_test_basic,\ outcome_svm_test_basic\ ,outcome_rf_test_GoQ,outcome_ada_test_GoQ,outcome_log_reg_model_test_GoQ,outcme_lin_mode_testl_GoQ,outcome_kmeans_test_GoQ,outcome_svm_test_GoQ,\ outcome_rf_test_basic_GoQ,outcome_ada_test_basic_GoQ,outcome_log_reg_model_test_basic_GoQ,outcme_lin_mode_testl_basic_GoQ,\ outcome_kmeans_test_basic_GoQ,outcome_svm_test_basic_GoQ],axis=1)''' X_test = np.concatenate([outcome_nn_test_gv,outcome_nn_test,outcome_nn_test_v2,outcome_nn_test_gv_v2,outcome_nn_test_v3,outcome_nn_test_gv_v3,outcome_nn_test_v4,outcome_nn_test_gv_v4\ ,outcome_rf_test,outcome_ada_test,outcome_log_reg_model_test,outcme_lin_model_test,outcome_kmeans_test,\ outcome_rf_test_basic,outcome_ada_test_basic,outcome_log_reg_model_test_basic,outcme_lin_mode_testl_basic,outcome_kmeans_test_basic,\ outcome_rf_test_GoQ,outcome_ada_test_GoQ,outcome_log_reg_model_test_GoQ,outcme_lin_mode_testl_GoQ,outcome_kmeans_test_GoQ,\ outcome_rf_test_basic_GoQ,outcome_ada_test_basic_GoQ,outcome_log_reg_model_test_basic_GoQ,outcme_lin_mode_testl_basic_GoQ,\ outcome_kmeans_test_basic_GoQ,outcome_xgb_test,outcome_lgb_test,outcome_xgb_test_basic,outcome_lgb_test_basic,outcome_xgb_test_GoQ,\ outcome_lgb_test_GoQ,outcome_xgb_test_basic_GoQ,outcome_lgb_test_basic_GoQ],axis=1) return X_test
# Save the classification model ml_model_name = model_folder + label + "_clf.model" pickle.dump(clf, open(ml_model_name, 'wb')) labels = [ 'CVSS2_Conf', 'CVSS2_Integrity', 'CVSS2_Avail', 'CVSS2_AccessVect', 'CVSS2_AccessComp', 'CVSS2_Auth', 'CVSS2_Severity' ] clfs = { 'CVSS2_Conf': { 'LGBM': LGBMClassifier(num_leaves=100, max_depth=-1, objective='multiclass', n_jobs=-1, random_state=42) }, 'CVSS2_Integrity': { 'XGB': XGBClassifier(objective='multiclass', max_depth=0, max_leaves=100, grow_policy='lossguide', n_jobs=-1, random_state=42, tree_method='hist') }, 'CVSS2_Avail': { 'LGBM':
predicted_lr = model_lr.predict(X_test) ; print("LogisticRegression",metrics.accuracy_score(Y_test, predicted_lr),"\n") #aa = model_lr.coef_ if cond01 == 3: from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bayes model_nb = GaussianNB(); model_nb.fit(X_train, Y_train) predicted_nb = model_nb.predict(X_test) ; print("Gaussian Naive Bayes",metrics.accuracy_score(Y_test, predicted_nb),"\n") if cond01 == 4: from sklearn.ensemble import GradientBoostingClassifier # GradientBoosting model_gb = GradientBoostingClassifier(); model_gb.fit(X_train, Y_train) predicted_gb = model_gb.predict(X_test) ; print("GradientBoosting",metrics.accuracy_score(Y_test, predicted_gb),"\n") if cond01 == 5: from lightgbm import LGBMClassifier # LightGBM model_lgbm = LGBMClassifier(); model_lgbm.fit(X_train, Y_train) predicted_lgbm = model_lgbm.predict(X_test); print("LightGBM",metrics.accuracy_score(Y_test, predicted_lgbm),"\n") # ##http://myenigma.hatenablog.com/entry/2015/10/09/223629 #import seaborn as sns #iris = sns.load_dataset("iris") #サンプルデータセット ##sns.pairplot(iris); #sns.pairplot(iris,hue="species"); #sns.plt.savefig("iris.png") #sns.plt.show() #
"./model2/sample_mydata_model_xgboost{}.pickle.dat".format( cnt), "wb")) else: print("LGBMClassifier") model = LGBMClassifier( boosting_type='gbdt', task='train', num_leaves=2**depth - 1, num_iterations=steps, learning_rate=0.01, n_estimators=2000, max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0, min_child_weight=5, min_child_samples=10, feature_fraction=0.9, feature_fraction_bynode=0.8, drop_rate=0.05, subsample=0.8, subsample_freq=1, colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, # n_jobs=4, silent=True) # 建议使用CV的方式训练预测。 model.fit( train_x, train_y,
else: pipeline.fit(audit_X, audit_y) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name) if "Audit" in datasets: build_audit(DecisionTreeClassifier(min_samples_leaf = 7, random_state = 13), "DecisionTreeAudit", compact = False, flat = True) build_audit(GradientBoostingClassifier(n_estimators = 71, random_state = 13), "GradientBoostingAudit") build_audit(LGBMClassifier(objective = "binary", n_estimators = 71, random_state = 13), "LightGBMAudit") build_audit(LogisticRegression(multi_class = "ovr", solver = "liblinear", random_state = 13), "LogisticRegressionAudit") build_audit(RandomForestClassifier(n_estimators = 17, random_state = 13), "RandomForestAudit", compact = False, flat = False) build_audit(XGBClassifier(objective = "binary:logistic", ntree_limit = 71, random_state = 13), "XGBoostAudit") sparsify("Audit") audit_X, audit_y = load_audit("AuditNA") if ("Audit" in datasets) or ("AuditNA" in datasets): build_audit(LGBMClassifier(objective = "binary", n_estimators = 71, random_state = 13), "LightGBMAuditNA") build_audit(XGBClassifier(objective = "binary:logistic", ntree_limit = 71, random_state = 13), "XGBoostAuditNA") def load_sentiment(name): df = load_csv(name) return (df["Sentence"], df["Score"])
def without_cv_transfer_a_to_b_modeling(): """ :return: """ '''Data input''' data_a_train = pd.read_csv('../data/A_train_final.csv', index_col='no') data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no') y_of_b_train = data_b_train['flag'] data_b_test = pd.read_csv('../data/B_test_final.csv', index_col='no') '''A train特征工程''' data_a_train_without_label = data_a_train.drop('flag', axis=1) data_a_train_without_label['UserInfo_222x82'] = data_a_train_without_label['UserInfo_82'] * data_a_train_without_label['UserInfo_222'] '''缺失值填充''' data_a_train_filled = data_a_train_without_label.fillna(value=10) '''特征的名字''' feature_name = list(data_a_train_without_label.columns.values) data_b_test_user_id = list(data_b_test.index.values) '''构造训练集和测试集''' x_temp = data_a_train_filled.iloc[:, :].as_matrix() # 自变量 y = data_a_train.iloc[:, -1].as_matrix() # 因变量 '''Feature selection 注意如果加特征的话,feature name还是需要改的''' X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, "0.1*mean") '''B train特征工程''' data_b_train_without_label = data_b_train.drop('flag', axis=1) data_b_train_without_label['UserInfo_222x82'] = data_b_train_without_label['UserInfo_82'] * data_b_train_without_label['UserInfo_222'] data_b_train_filled = data_b_train_without_label.fillna(value=10) '''b test 特征工程''' data_b_test['UserInfo_222x82'] = data_b_test['UserInfo_82'] * data_b_test['UserInfo_222'] data_b_test_filled = data_b_test.fillna(value=10) '''特征筛选''' data_b_train_filled_after_feature_selection = data_test_feature_drop(data_b_train_filled, dropped_feature_name) data_b_test_filled_after_feature_selection = data_test_feature_drop(data_b_test_filled, dropped_feature_name) '''用A_train建模预测B_train''' print '起始时间' print time.clock()*1.0/60 parameter_n_estimators = 400 classifier = LGBMClassifier(n_estimators=parameter_n_estimators) a_model = classifier.fit(X, y) prob_of_b_train = a_model.predict_proba(data_b_train_filled_after_feature_selection) print '训练终止时间' print time.clock()*1.0/60 '''画roc曲线''' fpr, tpr, thresholds = roc_curve(y_of_b_train, prob_of_b_train[:, 1]) roc_auc = auc(fpr, tpr) print '\nauc='+str(roc_auc) '''预测Btest''' prob_of_b_test = a_model.predict_proba(data_b_test_filled_after_feature_selection) result_file_name = '../result/B_test_predict_using_A_LGBLGB_without_cv_fillna_10' + '_N_' + str(parameter_n_estimators) + '_features_' + \ str(len_feature_choose) + '_offline_'+str(roc_auc)+'.csv' write_predict_results_to_csv(result_file_name, data_b_test_user_id, prob_of_b_test[:, 1].tolist())