def tune_params(): f1_t_total, f1_v_total = [], [] for max_depth in range(6,15): for subsample in [0.6,0.7,0.8]: for colsample_bytree in [0.6,0.7,0.8]: for reg_alpha in [0.1,1,10]: lgb_base = LGBMClassifier(n_estimators = 150,objective = 'binary', random_state=1234,n_jobs = 3,colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, max_depth = max_depth, subsample = subsample) _params = { 'max_depth':max_depth, 'subsample':subsample, 'colsample_bytree':colsample_bytree, 'reg_alpha':reg_alpha, } lgb_base.fit(X_t, y_t) y_t_pre = lgb_base.predict(X_t) y_v_pre = lgb_base.predict(X_v) f1_t_each = f1_score(y_t, y_t_pre,average = 'micro') f1_v_each = f1_score(y_v, y_v_pre,average = 'micro') f1_t_total.append(f1_t_each) f1_v_total.append(f1_v_each) print(_params) myfile1 = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_saveparams_f1_0418.txt', 'a', encoding='utf-8') print(_params['max_depth'],_params['subsample'],_params['colsample_bytree'], _params['reg_alpha'],file = myfile1) myfile1.close() print(f1_t_each,f1_v_each) myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_tunparms_f1_0418.txt', 'a', encoding='utf-8') print(f1_t_each,',',f1_v_each,file = myfile) myfile.close() return f1_t_total,f1_v_total
def baseline_xiong(self, profile: Profile, shared: Storage, logger: Logger, converted): a_std = converted[1].std(-1) g_mean = converted[3].mean(-1) g_std = converted[3].std(-1) m_over_0_count = (converted[2] >= 0.0).sum(-1).astype(np.float32) a_mean = converted[1].mean(-1) a_l2_std = np.sqrt(converted[1][:, 0, :]**2 + converted[1][:, 1, :]**2 + converted[1][:, 2, :]**2).std(-1)[:, np.newaxis] m_l2_std = np.sqrt(converted[2][:, 0, :]**2 + converted[2][:, 1, :]**2 + converted[2][:, 2, :]**2).std(-1)[:, np.newaxis] features = np.concatenate( (a_std, g_mean, g_std, m_over_0_count, a_mean, a_l2_std, m_l2_std), axis=1) labels = converted[ 0] # onehot.fit_transform(converted[0].reshape(-1, 1)).toarray() length = labels.shape[0] classifier = LGBMClassifier() classifier.fit(features[:int(length * 0.7)], labels[:int(length * 0.7)]) validate_y = labels[int(length * 0.7):] predict_y = classifier.predict(features[int(length * 0.7):]) logger.info('Xiong') logger.info(f'Accuracy: {accuracy_score(validate_y, predict_y)}') logger.info( f'Precision: {precision_score(validate_y, predict_y, average=None)}' ) logger.info( f'Recall: {recall_score(validate_y, predict_y, average=None)}')
def criteo_gdbtlr(X_idx, X_value, y): import numpy as np from sklearn.metrics import roc_auc_score, accuracy_score from sklearn.linear_model import LogisticRegression from lightgbm.sklearn import LGBMClassifier X_idx = X_idx.values.tolist() y = y.values.tolist() num_leaves = 31 model = LGBMClassifier(num_leaves=num_leaves) model.fit(X_idx, y) model_path = os.path.join(pwd_path, 'gbdtlr_model1.pt') y_pred = model.predict(X_idx, pred_leaf=True) y_pred_gbdt = model.predict(X_idx, pred_leaf=False) acc = model.score(X_idx, y) print("gbdt train acc:", acc) s = roc_auc_score(y, y_pred_gbdt) print('gbdt auc:', s) a = accuracy_score(y, y_pred_gbdt) print('gbdt train acc:', a) import pickle # pickle模块 # 保存Model(注:save文件夹要预先建立,否则会报错) with open(model_path, 'wb') as f: pickle.dump(model, f) # # 读取Model # with open('save/clf.pickle', 'rb') as f: # clf2 = pickle.load(f) transformed_matrix = np.zeros( [len(y_pred), len(y_pred[0]) * num_leaves], dtype=np.int64) for i in range(0, len(y_pred)): temp = np.arange(len(y_pred[0])) * num_leaves - 1 + np.array(y_pred[i]) transformed_matrix[i][temp] += 1 lr_model = LogisticRegression() lr_model.fit(transformed_matrix, y) y_pred_lr = lr_model.predict(transformed_matrix) print("truth_y:", y[:100], 'y_pred_lr:', y_pred_lr[:100]) s = roc_auc_score(y, y_pred_lr) print('auc:', s)
def score(params, skf=skf, sample_weight=sample_weight): params = {"max_depth": int(params["max_depth"]), "subsample": params["subsample"], "colsample_bytree": params['colsample_bytree'], "num_leaves": int(params['num_leaves']), "n_jobs": -2 } clf = LGBMClassifier(n_estimators=500, learning_rate=0.05, **params) list_score_acc = [] list_score_logloss = [] for train, val in skf.split(self.X, self.y): X_train, X_val = self.X[train], self.X[val] y_train, y_val = self.y[train], self.y[val] weight_train = sample_weight[train] weight_val = sample_weight[val] clf.fit(X_train, y_train, sample_weight=weight_train, eval_sample_weight=[weight_val], eval_set=[(X_val, y_val)], eval_metric="logloss", early_stopping_rounds=0, verbose=False ) _score_acc = accuracy_score(y_val, clf.predict(X_val), sample_weight=weight_val) _score_logloss = log_loss(y_val, clf.predict_proba(X_val), sample_weight=weight_val) list_score_acc.append(_score_acc) list_score_logloss.append(_score_logloss) """ ##n_estimaters=0 causes error at .fit() if clf.best_iteration_ != -1: list_best_iter.append(clf.best_iteration_) else: list_best_iter.append(params['n_estimators']) break """ # logger.info("n_estimators: {}".format(list_best_iter)) # params["n_estimators"] = np.mean(list_best_iter, dtype=int) score_acc = (np.mean(list_score_acc), np.min(list_score_acc), np.max(list_score_acc)) # logger.info("score_acc %s" % np.mean(list_score_acc)) # score_logloss = (np.mean(list_score_logloss), np.min(list_score_logloss), np.max(list_score_logloss)) # score_f1 = (np.mean(list_score_f1), np.min(list_score_f1), np.max(list_score_f1)) # score_auc = (np.mean(list_score_auc), np.min(list_score_auc), np.max(list_score_auc)) logloss = np.mean(list_score_logloss) return {'loss': logloss, 'status': STATUS_OK, 'localCV_acc': score_acc}
def lgb(x_train, y_train, x_val, y_val): lgb = LGBMClassifier(n_estimators=1000, max_depth=10, subsample=0.7, colsample_bytree=0.7, learning_rate=0.01, random_state=2020) lgb.fit(x_train, y_train) result = lgb.predict(x_val) score = f1_score(result, y_val) return score
def find_best_cv(self): Util.split_cv(self.X, self.y, self.n_folds_list, ORG_DATA_DIR) acc_score_means = [] acc_score_vars = [] for num_of_fold in self.n_folds_list: print("============") logger.info("==evaluating %s fold==" % num_of_fold) CV_DIR = os.path.join(ORG_DATA_DIR, "n_folds_%s/" % num_of_fold) acc_score = [] for i in range(num_of_fold): logger.info("loading %s th cv data in %s folds" % (i, num_of_fold)) X_train = pd.read_csv(os.path.join(CV_DIR, "X_train_%s.csv") % i, header=None, sep="\t").values X_val = pd.read_csv(os.path.join(CV_DIR, "X_val_%s.csv") % i, header=None, sep="\t").values y_train = pd.read_csv(os.path.join(CV_DIR, "y_train_%s.csv") % i, header=None, sep="\t").values y_c, y_r = y_train.shape y_train = y_train.reshape(y_c, ) y_val = pd.read_csv(os.path.join(CV_DIR, "y_val_%s.csv") % i, header=None, sep="\t").values y_c, y_r = y_val.shape y_val = y_val.reshape(y_c, ) logger.info("end loading %s th cv data in %s folds" % (i, num_of_fold)) logger.info("X_train.shape: %s %s" % X_train.shape) logger.info("X_val.shape: %s %s" % X_val.shape) logger.info("y_train.shape: %s" % y_train.shape) logger.info("y_val.shape: %s" % y_val.shape) clf = LGBMClassifier(objective="binary", n_estimators=20) weight_train = self._calc_w(y_train) clf.fit(X_train, y_train, sample_weight=weight_train, eval_set=[(X_val, y_val)], verbose=True) y_pred = clf.predict(X_val) logger.info("acc socore: %s folds, %s iteration" % (num_of_fold, i)) acc_score.append(accuracy_score(y_val, y_pred)) logger.info("mean acc score of %s folds is %s" % (num_of_fold, np.mean(acc_score))) acc_score_means.append(np.mean(acc_score)) logger.info("variance of acc score of %s folds is %s" % (num_of_fold, np.var(acc_score))) acc_score_vars.append(np.var(acc_score)) for i in range(len(self.n_folds_list)): logger.info( "===%s_folds=== mean acc:%s, var acc: %s " % (self.n_folds_list[i], acc_score_means[i], acc_score_vars[i]) )
def get_ntree(): f1_t_total, f1_v_total = [], [] for ntree in range(10, 810, 10): lgb_base = LGBMClassifier(n_estimators = ntree,objective = 'binary', random_state=1234,n_jobs = 2,colsample_bytree=0.8, reg_alpha=1, max_depth = 15, subsample = 0.8) print('此时 ntree = %s' % ntree) lgb_base.fit(X_t, y_t) y_t_pre = lgb_base.predict(X_t) y_v_pre = lgb_base.predict(X_v) f1_t_each = f1_score(y_t, y_t_pre,average = 'micro') f1_v_each = f1_score(y_v, y_v_pre,average = 'micro') f1_t_total.append(f1_t_each) f1_v_total.append(f1_v_each) myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_810_2.txt', 'a', encoding='utf-8') print(f1_t_each,',',f1_v_each,file = myfile) myfile.close() return f1_t_total,f1_v_total
def evaluate_age(): features = pd.read_csv( 'data/combine_feature/part-00000-380aaa4b-c838-43f4-8cb7-80164a4256f2-c000.csv' ) y = features.age.values features.drop(['user_id', 'age', 'gender'], axis=1, inplace=True) print(features.shape) X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2) lightgbm = LGBMClassifier(n_estimators=200, num_leaves=100, feature_fraction=0.75, bagging_fraction=0.75, learning_rate=0.1) lightgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5) pred = lightgbm.predict(X_test) print(classification_report(y_test, pred)) joblib.dump(lightgbm, 'data/lgb_age')
class PHSICAdasynLGBM(BaseEstimator): """ An estimator upsampling minority classes, finding a small set of stable biomarkers, and fitting a gradient boosting model over them Parameters ---------- n_features : int, optional (default=30) Max. number of biomarkers (important features) to be selected adasyn_neighbors : int, optional (default=10) K neighbors for ADASYN upsampling algorithm B : int, optional (default=20) Block size for Block HSIC Lasso M : int, optional (default=10) Max allowed permutations of samples for Block HSIC Lasso hsic_splits : int, optional (default=5) number of folds for verifying feature stability feature_neighbor_threshold : float, optional (default=0.4) threshold for considering neighbors of important features in stability check """ def __init__(self, n_features=30, adasyn_neighbors=10, B=20, M=10, hsic_splits=3, stability_minimum_across_splits=2, feature_neighbor_threshold=0.4): self.n_features = n_features self.adasyn_neighbors = adasyn_neighbors self.M = M self.B = B self.hsic_splits = hsic_splits self.neighbor_threshold = feature_neighbor_threshold self.stability_minimum_across_splits = stability_minimum_across_splits def fit(self, X, y): if X.shape[1] > 10000: #clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1).fit(X,y) clf = LGBMClassifier(n_estimators=1000, n_jobs=-1).fit(X, y) ftimp = clf.feature_importances_ relevant = np.where(ftimp > 0)[0] print("relevant ft:", len(relevant), "/", X.shape[1]) else: relevant = np.arange(X.shape[1]) sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[:, relevant][train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(relevant[all_ft_idx]) #if len(idxs) == 1: # self.hsic_idx_ = idxs[0] #else: # self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) self.hsic_idx_ = [] stability_concession = 0 while len(self.hsic_idx_) == 0: featurecandidates = np.unique(np.concatenate(idxs)) for candidate in featurecandidates: occurrences = np.sum( [1 if candidate in idx else 0 for idx in idxs]) if occurrences > self.stability_minimum_across_splits - stability_concession: self.hsic_idx_.append(candidate) if len(self.hsic_idx_) > 1: break else: # failed to find commonly occurring features - reduce threshold stability_concession += 1 print("HSIC done.", len(self.hsic_idx_), "(out of ", len(featurecandidates), " candidates)") print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self def predict_proba(self, X): return self.clf_.predict_proba(X[:, self.hsic_idx_]) def predict(self, X): return self.clf_.predict(X[:, self.hsic_idx_])
subsample_for_bin=800, n_jobs=4) # # specify your configurations as a dict # param_grid_xgboost={'min_child_samples':np.arange(10,100,10)} # start_time=time.clock() # grid_lgb=GridSearchCV(lgb,param_grid_xgboost,cv=5,scoring='accuracy') # grid_lgb.fit(X,y) # endtime=time.clock() # print('score',grid_lgb.grid_scores_) # print('Xgboost_best_estimator_',grid_lgb.best_estimator_) # print('Xgboost_best_score_',grid_lgb.best_score_) # print('Xgboost_best_params_',grid_lgb.best_params_) # print("run_time",endtime-start_time) start_time = time.clock() score_all = 0 kf = KFold(n_splits=5, shuffle=True) for train, test in kf.split(X): print(len(train), len(test)) X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] lgb.fit(X_train, y_train) preds = lgb.predict(X_test) score = accuracy_score(y_test, preds) print("score:", score) score_all = score_all + score print("score_all", score_all / 5) endtime = time.clock() print("run_time", endtime - start_time)
def zip_process(zip): z = str(zip) z = int(z.split('-')[0]) return zip_dic[z] df['gender'] = df['gender'].apply(gender_process) df['age'] = df['age'].apply(age_process) df['genres'] = df['genres'].apply(genres_process) df['zip'] = df['zip'].apply(zip_process) y = df['label'].values df.drop( columns=['user_id', 'movie_id', 'rating', 'timestamp', 'title', 'label'], axis=1, inplace=True) x = df.values length = int(len(x) * 0.9) x_train = x[0:length] y_train = y[0:length] x_test = x[length:] y_test = y[length:] model = LGBMClassifier(n_estimators=1200) model.fit(x_train, y_train) prediction = model.predict(x_test) acc = accuracy_score(y_test, prediction) print(acc)
subsample_freq=1, subsample=0.8, colsample_bytree=0.8, min_child_weight=5, random_state=2020, n_jobs=24, ) clf.fit( X_trn, Y_trn, eval_set=[(X_val, Y_val)], early_stopping_rounds=500, verbose=200, ) print('val_acc: {:.5f}'.format( accuracy_score(Y_val, clf.predict(X_val)))) oof[val_idx] = clf.predict_proba(X_val) sub += clf.predict_proba(X_sub) / skf.n_splits print('cv_acc : {:.5f}'.format(accuracy_score(Y_train, oof.argmax(axis=1)))) print( classification_report(Y_train, oof.argmax(axis=1), target_names=lbl.classes_)) oof_files = [ 'bert_oof0_2', 'cnn_oof0_2', 'mlp_oof0_2', 'bert_oof1', 'cnn_oof1', 'mlp_oof1' ] sub_files = [
print( "Select best LGB model with n_estimators = {} with best_score={}".format( best_clf.best_params_['n_estimators'], best_clf.best_score_)) #%% for a in [100, 300, 600, 1000]: for b in [0.0001, 0.001, 0.01, 0.1, 1, 10]: LBMclf = LGBMClassifier(random_state=50, n_jobs=-1, n_estimators=a, reg_lambda=b) LBMclf.fit(train_data, train_label) print( "The reuslt AUC_ROC of the lightGBM with n_estimators={} and reg_lambda={} on test data is" .format(a, b), roc_auc_score(test_label.tolist(), LBMclf.predict(test_data).tolist())) #%% # Make the model with the specified regularization parameter clf = LogisticRegression() best_clf = GridSearchCV(clf, scoring='roc_auc', cv=5, n_jobs=-1, param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]}) best_clf.fit(train_data, train_label) print("Select best Logistic Regression model with C = {} with best_score={}". format(best_clf.best_params_['C'], best_clf.best_score_)) #%% for c in [0.001, 0.01, 0.1, 1, 10, 100]: test_clf = LogisticRegression(C=c)
drop_first=True)).reshape(X.shape[0]) #veriyi bölme X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.21, random_state=42) #modeli kurma LightGBM = LGBMClassifier() LightGBM.fit(X_train, y_train) #modelden tahmin tapma pred = LightGBM.predict(X_test) #ilkel başarı değeri print(f"İlkel başarı değeri : {accuracy_score(y_test,pred)}") #hiperparametre seçelim hiperparams = { 'max_depth': np.arange(2, 10, 2), 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1], 'n_estimators': np.arange(200, 1000, 200) } model_cv = GridSearchCV(LightGBM, hiperparams, cv=10, n_jobs=-1).fit(X_train, y_train) print(model_cv.best_params_)
x_train, x_test, y_train, y_test = train_test_split(data_features_part, data_target_part, test_size=0.2, random_state=2021) ## 导入LightGBM模型 from lightgbm.sklearn import LGBMClassifier # 定义 LightGBM 模型 clf = LGBMClassifier() # 在训练集上训练LightGBM模型 clf.fit(x_train, y_train) # 在训练集和测试集上分布利用训练好的模型进行预测 train_predict = clf.predict(x_train) test_predict = clf.predict(x_test) from sklearn import metrics # 利用accuracy(准确度)【预测正确的样本数目占总预测样本数目的比例】评估模型效果 print('The accuracy of the Logistic Regression is:', metrics.accuracy_score(y_train, train_predict)) print('The accuracy of the Logistic Regression is:', metrics.accuracy_score(y_test, test_predict)) # 查看混淆矩阵 (预测值和真实值的各类情况统计矩阵) confusion_matrix_result = metrics.confusion_matrix(test_predict, y_test) print('The confusion matrix result:\n', confusion_matrix_result) # 利用热力图对于结果进行可视化
def classes(data, label, test): model = LGBMClassifier() model.fit(data, label) ans = model.predict(test) estimate(model, data) return ans
n_folds = 3 skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0) l = list(skf.split(x_train, y_train)) train_sets = np.zeros((x_train.shape[0], len(clfs))) test_sets = np.zeros((x_test.shape[0], len(clfs))) for j, clf in enumerate(clfs): '''依次训练各个单模型''' print(j, clf) test_j = np.zeros((x_test.shape[0], len(l))) for i, (trn_idx, val_idx) in enumerate(skf.split(x_train, y_train)): '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。''' # print("Fold", i) trn_x, trn_y = x_train[trn_idx], y_train[trn_idx] val_x, val_y = x_train[val_idx], y_train[val_idx] clf.fit(trn_x, trn_y) y_submission = clf.predict(val_x)[:, 1] train_sets[val_idx, j] = y_submission test_j[:, i] = clf.predict(x_test)[:, 1] '''对于测试集,直接用这k个模型的预测值均值作为新的特征。''' test_sets[:, j] = test_j.mean(axis=1) print(train_sets) print(test_sets) lgb.fit(train_sets, y_train) result = lgb.predict(test_sets)[:, 1] result = (result - result.min()) / (result.max() - result.min()) print(result) print(result.shape)
class PHSICAdasynLGBM(BaseEstimator): """ An estimator upsampling minority classes, finding a small set of stable biomarkers, and fitting a gradient boosting model over them Parameters ---------- n_features : int, optional (default=30) Max. number of biomarkers (important features) to be selected adasyn_neighbors : int, optional (default=10) K neighbors for ADASYN upsampling algorithm B : int, optional (default=20) Block size for Block HSIC Lasso M : int, optional (default=10) Max allowed permutations of samples for Block HSIC Lasso hsic_splits : int, optional (default=5) number of folds for verifying feature stability feature_neighbor_threshold : float, optional (default=0.4) threshold for considering neighbors of important features in stability check """ def __init__(self, n_features=30, adasyn_neighbors=10, B=20, M=10, hsic_splits=5, feature_neighbor_threshold=0.4): self.n_features = n_features self.adasyn_neighbors = adasyn_neighbors self.M = M self.B = B self.hsic_splits = hsic_splits self.neighbor_threshold = feature_neighbor_threshold def fit(self, X, y): sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(all_ft_idx) if len(idxs) == 1: self.hsic_idx_ = idxs[0] else: self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) print("HSIC done.", len(self.hsic_idx_)) print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self def predict_proba(self, X): return self.clf_.predict_proba(X[:, self.hsic_idx_]) def predict(self, X): return self.clf_.predict(X[:, self.hsic_idx_])
#"""# 调参后训练 time1 = time.time() lgb = LGBMClassifier(boosting_type='gbdt', learning_rate=l_r, n_estimators=n_e, num_leaves=num_leaves, subsample=ss, colsample_bytree=c_b, objective='binary', random_state=10) lgb = joblib.load('LightGBM_model.pkl') dtime1 = time.time() - time1 # 预测测试集1 time2 = time.time() test_pre = lgb.predict(data['test_data']) dtime2 = time.time() - time2 test_preb = lgb.predict_proba(data['test_data']) acc = metrics.accuracy_score(data['test_label'], test_pre) t = [i[1] for i in test_preb] auc = metrics.roc_auc_score(data['test_label'], t) recall = metrics.recall_score(data['test_label'], test_pre) prec = metrics.precision_score(data['test_label'], test_pre) f1 = metrics.f1_score(data['test_label'], test_pre) print(lgb.get_params()) print(lgb.feature_importances_) print('''train time: %d predict time: %d acc: %f auc: %f recall: %f
def loadDataFrame(): iris = load_iris() iris_data = iris.data iris_target = iris.target iris_df = pd.DataFrame(iris_data, columns=iris.feature_names) iris_df['target'] = pd.Series(iris_target) return iris_df if __name__ == "__main__": iris_df = loadDataFrame() value_counts = iris_df['target'].value_counts() print(value_counts) X_iris = iris_df.iloc[:, :-1].values y_iris = iris_df.iloc[:, -1].values X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.3, stratify=y_iris) model = LGBMClassifier() model.fit(X_train, y_train) print(model.get_params()) y_pred = model.predict(X_test) print("accuracy: %s" % np.mean(y_pred == y_test))
def multi_machine_learing_models(data_train, data_cv): print('正在训练模型!') data_train=pd.concat([data_train,data_cv],axis=0) y_train = data_train['label'].apply(lambda x: 0 if x == 'good' else 1) y_test = data_cv['label'].apply(lambda x: 0 if x == 'good' else 1) X_train = data_train.drop(['URL', 'label'], axis=1) X_test = data_cv.drop(['URL', 'label'], axis=1) filename_bayes = 'classifier_model\c_bayes.model' filename_LGB = 'classifier_model\c_LGB.model' filename_ada = 'classifier_model\c_ada.model' filename_rf = 'classifier_model\c_rf.model' filename_decision_tree = 'classifier_model\c_decision_tree.model' filename_lgs = 'classifier_model\c_lgs.model' vote = [] for i in range(len(y_test)): vote.append(0) bayes = BernoulliNB() bayes.fit(X_train, y_train) print('\nbayes模型的准确度:', bayes.score(X_test, y_test)) predict = bayes.predict(X_test) vote = list(map(lambda x: x[0] + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(bayes, filename_bayes) gbc = LGBMClassifier(n_estimators=200, objective='binary') gbc.fit(X_train, y_train) print('LGBMClassifier模型的准确度:', gbc.score(X_test, y_test)) predict = gbc.predict(X_test) vote = list(map(lambda x: 3 * x[0] + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(gbc, filename_LGB) ada = AdaBoostClassifier(n_estimators=100) # 迭代100次 ada.fit(X_train, y_train) print('ada模型的准确度:', ada.score(X_test, y_test)) predict = ada.predict(X_test) vote = list(map(lambda x: 2 * x[0] + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(ada, filename_ada) rf = RandomForestClassifier(n_estimators=100, oob_score=True) rf.fit(X_train, y_train) print('\nrf模型的准确度:', rf.score(X_test, y_test)) predict = rf.predict(X_test) vote = list(map(lambda x: x[0] * 3 + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(rf, filename_rf) decision_tree = tree.DecisionTreeClassifier() decision_tree.fit(X_train, y_train) print('\ndecision_tree模型的准确度:', decision_tree.score(X_test, y_test)) predict = decision_tree.predict(X_test) vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(decision_tree, filename_decision_tree) lgs = LogisticRegression() lgs.fit(X_train, y_train) print('\nLogisticRegression模型的准确度:', lgs.score(X_test, y_test)) predict = lgs.predict(X_test) vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(lgs, filename_lgs) print('\n投票结果:') vote_r = [] for i in range(len(vote)): if vote[i] >= 3: vote_r.append(1) else: vote_r.append(0) precision = metrics.precision_score(y_test, vote_r) recall = metrics.recall_score(y_test, vote_r) acc = metrics.accuracy_score(y_test, vote_r) print('准确度:', acc) print("precison:", precision) print("recall:", recall)