def main(): X = make_X(200) target = make_target(X) real_labels(X, target) clf_list = [ # nn.MLPClassifier(hidden_layer_sizes=(2,), random_state=0), # nn.MLPClassifier(hidden_layer_sizes=(3,), random_state=0), # nn.MLPClassifier(hidden_layer_sizes=(4,), random_state=0), # nn.MLPClassifier(hidden_layer_sizes=(10,), random_state=0), lgbm.LGBMClassifier(n_estimators=200,random_state=0), xgb.XGBClassifier(n_estimators=200,max_depth=5,random_state=0) # nn.MLPClassifier(hidden_layer_sizes=(200,)), # nn.MLPClassifier(hidden_layer_sizes=(300,)), # nn.MLPClassifier(hidden_layer_sizes=(200, 100)), # xgb.XGBClassifier(n_estimators=30, max_depth=3), # xgb.XGBClassifier(n_estimators=5, max_depth=3), # ensemble.AdaBoostClassifier(n_estimators=30, random_state=0) ] for clf in clf_list: prd_labels(X, target, clf) plt.show()
def get_model(algorithm_name): if algorithm_name == 'knn': # 根据算法名称使用不同算法 clf = KNeighborsClassifier(n_neighbors=5) elif algorithm_name == 'DT': clf = DecisionTreeClassifier() elif algorithm_name == 'SGD': clf = SGDClassifier() elif algorithm_name == 'MNB': clf = MultinomialNB() elif algorithm_name == 'GBM': clf = gbm.LGBMClassifier() elif algorithm_name == 'GNB': clf = GaussianNB() elif algorithm_name == 'BNB': clf = BernoulliNB() else: print('wrong input!') # 输入错误直接退出 exit() return clf
def run(self): self.output().makedirs() X = abhishek_feats.AbhishekFeatures().load('train', self.fold) y = xval_dataset.BaseDataset().load('train', self.fold).squeeze() cls = lgbsklearn.LGBMClassifier(num_leaves=1024, n_estimators=1024, is_unbalance=True) X_tr, X_va, y_tr, y_va = model_selection.train_test_split( X, y, test_size=0.05) cls.fit(X_tr, y_tr, sample_weight=core.weight_from(y_tr), eval_set=(X_va, y_va), early_stopping_rounds=10) validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold) y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze() y_pred = cls.predict_proba(validX)[:, 1] scorestr = "{:s} = {:f}".format(repr(self), core.score_data(y, y_pred)) print(colors.green | colors.bold | scorestr) np.save('cache/abhishek/lgbm/{:d}/valid.npy'.format(self.fold), y_pred) trainX = abhishek_feats.AbhishekFeatures().load('test', None) pred = cls.predict_proba(trainX)[:, 1] np.save('cache/abhishek/lgbm/{:d}/test.npy'.format(self.fold), pred) with self.output().open('w') as f: cols = abhishek_feats.AbhishekFeatures().load('valid', self.fold, as_df=True).columns v = pandas.Series(cls.feature_importances_, index=cols).sort_values() v.to_csv(f) f.write("\n\n") f.write(scorestr) f.write("\n")
def get_model(algorithm_name): if algorithm_name == 'knn_5': # 根据算法名称使用不同算法 clf = KNeighborsClassifier(n_neighbors=5) elif algorithm_name == 'RF_1000': clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=7) elif algorithm_name == 'DT': clf = DecisionTreeClassifier() elif algorithm_name == 'SGD': clf = SGDClassifier() elif algorithm_name == 'MNB': clf = MultinomialNB() elif algorithm_name == 'GBM': clf = gbm.LGBMClassifier() elif algorithm_name == 'GNB': clf = GaussianNB() elif algorithm_name == 'BNB': clf = BernoulliNB() else: print('wrong input!') # 输入错误直接退出 exit() return clf
def train_save(pred_period=20, is_high=True, is_clf=False): data = gen_dataset(is_high=is_high, is_clf=is_clf, pred_period=pred_period) if is_clf: _, y_train = data["train"] scale_pos_weight = sum(y_train == 0) / sum(y_train == 1) if not is_clf: models = [ lgbm.LGBMRegressor(n_estimators=300, num_leaves=100, max_depth=8, random_state=0), xgb.XGBRegressor(n_estimators=300, max_depth=5, random_state=0) ] else: models = [ lgbm.LGBMClassifier(n_estimators=300, scale_pos_weight=0.1, num_leaves=100, max_depth=8, random_state=0), xgb.XGBClassifier( n_estimators=300, scale_pos_weight=0.1, max_depth=5, random_state=0, ) ] y_pred_list = train(data, models, is_clf=is_clf) # save model for model in models: save_model(model, pred_period, is_high) return y_pred_list
def generate_model_package(training_data_path, id_cols, target_cols, fields_config_file, param_grid, model_name, target_var): """ training_data_path ,id_cols ,target_cols ,fields_config_file ,param_grid ,model_name ,target_var """ pyspark_app_nm = "train_" + model_name + "_" + secrets.token_hex(nbytes=4) logging.info("Starting process: " + pyspark_app_nm) #create spark object and spark context for parallel learning logging.info("Instantiating pyspark.") app_pyspark_conf = SparkConf() app_pyspark_conf.setAppName(pyspark_app_nm) # app_pyspark_conf.set('spark.executor.memory',spark_executor_memory) # app_pyspark_conf.set('spark.executor.cores', spark_executor_cores) spark = SparkSession.builder.config(conf=app_pyspark_conf).getOrCreate() sc = spark.sparkContext #load data logging.info("Beginning data load.") training_df = pd.read_parquet(training_data_path, engine='pyarrow') # sampling down # training_df_1 = training_df[training_df[target_var]==1].sample(20) # training_df_0 = training_df[training_df[target_var]==0].sample(40) # training_df = pd.concat([training_df_0,training_df_1]) # column handling logging.info("Creating column lists") all_cols = training_df.columns.tolist() x_cols = list(set(all_cols) - (set(target_cols + id_cols))) # dataframe setup X = training_df[x_cols] y = training_df[target_cols] # create holdout data logging.info("Creating holdout data") x_train, x_test, y_train, y_test = train_test_split(X, y[target_var], test_size=0.1, stratify=y[target_var]) wts = y_test.value_counts() wtrat = (wts[0] / wts[1]) # instantiate model gbm = lgb.LGBMClassifier() fit_params = { "eval_set": [(x_test, y_test)], "eval_metric": ear_stop_eval_mtr, "early_stopping_rounds": ear_stop_rnds # ,"scale_pos_weight": wtrat } grid_search = SparkGridSearchCV(sc, estimator=gbm, param_grid=param_grid, fit_params=fit_params) # grid_search.fit(x_train,y_train) grid_search.fit(x_train, y_train) best_model = grid_search.best_estimator_ optimized_parameters = best_model.get_params() # create confusion dataframe y_true = pd.DataFrame(y_test) y_true = y_true.reset_index() y_true.columns.values[0] = "CUSTOMER_KEY" y_true.columns.values[1] = "Y_TRUE" y_pred = pd.DataFrame(best_model.predict(x_test, y_test.tolist()), columns=["Y_PRED"]) confusion_data = pd.merge(left=y_true, right=y_pred, left_index=True, right_index=True) # summary statistics and metrics fr_col_nam_map = {0: "feature_nm", 1: "feature_importance"} feature_ranking = pd.DataFrame( [X.columns, best_model.feature_importances_]).T feature_ranking = feature_ranking.rename(columns=fr_col_nam_map) feature_ranking = feature_ranking.sort_values("feature_nm", ascending=False) metrics = { "precision_score": precision_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "roc_auc_score": roc_auc_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "classification_report": classification_report(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "confusion_matrix": confusion_matrix(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "accuracy_score": accuracy_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "precision_recall_curve": precision_recall_curve(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "recall_score": recall_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "roc_curve": roc_curve(confusion_data['Y_TRUE'], confusion_data['Y_PRED']) } output = { "model_name": model_name # string with model name , "model_class": best_model # grid_search.best_estimator_ , "optimized_parameters": optimized_parameters # best_model.get_params() , "feature_ranking": feature_ranking # best_model.feature_importances_ , "metrics": metrics, "confusion_data": confusion_data } return output