示例#1
0
def main():
    X = make_X(200)
    target = make_target(X)

    real_labels(X, target)

    clf_list = [
        # nn.MLPClassifier(hidden_layer_sizes=(2,), random_state=0),
        # nn.MLPClassifier(hidden_layer_sizes=(3,), random_state=0),
        # nn.MLPClassifier(hidden_layer_sizes=(4,), random_state=0),
        # nn.MLPClassifier(hidden_layer_sizes=(10,), random_state=0),
        lgbm.LGBMClassifier(n_estimators=200,random_state=0),
        xgb.XGBClassifier(n_estimators=200,max_depth=5,random_state=0)

        # nn.MLPClassifier(hidden_layer_sizes=(200,)),
        # nn.MLPClassifier(hidden_layer_sizes=(300,)),
        # nn.MLPClassifier(hidden_layer_sizes=(200, 100)),
        # xgb.XGBClassifier(n_estimators=30, max_depth=3),
        # xgb.XGBClassifier(n_estimators=5, max_depth=3),
        # ensemble.AdaBoostClassifier(n_estimators=30, random_state=0)
    ]
    for clf in clf_list:
        prd_labels(X, target, clf)

    plt.show()
示例#2
0
def get_model(algorithm_name):
    if algorithm_name == 'knn':  # 根据算法名称使用不同算法
        clf = KNeighborsClassifier(n_neighbors=5)
    elif algorithm_name == 'DT':
        clf = DecisionTreeClassifier()
    elif algorithm_name == 'SGD':
        clf = SGDClassifier()
    elif algorithm_name == 'MNB':
        clf = MultinomialNB()
    elif algorithm_name == 'GBM':
        clf = gbm.LGBMClassifier()
    elif algorithm_name == 'GNB':
        clf = GaussianNB()
    elif algorithm_name == 'BNB':
        clf = BernoulliNB()
    else:
        print('wrong input!')  # 输入错误直接退出
        exit()
    return clf
示例#3
0
    def run(self):
        self.output().makedirs()

        X = abhishek_feats.AbhishekFeatures().load('train', self.fold)
        y = xval_dataset.BaseDataset().load('train', self.fold).squeeze()
        cls = lgbsklearn.LGBMClassifier(num_leaves=1024,
                                        n_estimators=1024,
                                        is_unbalance=True)
        X_tr, X_va, y_tr, y_va = model_selection.train_test_split(
            X, y, test_size=0.05)
        cls.fit(X_tr,
                y_tr,
                sample_weight=core.weight_from(y_tr),
                eval_set=(X_va, y_va),
                early_stopping_rounds=10)

        validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold)
        y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze()
        y_pred = cls.predict_proba(validX)[:, 1]

        scorestr = "{:s} = {:f}".format(repr(self), core.score_data(y, y_pred))
        print(colors.green | colors.bold | scorestr)

        np.save('cache/abhishek/lgbm/{:d}/valid.npy'.format(self.fold), y_pred)

        trainX = abhishek_feats.AbhishekFeatures().load('test', None)
        pred = cls.predict_proba(trainX)[:, 1]
        np.save('cache/abhishek/lgbm/{:d}/test.npy'.format(self.fold), pred)

        with self.output().open('w') as f:
            cols = abhishek_feats.AbhishekFeatures().load('valid',
                                                          self.fold,
                                                          as_df=True).columns
            v = pandas.Series(cls.feature_importances_,
                              index=cols).sort_values()
            v.to_csv(f)
            f.write("\n\n")
            f.write(scorestr)
            f.write("\n")
示例#4
0
def get_model(algorithm_name):
    if algorithm_name == 'knn_5':  # 根据算法名称使用不同算法
        clf = KNeighborsClassifier(n_neighbors=5)
    elif algorithm_name == 'RF_1000':
        clf = RandomForestClassifier(n_estimators=1000,
                                     random_state=0,
                                     n_jobs=7)
    elif algorithm_name == 'DT':
        clf = DecisionTreeClassifier()
    elif algorithm_name == 'SGD':
        clf = SGDClassifier()
    elif algorithm_name == 'MNB':
        clf = MultinomialNB()
    elif algorithm_name == 'GBM':
        clf = gbm.LGBMClassifier()
    elif algorithm_name == 'GNB':
        clf = GaussianNB()
    elif algorithm_name == 'BNB':
        clf = BernoulliNB()
    else:
        print('wrong input!')  # 输入错误直接退出
        exit()
    return clf
示例#5
0
def train_save(pred_period=20, is_high=True, is_clf=False):

    data = gen_dataset(is_high=is_high, is_clf=is_clf, pred_period=pred_period)

    if is_clf:
        _, y_train = data["train"]
        scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)

    if not is_clf:
        models = [
            lgbm.LGBMRegressor(n_estimators=300,
                               num_leaves=100,
                               max_depth=8,
                               random_state=0),
            xgb.XGBRegressor(n_estimators=300, max_depth=5, random_state=0)
        ]
    else:
        models = [
            lgbm.LGBMClassifier(n_estimators=300,
                                scale_pos_weight=0.1,
                                num_leaves=100,
                                max_depth=8,
                                random_state=0),
            xgb.XGBClassifier(
                n_estimators=300,
                scale_pos_weight=0.1,
                max_depth=5,
                random_state=0,
            )
        ]
    y_pred_list = train(data, models, is_clf=is_clf)

    # save model
    for model in models:
        save_model(model, pred_period, is_high)

    return y_pred_list
def generate_model_package(training_data_path, id_cols, target_cols,
                           fields_config_file, param_grid, model_name,
                           target_var):
    """
            training_data_path
            ,id_cols
            ,target_cols
            ,fields_config_file
            ,param_grid
            ,model_name
            ,target_var
    """

    pyspark_app_nm = "train_" + model_name + "_" + secrets.token_hex(nbytes=4)

    logging.info("Starting process: " + pyspark_app_nm)

    #create spark object and spark context for parallel learning
    logging.info("Instantiating pyspark.")
    app_pyspark_conf = SparkConf()
    app_pyspark_conf.setAppName(pyspark_app_nm)
    #     app_pyspark_conf.set('spark.executor.memory',spark_executor_memory)
    #     app_pyspark_conf.set('spark.executor.cores', spark_executor_cores)

    spark = SparkSession.builder.config(conf=app_pyspark_conf).getOrCreate()
    sc = spark.sparkContext

    #load data
    logging.info("Beginning data load.")
    training_df = pd.read_parquet(training_data_path, engine='pyarrow')
    # sampling down
    #     training_df_1 = training_df[training_df[target_var]==1].sample(20)
    #     training_df_0 = training_df[training_df[target_var]==0].sample(40)
    #     training_df = pd.concat([training_df_0,training_df_1])

    # column handling
    logging.info("Creating column lists")
    all_cols = training_df.columns.tolist()
    x_cols = list(set(all_cols) - (set(target_cols + id_cols)))

    # dataframe setup
    X = training_df[x_cols]
    y = training_df[target_cols]

    # create holdout data
    logging.info("Creating holdout data")
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y[target_var],
                                                        test_size=0.1,
                                                        stratify=y[target_var])

    wts = y_test.value_counts()
    wtrat = (wts[0] / wts[1])

    # instantiate model
    gbm = lgb.LGBMClassifier()

    fit_params = {
        "eval_set": [(x_test, y_test)],
        "eval_metric": ear_stop_eval_mtr,
        "early_stopping_rounds": ear_stop_rnds
        #         ,"scale_pos_weight": wtrat
    }

    grid_search = SparkGridSearchCV(sc,
                                    estimator=gbm,
                                    param_grid=param_grid,
                                    fit_params=fit_params)
    #     grid_search.fit(x_train,y_train)

    grid_search.fit(x_train, y_train)

    best_model = grid_search.best_estimator_
    optimized_parameters = best_model.get_params()

    # create confusion dataframe
    y_true = pd.DataFrame(y_test)
    y_true = y_true.reset_index()
    y_true.columns.values[0] = "CUSTOMER_KEY"
    y_true.columns.values[1] = "Y_TRUE"

    y_pred = pd.DataFrame(best_model.predict(x_test, y_test.tolist()),
                          columns=["Y_PRED"])

    confusion_data = pd.merge(left=y_true,
                              right=y_pred,
                              left_index=True,
                              right_index=True)

    # summary statistics and metrics

    fr_col_nam_map = {0: "feature_nm", 1: "feature_importance"}
    feature_ranking = pd.DataFrame(
        [X.columns, best_model.feature_importances_]).T
    feature_ranking = feature_ranking.rename(columns=fr_col_nam_map)
    feature_ranking = feature_ranking.sort_values("feature_nm",
                                                  ascending=False)

    metrics = {
        "precision_score":
        precision_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "roc_auc_score":
        roc_auc_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "classification_report":
        classification_report(confusion_data['Y_TRUE'],
                              confusion_data['Y_PRED']),
        "confusion_matrix":
        confusion_matrix(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "accuracy_score":
        accuracy_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "precision_recall_curve":
        precision_recall_curve(confusion_data['Y_TRUE'],
                               confusion_data['Y_PRED']),
        "recall_score":
        recall_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "roc_curve":
        roc_curve(confusion_data['Y_TRUE'], confusion_data['Y_PRED'])
    }

    output = {
        "model_name": model_name  # string with model name
        ,
        "model_class": best_model  # grid_search.best_estimator_
        ,
        "optimized_parameters": optimized_parameters  # best_model.get_params()
        ,
        "feature_ranking": feature_ranking  # best_model.feature_importances_
        ,
        "metrics": metrics,
        "confusion_data": confusion_data
    }

    return output