示例#1
0
class SklearnGeneralModel(ModelBase):
    def __init__(self, is_normalize, model, searchCV=False):
        self.is_normalize = is_normalize
        self.model = model
        self.searchCV = searchCV

    def build_model(self, config_args=None):

        if config_args is None:
            config_args = {}

        if not self.searchCV:
            self.model = self.model(**config_args)
        else:
            self.model = BayesSearchCV(estimator=self.model(), **config_args)

    def train(self, x, y):
        if self.is_normalize:
            self.scaler = Normalizer()
            x = self.scaler.fit_transform(x)

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', UserWarning)
            self.model.fit(x, y)

    def predict(self, x):
        if self.is_normalize:
            x = self.scaler.transform(x)
        return self.model.predict(x)

    def feature_based_metrics(self, columns=None, index=None):
        feature_importance = self.model.best_estimator_.feature_importances_
        feature_importance = feature_importance / np.sum(feature_importance)
        return pd.DataFrame(feature_importance, index=columns, columns=index).T
示例#2
0
    def test_it_solves_the_easy_dataset_when_tuned(self):
        baseline = BayesSearchCV(style.StyleRankerBaseline,
                                 style.STYLE_RANKER_HYPER_PARAMETERS,
                                 n_iter=16,
                                 n_points=2,
                                 cv=4,
                                 n_jobs=1)
        baseline.fit(self.train_easy[['action0', 'action1']],
                     self.train_easy['label'])
        predictions = baseline.predict(self.dev_easy[['action0', 'action1']])

        # check that the accuracy is 100%
        self.assertEqual(
            metrics.accuracy_score(y_true=self.dev_easy['label'],
                                   y_pred=predictions), 1.)
def nested_cv(
    estimator,
    search_spaces,
    X,
    y,
    scoring="neg_mean_squared_error",
    inner_cv=5,
    outer_cv=10,
    random_state=42,
):
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.1,
        random_state=random_state,
    )

    opt = BayesSearchCV(
        estimator=estimator,
        search_spaces=search_spaces,
        scoring="neg_mean_squared_error",
        n_iter=25,
        cv=inner_cv,
        verbose=0,
        n_jobs=4,
        random_state=random_state,
    )
    opt.fit(X_train, y_train)
    print("Best params:\n%s" % opt.best_params_)
    inner_cv_rmse = np.round(np.sqrt(opt.best_score_ * -1), 2)
    print(f"Inner loop RMSE: {inner_cv_rmse}")

    nested_score = cross_val_score(
        opt,
        X_train,
        y_train,
        cv=outer_cv,
        scoring=scoring,
        n_jobs=4,
    )
    outer_cv_rmse = np.round(np.sqrt(nested_score.mean() * -1), 2)
    print(f"Outer loop RMSE: {outer_cv_rmse}")

    y_pred = opt.predict(X_test)
    rmse = np.round(mean_squared_error(y_test, y_pred, squared=False), 2)
    print(f"Validation RMSE: {rmse}")
    return opt
示例#4
0
def get_best_SVM_params(X_train, y_train, X_test, y_test):
    search_spaces = {
        "kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]),
        "C": Real(1e-1, 1e+1, "uniform"),
        "gamma": Real(1e-4, 1e+4, "log-uniform")
    }

    best_accuracy = 0
    best_model = None
    for i in range(5):
        grid = BayesSearchCV(SVC(), search_spaces, n_iter=10, cv=3, n_jobs=-1)
        grid.fit(X_train, y_train)
        accuracy = accuracy_score(y_test, grid.predict(X_test))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = grid

    return best_model.best_params_
class Regressor(BaseEstimator):
    def __init__(self, regressor, params):
        self.model = BayesSearchCV(estimator=regressor,
                                   search_spaces=params,
                                   scoring='mean_squared_error',
                                   cv=TimeSeriesSplit(n_splits=3),
                                   n_jobs=3,
                                   n_iter=10,
                                   verbose=3000,
                                   refit=True,
                                   random_state=42)

    def fit(self, X, y):
        self.model.fit(X, y)
        filename = '/home/mejri/Desktop/TELECOM_PARISTECH_MASTER_X_DATASCIENCE/MACHINE_LEARNING_BUSINESS_CASE/rossmann-store-sales/finalized_model.sav'
        pickle.dump(self.model, open(filename, 'wb'))

    def predict(self, X):
        yres = self.model.predict(X)
        return yres
示例#6
0
    def test_it_solves_scruples_easy_when_tuned(self):
        baseline = BayesSearchCV(self.BASELINE_MODEL,
                                 self.BASELINE_HYPER_PARAMETERS,
                                 n_iter=16,
                                 n_points=2,
                                 cv=4,
                                 n_jobs=1,
                                 refit=True)

        # train the model, tuning hyper-parameters
        _, train_features, train_labels, train_label_scores =\
            self.dataset.train
        baseline.fit(train_features, train_labels)

        # predict with the model on dev
        _, dev_features, dev_labels, dev_label_scores =\
            self.dataset.dev
        predictions = baseline.predict(dev_features)

        # check that the accuracy is 100%
        self.assertEqual(
            metrics.accuracy_score(y_true=dev_labels, y_pred=predictions), 1.)
示例#7
0
def get_best_ensemble_params(X_train, y_train, X_test, y_test):
    search_spaces = {
        "max_samples": Real(0.5, 1, "uniform"),
        "max_features": Real(0.5, 1, "uniform"),
        "kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]),
        "C": Real(1e-1, 1e+1, "uniform"),
    }

    best_accuracy = 0
    best_model = None
    for i in range(5):
        grid = BayesSearchCV(SVMEnsemble(),
                             search_spaces,
                             n_iter=10,
                             cv=3,
                             n_jobs=-1)
        grid.fit(X_train, y_train)
        accuracy = accuracy_score(y_test, grid.predict(X_test))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = grid

    return best_model.best_params_
示例#8
0
        'min_child_weight': (1, 10),
        'subsample': (0.5, 1.0, 'log-uniform'),
        'colsample_bytree': (0.5, 1.0, 'log-uniform'),
        'n_estimators': (100, 1000)
    },
    n_iter=32,
    random_state=42,
    cv=3
)

xgb_opt.fit(X_train, Y_train)

xgb_opt.score(X_train, Y_train)

#Accuracy of the model on the validation set
y_pred = xgb_opt.predict(X_test)
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

#Load the testSpike data
X_test = mat['testSpike']
X_test.shape

#Pre-processing of the testSpike data
l = 2
X = numpy.array([])
X = numpy.mean(X_test[:, 0: 1], axis = 1)[numpy.newaxis].T
for i in range(2, len(X_test[0])+1):
  if l != prev_data:
#     print(l,i)
    a = numpy.mean(X_test[:, i-l: i], axis = 1)[numpy.newaxis].T
示例#9
0
def main():
    mlflow.start_run(run_name=NAME)

    if "X_train.pkl" not in os.listdir():
        print("procesando los datos")
        X, y, encoder = preprocess_data("TOTAL_TRAIN.csv", process_cat=False)
        print(X.shape)

        with open(f"label_encoder_{NAME}.pkl", "wb") as f:
            pickle.dump(encoder, f)
        print(
            f"##################### The shape of X is {X.shape} #######################"
        )
        y = y.astype("int")
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.15,
                                                            random_state=15,
                                                            stratify=y)
        with open("X_train.pkl", "wb") as f:
            pickle.dump(X_train, f)
        with open("X_test.pkl", "wb") as f:
            pickle.dump(X_test, f)
        with open("y_train.pkl", "wb") as f:
            pickle.dump(y_train, f)
        with open("y_test.pkl", "wb") as f:
            pickle.dump(y_test, f)

        print(X_train.shape)

    else:
        with open("X_train.pkl", "rb") as f:
            X_train = pickle.load(f)
        with open("X_test.pkl", "rb") as f:
            X_test = pickle.load(f)
        with open("y_train.pkl", "rb") as f:
            y_train = pickle.load(f)
        with open("y_test.pkl", "rb") as f:
            y_test = pickle.load(f)
        with open(f"label_encoder_XGB1704.pkl", "rb") as f:
            encoder = pickle.load(f)
        print("######### ajustando cat encoder ############")

    cols_cat = ["ruido", "CODIGO_POSTAL", "ZONA_METROPOLITANA", "CALIDAD_AIRE"]
    cols_float = [col for col in X_train.columns if col not in cols_cat]
    X_train[cols_float] = X_train[cols_float].astype("float")
    X_test[cols_float] = X_test[cols_float].astype("float")

    labs_names = [c for c in encoder.classes_]

    model = LGBMClassifier(
        class_weight="balanced",
        objective="multiclass:softmax",
        n_jobs=-1,
        random_state=100,
        silent=True,
    )

    if MODE != "INDIVIDUAL":
        params = {
            "reg_alpha": (1e-3, 5.0, "log-uniform"),
            "reg_lambda": (1e-2, 50.0, "log-uniform"),
            "n_estimators": (600, 4500),
            "learning_rate": (5e-3, 1.0, "log-uniform"),
            "num_leaves": (20, 80),
            "boosting_type": ["gbdt", "goss"],
            "colsample_bytree": (0.1, 1.0, "uniform"),
            "subsample": (0.1, 1.0, "uniform"),
            "min_child_samples": (1, 25),
            "min_child_weight": (1e-6, 0.1, "log-uniform"),
        }

        print(params)

        cb = CatBoostEncoder(cols=cols_cat)
        X_train = cb.fit_transform(X_train, y_train)
        X_test = cb.transform(X_test)
        fit_params = {
            ### fit params ###
            "eval_set": [(X_test, y_test)],
            "eval_metric": lgb_f1_score,
            "early_stopping_rounds": 300,
        }

        pipeline = Pipeline(steps=[("clas_encoder",
                                    CatBoostEncoder(
                                        cols=cols_cat)), ("model", model)])

        best_model = BayesSearchCV(
            model,
            params,
            n_iter=N_ITER,
            n_points=1,
            cv=cv,
            scoring=f2_scorer,
            random_state=100,
            optimizer_kwargs={"n_initial_points": 10},
            fit_params=fit_params,
        )

    def on_step(optim_result):
        score = best_model.best_score_
        results = best_model.cv_results_
        try:
            results_df = pd.DataFrame(results)
            results_df.to_csv(f"results_{NAME}.csv", header=True, index=False)
            print(
                f"############ Llevamos {results_df.shape[0]} pruebas #################"
            )
            print(f"los resultados del cv de momento son {results_df}")
        except:
            print("Unable to convert cv results to pandas dataframe")
        mlflow.log_metric("best_score", score)
        with open(f"./best_{NAME}_params.pkl", "wb") as f:
            pickle.dump(best_model.best_params_, f)

        print("best score: %s" % score)
        if score >= 0.98:
            print("Interrupting!")
            return True

    print("ajustando modelo")
    if MODE != "INDIVIDUAL":
        print(X_train.dtypes)
        best_model.fit(X_train, y_train, callback=[on_step])
        with open(f"./best_{NAME}_model.pkl", "wb") as f:
            pickle.dump(best_model, f)
        preds = best_model.predict(X_test)
    else:
        if NAME not in os.listdir():
            os.mkdir(NAME)

        cat_encoder = CatBoostEncoder(cols=cols_cat)
        X_train = cat_encoder.fit_transform(X_train, y_train)
        X_test = cat_encoder.transform(X_test)
        best_model = BalancedBaggingClassifier(
            base_estimator=HistGradientBoostingClassifier(
                max_iter=3000,
                random_state=42,
                learning_rate=0.1,
                max_leaf_nodes=54,
                min_samples_leaf=2,
                scoring=f2_scorer,
                validation_fraction=0.1,
                n_iter_no_change=50,
            ),
            n_estimators=5,
            random_state=42,
            n_jobs=-1,
            max_features=0.7,
            sampling_strategy={5: int(dict(Counter(y_train))[5] * 0.11)},
        )
        best_model.fit(X_train, y_train)
        preds = best_model.predict(X_test)
        print(
            f'F1 SCORE IS {f1_score(y_test, preds, average="macro")}, precision is {precision_score(y_test, preds, average="macro")}, recall is {recall_score(y_test, preds, average="macro")}, accuracy is {accuracy_score(y_test, preds)}'
        )
        print(
            f"F2 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        print(
            f"F05 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        cm = confusion_matrix(y_test, preds)
        grafico_conf_matrix = print_confusion_matrix(cm,
                                                     class_names=labs_names)
        grafico_conf_matrix.savefig(f"{NAME}/norm_NO_PIPELINE")

        with open(f"best_model_{NAME}.pkl", "wb") as f:
            pickle.dump(best_model, f)

    print("loggeando movidas")
    mlflow.log_metrics(
        metrics={
            "f1": f1_score(y_test, preds, average="macro"),
            "precision": precision_score(y_test, preds, average="macro"),
            "recall": recall_score(y_test, preds, average="macro"),
            "accuracy": accuracy_score(y_test, preds),
            "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"),
            "f2": fbeta_score(y_test, preds, beta=2, average="macro"),
        })
    if MODE != "INDIVIDUAL":
        best_params = best_model.best_params_
        for param in best_params.keys():
            mlflow.log_param(param, best_params[param])
    cm = confusion_matrix(y_test, preds)
    grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names)
    grafico_conf_matrix.savefig(NAME)
    grafico_norm = print_confusion_matrix(cm,
                                          class_names=labs_names,
                                          normalize=False)
    grafico_norm.savefig(f"{NAME}_no_norm")
    mlflow.end_run()
                        X = np.asarray(train['rep'].values.tolist())
                        y = train['target'].values.astype('float64')

                        model.fit(
                                       X ,
                                        y

                        )
                        joblib.dump(model, f'./models/auc__{d}__{rep}__{fam_name}__model.pkl')
                    else:
                        model = joblib.load(f'./models/auc__{d}__{rep}__{fam_name}__model.pkl')

                    predictions = model.predict(

                                    np.asarray(validate['rep'].values.tolist()),

                    )

                    r_results[fam_name] = {
                            'roc_score':roc_auc_score(validate['target'], predictions),
                            'roc50_score':get_roc(validate['target'].values, predictions, 50)
                        }
                    print('roc_score', r_results[fam_name]['roc_score'])
                    print('roc50_score', r_results[fam_name]['roc50_score'])
                    if to_train:
                        if model_name != 'NaiveBayes':
                            r_params[fam_name] = model.best_params_
                            print(model.best_params_)
#
                except Exception as e:
                 'depth': Integer(1, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'),
                 'bagging_temperature': Real(0.0, 1.0),
                 'border_count': Integer(1, 255),
                 'l2_leaf_reg': Integer(2, 30),
                 'scale_pos_weight':Real(0.01, 1.0, 'uniform')}
cb_bs = BayesSearchCV(cb, cb_param_grid, scoring = 'roc_auc', n_iter = 100, n_jobs = 1,
                      return_train_score = False, refit = True, optimizer_kwargs = {'base_estimator': 'GP'}, 
                      random_state = 123)

cb_bs.fit(x_train, y_train)

y_probs = cb_bs.predict_proba(x_test)
y_probs = y_probs[:, 1]
y_pred = cb_bs.predict(x_test)

print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_probs)) ### 0.903

fpr, tpr, thresholds = roc_curve(y_test, y_probs)
plot_roc_curve(fpr, tpr)

# Find the best parameters
cb_bs.best_params_
# Use the parameters to re-run the model
cb_tuned = CatBoostClassifier(iterations = 1000, depth = 8,
                 learning_rate = 0.11574, random_strength = 1e-9,
                 bagging_temperature = 1.0,
                 border_count = 178,
                 l2_leaf_reg = 2,
            (
                "model",
                LGBMClassifier(n_jobs=-1, boosting_type="gbdt").set_params(
                    **{
                        k.replace("final_estimator__model__", ""): v
                        for k, v in params.items()
                    }),
            ),
        ]),
        verbose=1,
        n_jobs=-1,
        cv=3,
    )

    best_model = model.fit(X_train, y_train)
    preds = best_model.predict(X_test)
    print("loggeando movidas")
    mlflow.log_metrics(
        metrics={
            "f1": f1_score(y_test, preds, average="macro"),
            "precision": precision_score(y_test, preds, average="macro"),
            "recall": recall_score(y_test, preds, average="macro"),
            "accuracy": accuracy_score(y_test, preds),
            "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"),
            "f2": fbeta_score(y_test, preds, beta=2, average="macro"),
        })

    best_params = params
    for param in best_params.keys():
        mlflow.log_param(param, best_params[param])
    cm = confusion_matrix(y_test, preds)
    'min_child_weight': (0, 5),
    'n_estimators': (50, 100),
    'scale_pos_weight': (1e-6, 500, 'log-uniform')
}

opt = BayesSearchCV(model,
                    params,
                    n_iter=5,
                    cv=schema,
                    refit=True,
                    scoring='f1')

# %%
X_train, y_train = train.drop(columns=['Prediction']).astype(
    np.float32), train.Prediction.astype(np.int)
opt.fit(X_train, y_train)
context.io.save('xente_xgb', opt)

# %%
X_test, y_test = test.drop(columns=['Prediction']).astype(
    np.float32), test.Prediction.astype(np.int)
y_pred = opt.predict(X_test)

# %%
xente_sample_submission = context.io.load('xente_sample_submission').assign(
    Prediction=y_pred.astype(np.int))

context.io.save('xente_y_submission', xente_sample_submission)

# %%
示例#14
0
    'regressor__model__min_child_weight': (10, 500, 'log-uniform'),  # categorical parameter
    'regressor__model__n_estimators': (1, 8),  # integer valued parameter
    'regressor__model__reg_alpha': (1, 8, 'log-uniform'),  # integer valued parameter
    'regressor__model__reg_lambda': (1, 8, 'log-uniform'),  # integer valued parameter
    'regressor__model__subsample': (1, 8, 'log-uniform'),  # integer valued parameter

}
#%%
# Since sksurv output log hazard ratios (here relative to 0 on predictors)
# we must use 'output_margin=True' for comparability.
estimator = CoxPHSurvivalAnalysis().fit(data_x, data_y)
gbm = xgb.XGBRegressor(objective='survival:cox',
                       booster='gblinear',
                       base_score=1,
                       n_estimators=1000)

search = BayesSearchCV(gbm, params, n_iter=3, cv=3)
search.fit(data_x, data_y_xgb)

#%%
prediction_sksurv = estimator.predict(data_x)
predictions_xgb = search.predict(data_x)
d = pd.DataFrame({'xgb': predictions_xgb,
                  'sksurv': prediction_sksurv})
d.head()

# %%
context.io.save('xente_xgb', gbm)

# %%
示例#15
0
    def tune_parameter(self,
                       estimator,
                       tp_manner,
                       params,
                       X,
                       y,
                       scoring="neg_log_loss"):
        estimator_name = (self.get_default_params_and_name(estimator))[0]
        print("tune parameters for " + estimator_name)
        if tp_manner == "bayes":
            if estimator_name in ["rf", "et"]:
                base_estimator = "RF"
            elif estimator_name in [
                    "adaboost", "xgb", "lgb", "gbm", "catboost"
            ]:
                base_estimator = "GBRT"
            else:
                base_estimator = "GP"
            tp = BayesSearchCV(
                estimator=estimator,
                search_spaces=params,
                optimizer_kwargs={"base_estimator": base_estimator},
                scoring=scoring,
                n_iter=60,
                verbose=2,
                n_jobs=-1,
                cv=3,
                refit=True,
                random_state=1234)
        elif tp_manner == "gs":
            tp = GridSearchCV(estimator=estimator,
                              param_grid=params,
                              scoring=scoring,
                              n_jobs=-1,
                              cv=3,
                              refit=True,
                              verbose=2)
        elif tp_manner == "random":
            tp = RandomizedSearchCV(estimator=estimator,
                                    param_distributions=params,
                                    scoring=scoring,
                                    n_jobs=-1,
                                    n_iter=60,
                                    cv=3,
                                    refit=True,
                                    verbose=2,
                                    random_state=1234)
        elif tp_manner == "hpopt":
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.3, random_state=42, shuffle=True)
            space = params

            def objective(space):
                clf = estimator
                clf.set_params(**space)
                clf.fit(X=X_train, y=y_train)
                loss = self.get_loss(clf, X_test, y_test, scoring)
                return loss

            best_param = fmin(fn=objective,
                              space=space,
                              algo=tpe.suggest,
                              max_evals=60)

            str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
            tp = estimator.set_params(**best_param)
            tp.fit(X, y)
            y_hat = tp.predict(X)
            metrics_dict = self.get_metrics(y_hat, y)
            print(estimator_name, best_param)
            print(metrics_dict)
            model_name = estimator_name + str_time + ".pkl"
            print("save metrics to tp_log.csv:", estimator_name)
            with open("tp_log.csv", 'a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(
                    [estimator_name, model_name, " best params : ", str_time])
                for key, value in metrics_dict.items():
                    writer.writerow([key, value])
                for key, value in best_param.items():
                    writer.writerow([key, value])
            # tp.save_model(model_name)
            joblib.dump(tp, model_name)
            return {estimator_name: tp}
        else:
            #todo
            return
        if estimator_name == "catboost":
            tp.fit(X=X, y=y, cat_features=self.cat_features)
        elif estimator_name == "lgb" and self.cat_features:
            tp.fit(X=X, y=y, categorical_feature=self.cat_features)
        else:
            tp.fit(X, y)
        best_param = tp.best_params_
        best_score = tp.best_score_
        y_hat = tp.predict(X)
        metrics_dict = self.get_metrics(y_hat, y)
        print(estimator_name, best_param)
        print("best score:", best_score)
        print(metrics_dict)
        str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
        model_name = estimator_name + str_time + ".pkl"
        print("save metrics to tp_log.csv:", estimator_name)
        str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
        with open("tp_log.csv", 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(
                [estimator_name, model_name, " best params : ", str_time])
            for key, value in metrics_dict.items():
                writer.writerow([key, value])
            for key, value in best_param.items():
                writer.writerow([key, value])
        joblib.dump(tp, model_name)
        return {estimator_name: tp}
示例#16
0
def demand(df,test_start_month = 201901, test_end_month = 201924,model = RandomForestRegressor(n_estimators=100), scale_x = preprocessing.StandardScaler(),scale_y = preprocessing.StandardScaler(),offset = 8,demand_lag = 9,tuning= True, scale = True,dummy = True,demand_lag_param = True):
    

    df = df.sort_values(['Part_No','Order_Month'])
    df = df.reset_index()
    df = df.drop('index',axis =1 )

    
    
    if dummy == True:
        df_D = pd.get_dummies(df, columns = qual_preds, drop_first= True) 
    else:
        df_D = df
    df_D = df_D.replace(np.nan,0)
    
    cat = list((set(df_D.columns) - set(df.columns)))
    predictors.extend(cat)
    final_vbls =  list(set(predictors) - set(qual_preds))
    
    
    test_data_period = df[(df['Order_Month'] > test_start_month) & (df['Order_Month'] < test_end_month)].sort_values(['Part_No','Order_Month'])
    
    final = pd.DataFrame()
    pred = pd.DataFrame()
    feature_importances = []
    
    feature_importance_matrix = pd.DataFrame()
    
    if tuning == True:
        model = BayesSearchCV(
                estimator = RandomForestRegressor(
                    n_jobs = 1,
                    criterion='mse',
                ),
                search_spaces = {
                    'min_weight_fraction_leaf': (1e-9, 0.5, 'uniform'),
                    'max_depth': (1, 50),
                    'max_leaf_nodes': (2, 20),
                    'min_impurity_decrease': (0.01, 1.0, 'uniform'),
                    'min_impurity_split': (0.01, 1.0, 'uniform'),
                    'min_impurity_decrease': (0.01, 1.0, 'uniform'),
                    'ccp_alpha': (1e-9, 1.0, 'log-uniform'),
                    'n_estimators': (50,300)
                },    
                cv = KFold(
                    n_splits=3,
                    shuffle=True,
                    random_state=42
                ),
                n_iter=20,
                verbose = 1,
                return_train_score = True
            )
        y_train = df[df['Order_Month']<test_start_month][target]
        X_train = df[df['Order_Month']<test_start_month][final_vbls]
        model.fit(X_train, y_train)
        model = model.best_estimator_
        print(model.get_params)
        

    
    for planning_month in test_data_period['Order_Month'].sort_values().unique():
            df_D['Demand_copy'] =df_D['Demand']   
            print('planning_month:',planning_month,'\n')
            if demand_lag_param == True:

                for lag in range(1,demand_lag+1):

                    demand_lag_str = 'demand_lag_' + str(lag)
                    df_D[demand_lag_str] = df_D.groupby(['Part_No'])[target].shift(lag)
                    df_D[demand_lag_str] = df_D[demand_lag_str].fillna(0)


            for month in range(0,offset):
                target_month = planning_month + month
                
                print('target_month:',target_month)
                if target_month>test_end_month:
                    continue
                
                for lag in range(1,demand_lag+1):

                    demand_lag_str = 'demand_lag_' + str(lag)
                    df_D[demand_lag_str] = df_D.groupby(['Part_No'])[target].shift(lag)
                    df_D[demand_lag_str] = df_D[demand_lag_str].fillna(0)
                
                train_data_org = df_D[(df_D['Order_Month']< target_month)]
                train_data = df_D[(df_D['Order_Month'] < target_month)]
                
                test_data = df_D[(df_D['Order_Month'] == target_month)]
                test_data_org = df_D[(df_D['Order_Month'] == target_month)]
                

                
                if scale == True:
                    X_train = pd.DataFrame(scale_x.fit_transform(train_data[final_vbls]))
                    X_train.columns = final_vbls
                    
                    y_train = pd.DataFrame(scale_y.fit_transform(train_data[target]))
                    X_test = pd.DataFrame(scale_x.transform(test_data[final_vbls]))
                    
                    X_test.columns = final_vbls
         
                    
                    
                    model.fit(X_train, y_train)
                    print(model.get_params)
                    
                    y_pred = model.predict(X_test)
                    y_pred = pd.DataFrame(y_pred)
                    y_pred = scale_y.inverse_transform(y_pred)
                    y_pred = pd.Series(pd.DataFrame(y_pred)[0])
                    y_pred = y_pred.round()
                    y_pred[y_pred < 0] = 0
                
                else:
                    y_train = train_data[target]
                    X_train = train_data[final_vbls]
                    X_test  = test_data[final_vbls]
                    model.fit(X_train, y_train)
                    print(model.get_params)
                    
                    y_pred = model.predict(X_test)
                    y_pred = pd.Series(y_pred)
                    y_pred = y_pred.round()
                    y_pred[y_pred < 0] = 0
                
                

                                
                if X_test.shape[0] == 0:
                    continue

                

                print(X_test.columns)


                
              
                
                feature_importances.append(model.feature_importances_)
                

                df_D.loc[test_data.index, target[0]] = y_pred.values
 

                
                
                

                pred = pred.append(pd.DataFrame({ "Part_No": test_data_org['Part_No'].values, 'Actual': test_data_org['Demand'].values,                                                     "Offset": [month] * test_data_org.shape[0],"planning_month": [planning_month]*test_data_org.shape[0],"target_month": [target_month] * test_data_org.shape[0],'Fcst':y_pred}),ignore_index = True)
                
                


    y_true, y_pred = np.array(pred['Actual']), np.array(pred['Fcst'])
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    accuracy = 1-mape


    feature_importances = [sum(x)/len(feature_importances) for x in zip(*feature_importances)]
    feature_importance_matrix = pd.DataFrame({'Columns':X_train.columns, 'f_imp' : feature_importances})

    return pred,df_D,feature_importance_matrix,mape,accuracy     
示例#17
0
def executeML(X,
              y,
              X_test,
              y_test,
              n_jobs,
              feature_labels,
              class_labels,
              pipe,
              parameters,
              ml_type,
              bayesOpt=False,
              search_space=None,
              n_iter=32,
              acq_func=""):
    if bayesOpt:
        # Example search space: { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1,8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }
        acq_funcs = ("LCB", "EI", "PI", "gp_hedge")
        for i in range(len(acq_funcs)):
            if acq_func in acq_funcs[i]:
                break
        if i == len(acq_funcs):
            i = i - 1
        optimizer_kwargs = {'acq_func': acq_funcs[i]}
        cv = BayesSearchCV(pipe,
                           search_space,
                           verbose=verbose,
                           n_iter=n_iter,
                           n_jobs=n_jobs,
                           optimizer_kwargs=optimizer_kwargs,
                           scoring=kappa_scorer)
    else:
        cv = GridSearchCV(pipe,
                          parameters,
                          verbose=verbose,
                          n_jobs=n_jobs,
                          scoring=kappa_scorer)
    tick1 = datetime.datetime.now()
    cv.fit(X, y)
    tick2 = datetime.datetime.now()
    print("\n{0} fitting time: {1}".format(ml_type, tick2 - tick1))
    print("{0} best params {1}".format(ml_type, cv.best_params_))
    print("{0} best score {1}".format(ml_type, cv.best_score_))
    tick3 = datetime.datetime.now()
    y_predict = cv.predict(X_test)
    tick4 = datetime.datetime.now()
    print("{0} test set accuracy: {1}".format(
        ml_type, accuracy_score(y_test, y_predict)))
    print("{0} test set cohen kappa: {1}".format(
        ml_type, cohen_kappa_score(y_test, y_predict)))
    print("{0} prediction time: {1}".format(ml_type, tick4 - tick3))
    print(
        classification_report(y_test,
                              y_predict,
                              target_names=class_labels,
                              digits=8))
    cnf_matrix = confusion_matrix(y_test, y_predict)
    print("")
    print(cnf_matrix)
    pickle.dump(
        cnf_matrix,
        open(os.path.join(SAVE_DIR, "cnf_matrix.best." + ml_type + ".pck"),
             'wb'))
    pickle.dump(
        cv.best_estimator_,
        open(os.path.join(SAVE_DIR, "classifier.best." + ml_type + ".pck"),
             'wb'))
    return cv.best_estimator_
示例#18
0
        random_state=42
    ),
    n_jobs = 1,
    n_iter = 100,   
    verbose = 0,
    refit = True,
    random_state = 42
)


# Fit the model
result = bayes_cv_tuner.fit(data[features], target, callback=status_print)
with open('best_params.txt', 'w+') as fo:
    fo.write(str(bayes_cv_tuner.best_params_))
    
pred = bayes_cv_tuner.predict(data[features])
data['pred'] = pred
data['pred'].to_csv('../result/train_pre.csv')
data['combine'] = data.loc[data['leak_tsne'].isnan()
    
nrows = None
test = pd.read_csv('../input/test.csv', nrows = nrows)
#test = add_leak(test, 'leak', '../input/test_leak.csv')
#test = add_leak(test, 'leak6', '../input/test_leak_new6.csv')
#test = add_leak(test, 'leak16', '../input/test_leak_new16.csv')
#test = add_leak(test, 'leak22', '../input/test_leak_new22.csv')
#test = add_leak(test, 'leak_tsne', '../input/test_leak_tsne.csv')
test = add_leak(test, 'leak_tsne', '../input/test_leak_tsne.csv')
#test = add_leak(test, 'leak_tsne11', '../input/test_leak_tsne_11.3.csv')
#data, cols = add_bulk_leak(data, '../input/bunk_leak_test.csv')
示例#19
0
def run_shallow(data_dir: str, results_dir: str, splits: List[str],
                metric: str, n_iter: int, n_points: int, n_folds: int,
                n_jobs: int) -> None:
    """Evaluate shallow baselines on the scruples resource.

    Train shallow baseline models on the scruples resource, reading
    the dataset from DATA_DIR, and writing trained models, logs, and
    other results to RESULTS_DIR. Performance is reported for each split
    provided as an argument.
    """
    # Step 1: Manage and construct paths.

    logger.info('Creating the results directory.')

    os.makedirs(results_dir)
    model_paths = {}
    metrics_paths = collections.defaultdict(dict)
    predictions_paths = collections.defaultdict(dict)
    for baseline in baselines.resource.SHALLOW_BASELINES.keys():
        os.makedirs(os.path.join(results_dir, baseline))
        model_paths[baseline] = os.path.join(results_dir, baseline,
                                             'model.pkl')
        for split in splits:
            os.makedirs(os.path.join(results_dir, baseline, split))
            metrics_paths[baseline][split] = os.path.join(
                results_dir, baseline, split, 'metrics.json')
            predictions_paths[baseline][split] = os.path.join(
                results_dir, baseline, split, 'predictions.jsonl')

    # Step 2: Load the data.

    logger.info(f'Loading the data from {data_dir}.')

    dataset = ScruplesResource(data_dir=data_dir)

    # Step 3: Run the baselines.

    logger.info('Running the baselines.')

    for baseline, (Model, hyper_parameter_space) in tqdm.tqdm(
            baselines.resource.SHALLOW_BASELINES.items(),
            **settings.TQDM_KWARGS):
        # tune the hyper-parameters and train the model
        ids, features, labels, label_scores = dataset.train
        if hyper_parameter_space:
            model = BayesSearchCV(
                Model,
                hyper_parameter_space,
                scoring=make_scorer(score_func=METRICS[metric][1],
                                    **METRICS[metric][2]),
                n_iter=n_iter,
                n_points=n_points,
                cv=n_folds,
                n_jobs=os.cpu_count() if n_jobs == 0 else n_jobs,
                refit=True)
        else:
            model = Model
        model.fit(features, labels)

        # Step 4: Save the model.

        with open(model_paths[baseline], 'wb') as model_file:
            dill.dump(model, model_file)

        # Step 5: Run evaluation on the splits.

        for split in splits:
            ids, features, labels, label_scores = getattr(dataset, split)

            predictions = model.predict(features)
            probabilities = model.predict_proba(features)

            with open(metrics_paths[baseline][split], 'w') as metrics_file:
                json.dump(
                    {
                        key: metric(
                            y_true=labels,
                            y_pred=probabilities
                            if scorer_kwargs['needs_proba'] else predictions)
                        for key, (_, metric, scorer_kwargs) in METRICS.items()
                    }, metrics_file)

            with open(predictions_paths[baseline][split], 'w')\
                 as predictions_file:
                for id_, probs, prediction in zip(ids, probabilities,
                                                  predictions):
                    predictions_file.write(
                        json.dumps({
                            'id': id_,
                            'label': prediction.tolist(),
                            'label_scores': probs.tolist()
                        }) + '\n')
示例#20
0
                      cv=10,
                      n_jobs=N_JOBS, 
                      verbose=0, 
                      error_score=-9999, 
                      scoring=spearman_scorer, 
                      random_state=RANDOM_STATE,
                      return_train_score=True, 
                      n_iter=3)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    search.fit(train, y_train) 

test = X_test[features]

predicted = search.predict(test)
model_test_score = spearman_scorer(search, test, y_test)

estimator = search.best_estimator_.named_steps['estimator']
imputer = search.best_estimator_.named_steps['imputer']


def multiproc_iter_func(max_workers, an_iter, func, item_kwarg, **kwargs):
    """
    A helper functions that applies a function to each item in an iterable using
    multiple processes. 'item_kwarg' is the keyword argument for the item in the
    iterable that we pass to the function.
    """
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_results = [executor.submit(func, **{item_kwarg: item}, **kwargs)
                          for item in an_iter]