class SklearnGeneralModel(ModelBase): def __init__(self, is_normalize, model, searchCV=False): self.is_normalize = is_normalize self.model = model self.searchCV = searchCV def build_model(self, config_args=None): if config_args is None: config_args = {} if not self.searchCV: self.model = self.model(**config_args) else: self.model = BayesSearchCV(estimator=self.model(), **config_args) def train(self, x, y): if self.is_normalize: self.scaler = Normalizer() x = self.scaler.fit_transform(x) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) self.model.fit(x, y) def predict(self, x): if self.is_normalize: x = self.scaler.transform(x) return self.model.predict(x) def feature_based_metrics(self, columns=None, index=None): feature_importance = self.model.best_estimator_.feature_importances_ feature_importance = feature_importance / np.sum(feature_importance) return pd.DataFrame(feature_importance, index=columns, columns=index).T
def test_it_solves_the_easy_dataset_when_tuned(self): baseline = BayesSearchCV(style.StyleRankerBaseline, style.STYLE_RANKER_HYPER_PARAMETERS, n_iter=16, n_points=2, cv=4, n_jobs=1) baseline.fit(self.train_easy[['action0', 'action1']], self.train_easy['label']) predictions = baseline.predict(self.dev_easy[['action0', 'action1']]) # check that the accuracy is 100% self.assertEqual( metrics.accuracy_score(y_true=self.dev_easy['label'], y_pred=predictions), 1.)
def nested_cv( estimator, search_spaces, X, y, scoring="neg_mean_squared_error", inner_cv=5, outer_cv=10, random_state=42, ): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=random_state, ) opt = BayesSearchCV( estimator=estimator, search_spaces=search_spaces, scoring="neg_mean_squared_error", n_iter=25, cv=inner_cv, verbose=0, n_jobs=4, random_state=random_state, ) opt.fit(X_train, y_train) print("Best params:\n%s" % opt.best_params_) inner_cv_rmse = np.round(np.sqrt(opt.best_score_ * -1), 2) print(f"Inner loop RMSE: {inner_cv_rmse}") nested_score = cross_val_score( opt, X_train, y_train, cv=outer_cv, scoring=scoring, n_jobs=4, ) outer_cv_rmse = np.round(np.sqrt(nested_score.mean() * -1), 2) print(f"Outer loop RMSE: {outer_cv_rmse}") y_pred = opt.predict(X_test) rmse = np.round(mean_squared_error(y_test, y_pred, squared=False), 2) print(f"Validation RMSE: {rmse}") return opt
def get_best_SVM_params(X_train, y_train, X_test, y_test): search_spaces = { "kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]), "C": Real(1e-1, 1e+1, "uniform"), "gamma": Real(1e-4, 1e+4, "log-uniform") } best_accuracy = 0 best_model = None for i in range(5): grid = BayesSearchCV(SVC(), search_spaces, n_iter=10, cv=3, n_jobs=-1) grid.fit(X_train, y_train) accuracy = accuracy_score(y_test, grid.predict(X_test)) if accuracy > best_accuracy: best_accuracy = accuracy best_model = grid return best_model.best_params_
class Regressor(BaseEstimator): def __init__(self, regressor, params): self.model = BayesSearchCV(estimator=regressor, search_spaces=params, scoring='mean_squared_error', cv=TimeSeriesSplit(n_splits=3), n_jobs=3, n_iter=10, verbose=3000, refit=True, random_state=42) def fit(self, X, y): self.model.fit(X, y) filename = '/home/mejri/Desktop/TELECOM_PARISTECH_MASTER_X_DATASCIENCE/MACHINE_LEARNING_BUSINESS_CASE/rossmann-store-sales/finalized_model.sav' pickle.dump(self.model, open(filename, 'wb')) def predict(self, X): yres = self.model.predict(X) return yres
def test_it_solves_scruples_easy_when_tuned(self): baseline = BayesSearchCV(self.BASELINE_MODEL, self.BASELINE_HYPER_PARAMETERS, n_iter=16, n_points=2, cv=4, n_jobs=1, refit=True) # train the model, tuning hyper-parameters _, train_features, train_labels, train_label_scores =\ self.dataset.train baseline.fit(train_features, train_labels) # predict with the model on dev _, dev_features, dev_labels, dev_label_scores =\ self.dataset.dev predictions = baseline.predict(dev_features) # check that the accuracy is 100% self.assertEqual( metrics.accuracy_score(y_true=dev_labels, y_pred=predictions), 1.)
def get_best_ensemble_params(X_train, y_train, X_test, y_test): search_spaces = { "max_samples": Real(0.5, 1, "uniform"), "max_features": Real(0.5, 1, "uniform"), "kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]), "C": Real(1e-1, 1e+1, "uniform"), } best_accuracy = 0 best_model = None for i in range(5): grid = BayesSearchCV(SVMEnsemble(), search_spaces, n_iter=10, cv=3, n_jobs=-1) grid.fit(X_train, y_train) accuracy = accuracy_score(y_test, grid.predict(X_test)) if accuracy > best_accuracy: best_accuracy = accuracy best_model = grid return best_model.best_params_
'min_child_weight': (1, 10), 'subsample': (0.5, 1.0, 'log-uniform'), 'colsample_bytree': (0.5, 1.0, 'log-uniform'), 'n_estimators': (100, 1000) }, n_iter=32, random_state=42, cv=3 ) xgb_opt.fit(X_train, Y_train) xgb_opt.score(X_train, Y_train) #Accuracy of the model on the validation set y_pred = xgb_opt.predict(X_test) from sklearn import metrics print("Accuracy:",metrics.accuracy_score(Y_test, y_pred)) #Load the testSpike data X_test = mat['testSpike'] X_test.shape #Pre-processing of the testSpike data l = 2 X = numpy.array([]) X = numpy.mean(X_test[:, 0: 1], axis = 1)[numpy.newaxis].T for i in range(2, len(X_test[0])+1): if l != prev_data: # print(l,i) a = numpy.mean(X_test[:, i-l: i], axis = 1)[numpy.newaxis].T
def main(): mlflow.start_run(run_name=NAME) if "X_train.pkl" not in os.listdir(): print("procesando los datos") X, y, encoder = preprocess_data("TOTAL_TRAIN.csv", process_cat=False) print(X.shape) with open(f"label_encoder_{NAME}.pkl", "wb") as f: pickle.dump(encoder, f) print( f"##################### The shape of X is {X.shape} #######################" ) y = y.astype("int") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=15, stratify=y) with open("X_train.pkl", "wb") as f: pickle.dump(X_train, f) with open("X_test.pkl", "wb") as f: pickle.dump(X_test, f) with open("y_train.pkl", "wb") as f: pickle.dump(y_train, f) with open("y_test.pkl", "wb") as f: pickle.dump(y_test, f) print(X_train.shape) else: with open("X_train.pkl", "rb") as f: X_train = pickle.load(f) with open("X_test.pkl", "rb") as f: X_test = pickle.load(f) with open("y_train.pkl", "rb") as f: y_train = pickle.load(f) with open("y_test.pkl", "rb") as f: y_test = pickle.load(f) with open(f"label_encoder_XGB1704.pkl", "rb") as f: encoder = pickle.load(f) print("######### ajustando cat encoder ############") cols_cat = ["ruido", "CODIGO_POSTAL", "ZONA_METROPOLITANA", "CALIDAD_AIRE"] cols_float = [col for col in X_train.columns if col not in cols_cat] X_train[cols_float] = X_train[cols_float].astype("float") X_test[cols_float] = X_test[cols_float].astype("float") labs_names = [c for c in encoder.classes_] model = LGBMClassifier( class_weight="balanced", objective="multiclass:softmax", n_jobs=-1, random_state=100, silent=True, ) if MODE != "INDIVIDUAL": params = { "reg_alpha": (1e-3, 5.0, "log-uniform"), "reg_lambda": (1e-2, 50.0, "log-uniform"), "n_estimators": (600, 4500), "learning_rate": (5e-3, 1.0, "log-uniform"), "num_leaves": (20, 80), "boosting_type": ["gbdt", "goss"], "colsample_bytree": (0.1, 1.0, "uniform"), "subsample": (0.1, 1.0, "uniform"), "min_child_samples": (1, 25), "min_child_weight": (1e-6, 0.1, "log-uniform"), } print(params) cb = CatBoostEncoder(cols=cols_cat) X_train = cb.fit_transform(X_train, y_train) X_test = cb.transform(X_test) fit_params = { ### fit params ### "eval_set": [(X_test, y_test)], "eval_metric": lgb_f1_score, "early_stopping_rounds": 300, } pipeline = Pipeline(steps=[("clas_encoder", CatBoostEncoder( cols=cols_cat)), ("model", model)]) best_model = BayesSearchCV( model, params, n_iter=N_ITER, n_points=1, cv=cv, scoring=f2_scorer, random_state=100, optimizer_kwargs={"n_initial_points": 10}, fit_params=fit_params, ) def on_step(optim_result): score = best_model.best_score_ results = best_model.cv_results_ try: results_df = pd.DataFrame(results) results_df.to_csv(f"results_{NAME}.csv", header=True, index=False) print( f"############ Llevamos {results_df.shape[0]} pruebas #################" ) print(f"los resultados del cv de momento son {results_df}") except: print("Unable to convert cv results to pandas dataframe") mlflow.log_metric("best_score", score) with open(f"./best_{NAME}_params.pkl", "wb") as f: pickle.dump(best_model.best_params_, f) print("best score: %s" % score) if score >= 0.98: print("Interrupting!") return True print("ajustando modelo") if MODE != "INDIVIDUAL": print(X_train.dtypes) best_model.fit(X_train, y_train, callback=[on_step]) with open(f"./best_{NAME}_model.pkl", "wb") as f: pickle.dump(best_model, f) preds = best_model.predict(X_test) else: if NAME not in os.listdir(): os.mkdir(NAME) cat_encoder = CatBoostEncoder(cols=cols_cat) X_train = cat_encoder.fit_transform(X_train, y_train) X_test = cat_encoder.transform(X_test) best_model = BalancedBaggingClassifier( base_estimator=HistGradientBoostingClassifier( max_iter=3000, random_state=42, learning_rate=0.1, max_leaf_nodes=54, min_samples_leaf=2, scoring=f2_scorer, validation_fraction=0.1, n_iter_no_change=50, ), n_estimators=5, random_state=42, n_jobs=-1, max_features=0.7, sampling_strategy={5: int(dict(Counter(y_train))[5] * 0.11)}, ) best_model.fit(X_train, y_train) preds = best_model.predict(X_test) print( f'F1 SCORE IS {f1_score(y_test, preds, average="macro")}, precision is {precision_score(y_test, preds, average="macro")}, recall is {recall_score(y_test, preds, average="macro")}, accuracy is {accuracy_score(y_test, preds)}' ) print( f"F2 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}" ) print( f"F05 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}" ) cm = confusion_matrix(y_test, preds) grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names) grafico_conf_matrix.savefig(f"{NAME}/norm_NO_PIPELINE") with open(f"best_model_{NAME}.pkl", "wb") as f: pickle.dump(best_model, f) print("loggeando movidas") mlflow.log_metrics( metrics={ "f1": f1_score(y_test, preds, average="macro"), "precision": precision_score(y_test, preds, average="macro"), "recall": recall_score(y_test, preds, average="macro"), "accuracy": accuracy_score(y_test, preds), "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"), "f2": fbeta_score(y_test, preds, beta=2, average="macro"), }) if MODE != "INDIVIDUAL": best_params = best_model.best_params_ for param in best_params.keys(): mlflow.log_param(param, best_params[param]) cm = confusion_matrix(y_test, preds) grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names) grafico_conf_matrix.savefig(NAME) grafico_norm = print_confusion_matrix(cm, class_names=labs_names, normalize=False) grafico_norm.savefig(f"{NAME}_no_norm") mlflow.end_run()
X = np.asarray(train['rep'].values.tolist()) y = train['target'].values.astype('float64') model.fit( X , y ) joblib.dump(model, f'./models/auc__{d}__{rep}__{fam_name}__model.pkl') else: model = joblib.load(f'./models/auc__{d}__{rep}__{fam_name}__model.pkl') predictions = model.predict( np.asarray(validate['rep'].values.tolist()), ) r_results[fam_name] = { 'roc_score':roc_auc_score(validate['target'], predictions), 'roc50_score':get_roc(validate['target'].values, predictions, 50) } print('roc_score', r_results[fam_name]['roc_score']) print('roc50_score', r_results[fam_name]['roc50_score']) if to_train: if model_name != 'NaiveBayes': r_params[fam_name] = model.best_params_ print(model.best_params_) # except Exception as e:
'depth': Integer(1, 8), 'learning_rate': Real(0.01, 1.0, 'log-uniform'), 'random_strength': Real(1e-9, 10, 'log-uniform'), 'bagging_temperature': Real(0.0, 1.0), 'border_count': Integer(1, 255), 'l2_leaf_reg': Integer(2, 30), 'scale_pos_weight':Real(0.01, 1.0, 'uniform')} cb_bs = BayesSearchCV(cb, cb_param_grid, scoring = 'roc_auc', n_iter = 100, n_jobs = 1, return_train_score = False, refit = True, optimizer_kwargs = {'base_estimator': 'GP'}, random_state = 123) cb_bs.fit(x_train, y_train) y_probs = cb_bs.predict_proba(x_test) y_probs = y_probs[:, 1] y_pred = cb_bs.predict(x_test) print(classification_report(y_test, y_pred)) print(roc_auc_score(y_test, y_probs)) ### 0.903 fpr, tpr, thresholds = roc_curve(y_test, y_probs) plot_roc_curve(fpr, tpr) # Find the best parameters cb_bs.best_params_ # Use the parameters to re-run the model cb_tuned = CatBoostClassifier(iterations = 1000, depth = 8, learning_rate = 0.11574, random_strength = 1e-9, bagging_temperature = 1.0, border_count = 178, l2_leaf_reg = 2,
( "model", LGBMClassifier(n_jobs=-1, boosting_type="gbdt").set_params( **{ k.replace("final_estimator__model__", ""): v for k, v in params.items() }), ), ]), verbose=1, n_jobs=-1, cv=3, ) best_model = model.fit(X_train, y_train) preds = best_model.predict(X_test) print("loggeando movidas") mlflow.log_metrics( metrics={ "f1": f1_score(y_test, preds, average="macro"), "precision": precision_score(y_test, preds, average="macro"), "recall": recall_score(y_test, preds, average="macro"), "accuracy": accuracy_score(y_test, preds), "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"), "f2": fbeta_score(y_test, preds, beta=2, average="macro"), }) best_params = params for param in best_params.keys(): mlflow.log_param(param, best_params[param]) cm = confusion_matrix(y_test, preds)
'min_child_weight': (0, 5), 'n_estimators': (50, 100), 'scale_pos_weight': (1e-6, 500, 'log-uniform') } opt = BayesSearchCV(model, params, n_iter=5, cv=schema, refit=True, scoring='f1') # %% X_train, y_train = train.drop(columns=['Prediction']).astype( np.float32), train.Prediction.astype(np.int) opt.fit(X_train, y_train) context.io.save('xente_xgb', opt) # %% X_test, y_test = test.drop(columns=['Prediction']).astype( np.float32), test.Prediction.astype(np.int) y_pred = opt.predict(X_test) # %% xente_sample_submission = context.io.load('xente_sample_submission').assign( Prediction=y_pred.astype(np.int)) context.io.save('xente_y_submission', xente_sample_submission) # %%
'regressor__model__min_child_weight': (10, 500, 'log-uniform'), # categorical parameter 'regressor__model__n_estimators': (1, 8), # integer valued parameter 'regressor__model__reg_alpha': (1, 8, 'log-uniform'), # integer valued parameter 'regressor__model__reg_lambda': (1, 8, 'log-uniform'), # integer valued parameter 'regressor__model__subsample': (1, 8, 'log-uniform'), # integer valued parameter } #%% # Since sksurv output log hazard ratios (here relative to 0 on predictors) # we must use 'output_margin=True' for comparability. estimator = CoxPHSurvivalAnalysis().fit(data_x, data_y) gbm = xgb.XGBRegressor(objective='survival:cox', booster='gblinear', base_score=1, n_estimators=1000) search = BayesSearchCV(gbm, params, n_iter=3, cv=3) search.fit(data_x, data_y_xgb) #%% prediction_sksurv = estimator.predict(data_x) predictions_xgb = search.predict(data_x) d = pd.DataFrame({'xgb': predictions_xgb, 'sksurv': prediction_sksurv}) d.head() # %% context.io.save('xente_xgb', gbm) # %%
def tune_parameter(self, estimator, tp_manner, params, X, y, scoring="neg_log_loss"): estimator_name = (self.get_default_params_and_name(estimator))[0] print("tune parameters for " + estimator_name) if tp_manner == "bayes": if estimator_name in ["rf", "et"]: base_estimator = "RF" elif estimator_name in [ "adaboost", "xgb", "lgb", "gbm", "catboost" ]: base_estimator = "GBRT" else: base_estimator = "GP" tp = BayesSearchCV( estimator=estimator, search_spaces=params, optimizer_kwargs={"base_estimator": base_estimator}, scoring=scoring, n_iter=60, verbose=2, n_jobs=-1, cv=3, refit=True, random_state=1234) elif tp_manner == "gs": tp = GridSearchCV(estimator=estimator, param_grid=params, scoring=scoring, n_jobs=-1, cv=3, refit=True, verbose=2) elif tp_manner == "random": tp = RandomizedSearchCV(estimator=estimator, param_distributions=params, scoring=scoring, n_jobs=-1, n_iter=60, cv=3, refit=True, verbose=2, random_state=1234) elif tp_manner == "hpopt": X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42, shuffle=True) space = params def objective(space): clf = estimator clf.set_params(**space) clf.fit(X=X_train, y=y_train) loss = self.get_loss(clf, X_test, y_test, scoring) return loss best_param = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=60) str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) tp = estimator.set_params(**best_param) tp.fit(X, y) y_hat = tp.predict(X) metrics_dict = self.get_metrics(y_hat, y) print(estimator_name, best_param) print(metrics_dict) model_name = estimator_name + str_time + ".pkl" print("save metrics to tp_log.csv:", estimator_name) with open("tp_log.csv", 'a', newline='') as f: writer = csv.writer(f) writer.writerow( [estimator_name, model_name, " best params : ", str_time]) for key, value in metrics_dict.items(): writer.writerow([key, value]) for key, value in best_param.items(): writer.writerow([key, value]) # tp.save_model(model_name) joblib.dump(tp, model_name) return {estimator_name: tp} else: #todo return if estimator_name == "catboost": tp.fit(X=X, y=y, cat_features=self.cat_features) elif estimator_name == "lgb" and self.cat_features: tp.fit(X=X, y=y, categorical_feature=self.cat_features) else: tp.fit(X, y) best_param = tp.best_params_ best_score = tp.best_score_ y_hat = tp.predict(X) metrics_dict = self.get_metrics(y_hat, y) print(estimator_name, best_param) print("best score:", best_score) print(metrics_dict) str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) model_name = estimator_name + str_time + ".pkl" print("save metrics to tp_log.csv:", estimator_name) str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) with open("tp_log.csv", 'a', newline='') as f: writer = csv.writer(f) writer.writerow( [estimator_name, model_name, " best params : ", str_time]) for key, value in metrics_dict.items(): writer.writerow([key, value]) for key, value in best_param.items(): writer.writerow([key, value]) joblib.dump(tp, model_name) return {estimator_name: tp}
def demand(df,test_start_month = 201901, test_end_month = 201924,model = RandomForestRegressor(n_estimators=100), scale_x = preprocessing.StandardScaler(),scale_y = preprocessing.StandardScaler(),offset = 8,demand_lag = 9,tuning= True, scale = True,dummy = True,demand_lag_param = True): df = df.sort_values(['Part_No','Order_Month']) df = df.reset_index() df = df.drop('index',axis =1 ) if dummy == True: df_D = pd.get_dummies(df, columns = qual_preds, drop_first= True) else: df_D = df df_D = df_D.replace(np.nan,0) cat = list((set(df_D.columns) - set(df.columns))) predictors.extend(cat) final_vbls = list(set(predictors) - set(qual_preds)) test_data_period = df[(df['Order_Month'] > test_start_month) & (df['Order_Month'] < test_end_month)].sort_values(['Part_No','Order_Month']) final = pd.DataFrame() pred = pd.DataFrame() feature_importances = [] feature_importance_matrix = pd.DataFrame() if tuning == True: model = BayesSearchCV( estimator = RandomForestRegressor( n_jobs = 1, criterion='mse', ), search_spaces = { 'min_weight_fraction_leaf': (1e-9, 0.5, 'uniform'), 'max_depth': (1, 50), 'max_leaf_nodes': (2, 20), 'min_impurity_decrease': (0.01, 1.0, 'uniform'), 'min_impurity_split': (0.01, 1.0, 'uniform'), 'min_impurity_decrease': (0.01, 1.0, 'uniform'), 'ccp_alpha': (1e-9, 1.0, 'log-uniform'), 'n_estimators': (50,300) }, cv = KFold( n_splits=3, shuffle=True, random_state=42 ), n_iter=20, verbose = 1, return_train_score = True ) y_train = df[df['Order_Month']<test_start_month][target] X_train = df[df['Order_Month']<test_start_month][final_vbls] model.fit(X_train, y_train) model = model.best_estimator_ print(model.get_params) for planning_month in test_data_period['Order_Month'].sort_values().unique(): df_D['Demand_copy'] =df_D['Demand'] print('planning_month:',planning_month,'\n') if demand_lag_param == True: for lag in range(1,demand_lag+1): demand_lag_str = 'demand_lag_' + str(lag) df_D[demand_lag_str] = df_D.groupby(['Part_No'])[target].shift(lag) df_D[demand_lag_str] = df_D[demand_lag_str].fillna(0) for month in range(0,offset): target_month = planning_month + month print('target_month:',target_month) if target_month>test_end_month: continue for lag in range(1,demand_lag+1): demand_lag_str = 'demand_lag_' + str(lag) df_D[demand_lag_str] = df_D.groupby(['Part_No'])[target].shift(lag) df_D[demand_lag_str] = df_D[demand_lag_str].fillna(0) train_data_org = df_D[(df_D['Order_Month']< target_month)] train_data = df_D[(df_D['Order_Month'] < target_month)] test_data = df_D[(df_D['Order_Month'] == target_month)] test_data_org = df_D[(df_D['Order_Month'] == target_month)] if scale == True: X_train = pd.DataFrame(scale_x.fit_transform(train_data[final_vbls])) X_train.columns = final_vbls y_train = pd.DataFrame(scale_y.fit_transform(train_data[target])) X_test = pd.DataFrame(scale_x.transform(test_data[final_vbls])) X_test.columns = final_vbls model.fit(X_train, y_train) print(model.get_params) y_pred = model.predict(X_test) y_pred = pd.DataFrame(y_pred) y_pred = scale_y.inverse_transform(y_pred) y_pred = pd.Series(pd.DataFrame(y_pred)[0]) y_pred = y_pred.round() y_pred[y_pred < 0] = 0 else: y_train = train_data[target] X_train = train_data[final_vbls] X_test = test_data[final_vbls] model.fit(X_train, y_train) print(model.get_params) y_pred = model.predict(X_test) y_pred = pd.Series(y_pred) y_pred = y_pred.round() y_pred[y_pred < 0] = 0 if X_test.shape[0] == 0: continue print(X_test.columns) feature_importances.append(model.feature_importances_) df_D.loc[test_data.index, target[0]] = y_pred.values pred = pred.append(pd.DataFrame({ "Part_No": test_data_org['Part_No'].values, 'Actual': test_data_org['Demand'].values, "Offset": [month] * test_data_org.shape[0],"planning_month": [planning_month]*test_data_org.shape[0],"target_month": [target_month] * test_data_org.shape[0],'Fcst':y_pred}),ignore_index = True) y_true, y_pred = np.array(pred['Actual']), np.array(pred['Fcst']) mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100 accuracy = 1-mape feature_importances = [sum(x)/len(feature_importances) for x in zip(*feature_importances)] feature_importance_matrix = pd.DataFrame({'Columns':X_train.columns, 'f_imp' : feature_importances}) return pred,df_D,feature_importance_matrix,mape,accuracy
def executeML(X, y, X_test, y_test, n_jobs, feature_labels, class_labels, pipe, parameters, ml_type, bayesOpt=False, search_space=None, n_iter=32, acq_func=""): if bayesOpt: # Example search space: { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1,8), 'kernel': Categorical(['linear', 'poly', 'rbf']), } acq_funcs = ("LCB", "EI", "PI", "gp_hedge") for i in range(len(acq_funcs)): if acq_func in acq_funcs[i]: break if i == len(acq_funcs): i = i - 1 optimizer_kwargs = {'acq_func': acq_funcs[i]} cv = BayesSearchCV(pipe, search_space, verbose=verbose, n_iter=n_iter, n_jobs=n_jobs, optimizer_kwargs=optimizer_kwargs, scoring=kappa_scorer) else: cv = GridSearchCV(pipe, parameters, verbose=verbose, n_jobs=n_jobs, scoring=kappa_scorer) tick1 = datetime.datetime.now() cv.fit(X, y) tick2 = datetime.datetime.now() print("\n{0} fitting time: {1}".format(ml_type, tick2 - tick1)) print("{0} best params {1}".format(ml_type, cv.best_params_)) print("{0} best score {1}".format(ml_type, cv.best_score_)) tick3 = datetime.datetime.now() y_predict = cv.predict(X_test) tick4 = datetime.datetime.now() print("{0} test set accuracy: {1}".format( ml_type, accuracy_score(y_test, y_predict))) print("{0} test set cohen kappa: {1}".format( ml_type, cohen_kappa_score(y_test, y_predict))) print("{0} prediction time: {1}".format(ml_type, tick4 - tick3)) print( classification_report(y_test, y_predict, target_names=class_labels, digits=8)) cnf_matrix = confusion_matrix(y_test, y_predict) print("") print(cnf_matrix) pickle.dump( cnf_matrix, open(os.path.join(SAVE_DIR, "cnf_matrix.best." + ml_type + ".pck"), 'wb')) pickle.dump( cv.best_estimator_, open(os.path.join(SAVE_DIR, "classifier.best." + ml_type + ".pck"), 'wb')) return cv.best_estimator_
random_state=42 ), n_jobs = 1, n_iter = 100, verbose = 0, refit = True, random_state = 42 ) # Fit the model result = bayes_cv_tuner.fit(data[features], target, callback=status_print) with open('best_params.txt', 'w+') as fo: fo.write(str(bayes_cv_tuner.best_params_)) pred = bayes_cv_tuner.predict(data[features]) data['pred'] = pred data['pred'].to_csv('../result/train_pre.csv') data['combine'] = data.loc[data['leak_tsne'].isnan() nrows = None test = pd.read_csv('../input/test.csv', nrows = nrows) #test = add_leak(test, 'leak', '../input/test_leak.csv') #test = add_leak(test, 'leak6', '../input/test_leak_new6.csv') #test = add_leak(test, 'leak16', '../input/test_leak_new16.csv') #test = add_leak(test, 'leak22', '../input/test_leak_new22.csv') #test = add_leak(test, 'leak_tsne', '../input/test_leak_tsne.csv') test = add_leak(test, 'leak_tsne', '../input/test_leak_tsne.csv') #test = add_leak(test, 'leak_tsne11', '../input/test_leak_tsne_11.3.csv') #data, cols = add_bulk_leak(data, '../input/bunk_leak_test.csv')
def run_shallow(data_dir: str, results_dir: str, splits: List[str], metric: str, n_iter: int, n_points: int, n_folds: int, n_jobs: int) -> None: """Evaluate shallow baselines on the scruples resource. Train shallow baseline models on the scruples resource, reading the dataset from DATA_DIR, and writing trained models, logs, and other results to RESULTS_DIR. Performance is reported for each split provided as an argument. """ # Step 1: Manage and construct paths. logger.info('Creating the results directory.') os.makedirs(results_dir) model_paths = {} metrics_paths = collections.defaultdict(dict) predictions_paths = collections.defaultdict(dict) for baseline in baselines.resource.SHALLOW_BASELINES.keys(): os.makedirs(os.path.join(results_dir, baseline)) model_paths[baseline] = os.path.join(results_dir, baseline, 'model.pkl') for split in splits: os.makedirs(os.path.join(results_dir, baseline, split)) metrics_paths[baseline][split] = os.path.join( results_dir, baseline, split, 'metrics.json') predictions_paths[baseline][split] = os.path.join( results_dir, baseline, split, 'predictions.jsonl') # Step 2: Load the data. logger.info(f'Loading the data from {data_dir}.') dataset = ScruplesResource(data_dir=data_dir) # Step 3: Run the baselines. logger.info('Running the baselines.') for baseline, (Model, hyper_parameter_space) in tqdm.tqdm( baselines.resource.SHALLOW_BASELINES.items(), **settings.TQDM_KWARGS): # tune the hyper-parameters and train the model ids, features, labels, label_scores = dataset.train if hyper_parameter_space: model = BayesSearchCV( Model, hyper_parameter_space, scoring=make_scorer(score_func=METRICS[metric][1], **METRICS[metric][2]), n_iter=n_iter, n_points=n_points, cv=n_folds, n_jobs=os.cpu_count() if n_jobs == 0 else n_jobs, refit=True) else: model = Model model.fit(features, labels) # Step 4: Save the model. with open(model_paths[baseline], 'wb') as model_file: dill.dump(model, model_file) # Step 5: Run evaluation on the splits. for split in splits: ids, features, labels, label_scores = getattr(dataset, split) predictions = model.predict(features) probabilities = model.predict_proba(features) with open(metrics_paths[baseline][split], 'w') as metrics_file: json.dump( { key: metric( y_true=labels, y_pred=probabilities if scorer_kwargs['needs_proba'] else predictions) for key, (_, metric, scorer_kwargs) in METRICS.items() }, metrics_file) with open(predictions_paths[baseline][split], 'w')\ as predictions_file: for id_, probs, prediction in zip(ids, probabilities, predictions): predictions_file.write( json.dumps({ 'id': id_, 'label': prediction.tolist(), 'label_scores': probs.tolist() }) + '\n')
cv=10, n_jobs=N_JOBS, verbose=0, error_score=-9999, scoring=spearman_scorer, random_state=RANDOM_STATE, return_train_score=True, n_iter=3) with warnings.catch_warnings(): warnings.filterwarnings('ignore') search.fit(train, y_train) test = X_test[features] predicted = search.predict(test) model_test_score = spearman_scorer(search, test, y_test) estimator = search.best_estimator_.named_steps['estimator'] imputer = search.best_estimator_.named_steps['imputer'] def multiproc_iter_func(max_workers, an_iter, func, item_kwarg, **kwargs): """ A helper functions that applies a function to each item in an iterable using multiple processes. 'item_kwarg' is the keyword argument for the item in the iterable that we pass to the function. """ with ProcessPoolExecutor(max_workers=max_workers) as executor: future_results = [executor.submit(func, **{item_kwarg: item}, **kwargs) for item in an_iter]