def test_transition_features(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('class1', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), TargetExplanation('class2', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), ], transition_features=TransitionFeatureWeights( class_names=['class2', 'class1'], # reverse on purpose coef=np.array([[1.5, 2.5], [3.5, 4.5]]), )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert set(df_dict) == {'targets', 'transition_features'} assert df_dict['targets'].equals(format_as_dataframe(expl.targets)) df = df_dict['transition_features'] print(df) print(format_as_text(expl)) assert str(df) == ('to class2 class1\n' 'from \n' 'class2 1.5 2.5\n' 'class1 3.5 4.5') with pytest.warns(UserWarning): single_df = format_as_dataframe(expl) assert single_df.equals(df)
def test_transition_features(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('class1', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), TargetExplanation('class2', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), ], transition_features=TransitionFeatureWeights( class_names=['class2', 'class1'], # reverse on purpose coef=np.array([[1.5, 2.5], [3.5, 4.5]]), )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert set(df_dict) == {'targets', 'transition_features'} assert df_dict['targets'].equals(format_as_dataframe(expl.targets)) df = df_dict['transition_features'] print(df) print(format_as_text(expl)) expected = pd.DataFrame([ { 'from': 'class2', 'to': 'class2', 'coef': 1.5 }, { 'from': 'class2', 'to': 'class1', 'coef': 2.5 }, { 'from': 'class1', 'to': 'class2', 'coef': 3.5 }, { 'from': 'class1', 'to': 'class1', 'coef': 4.5 }, ], columns=['from', 'to', 'coef']) assert df.equals(expected) with pytest.warns(UserWarning): single_df = format_as_dataframe(expl) assert single_df.equals(df)
def pred(self): self._prepData() self.pred_oh_intr = eli5.format_as_dataframe( eli5.explain_prediction_xgboost(self.model_oh.get_booster(), self.df_pred_oh.iloc[0], feature_filter=self._filter_func)) self.pred_hel_intr = eli5.format_as_dataframe( eli5.explain_prediction_xgboost(self.model_hel.get_booster(), self.df_pred_hel.iloc[0], feature_filter=self._filter_func)) self.pred_oh = self.model_oh.predict_proba(self.df_pred_oh) self.pred_hel = self.model_hel.predict_proba(self.df_pred_hel)
def test_feature_importances(with_std, with_value): expl = Explanation(estimator='some estimator', feature_importances=FeatureImportances( importances=[ FeatureWeight('a', 1, std=0.1 if with_std else None, value=1 if with_value else None), FeatureWeight('b', 2, std=0.2 if with_std else None, value=3 if with_value else None), ], remaining=10, )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert list(df_dict) == ['feature_importances'] df = df_dict['feature_importances'] expected_df = pd.DataFrame({'weight': [1, 2]}, index=['a', 'b']) if with_std: expected_df['std'] = [0.1, 0.2] if with_value: expected_df['value'] = [1, 3] print(df, expected_df, sep='\n') assert expected_df.equals(df) single_df = format_as_dataframe(expl) assert expected_df.equals(single_df)
def main(): df = pd.read_excel('data/mr_vs_fr_30.xlsx') df = df.sample(frac=1, random_state=seed) df['text_lemmatized'] = df['text'].apply(morphText) X_train, X_test, y_train, y_test = train_test_split( df['text_lemmatized'], df['label'], test_size=0.3, random_state=42, stratify=df['label']) flag_test = True get_pipe(X_train, y_train, flag_test, X_test, y_test) flag_test = False pipe = get_pipe(df['text_lemmatized'], df['label'], flag_test) k = 0 words = [] for index, row in df.iterrows(): te5 = TextExplainer(clf=DecisionTreeClassifier(max_depth=5), random_state=seed) te5.fit(row['text_lemmatized'], pipe.predict_proba) df_eli5_w = eli5.format_as_dataframe(te5.explain_weights()) print('class {}'.format('male' if row['label'] == 0 else 'woman')) print('predict:') print(df_eli5_w) print(100*'*') temp_m = ', '.join(df_eli5_w[df_eli5_w['weight'] > 0]['feature'].tolist()) if temp_m: words.append(temp_m) else: words.append('') k += 1 df['words'] = words df.to_excel('mr_vs_fr_words_30.xlsx', index=False)
def test_targets_with_value(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('y', feature_weights=FeatureWeights( pos=[ FeatureWeight('a', 13, value=1), FeatureWeight('b', 5, value=2) ], neg=[ FeatureWeight('neg1', -10, value=3), FeatureWeight('neg2', -1, value=4) ], )), TargetExplanation('y2', feature_weights=FeatureWeights( pos=[FeatureWeight('f', 1, value=5)], neg=[], )), ], ) df = format_as_dataframe(expl) expected_df = pd.DataFrame( { 'weight': [13, 5, -1, -10, 1], 'value': [1, 2, 4, 3, 5] }, columns=['weight', 'value'], index=pd.MultiIndex.from_tuples([('y', 'a'), ('y', 'b'), ('y', 'neg2'), ('y', 'neg1'), ('y2', 'f')], names=['target', 'feature'])) print(df, expected_df, sep='\n') assert expected_df.equals(df)
def print_eli5(click_data, category): pred = pd.read_csv(category + '_xy.csv') model = joblib.load(category + ".h5") pred = pred.loc[(pred['grid_x'] == click_data['points'][0]['lon']) & (pred['grid_y'] == click_data['points'][0]['lat']), :] pred_sqr = pred['eurogrid_0250_1'].values[0] dane_model = df.loc[df['eurogrid_0250_1'] == pred_sqr, :] dict_ = eli5.format_as_dataframe(eli5.explain_weights(model)) cols = dict_['feature'].values maping = {} for i in range(len(cols)): maping['x' + str(i)] = cols[i] # print(dane_model.columns) expl = dane_model.loc[:, cols] # print(expl.head()) all_cols = itertools.permutations(cols) for cols in all_cols: try: expl = expl.loc[:, list(cols)] expl = eli5.formatters.format_as_dataframe( eli5.explain_prediction(model, expl)) break except: continue expl['feature'] = expl['feature'].apply(lambda x: map_x(x, maping)) return generate_table(expl)
def test_targets(with_std, with_value): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation( 'y', feature_weights=FeatureWeights( pos=[ FeatureWeight('a', 13, std=0.13 if with_std else None, value=2 if with_value else None), FeatureWeight('b', 5, std=0.5 if with_std else None, value=1 if with_value else None) ], neg=[ FeatureWeight('neg1', -10, std=0.2 if with_std else None, value=5 if with_value else None), FeatureWeight('neg2', -1, std=0.3 if with_std else None, value=4 if with_value else None) ], )), TargetExplanation('y2', feature_weights=FeatureWeights( pos=[FeatureWeight('f', 1)], neg=[], )), ], ) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert list(df_dict) == ['targets'] df = df_dict['targets'] expected_df = pd.DataFrame( { 'target': ['y', 'y', 'y', 'y', 'y2'], 'feature': ['a', 'b', 'neg2', 'neg1', 'f'], 'weight': [13, 5, -1, -10, 1] }, columns=['target', 'feature', 'weight']) if with_std: expected_df['std'] = [0.13, 0.5, 0.3, 0.2, None] if with_value: expected_df['value'] = [2, 1, 4, 5, None] print(df, expected_df, sep='\n') assert expected_df.equals(df) single_df = format_as_dataframe(expl) assert expected_df.equals(single_df)
def test_explain_prediction(boston_train): X, y, feature_names = boston_train reg = LinearRegression() reg.fit(X, y) expl = explain_prediction(reg, X[0]) df = format_as_dataframe(expl) check_prediction_df(df, expl) check_prediction_df(explain_prediction_df(reg, X[0]), expl) df_dict = explain_prediction_dfs(reg, X[0]) assert set(df_dict.keys()) == {'targets'} check_prediction_df(df_dict['targets'], expl)
def test_explain_weights_fi(boston_train): X, y, feature_names = boston_train reg = ExtraTreesRegressor() reg.fit(X, y) expl = explain_weights(reg) df = format_as_dataframe(expl) assert list(df.columns) == ['weight', 'std'] for fw in expl.feature_importances.importances: df_fw = df.loc[fw.feature] assert np.isclose(df_fw['weight'], fw.weight) assert np.isclose(df_fw['std'], fw.std)
def prediction(model, test_feat, test_lab, cross_valid, name, pred_prob): if pred_prob == False: pred = model.predict(test_feat) if cross_valid == False: return results(test_lab, pred, cross_valid, name) else: performance = results(test_lab, pred, cross_valid, name) return performance else: pred = model.predict(test_feat) results(test_lab, pred, cross_valid, name) if name == "Decision Tree" or name == "Random Forest": expl = explain_prediction_tree_classifier(model, test_feat.iloc[0]) expl_df = format_as_dataframe(expl) if name == "Decision Tree": expl_df.to_csv('/.../expl_dt.csv') else: expl_df.to_csv('/.../expl_rf.csv') expfi = explain_rf_feature_importance( model, feature_names=list(test_feat)) expfi_df = format_as_dataframe(expfi) expfi_df.to_csv('/.../expfi_rf.csv') elif name == "XGBoost": expl = explain_prediction_xgboost(model, test_feat.iloc[0]) expl_df = format_as_dataframe(expl) expl_df.to_csv('/.../expl_xgb.csv') expfi = explain_weights_xgboost(model, feature_names=list(test_feat)) expfi_df = format_as_dataframe(expfi) expfi_df.to_csv('/.../expfi_xgb.csv') else: print("//")
def explain(): data = request.get_json(force=True) # app.logger.info(data) df = pd.DataFrame(data, index=[0]) data_array = transformer.transform(df) exp = explain_prediction(model, data_array[0], feature_names=all_feature_names, top=(5, 5), targets=[True]) output = format_as_dataframe(exp).to_dict() return jsonify(output)
def st_lime_explanation( text: str, predict_func: Callable[[List[str]], np.ndarray], unique_labels: List[str], n_samples: int, position_dependent: bool = True, ): # TODO just use ELI5's built-in visualization when streamlit supports it: # https://github.com/streamlit/streamlit/issues/779 with st.spinner("Generating LIME explanations..."): te = TextExplainer( random_state=1, n_samples=n_samples, position_dependent=position_dependent ) te.fit(text, predict_func) st.json(te.metrics_) explanation = te.explain_prediction() explanation_df = eli5.format_as_dataframe(explanation) for target_ndx, target in enumerate( sorted(explanation.targets, key=lambda t: -t.proba) ): target_explanation_df = explanation_df[ explanation_df["target"] == target_ndx ].copy() target_explanation_df["contribution"] = ( target_explanation_df["weight"] * target_explanation_df["value"] ) target_explanation_df["abs_contribution"] = abs( target_explanation_df["contribution"] ) target_explanation_df = ( target_explanation_df.drop("target", axis=1) .sort_values(by="abs_contribution", ascending=False) .reset_index(drop=True) ) st.subheader( f"Target: {unique_labels[target_ndx]} (probability {target.proba:.4f}, score {target.score:.4f})" ) st.dataframe(target_explanation_df)
def main(): training_labels, testing_labels = bring_in_labels() # bring in actual datasets, 18 data is given training labels # Team name removed to avoid error in ML fitting url18 = 'https://www.basketball-reference.com/leagues/NBA_2019.html' url19 = 'https://www.basketball-reference.com/leagues/NBA_2020.html' rstats_18 = data_prep.scrape_regular(url18) rstats_18 = rstats_18.merge(training_labels, how='left', on='Team') rstats_18 = rstats_18.loc[:, rstats_18.columns != 'Team'] astats_18 = data_prep.scrape_advanced(url18) astats_18 = astats_18.merge(training_labels, how='left', on='Team') astats_18 = astats_18.loc[:, astats_18.columns != 'Team'] # 19 data is given testing labels rstats_19 = data_prep.scrape_regular(url19) rstats_19 = rstats_19.merge(testing_labels, how='left', on='Team') rstats_19 = rstats_19.loc[:, rstats_19.columns != 'Team'] astats_19 = data_prep.scrape_advanced(url19) astats_19 = astats_19.merge(testing_labels, how='left', on='Team') astats_19 = astats_19.loc[:, astats_19.columns != 'Team'] # Train both models on their 2018 data r_model = train_model(rstats_18) a_model = train_model(astats_18) # Test the r_models ability to predict rtest_labels = rstats_19['W/L%'] rtest_features = rstats_19.loc[:, rstats_18.columns != 'W/L%'] r_model_predictions = r_model.predict(rtest_features) r_model_mse = mean_squared_error(rtest_labels, r_model_predictions) # test the a_models ability to predict atest_labels = astats_19['W/L%'] atest_features = astats_19.loc[:, astats_18.columns != 'W/L%'] a_model_predictions = a_model.predict(atest_features) a_model_mse = mean_squared_error(atest_labels, a_model_predictions) # Stands for regular feature names rf_names = rstats_18.columns rf_names = list(rf_names[:len(rf_names) - 1]) r_imp_features = eli5.format_as_dataframe(eli5.explain_weights( r_model, top=10, feature_names=rf_names)) # Stands for advanced feature names af_names = astats_18.columns af_names = list(af_names[:len(af_names) - 1]) a_imp_features = eli5.format_as_dataframe(eli5.explain_weights( a_model, top=10, feature_names=af_names)) plot_MSE_diffs(r_model_mse, a_model_mse) plot_rif(r_imp_features) plot_aif(a_imp_features)
def main(input_file_path, output_file_path, tgt="Oil_norm", n_splits=5): input_file_name = os.path.join(input_file_path, "Train_final.pck") input_file_name_test = os.path.join(input_file_path, "Test_final.pck") input_file_name_val = os.path.join(input_file_path, "Validation_final.pck") output_file_name = os.path.join(output_file_path, f"models_lgbm_{tgt}.pck") df = pd.read_pickle(input_file_name).drop(exclude_cols, axis=1) df_test = pd.read_pickle(input_file_name_test) df_val = pd.read_pickle(input_file_name_val).drop(exclude_cols, axis=1) ids = df_test["EPAssetsId"] ids_uwi = df_test["UWI"] df_test = df_test.drop(exclude_cols, axis=1) cv = KFold(n_splits=n_splits, shuffle=False) models = [] scores = [] scores_dm = [] y = df.loc[~df[tgt].isna(), tgt] X = df.loc[~df[tgt].isna(), :].drop( [ "Oil_norm", "Gas_norm", "Water_norm", "EPAssetsId", "_Normalized`IP`BOE/d" ], axis=1, ) X_test = df_test.copy().drop("EPAssetsId", axis=1) X_holdout, y_holdout = ( df_val.loc[~df_val[tgt].isna(), :].drop( [ "Oil_norm", "Gas_norm", "Water_norm", "EPAssetsId", "_Normalized`IP`BOE/d", ], axis=1, ), df_val.loc[~df_val[tgt].isna(), tgt], ) preds_test = np.zeros((n_splits, df_test.shape[0])) preds_holdout = np.zeros((n_splits, X_holdout.shape[0])) for k, (train_index, test_index) in enumerate(cv.split(X, y)): X_train, X_val = X.iloc[train_index, :], X.iloc[test_index, :] # model = LGBMRegressor(num_leaves=16, learning_rate=0.1, n_estimators=300, reg_lambda=30, reg_alpha=30, # objective='mae',random_state=123) model = LogLGBM( num_leaves=16, learning_rate=0.05, n_estimators=900, reg_lambda=0, reg_alpha=0, objective="mae", random_state=123, feature_fraction=0.7, ) y_train, y_val = y.iloc[train_index], y.iloc[test_index] geom_mean = gmean(y_train) dm = DummyRegressor(strategy="constant", constant=geom_mean) model.fit(X_train, y_train, categorical_feature=CAT_COLUMNS) # model.fit(X_train, y_train) dm.fit(X_train, y_train) score = mean_absolute_error(y_holdout, model.predict(X_holdout)) score_dm = mean_absolute_error(y_val, dm.predict(X_val)) # logging.info(f' Score = {score}') models.append(model) scores.append(score) scores_dm.append((score_dm)) logger.warning(f"Holdout score = {score}") preds_test[k, :] = model.predict(X_test).reshape(1, -1) preds_holdout[k, :] = model.predict(X_holdout).reshape(1, -1) with open(output_file_name, "wb") as f: pickle.dump(models, f) logger.info(scores) logger.info(f"Mean scores LGBM = {np.mean(scores)}") logger.info(f"Mean scores Dummy = {np.mean(scores_dm)}") preds_df = pd.DataFrame({ "EPAssetsID": ids, "UWI": ids_uwi, tgt: preds_test.mean(axis=0) }) preds_df_val = pd.DataFrame({ tgt: preds_holdout.mean(axis=0), "gt": y_holdout }) score_holdout = mean_absolute_error(preds_df_val["gt"], preds_df_val[tgt]) logger.warning(f"Final score on holdout: {score_holdout}") print(eli5.format_as_dataframe(eli5.explain_weights(model))) return preds_df, score_holdout, preds_df_val
def main( input_file_path, output_file_path, tgt="Oil_norm", interim_file_path=None, n_splits=7, ): input_file_name = os.path.join(input_file_path, "Train_final.pck") input_file_name_test = os.path.join(input_file_path, "Test_final.pck") input_file_name_val = os.path.join(input_file_path, "Validation_final.pck") exclude_cols = exclude_cols_dict.get(tgt) output_file_name = os.path.join(output_file_path, f"models_lgbm_{tgt}.pck") df = pd.read_pickle(input_file_name).drop(exclude_cols, axis=1) df_test = pd.read_pickle(input_file_name_test) df_val = pd.read_pickle(input_file_name_val).drop(exclude_cols, axis=1) df_all = pd.concat([df, df_val], axis=0) df_all[tgt] = df_all[tgt].fillna(value=0) ids = df_test["EPAssetsId"] ids_uwi = df_test["UWI"] df_test = df_test.drop(exclude_cols, axis=1) cv = KFold(n_splits=n_splits, shuffle=False) models = [] scores = [] scores_dm = [] y = df_all.loc[~df_all[tgt].isna(), tgt] id_X =df_all.loc[~df_all[tgt].isna(),["EPAssetsId"]] X = df_all.loc[~df_all[tgt].isna(), :].drop( ["Oil_norm", "Gas_norm", "Water_norm", "EPAssetsId", "_Normalized`IP`BOE/d"], axis=1, ) X_test = df_test.copy().drop("EPAssetsId", axis=1) preds_test = np.zeros((n_splits, df_test.shape[0])) preds_holdout = [] y_true = [] id_list=[] np.random.seed(123) best_params = pd.read_csv( os.path.join(output_file_path, f"LGBM_{tgt}_feats_final_Trials.csv") ).head(20) datasets = {} for k, (train_index, test_index) in enumerate(cv.split(X, y)): X_train, X_holdout = X.iloc[train_index, :], X.iloc[test_index, :] id_X_holdout = id_X.iloc[test_index] # model = LGBMRegressor(num_leaves=16, learning_rate=0.1, n_estimators=300, reg_lambda=30, reg_alpha=30, # objective='mae',random_state=123) params = best_params.iloc[0, :].to_dict() model = LogLGBM( learning_rate=0.05, n_estimators=3500, objective="mse", num_leaves=np.int(params["num_leaves"]), feature_fraction=params["feature_fraction"], min_data_in_leaf=np.int(params["min_data_in_leaf"]), bagging_fraction=params["bagging_fraction"], lambda_l1=params["lambda_l1"], lambda_l2=params["lambda_l2"], random_state=k, ) y_train, y_holdout = y.iloc[train_index], y.iloc[test_index] geom_mean = gmean(y_train) dm = DummyRegressor(strategy="constant", constant=geom_mean) model.fit( X_train, y_train, categorical_feature=set(CAT_COLUMNS) - set(exclude_cols), eval_set=(X_holdout, y_holdout), early_stopping_rounds=150, verbose=200, ) # model.fit(X_train, y_train) dm.fit(X_train, y_train) score = mean_absolute_error(y_holdout, model.predict(X_holdout)) score_dm = mean_absolute_error(y_holdout, dm.predict(X_holdout)) # logging.info(f' Score = {score}') models.append(model) scores.append(score) scores_dm.append((score_dm)) logger.warning(f"Holdout score = {score}") preds_test[k, :] = model.predict(X_test) preds_holdout.append(model.predict(X_holdout).reshape(1, -1)) y_true.append(y_holdout.values.reshape(1, -1)) print( mean_absolute_error( y_holdout.values.reshape(1, -1), model.predict(X_holdout).reshape(1, -1) ) ) with open(output_file_name, "wb") as f: pickle.dump(models, f) logger.info(scores) logger.info(f"Mean scores LGBM = {np.mean(scores)}") logger.info(f"Mean scores Dummy = {np.mean(scores_dm)}") preds_df = pd.DataFrame( {"EPAssetsID": ids, "UWI": ids_uwi, tgt: preds_test.mean(axis=0)} ) n_points = np.hstack(y_true).shape[0] preds_df_val = pd.DataFrame( {tgt: np.hstack(preds_holdout)[0, :], f"gt_{tgt}": np.hstack(y_true)[0, :]} ) logger.warning(f"Final scores on holdout: {np.mean(scores)} +- {np.std(scores)}") logger.warning( f"Final scores on full holdout: {mean_absolute_error(preds_df_val[f'gt_{tgt}'], preds_df_val[tgt])}" ) print(eli5.format_as_dataframe(eli5.explain_weights(model, top=60))) return preds_df, preds_df_val, np.mean(scores)
def test_bad_list(): with pytest.raises(ValueError): format_as_dataframe([1])
import eli5 from eli5.sklearn import PermutationImportance import numpy as np import matplotlib.pylot as plt import pandas as pd #fetch best performing model best_model = RF_gscv.best_estimator_ best_model2 = MLP_gscv.best_estimator_ #fit permutation importance on test data perm = PermutationImportance(best_model).fit(test_img, test_lab) perm2 = PermutationImportance(best_model2).fit(test_img, test_lab) #show weights wghts = eli5.format_as_dataframe(eli5.explain_weights(perm)) wghts2 = eli5.format_as_dataframe(eli5.explain_weights(perm2)) #write dataframes to csv wghts.to_csv( 'D:/studies/phd/WV3_Data_July2019/010039360030_01/L_Sabie_subset/rf_permImportance.csv', encoding='utf-8', index=False) wghts2.to_csv( 'D:/studies/phd/WV3_Data_July2019/010039360030_01/L_Sabie_subset/mlp_permImportance.csv', encoding='utf-8', index=False) gLawn = mlp_map_prob[:, 3] w = x_img_arr[:, -9] plt.scatter(w, gLawn)
def train(data, *regs, save_to=None, concat_features=False, explain=False): coords = utils.load_coords() concated_xs = np.concatenate(data['xs'], axis=1) all_rmse, all_patch_rmse, all_baselines = [], [], [] regs_name = ', '.join(type(reg).__name__ for reg in regs) fitted_regs = [] expl_by_cls = defaultdict(list) for cls in range(utils.N_CLASSES): ids = data['ids'][cls] scales = data['scales'][cls] ys = data['ys'][cls] xs = input_features(concated_xs if concat_features else data['xs'][cls]) # indices = np.array(sorted(range(len(ids)), key=lambda i: (scales[i], ids[i]))) # ids, xs, ys = ids[indices], xs[indices], ys[indices] pred, fitted = train_predict(regs, xs, ys, ids) ys_by_id, pred_by_id = [], [] unique_ids = sorted(set(ids)) pred_by_id = get_pred_by_id(ids, pred, unique_ids) for img_id in unique_ids: try: ys_by_id.append((coords.loc[[img_id]].cls == cls).sum()) except KeyError: ys_by_id.append(0) pred_by_id = round_prediction(pred_by_id) patch_rmse = np.sqrt(metrics.mean_squared_error(ys, pred)) rmse = np.sqrt(metrics.mean_squared_error(ys_by_id, pred_by_id)) baseline_rmse = np.sqrt(metrics.mean_squared_error( cross_val_predict(DummyRegressor(), [[0]] * len(ys_by_id), ys_by_id, cv=5), ys_by_id)) print('cls {}, patch mean {:.3f}, patch RMSE {:.3f}, ' 'image mean {:.2f}, image RMSE {:.2f}, baseline RMSE {:.2f}' .format(cls, np.mean(ys), patch_rmse, np.mean(ys_by_id), rmse, baseline_rmse)) all_rmse.append(rmse) all_patch_rmse.append(patch_rmse) all_baselines.append(baseline_rmse) if save_to: fitted_regs.append(fitted) if explain: for reg in fitted: expl = eli5.explain_weights(reg, feature_names=FEATURE_NAMES) expl_by_cls[cls].append(expl) print(type(reg).__name__, format_as_text( expl, show=('method', 'targets', 'feature_importances'))) print('{} with {} features: mean patch RMSE {:.3f}, mean image RMSE {:.2f}, ' 'mean baseline RMSE {:.2f}' .format(regs_name, ', '.join(FEATURE_NAMES), np.mean(all_patch_rmse), np.mean(all_rmse), np.mean(all_baselines))) if save_to: joblib.dump(fitted_regs, save_to) print('Saved to', save_to) if explain: dfs = [] for cls, expls in expl_by_cls.items(): for expl in expls: df = eli5.format_as_dataframe(expl) df['cls'] = cls df['estimator'] = expl.estimator.split('(')[0] dfs.append(df) df = pd.concat(dfs) df.reset_index(inplace=True) df['feature'] = df['index'] del df['index'] df = df[['feature', 'cls', 'estimator', 'std', 'weight']] df.to_csv('feature_importances.csv', index=None)
def main_run_linear_models(train_ds, val_ds, test_ds, data_props, max_backlooking=None, layer_type='dense', activation_funcs=['sigmoid', 'relu', 'tanh'], max_serach_iterations=200, NN_max_depth=3, MAX_EPOCHS=800, patience=25, model_name='linear', examples=None, return_permutation_importances=True, redo_serach_best_model=False): mlflow.set_experiment(model_name) experiment_date_time = int( datetime.datetime.now().strftime("%Y%m%d%H%M%S")) flatten_input = True if layer_type == 'dense' else False def _extract_just_important_data_props(data_props): kwargs = {} kwargs['dataset_cols_X_just_these'] = data_props['third_filter'][ 'cols_just_these'] kwargs['dataset_cols_X_exclude'] = data_props['third_filter'][ 'cols_drop'] kwargs['dataset_cols_y'] = data_props['third_filter'][ 'y_cols_just_these'] kwargs['dataset_hash_input'] = int(data_props['first_step']['dataset']) kwargs['dataset_hash_first'] = data_props['first_step_data_hash'] kwargs['dataset_hash_second'] = data_props['second_step_data_hash'] kwargs['dataset_split_method'] = data_props['second_step'][ 'split_method'] kwargs['dataset_split_steps_train'] = data_props['second_step'][ 'split_props']['train_time_steps'] kwargs['dataset_split_steps_val'] = data_props['second_step'][ 'split_props']['val_time_steps'] kwargs['dataset_split_steps_test'] = data_props['second_step'][ 'split_props']['test_time_steps'] kwargs['dataset_iter_step'] = data_props['iter_step'] kwargs['dataset_normalization'] = data_props['second_step'][ 'normalize_method'] kwargs['dataset_window_backlooking'] = data_props['first_step'][ 'window_input_width'] kwargs['dataset_window_prediction'] = data_props['first_step'][ 'window_pred_width'] kwargs['dataset_window_shift'] = data_props['first_step'][ 'window_shift'] return kwargs def _hp_tranform_param_dict(param_dict): new_param_dict = {} for key, value in param_dict.items(): if type(value) == list: new_param_dict[key] = hp.choice(key, value) elif type(value) == set: new_param_dict[key] = hp.uniform(key, *values) else: new_param_dict[key] = value return new_param_dict max_backlooking = data_props['first_step'][ 'window_input_width'] if max_backlooking is None else max_backlooking param_grid = dict( n_layers=list(range(1, NN_max_depth + 1)), first_layer_nodes=[0] if NN_max_depth == 1 else [128, 64, 32, 16, 8], last_layer_nodes=[0] if NN_max_depth == 1 else [64, 32, 16, 8, 4], activation_func=activation_funcs, backlooking_window=list(range(1, max_backlooking + 1))) hp_param_dict = _hp_tranform_param_dict(param_dict=param_grid) hp_param_dict['model_name'] = model_name hp_param_dict['data_props'] = data_props hp_param_dict['layer_type'] = layer_type def _optimize_objective(*args, **kwargs): if args != (): kwargs = args[ 0] # if positional arguments expect first to be dictionary with all kwargs if type(kwargs) != dict: raise Exception( f'kwargs is not dict - it is {type(kwargs)} with values: {kwargs}' ) backlooking_window = kwargs.pop('backlooking_window') n_layers = kwargs.pop('n_layers') first_layer_nodes = kwargs.pop('first_layer_nodes') last_layer_nodes = kwargs.pop('last_layer_nodes') activation_func = kwargs.pop('activation_func') return_everything = kwargs.pop('return_everything', False) verbose = kwargs.pop('verbose', 0) model_name = kwargs.pop('model_name', 'linear') data_props = kwargs.pop('data_props') layer_type = kwargs.pop('layer_type', 'dense') dataset = _get_prep_data(train_ds, val_ds, test_ds, flatten=flatten_input, keep_last_n_periods=backlooking_window) now = datetime.datetime.now() date_time = str(now.strftime("%y%m%d%H%M%S")) model_name = f"{date_time}_{model_name}_w{backlooking_window}_l{n_layers}_a{activation_func}" kwargs = dict( model_name=model_name, n_layers=n_layers, first_layer_nodes=first_layer_nodes, last_layer_nodes=last_layer_nodes, activation_func=activation_func, input_size=dataset['input_shape'] if layer_type == 'dense' else tuple(list(train_ds.element_spec[0].shape)[1:]), output_size=dataset['output_shape'], backlooking_window=backlooking_window, layer_type=layer_type) model = createmodel(**kwargs) history, mlflow_additional_params = compile_and_fit( model=model, train=dataset['train_ds'], val=dataset['val_ds'], MAX_EPOCHS=MAX_EPOCHS, patience=patience, model_name=model_name, verbose=verbose) # Get all data props for documentation in MLflow kwargs.update(_extract_just_important_data_props(data_props)) kwargs['run'] = experiment_date_time mlflow_additional_params['kwargs'] = kwargs train_performance = dict( zip(model.metrics_names, evaluate_model(model=model, tf_data=dataset['train_ds']))) val_performance = dict( zip(model.metrics_names, evaluate_model(model=model, tf_data=dataset['val_ds']))) test_performance = dict( zip( model.metrics_names, evaluate_model( model=model, tf_data=dataset['test_ds'], mlflow_additional_params=mlflow_additional_params))) mlflow_additional_params['data_props'] = data_props # Only save model if close to 15% best models try: best_loss = float(trials.best_trial['result']['loss']) current_loss = min(history.history['val_loss']) if current_loss <= best_loss * (1 + 0.15): save_model = True else: save_model = False except: save_model = True mlflow_saved = my_helpers.mlflow_last_run_add_param( param_dict=mlflow_additional_params, save_model=save_model) tf.keras.backend.clear_session() return_metrics = dict(loss=val_performance['loss'], all_metrics={ 'train': train_performance, 'val': val_performance, 'test': test_performance }, status=STATUS_OK, mlflow=mlflow_saved, model_name=model_name) if return_everything: return_metrics['model'] = model return_metrics['history'] = history return return_metrics ###### Get old best model records ###### storage_file_path = os.path.join( my_helpers.get_project_directories(key='cache_dir'), 'storage_best_model.json') if not os.path.exists(storage_file_path): best_model_storage = {} else: with open(storage_file_path) as json_file: best_model_storage = json.load(json_file) ######## Search for best model ######## if redo_serach_best_model or model_name not in best_model_storage or data_props[ 'iter_step'] not in best_model_storage[model_name]: warnings.filterwarnings('ignore') trials = Trials() best = fmin(fn=_optimize_objective, space=hp_param_dict, algo=tpe.suggest, max_evals=max_serach_iterations, trials=trials, early_stop_fn=no_progress_loss(iteration_stop_count=int( max_serach_iterations / 4), percent_increase=0.025)) warnings.simplefilter('always') # getting all parameters for best model storage mlflow_best_model = trials.best_trial['result']['mlflow'] best_params = {} for key, idx in best.items(): best_params[key] = param_grid[key][idx] coef_names_ = list( data_props['look_ups']['out_lookup_col_name']['X'].keys()) coef_names_ = coef_names_ + [ col + f'_sft_{i}' for i in range(1, best_params['backlooking_window']) for col in coef_names_ ] # Saving best model to storage if model_name not in best_model_storage: best_model_storage[model_name] = {} if data_props['iter_step'] not in best_model_storage[model_name]: best_model_storage[model_name][data_props['iter_step']] = { 'best_model': { 'result': { 'loss': 10**10 } }, 'history': {} } best_model_param = dict( result={ 'loss': trials.best_trial['result']['loss'], 'all_metrics': trials.best_trial['result']['all_metrics'] }, model_name=trials.best_trial['result']['model_name'], model_id=trials.best_trial['result']['mlflow']['model_id'], run_id=experiment_date_time, input_coefs=coef_names_, path_saved_model=trials.best_trial['result']['mlflow'] ['saved_model_path'], status=trials.best_trial['result']['status'], params=best_params, data=_extract_just_important_data_props(data_props)) best_model_storage[model_name][data_props['iter_step']]['history'][ experiment_date_time] = best_model_param if trials.best_trial['result']['loss'] < best_model_storage[model_name][ data_props['iter_step']]['best_model']['result']['loss']: best_model_storage[model_name][ data_props['iter_step']]['best_model'] = best_model_param with open(storage_file_path, 'w') as outfile: json.dump(best_model_storage, outfile) else: # Get best model from storage best_model_param = best_model_storage[model_name][ data_props['iter_step']]['best_model'] ######## Get Best model again ######## best_model = tf.keras.models.load_model( best_model_param['path_saved_model']) best_model.compile(loss=tf.losses.MeanAbsoluteError(), optimizer=tf.optimizers.Adam(), metrics=[ tf.metrics.MeanAbsoluteError(), CustomMeanDirectionalAccuracy(), tf.losses.Huber(), tf.metrics.MeanAbsolutePercentageError(), tf.metrics.MeanSquaredError(), tf.metrics.MeanSquaredLogarithmicError() ]) print('Best model is:', best_model_param) out = dict(best_model_param) ####### Get examples for plotting ####### if examples is not None: example_X = examples['X'] periods = best_model_param['params']['backlooking_window'] if layer_type == 'dense': example_X = tf.data.Dataset.from_tensors( np.reshape(example_X[:, -periods:, :], (example_X.shape[0], -1))) else: example_X = tf.data.Dataset.from_tensors(example_X) out['examples_pred_y'] = best_model.predict(example_X) ###### For 1 layer dense/linear models get coef & p-values ###### if NN_max_depth == 1 and isinstance(best_model.layers[0], tf.keras.layers.Dense): # Get coefs intercept_ = best_model.layers[0].bias.numpy() coef_ = best_model.layers[0].weights[0].numpy() out['coef_'] = pd.Series( dict( zip(['intercept_'] + best_model_param['input_coefs'], intercept_.tolist() + coef_.squeeze().tolist()))) dataset = _get_prep_data(train_ds, val_ds, test_ds, flatten=True, keep_last_n_periods=best_model_param['params'] ['backlooking_window']) # get p-values import app.d_prediction.my_custom_pvalue_calc as my_p_lib out['p_values'] = {} for data_set in ['train', 'val', 'test']: y_pred = best_model.predict(dataset[f'{data_set}_X']) y_pred = np.reshape(y_pred, (-1, 1)) try: p_values = my_p_lib.coef_pval(dataset[f'{data_set}_X'], dataset[f'{data_set}_y'], coef_, intercept_, y_pred) p_values = pd.Series( dict(zip(best_model_param['input_coefs'], p_values))) out['p_values'][data_set] = p_values except: warnings.warn( "P-Values: ValueError: Input contains infinity or nan.") out['p_values'][data_set] = pd.Series( dict( zip(best_model_param['input_coefs'], ['error'] * len(best_model_param['input_coefs'])))) out['p_values'] = pd.DataFrame(out['p_values']) ##### Get Column Feature Importance ##### if return_permutation_importances: if 'feature_importance' in best_model_param: out['feature_importance'] = best_model_param['feature_importance'] else: import eli5 from eli5.sklearn import PermutationImportance sklearn_model = KerasRegressor(build_fn=best_model) sklearn_model.model = best_model dataset = _get_prep_data( train_ds, val_ds, test_ds, flatten=flatten_input, keep_last_n_periods=best_model_param['params'] ['backlooking_window']) out['feature_importance'] = {} for data_set in ['train', 'val']: # Calculate actual FeatureImporttance try: perm = PermutationImportance( sklearn_model, cv='prefit').fit( dataset[f'{data_set}_X'].numpy(), np.reshape(dataset[f'{data_set}_y'].numpy(), (-1, 1))) feature_importances = eli5.format_as_dataframe( eli5.explain_weights( perm, feature_names=best_model_param['input_coefs'], top=10**10)) out['feature_importance'][ data_set] = feature_importances.set_index( 'feature').to_dict() except: warnings.warn( "PermutationImportance: ValueError: Input contains infinity or a value too large for dtype('float16')." ) if out['feature_importance'] != {}: best_model_param['feature_importance'] = out[ 'feature_importance'] best_model_storage[model_name][ data_props['iter_step']]['best_model'][ 'feature_importance'] = out['feature_importance'] best_model_storage[model_name][ data_props['iter_step']]['history'][experiment_date_time][ 'feature_importance'] = out['feature_importance'] with open(storage_file_path, 'w') as outfile: json.dump(best_model_storage, outfile) out['status'] = 'ok' return out
def process(self, inputs): max_features = self._campaign_configuration['FeatureSelection']['max_features'] # setting parameters for XGboost design space expoloration xgboost_parameters = copy.deepcopy(self._campaign_configuration) xgboost_parameters['General']['techniques'] = ['XGBoost'] xgboost_parameters['General']['run_num'] = 1 local_root_directory = self._campaign_configuration['General']['output'] for token in self._prefix: local_root_directory = os.path.join(local_root_directory, token) xgboost_parameters['General']['output'] = local_root_directory del xgboost_parameters['FeatureSelection'] model_building_var = model_building.model_building.ModelBuilding(0) if 'XGBoost' not in xgboost_parameters: # default parameters if not provided in the ini file xgboost_parameters['XGBoost'] = {} xgboost_parameters['XGBoost']['min_child_weight'] = [1, 3] xgboost_parameters['XGBoost']['gamma'] = [0, 1] xgboost_parameters['XGBoost']['n_estimators'] = [50, 100, 150, 250] xgboost_parameters['XGBoost']['learning_rate'] = [0.01, 0.05, 0.1] xgboost_parameters['XGBoost']['max_depth'] = [1, 2, 3, 5, 9, 13] best_conf = model_building_var.process(xgboost_parameters, inputs, int(self._campaign_configuration['General']['j'])) # best_conf is a XGBoost configuration exeperiment xgb_regressor = best_conf.get_regressor() # top = None means all expl = eli5.xgboost.explain_weights_xgboost(xgb_regressor, feature_names=inputs.x_columns, top=max_features, importance_type='gain') # text version expl_weights = eli5.format_as_text(expl) self._logger.debug("XGBoost feature scores:\n%s", str(expl_weights)) df = eli5.format_as_dataframe(expl) # data frame version xgb_sorted_features = df['feature'].values.tolist() # features list features_sig = df['weight'].values.tolist() # significance score weights cumulative_significance = 0 tolerance = self._campaign_configuration['FeatureSelection']['XGBoost_tolerance'] index = 0 while cumulative_significance < tolerance and index < len(features_sig): cumulative_significance = cumulative_significance + features_sig[index] index = index + 1 feat_res = xgb_sorted_features[0:index] self._logger.info("XGBoost selected features: %s", str(feat_res)) data = inputs data.x_columns = feat_res return data
def classify(features, labels, model='all', resample_method=None, scoring='roc_auc_ovo', cv=10, n_iter=10): ''' A nested function to apply machine learning classification Args: features: A pandas dataframe containing the features labels: A pandas dataframe containing the labels model: Options are: 'rf' - Random Forest 'gbm' - Gradient Boosting 'dt' - Decision Tree 'et' - Extremely Randomized Tree 'log_sgd' - Logistic Regression with Stochastic Gradient Descent learning 'all' - Tests out all five of the models and identifies which model is the best based on the cross-validation score Default is 'all' resample_method: Resampling to deal with imbalanced data Reference: https://imbalanced-learn.readthedocs.io/en/stable/combine.html#bpm2004 Options are: 'smote_tomek' - https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTETomek.html#imblearn.combine.SMOTETomek 'smote_enn' - https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html#imblearn.combine.SMOTEENN Default is None scoring: The metric for evaluating model performance Reference: https://scikit-learn.org/stable/modules/model_evaluation.html Default is 'roc_auc_ovo' cv: The number of splits for cross-validation Default is 10 n_iter: The number of parameter settings that are sampled Reference: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html Default is 10 Returns the tuned classifier a report containing the f1-score, precision, and recall for each class the Matthews Correlation Coefficient value the log loss (cross-entropy loss) a pandas dataframe of the features predicted to be of a certain class given its weight and value a confusion matrix figure a ROC curve figure ''' # Make sure the features and labels are of type pandas dataframes if not isinstance(features, pd.DataFrame) and not isinstance( labels, pd.DataFrame): raise TypeError( 'The features and labels are not of a pandas dataframe type.') # Make sure the number of rows are the same in features and labels assert features.shape[0] == labels.shape[0], 'Unequal number of rows.' # Get the names of the features feature_names = features.columns.values def standardize(X): ''' Custom standardization function Args: X: the features as a numpy array Returns the standardized features ''' return (X - np.mean(X)) / np.std(X) def which_model(X, y, model='all'): ''' Using the baseline models (default parameters) of Random Forest, Gradient Boosting, Decision Tree, Extremely Randomized Tree, and Logistic Regression with Stochastic Gradient Descent learning on the entire dataset (with cross-validation) to determine which model is best. The user can either test the 5 models individually or test all of them by setting model = 'all' Args: X: The numpy array containing the features y: The numpy array containing the labels model: Options are: 'rf' - Random Forest 'gbm' - Gradient Boosting 'dt' - Decision Tree 'et' - Extremely Randomized Tree 'log_sgd' - Logistic Regression with Stochastic Gradient Descent learning 'all' - Tests out all five of the models and identifies which model is the best based on the cross-validation score Default is 'all' Returns best model ''' if model == 'all': # Pipelines help prevent data leakage pipelines = [] pipelines.append( ('Random Forest', skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('RF', RandomForestClassifier(random_state=9999)) ]))) pipelines.append( ('Gradient Boosting', skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('GBM', GradientBoostingClassifier(random_state=9999)) ]))) pipelines.append( ('Decision Tree', skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('DT', DecisionTreeClassifier(random_state=9999)) ]))) pipelines.append( ('Extra Trees', skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('ET', ExtraTreesClassifier(random_state=9999)) ]))) pipelines.append( ('Logistic Regression (SGD)', skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('LOGSGD', SGDClassifier(loss='log', random_state=9999)) ]))) print('\nSelecting model...') print('\nModel\tScore') print('-------------') results = [] names = [] for name, model in pipelines: # Apply cross validation cv_results = cross_val_score(model, X, y, cv=cv, scoring=scoring) results.append(np.mean(cv_results)) names.append(name) print('{}: {:.4f} ± {:.4f}'.format(name, np.mean(cv_results), np.std(cv_results))) names_results = list(zip(names, results)) # Return model with highest score value selected_model = max(names_results, key=lambda item: item[1]) print('\nThe selected model is', selected_model) if 'Gradient Boosting' in selected_model: return GradientBoostingClassifier() elif 'Random Forest' in selected_model: return RandomForestClassifier() elif 'Decision Tree' in selected_model: return DecisionTreeClassifier() elif 'Extra Trees' in selected_model: return ExtraTreesClassifier() elif 'Logistic Regression (SGD)' in selected_model: return SGDClassifier(loss='log') elif model == 'rf': rf_pipe = skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('RF', RandomForestClassifier(random_state=9999)) ]) rf_score = cross_val_score(rf_pipe, X, y, cv=cv, scoring=scoring) print('\nRandom Forest score: {:.4f} ± {:.4f}'.format( np.mean(rf_score), np.std(rf_score))) return RandomForestClassifier() elif model == 'gbm': gb_pipe = skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('GBM', GradientBoostingClassifier(random_state=9999)) ]) gb_score = cross_val_score(gb_pipe, X, y, cv=cv, scoring=scoring) print('\nGradient Boosting score: {:.4f} ± {:.4f}'.format( np.mean(gb_score), np.std(gb_score))) return GradientBoostingClassifier() elif model == 'dt': dt_pipe = skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('DT', DecisionTreeClassifier(random_state=9999)) ]) dt_score = cross_val_score(dt_pipe, X, y, cv=cv, scoring=scoring) print('\nDecision Tree score: {:.4f} ± {:.4f}'.format( np.mean(dt_score), np.std(dt_score))) return DecisionTreeClassifier() elif model == 'et': et_pipe = skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('ET', ExtraTreesClassifier(random_state=9999)) ]) et_score = cross_val_score(et_pipe, X, y, cv=cv, scoring=scoring) print('\nExtra Trees score: {:.4f} ± {:.4f}'.format( np.mean(et_score), np.std(et_score))) return ExtraTreesClassifier() elif model == 'log_sgd': log_pipe = skl_pipeline([ ('Standardization', FunctionTransformer(standardize)), ('LOGSGD', SGDClassifier(loss='log', random_state=9999)) ]) log_score = cross_val_score(log_pipe, X, y, cv=cv, scoring=scoring) print('\nLogistic Regression (SGD) score: {:.4f} ± {:.4f}'.format( np.mean(log_score), np.std(log_score))) return SGDClassifier(loss='log') def train(selected_model, X_train, y_train, resample_method=None): ''' Train and tune the hyperparameters of the selected model Random Search is used because the parameter search space is large and performs as well as Grid Search. Hyperparameter tuning is more art than science as it is based on expertise and experience Args: selected_model: The model from which_model() X_train: The training features y_train: The training labels Returns the tuned classifier ''' print('\nStarting training...') if resample_method == None: if selected_model.__class__.__name__ == 'GradientBoostingClassifier': start = time() pipe = skl_pipeline([('Standardization', FunctionTransformer(standardize)), ('clf', selected_model)]) gb_grid = { 'clf__n_estimators': [ int(x) for x in np.linspace(start=100, stop=1000, num=10) ], 'clf__subsample': [0.7, 0.8], 'clf__learning_rate': [0.001, 0.01, 0.1], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__max_features': ['sqrt', 'log2'], 'clf__min_samples_split': [2, 5, 10], 'clf__min_samples_leaf': [1, 2, 4], 'clf__loss': ['deviance'], 'clf__random_state': [9999] } clf = RandomizedSearchCV(pipe, gb_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif selected_model.__class__.__name__ == 'RandomForestClassifier': start = time() pipe = skl_pipeline([('Standardization', FunctionTransformer(standardize)), ('clf', selected_model)]) rf_grid = { 'clf__n_estimators': [ int(x) for x in np.linspace(start=100, stop=1000, num=10) ], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__max_features': ['sqrt', 'auto'], 'clf__min_samples_leaf': [1, 2, 4], 'clf__min_samples_split': [2, 5, 10], 'clf__bootstrap': [True], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999] } clf = RandomizedSearchCV(pipe, rf_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif selected_model.__class__.__name__ == 'DecisionTreeClassifier': start = time() pipe = skl_pipeline([('Standardization', FunctionTransformer(standardize)), ('clf', selected_model)]) dt_grid = { 'clf__criterion': ['gini'], 'clf__splitter': ['best', 'random'], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__min_samples_leaf': [1, 2, 4], 'clf__max_features': ['sqrt', 'auto'], 'clf__min_samples_split': [2, 5, 10], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999] } clf = RandomizedSearchCV(pipe, dt_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif selected_model.__class__.__name__ == 'ExtraTreesClassifier': start = time() pipe = skl_pipeline([('Standardization', FunctionTransformer(standardize)), ('clf', selected_model)]) et_grid = { 'clf__criterion': ['gini'], 'clf__n_estimators': [ int(x) for x in np.linspace(start=100, stop=1000, num=10) ], 'clf__bootstrap': [True, False], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__min_samples_leaf': [1, 2, 4], 'clf__max_features': ['sqrt', 'auto'], 'clf__min_samples_split': [2, 5, 10], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999] } clf = RandomizedSearchCV(pipe, et_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif 'log' in selected_model.get_params().values(): start = time() pipe = skl_pipeline([('Standardization', FunctionTransformer(standardize)), ('clf', selected_model)]) log_grid = { 'clf__loss': ['log'], 'clf__penalty': ['l2', 'l1', 'elasticnet'], 'clf__alpha': [0.01, 0.001, 0.0001], 'clf__max_iter': [1000, 5000], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999] } clf = RandomizedSearchCV(pipe, log_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif resample_method == 'smote_tomek': if selected_model.__class__.__name__ == 'GradientBoostingClassifier': start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTETOMEK', SMOTETomek()), ('clf', selected_model)]) gb_grid = { 'clf__n_estimators': [ int(x) for x in np.linspace(start=100, stop=1000, num=10) ], 'clf__subsample': [0.7, 0.8], 'clf__learning_rate': [0.001, 0.01, 0.1], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__max_features': ['sqrt', 'log2'], 'clf__min_samples_split': [2, 5, 10], 'clf__min_samples_leaf': [1, 2, 4], 'clf__loss': ['deviance'], 'clf__random_state': [9999], 'SMOTETOMEK__random_state': [9999] } clf = RandomizedSearchCV(pipe, gb_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif selected_model.__class__.__name__ == 'RandomForestClassifier': start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTETOMEK', SMOTETomek()), ('clf', selected_model)]) rf_grid = { 'clf__n_estimators': [ int(x) for x in np.linspace(start=100, stop=1000, num=10) ], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__max_features': ['sqrt', 'auto'], 'clf__min_samples_leaf': [1, 2, 4], 'clf__min_samples_split': [2, 5, 10], 'clf__bootstrap': [True], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999], 'SMOTETOMEK__random_state': [9999] } clf = RandomizedSearchCV(pipe, rf_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif selected_model.__class__.__name__ == 'DecisionTreeClassifier': start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTETOMEK', SMOTETomek()), ('clf', selected_model)]) dt_grid = { 'clf__criterion': ['gini'], 'clf__splitter': ['best', 'random'], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__min_samples_leaf': [1, 2, 4], 'clf__max_features': ['sqrt', 'auto'], 'clf__min_samples_split': [2, 5, 10], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999], 'SMOTETOMEK__random_state': [9999] } clf = RandomizedSearchCV(pipe, dt_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif selected_model.__class__.__name__ == 'ExtraTreesClassifier': start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTETOMEK', SMOTETomek()), ('clf', selected_model)]) et_grid = { 'clf__criterion': ['gini'], 'clf__n_estimators': [ int(x) for x in np.linspace(start=100, stop=1000, num=10) ], 'clf__bootstrap': [True, False], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__min_samples_leaf': [1, 2, 4], 'clf__max_features': ['sqrt', 'auto'], 'clf__min_samples_split': [2, 5, 10], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999] } clf = RandomizedSearchCV(pipe, et_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif 'log' in selected_model.get_params().values(): start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTETOMEK', SMOTETomek()), ('clf', selected_model)]) log_grid = { 'clf__loss': ['log'], 'clf__penalty': ['l2', 'l1', 'elasticnet'], 'clf__alpha': [0.01, 0.001, 0.0001], 'clf__max_iter': [1000, 5000], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999], 'SMOTETOMEK__random_state': [9999] } clf = RandomizedSearchCV(pipe, log_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif resample_method == 'smote_enn': if selected_model.__class__.__name__ == 'GradientBoostingClassifier': start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTENN', SMOTEENN()), ('clf', selected_model)]) gb_grid = { 'clf__n_estimators': [ int(x) for x in np.linspace(start=100, stop=1000, num=10) ], 'clf__subsample': [0.7, 0.8], 'clf__learning_rate': [0.001, 0.01, 0.1], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__max_features': ['sqrt', 'log2'], 'clf__min_samples_split': [2, 5, 10], 'clf__min_samples_leaf': [1, 2, 4], 'clf__loss': ['deviance'], 'clf__random_state': [9999], 'SMOTENN__random_state': [9999] } clf = RandomizedSearchCV(pipe, gb_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif selected_model.__class__.__name__ == 'RandomForestClassifier': start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTENN', SMOTEENN()), ('clf', selected_model)]) rf_grid = { 'clf__n_estimators': [ int(x) for x in np.linspace(start=100, stop=1000, num=10) ], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__max_features': ['sqrt', 'auto'], 'clf__min_samples_leaf': [1, 2, 4], 'clf__min_samples_split': [2, 5, 10], 'clf__bootstrap': [True], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999], 'SMOTENN__random_state': [9999] } clf = RandomizedSearchCV(pipe, rf_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif selected_model.__class__.__name__ == 'DecisionTreeClassifier': start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTENN', SMOTEENN()), ('clf', selected_model)]) dt_grid = { 'clf__criterion': ['gini'], 'clf__splitter': ['best', 'random'], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__min_samples_leaf': [1, 2, 4], 'clf__max_features': ['sqrt', 'auto'], 'clf__min_samples_split': [2, 5, 10], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999], 'SMOTENN__random_state': [9999] } clf = RandomizedSearchCV(pipe, dt_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif selected_model.__class__.__name__ == 'ExtraTreesClassifier': start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTENN', SMOTEENN()), ('clf', selected_model)]) et_grid = { 'clf__criterion': ['gini'], 'clf__n_estimators': [ int(x) for x in np.linspace(start=100, stop=1000, num=10) ], 'clf__bootstrap': [True, False], 'clf__max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'clf__min_samples_leaf': [1, 2, 4], 'clf__max_features': ['sqrt', 'auto'], 'clf__min_samples_split': [2, 5, 10], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999] } clf = RandomizedSearchCV(pipe, et_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf elif 'log' in selected_model.get_params().values(): start = time() pipe = imbl_pipeline([('Standardization', FunctionTransformer(standardize)), ('SMOTENN', SMOTEENN()), ('clf', selected_model)]) log_grid = { 'clf__loss': ['log'], 'clf__penalty': ['l2', 'l1', 'elasticnet'], 'clf__alpha': [0.01, 0.001, 0.0001], 'clf__max_iter': [1000, 5000], 'clf__class_weight': ['balanced', None], 'clf__random_state': [9999], 'SMOTENN__random_state': [9999] } clf = RandomizedSearchCV(pipe, log_grid, cv=cv, n_iter=n_iter, scoring=scoring, n_jobs=-1, random_state=9999) clf.fit(X_train, y_train) end = time() time_elapsed = end - start print('\nThe best {}-fold cross valdiation score is {:.4f}.'. format(cv, clf.best_score_)) print('The best parameters are:\n', clf.best_estimator_.get_params()['clf']) print('Training took {:.0f}m {:.0f}s.'.format( time_elapsed // 60, time_elapsed % 60)) return clf def evaluate(clf, X_test, y_test): ''' Evaluate the tuned classifier's performance on the testing set Args: clf: The tuned classifier from train() X_test: The test data features y_test: The test data labels Returns a report containing the f1-score, precision, and recall of each class the Matthew's Correlation Coefficient the log loss (cross-entropy loss) a confusion matrix figure a ROC curve figure ''' def plot_confusion_matrix(y_test, y_pred): ''' Confusion matrix Args: y_test: The test set labels y_pred: The predicted labels Returns confusion matrix figure ''' cm = confusion_matrix(y_test, y_pred) cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] df_cm = pd.DataFrame(cm, columns=np.unique(y_test), index=np.unique(y_test)) df_cm.index.name = 'True Labels' df_cm.columns.name = 'Predicted Labels' cm_fig = sns.heatmap(df_cm, cmap='Blues', annot=True, cbar=False) for _, spine in cm_fig.spines.items(): spine.set_visible(True) plt.title('{} Confusion Matrix'.format(model_name)) plt.yticks(rotation=0) return cm_fig y_pred = clf.predict(X_test) report = classification_report(y_test, y_pred) mcc = matthews_corrcoef(y_test, y_pred) if 'log' in clf.best_estimator_.get_params().values(): model_name = 'Logistic Regression (SGD)' y_probas = clf.best_estimator_['clf'].predict_proba(X_test) else: model_name = selected_model.__class__.__name__ y_probas = clf.predict_proba(X_test) conf_mat = plot_confusion_matrix(y_test, y_pred) # Binary classification vs multi-class classification ROC curve if len(np.unique(y_test)) > 2: roc_curve = plot_roc(y_test, y_probas, title='{} ROC curve'.format(model_name)) else: roc_curve = plot_roc_curve(clf, X_test, y_test, name=model_name) roc_curve.figure_.suptitle('{} ROC curve'.format(model_name)) loss_score = log_loss(y_test, y_probas) # Get the features of their predicted class based on weight and value if 'log' in selected_model.get_params().values(): feat_imp = eli5.sklearn.explain_prediction_linear_classifier( clf.best_estimator_['clf'], X_test[1], feature_names=feature_names) else: feat_imp = eli5.sklearn.explain_prediction.explain_prediction_tree_classifier( clf.best_estimator_['clf'], X_test[1], feature_names=feature_names) return report, mcc, loss_score, feat_imp, conf_mat, roc_curve X = features.to_numpy() y = labels.to_numpy() X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=9999) selected_model = which_model(X, y, model=model) tuned_clf = train(selected_model, X_train, y_train, resample_method=resample_method) report, mcc, loss_score, feat_imp, conf_mat, roc_curve = evaluate( tuned_clf, X_test, y_test) feat_imp = eli5.format_as_dataframe(feat_imp) return tuned_clf, report, mcc, loss_score, feat_imp, conf_mat, roc_curve
def main(input_file_path, output_file_path, tgt="Oil_norm", interim_file_path=None, n_splits=11): condition = condition_dict[tgt] input_file_name = os.path.join(input_file_path, "Train_final.pck") input_file_name_test = os.path.join(input_file_path, "Test_final.pck") input_file_name_val = os.path.join(input_file_path, "Validation_final.pck") exclude_cols = exclude_cols_dict.get(tgt) output_file_name = os.path.join(output_file_path, f"models_lgbm_{tgt}.pck") df = pd.read_pickle(input_file_name).drop(exclude_cols, axis=1) df_test = pd.read_pickle(input_file_name_test) df_val = pd.read_pickle(input_file_name_val).drop(exclude_cols, axis=1) df_all = pd.concat([df, df_val], axis=0) ids = df_test["EPAssetsId"].copy() ids_uwi = df_test["UWI"].copy() df_test = df_test.drop(exclude_cols, axis=1) cv = KFold(n_splits=n_splits, shuffle=False) models = [] scores = [] scores_dm = [] y = df_all.loc[~df_all[tgt].isna(), tgt] id_X = df_all.loc[~df_all[tgt].isna(), ["EPAssetsId"]] X = df_all.loc[~df_all[tgt].isna(), :].drop( [ "Oil_norm", "Gas_norm", "Water_norm", "EPAssetsId", "_Normalized`IP`BOE/d" ], axis=1, ) # Filter large vals # condition = y < threshold_dict[tgt] # X = X.loc[condition,:] # y = y.loc[condition] X_test = df_test.copy().drop("EPAssetsId", axis=1) preds_test = np.zeros((n_splits, df_test.shape[0])) preds_holdout = [] y_true = [] id_list = [] np.random.seed(123) best_params = pd.read_csv( os.path.join(output_file_path, f'LGBM_{tgt}_feats_final_Trials.csv')).head(20) datasets = {} for k, (train_index, test_index) in enumerate(cv.split(X, y)): X_train, X_holdout = X.iloc[train_index, :], X.iloc[test_index, :] print(X_train.shape) id_X_holdout = id_X.iloc[test_index] # model = LGBMRegressor(num_leaves=16, learning_rate=0.1, n_estimators=300, reg_lambda=30, reg_alpha=30, # objective='mae',random_state=123) if tgt == 'Oil_norm': params = best_params.iloc[k, :].to_dict() else: params = best_params.iloc[0, :].to_dict() model = LogRF(target=tgt, max_depth=17, n_estimators=200) y_train, y_holdout = y.iloc[train_index], y.iloc[test_index] idx = (y_train > condition[0]) & (y_train < condition[1]) X_train, y_train = X_train.loc[idx, :], y_train.loc[idx] # Calculate a fill value: #target_log_mean = np.median(np.log(y_train[y_train > 0])) #target_fill_val = np.exp(target_log_mean) target_fill_val = 0 y_train = y_train.fillna(value=target_fill_val) logging.info(f'Filling {tgt} with {target_fill_val}') y_holdout = y_holdout.fillna(value=0) geom_mean = gmean(y_train) dm = DummyRegressor(strategy="constant", constant=geom_mean) X_train = X_train.fillna(-999) X_holdout = X_holdout.fillna(-999) X_test = X_test.fillna(-999) model.fit(X_train, y_train) # model.fit(X_train, y_train) dm.fit(X_train, y_train) y_pred_holdout = model.predict(X_holdout) score = mean_absolute_error(y_holdout, y_pred_holdout) score_dm = mean_absolute_error(y_holdout, dm.predict(X_holdout)) # logging.info(f' Score = {score}') models.append(model) scores.append(score) scores_dm.append((score_dm)) logger.warning(f"Holdout score = {score}") preds_test[k, :] = model.predict(X_test) preds_holdout.append(y_pred_holdout.reshape(1, -1)) y_true.append(y_holdout.values.reshape(1, -1)) id_list.append(id_X_holdout.values.reshape(1, -1)) print( mean_absolute_error(y_holdout.values.reshape(1, -1), y_pred_holdout.reshape(1, -1))) with open(output_file_name, "wb") as f: pickle.dump(models, f) logger.info(scores) logger.info(f"Mean scores LGBM = {np.mean(scores)}") logger.info(f"Mean scores Dummy = {np.mean(scores_dm)}") # preds_df = pd.DataFrame( # {"EPAssetsID": ids, "UWI": ids_uwi, tgt: preds_test.mean(axis=0)} # ) preds_df = pd.DataFrame({ "EPAssetsID": ids, "UWI": ids_uwi, tgt: mean_log(preds_test) }) n_points = np.hstack(y_true).shape[0] preds_df_val = pd.DataFrame({ tgt: np.hstack(preds_holdout)[0, :], f"gt_{tgt}": np.hstack(y_true)[0, :], 'EPAssetsId': np.hstack(id_list)[0, :] }) logger.warning( f"Final scores on holdout: {np.mean(scores)} +- {np.std(scores)}") logger.warning( f"Final scores on full holdout: {mean_absolute_error(preds_df_val[f'gt_{tgt}'], preds_df_val[tgt])}" ) print(eli5.format_as_dataframe(eli5.explain_weights(model, top=60))) return preds_df, preds_df_val, np.mean(scores)