def get_feat_importance_logreg(mdl_logreg, x, y): """ Calculate feature importance for logistic regression. This is similar to random forest. Parameters ---------- mdl_logreg : sklearn classifier classifier. x : pandas data frame features. y : pandas data frame Label. Returns ------- feat_importance : TYPE DESCRIPTION. """ visualizer = FeatureImportances(mdl_logreg, title='Logistic regression') visualizer.fit(x, y) visualizer.ax.remove() feat_importance = visualizer.feature_importances_[::-1] return feat_importance, visualizer
def create_feature_importance_chart(regressor, X_train, y_train): """Create feature importance chart. Tip: Check Sklearn-Neptune integration `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_ for the full example. Args: regressor (:obj:`regressor`): | Fitted sklearn regressor object X_train (:obj:`ndarray`): | Training data matrix y_train (:obj:`ndarray`): | The regression target for training Returns: ``neptune.types.File`` object that you can assign to run's ``base_namespace``. Examples: .. code:: python3 import neptune.new.integrations.sklearn as npt_utils rfr = RandomForestRegressor() rfr.fit(X_train, y_train) run = neptune.init(project='my_workspace/my_project') run['visuals/feature_importance'] = npt_utils.create_feature_importance_chart(rfr, X_train, y_train) """ assert is_regressor(regressor), 'regressor should be sklearn regressor.' chart = None try: fig, ax = plt.subplots() visualizer = FeatureImportances(regressor, is_fitted=True, ax=ax) visualizer.fit(X_train, y_train) visualizer.finalize() chart = neptune.types.File.as_image(fig) plt.close(fig) except Exception as e: print('Did not log feature importance chart. Error: {}'.format(e)) return chart
def log_feature_importance_chart(regressor, X_train, y_train, experiment=None): """Log feature importance chart. Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method. Tip: Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example. Args: regressor (:obj:`regressor`): | Fitted sklearn regressor object X_train (:obj:`ndarray`): | Training data matrix y_train (:obj:`ndarray`): | The regression target for training experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``): | Neptune ``Experiment`` object to control to which experiment you log the data. | If ``None``, log to currently active, and most recent experiment. Returns: ``None`` Examples: .. code:: python3 rfr = RandomForestRegressor() rfr.fit(X_train, y_train) neptune.init('my_workspace/my_project') neptune.create_experiment() log_feature_importance_chart(rfr, X_train, y_train) """ assert is_regressor(regressor), 'regressor should be sklearn regressor.' exp = _validate_experiment(experiment) try: fig, ax = plt.subplots() visualizer = FeatureImportances(regressor, is_fitted=True, ax=ax) visualizer.fit(X_train, y_train) visualizer.finalize() exp.log_image('charts_sklearn', fig, image_name='Feature Importance') plt.close(fig) except Exception as e: print('Did not log feature importance chart. Error: {}'.format(e))
def show_FeatureImportances( est: BaseEstimator, conf_mat_labels: List, X: DataFrame, y: Series, fig_size: Tuple = (8, 8), savefig: Path = Path().cwd() / "reports" / "figures" / "feats_imps.png", save_pref: bool = False, ) -> None: """Show feature importances""" fig, ax = plt.subplots(figsize=fig_size) cm = FeatureImportances(est, stack=True, labels=conf_mat_labels, relative=False, ax=ax) cm.fit(X, y) cm.show() if save_pref and not savefig.is_file(): fig.savefig(savefig, bbox_inches="tight", dpi=300)
logistic_score = balanced_accuracy_score(y_test, y_pred_Logisticregression.round(), adjusted=False) #logistic_score_acc=accuracy_score(y_test, y_pred_Logisticregression) scores.append(logistic_score) print(logistic_score) #print(logistic_score_acc) #from sklearn.metrics import accuracy_score #logistic_score = balanced_accuracy_score(test_scores_encoded, y_pred_Logisticregression.round(), adjusted=False) #logistic_score_acc=accuracy_score(test_scores_encoded, y_pred_Logisticregression.round(), normalize=False) #print(logistic_score) #print(logistic_score_acc) data_score = pd.DataFrame(columns=['Commodity', 'score']) data_score['Commodity'] = y_location_trains.columns data_score['score'] = scores print(data_score) data_score.to_csv('/Users/monalisa/Downloads/mmai823-project-master/out/linear_scores.csv') print(X_train.columns) # Feature importance #viz_feat = FeatureImportances(rfc, labels=X_train.columns, relative=False) from matplotlib import pyplot as plt %matplotlib inline viz_features = FeatureImportances(logistic, labels=X_train.columns) viz_features.fit(X_train, y_train) viz_features.show() plt.tight_layout()
) #Citation - xgboost.readthedocs.io xgboost API reference documentation selector = RFE( estimator, verbose=0, n_features_to_select=40 ) #Citation - scikit-learn.org sklearn API reference documentation selector.fit(X, y) #Select the best features selected = [X.columns[i] for i in selector.get_support(indices=True) ] #Citation - scikit-learn.org sklearn API reference documentation selected X = X[selected] plt.figure(figsize=(11, 9)) ax = plt.gca( ) #Citation - from matplotlib.org matplotlip API reference documentation # Title case the feature for better display and create the visualizer model = RandomForestClassifier( n_jobs=-1 ) #Citation - scikit-learn.org sklearn API reference documentation labels = list(map(lambda s: s.title(), X.columns)) viz = FeatureImportances( model, labels=labels, relative=True) #Citation - scikit-yb.org yellowbrick documentation # Fit and show the feature importances viz.fit(X, y) viz.show(ax=ax)
def importances(): X, y = load_occupancy() oz = FeatureImportances(RandomForestClassifier(), ax=newfig()) oz.fit(X, y) savefig(oz, "feature_importances")
#task 8: Feature importance and evolution metrics from yellowbrick.model_selection import FeatureImportances plt.rcParams['figure.figsize'] = (12,8) plt.style.use("ggplot") rf = RandomForestClassifer(bootstrap = 'True', class_weight = None, criterion='gini', max_depth=5, max_feature='auto', max_leaf_nodes = None, min_impurity_decrease=0.0, min_impurity_split= None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=1, verbose=False, warn_start=False) viz= FeatureImportances(rf) viz.fit(X_train,y_train) viz.show(); dt = DecisionForestClassifer(class_weight = None, criterion='gini', max_depth=3, max_feature='None', max_leaf_nodes = None, min_impurity_decrease=0.0, min_impurity_split= None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False,random_state=0, splitters='best') viz= FeatureImportances(dt) viz.fit(X_train,y_train) viz.show();
def train_model(context: MLClientCtx, dataset: DataItem, model_pkg_class: str, label_column: str = "label", train_validation_size: float = 0.75, sample: float = 1.0, models_dest: str = "models", test_set_key: str = "test_set", plots_dest: str = "plots", dask_key: str = "dask_key", dask_persist: bool = False, scheduler_key: str = '', file_ext: str = "parquet", random_state: int = 42) -> None: """ Train a sklearn classifier with Dask :param context: Function context. :param dataset: Raw data file. :param model_pkg_class: Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", or json model config. :param label_column: (label) Ground-truth y labels. :param train_validation_size: (0.75) Train validation set proportion out of the full dataset. :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default. :param models_dest: (models) Models subfolder on artifact path. :param test_set_key: (test_set) Mlrun db key of held out data in artifact store. :param plots_dest: (plots) Plot subfolder on artifact path. :param dask_key: (dask key) Key of dataframe in dask client "datasets" attribute. :param dask_persist: (False) Should the data be persisted (through the `client.persist`) :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact. :param file_ext: (parquet) format for test_set_key hold out data :param random_state: (42) sklearn seed """ if scheduler_key: client = Client(scheduler_key) else: client = Client() context.logger.info("Read Data") df = dataset.as_df(df_module=dd) context.logger.info("Prep Data") numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] df = df.select_dtypes(include=numerics) if df.isna().any().any().compute() == True: raise Exception('NAs valus found') df_header = df.columns df = df.sample(frac=sample).reset_index(drop=True) encoder = LabelEncoder() encoder = encoder.fit(df[label_column]) X = df.drop(label_column, axis=1).to_dask_array(lengths=True) y = encoder.transform(df[label_column]) classes = df[label_column].drop_duplicates() # no unique values in dask classes = [str(i) for i in classes] context.logger.info("Split and Train") X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, train_size=train_validation_size, random_state=random_state) scaler = StandardScaler() scaler = scaler.fit(X_train) X_train_transformed = scaler.transform(X_train) X_test_transformed = scaler.transform(X_test) model_config = gen_sklearn_model(model_pkg_class, context.parameters.items()) model_config["FIT"].update({"X": X_train_transformed, "y": y_train}) ClassifierClass = create_class(model_config["META"]["class"]) model = ClassifierClass(**model_config["CLASS"]) with joblib.parallel_backend("dask"): model = model.fit(**model_config["FIT"]) artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.logger.info("Evaluate") extra_data_dict = {} for report in (ROCAUC, ClassificationReport, ConfusionMatrix): report_name = str(report.__name__) plt.cla() plt.clf() plt.close() viz = report(model, classes=classes, per_class=True, is_fitted=True) viz.fit(X_train_transformed, y_train) # Fit the training data to the visualizer viz.score(X_test_transformed, y_test.compute()) # Evaluate the model on the test data plot = context.log_artifact(PlotArtifact(report_name, body=viz.fig, title=report_name), db_key=False) extra_data_dict[str(report)] = plot if report_name == 'ROCAUC': context.log_results({ "micro": viz.roc_auc.get("micro"), "macro": viz.roc_auc.get("macro") }) elif report_name == 'ClassificationReport': for score_name in viz.scores_: for score_class in viz.scores_[score_name]: context.log_results({ score_name + "-" + score_class: viz.scores_[score_name].get(score_class) }) viz = FeatureImportances(model, classes=classes, per_class=True, is_fitted=True, labels=df_header.delete( df_header.get_loc(label_column))) viz.fit(X_train_transformed, y_train) viz.score(X_test_transformed, y_test) plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, title="FeatureImportances"), db_key=False) extra_data_dict[str("FeatureImportances")] = plot plt.cla() plt.clf() plt.close() context.logger.info("Log artifacts") artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.set_label('class', model_pkg_class) context.log_model("model", body=dumps(model), artifact_path=artifact_path, model_file="model.pkl", extra_data=extra_data_dict, metrics=context.results, labels={"class": model_pkg_class}) context.log_artifact("standard_scaler", body=dumps(scaler), artifact_path=artifact_path, model_file="scaler.gz", label="standard_scaler") context.log_artifact("label_encoder", body=dumps(encoder), artifact_path=artifact_path, model_file="encoder.gz", label="label_encoder") df_to_save = delayed(np.column_stack)((X_test, y_test)).compute() context.log_dataset( test_set_key, df=pd.DataFrame(df_to_save, columns=df_header), # improve log dataset ability format=file_ext, index=False, labels={"data-type": "held-out"}, artifact_path=context.artifact_subpath('data')) context.logger.info("Done!")
print( f"Root Mean Square Error for Random Forest = {round(rmse_random_forest, 3)}" ) print( f"R^2(coefficient of determination) on training set = {round(score_random_forest_train, 3)}" ) print( f"R^2(coefficient of determination) on testing set = {round(score_random_forest_test, 3)}" ) print("Classification Report") print(classification_report(y_test, pred_random_forest)) print("Confusion Matrix:") print(confusion_matrix(y_test, pred_random_forest)) plt.figure(figsize=(16, 10)) viz = FeatureImportances(random_forest) viz.fit(x_train, y_train) viz.show() # Plot learning curve def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)): plt.figure() plt.title(title)