def display_dataset_analysis(self, global_analysis: bool = True, univariate_analysis: bool = True, target_analysis: bool = True, multivariate_analysis: bool = True): """ This method performs and displays an exploration of the data given. It allows to compare train and test values for each part of the analysis. The parameters of the method allow to filter which part to display or not. Parameters ---------- global_analysis : bool Whether or not to display the global analysis part. univariate_analysis : bool Whether or not to display the univariate analysis part. target_analysis : bool Whether or not to display the target analysis part that plots the distribution of the target variable. multivariate_analysis : bool Whether or not to display the multivariate analysis part """ if global_analysis: print_md("### Global analysis") self._display_dataset_analysis_global() if univariate_analysis: print_md("### Univariate analysis") self._perform_and_display_analysis_univariate( df=self.df_train_test, col_splitter="data_train_test", split_values=["test", "train"], names=["Prediction dataset", "Training dataset"], group_id='univariate') if target_analysis: df_target = self._create_train_test_df( test=pd.DataFrame({self.target_name: self.y_test}, index=range(len(self.y_test))) if self.y_test is not None else None, train=pd.DataFrame({self.target_name: self.y_train}, index=range(len(self.y_train))) if self.y_train is not None else None) if df_target is not None: if target_analysis: print_md("### Target analysis") self._perform_and_display_analysis_univariate( df=df_target, col_splitter="data_train_test", split_values=["test", "train"], names=["Prediction dataset", "Training dataset"], group_id='target') if multivariate_analysis: print_md("### Mutlivariate analysis") fig_corr = generate_correlation_matrix_fig(self.df_train_test, max_features=20) print_html(convert_fig_to_html(fig=fig_corr)) print_md('---')
def _perform_and_display_analysis_univariate(self, df: pd.DataFrame, col_splitter: str, split_values: list, names: list, group_id: str): col_types = compute_col_types(df) n_splits = df[col_splitter].nunique() test_stats_univariate = perform_univariate_dataframe_analysis( df.loc[df[col_splitter] == split_values[0]], col_types=col_types) if n_splits > 1: train_stats_univariate = perform_univariate_dataframe_analysis( df.loc[df[col_splitter] == split_values[1]], col_types=col_types) univariate_template = template_env.get_template("univariate.html") univariate_features_desc = list() list_cols_labels = [ self.explainer.features_dict.get(col, col) for col in df.drop(col_splitter, axis=1).columns.to_list() ] for col_label in sorted(list_cols_labels): col = self.explainer.inv_features_dict.get(col_label, col_label) fig = generate_fig_univariate(df_all=df, col=col, hue=col_splitter, type=col_types[col]) df_col_stats = self._stats_to_table( test_stats=test_stats_univariate[col], train_stats=train_stats_univariate[col] if n_splits > 1 else None, names=names) univariate_features_desc.append({ 'feature_index': int(self.explainer.inv_columns_dict.get(col, 0)), 'name': col, 'type': str(series_dtype(df[col])), 'description': col_label, 'table': df_col_stats.to_html(classes="greyGridTable"), 'image': convert_fig_to_html(fig) }) print_html( univariate_template.render(features=univariate_features_desc, groupId=group_id))
def display_model_performance(self): """ Displays the performance of the model. The metrics are computed using the config dict. Metrics should be given as a list of dict. Each dict contains they following keys : 'path' (path to the metric function, ex: 'sklearn.metrics.mean_absolute_error'), 'name' (optional, name of the metric as displayed in the report), and 'use_proba_values' (optional, possible values are False (default) or True if the metric uses proba values instead of predicted values). For example : config['metrics'] = [ { 'path': 'sklearn.metrics.mean_squared_error', 'name': 'Mean absolute error', # Optional : name that will be displayed next to the metric 'y_pred': 'predicted_values' # Optional }, { 'path': 'Scoring_AP.utils.lift10', # Custom function path 'name': 'Lift10', 'y_pred': 'proba_values' # Use proba values instead of predicted values } ] """ if self.y_test is None: logging.info( "No labels given for test set. Skipping model performance part" ) return print_md("### Univariate analysis of target variable") df = pd.concat([ pd.DataFrame({ self.target_name: self.y_pred }).assign(_dataset="pred"), pd.DataFrame({ self.target_name: self.y_test }).assign(_dataset="true") if self.y_test is not None else None ]) self._perform_and_display_analysis_univariate( df=df, col_splitter="_dataset", split_values=["pred", "true"], names=["Prediction values", "True values"], group_id='target-distribution') if 'metrics' not in self.config.keys(): logging.info( "No 'metrics' key found in report config dict. Skipping model performance part." ) return print_md("### Metrics") for metric in self.config['metrics']: if 'name' not in metric.keys(): metric['name'] = metric['path'] if metric['path'] in ['confusion_matrix', 'sklearn.metrics.confusion_matrix'] or \ metric['name'] == 'confusion_matrix': print_md(f"**{metric['name']} :**") print_html( convert_fig_to_html( generate_confusion_matrix_plot(y_true=self.y_test, y_pred=self.y_pred))) else: try: metric_fn = get_callable(path=metric['path']) # Look if we should use proba values instead of predicted values if 'use_proba_values' in metric.keys( ) and metric['use_proba_values'] is True: y_pred = self.explainer.proba_values else: y_pred = self.y_pred res = metric_fn(self.y_test, y_pred) except Exception as e: logging.info( f"Could not compute following metric : {metric['path']}. \n{e}" ) continue if isinstance(res, Number): res = display_value(round_to_k(res, 3)) print_md(f"**{metric['name']} :** {res}") elif isinstance(res, (list, tuple, np.ndarray)): print_md(f"**{metric['name']} :**") print_html( pd.DataFrame(res).to_html(classes="greyGridTable")) elif isinstance(res, str): print_md(f"**{metric['name']} :**") print_html(f"<pre>{res}</pre>") else: logging.info( f"Could not compute following metric : {metric['path']}. \n" f"Result of type {res} cannot be displayed") print_md('---')