예제 #1
0
    def display_dataset_analysis(self,
                                 global_analysis: bool = True,
                                 univariate_analysis: bool = True,
                                 target_analysis: bool = True,
                                 multivariate_analysis: bool = True):
        """
        This method performs and displays an exploration of the data given.
        It allows to compare train and test values for each part of the analysis.

        The parameters of the method allow to filter which part to display or not.

        Parameters
        ----------
        global_analysis : bool
            Whether or not to display the global analysis part.
        univariate_analysis : bool
            Whether or not to display the univariate analysis part.
        target_analysis : bool
            Whether or not to display the target analysis part that plots
            the distribution of the target variable.
        multivariate_analysis : bool
            Whether or not to display the multivariate analysis part
        """
        if global_analysis:
            print_md("### Global analysis")
            self._display_dataset_analysis_global()

        if univariate_analysis:
            print_md("### Univariate analysis")
            self._perform_and_display_analysis_univariate(
                df=self.df_train_test,
                col_splitter="data_train_test",
                split_values=["test", "train"],
                names=["Prediction dataset", "Training dataset"],
                group_id='univariate')
        if target_analysis:
            df_target = self._create_train_test_df(
                test=pd.DataFrame({self.target_name: self.y_test},
                                  index=range(len(self.y_test)))
                if self.y_test is not None else None,
                train=pd.DataFrame({self.target_name: self.y_train},
                                   index=range(len(self.y_train)))
                if self.y_train is not None else None)
            if df_target is not None:
                if target_analysis:
                    print_md("### Target analysis")
                    self._perform_and_display_analysis_univariate(
                        df=df_target,
                        col_splitter="data_train_test",
                        split_values=["test", "train"],
                        names=["Prediction dataset", "Training dataset"],
                        group_id='target')
        if multivariate_analysis:
            print_md("### Mutlivariate analysis")
            fig_corr = generate_correlation_matrix_fig(self.df_train_test,
                                                       max_features=20)
            print_html(convert_fig_to_html(fig=fig_corr))
        print_md('---')
예제 #2
0
    def _perform_and_display_analysis_univariate(self, df: pd.DataFrame,
                                                 col_splitter: str,
                                                 split_values: list,
                                                 names: list, group_id: str):
        col_types = compute_col_types(df)
        n_splits = df[col_splitter].nunique()
        test_stats_univariate = perform_univariate_dataframe_analysis(
            df.loc[df[col_splitter] == split_values[0]], col_types=col_types)
        if n_splits > 1:
            train_stats_univariate = perform_univariate_dataframe_analysis(
                df.loc[df[col_splitter] == split_values[1]],
                col_types=col_types)

        univariate_template = template_env.get_template("univariate.html")
        univariate_features_desc = list()
        list_cols_labels = [
            self.explainer.features_dict.get(col, col)
            for col in df.drop(col_splitter, axis=1).columns.to_list()
        ]
        for col_label in sorted(list_cols_labels):
            col = self.explainer.inv_features_dict.get(col_label, col_label)
            fig = generate_fig_univariate(df_all=df,
                                          col=col,
                                          hue=col_splitter,
                                          type=col_types[col])
            df_col_stats = self._stats_to_table(
                test_stats=test_stats_univariate[col],
                train_stats=train_stats_univariate[col]
                if n_splits > 1 else None,
                names=names)
            univariate_features_desc.append({
                'feature_index':
                int(self.explainer.inv_columns_dict.get(col, 0)),
                'name':
                col,
                'type':
                str(series_dtype(df[col])),
                'description':
                col_label,
                'table':
                df_col_stats.to_html(classes="greyGridTable"),
                'image':
                convert_fig_to_html(fig)
            })
        print_html(
            univariate_template.render(features=univariate_features_desc,
                                       groupId=group_id))
예제 #3
0
    def display_model_performance(self):
        """
        Displays the performance of the model. The metrics are computed using the config dict.

        Metrics should be given as a list of dict. Each dict contains they following keys :
        'path' (path to the metric function, ex: 'sklearn.metrics.mean_absolute_error'),
        'name' (optional, name of the metric as displayed in the report),
        and 'use_proba_values' (optional, possible values are False (default) or True
        if the metric uses proba values instead of predicted values).

        For example :
        config['metrics'] = [
                {
                    'path': 'sklearn.metrics.mean_squared_error',
                    'name': 'Mean absolute error',  # Optional : name that will be displayed next to the metric
                    'y_pred': 'predicted_values'  # Optional
                },
                {
                    'path': 'Scoring_AP.utils.lift10',  # Custom function path
                    'name': 'Lift10',
                    'y_pred': 'proba_values'  # Use proba values instead of predicted values
                }
            ]
        """
        if self.y_test is None:
            logging.info(
                "No labels given for test set. Skipping model performance part"
            )
            return

        print_md("### Univariate analysis of target variable")
        df = pd.concat([
            pd.DataFrame({
                self.target_name: self.y_pred
            }).assign(_dataset="pred"),
            pd.DataFrame({
                self.target_name: self.y_test
            }).assign(_dataset="true") if self.y_test is not None else None
        ])
        self._perform_and_display_analysis_univariate(
            df=df,
            col_splitter="_dataset",
            split_values=["pred", "true"],
            names=["Prediction values", "True values"],
            group_id='target-distribution')

        if 'metrics' not in self.config.keys():
            logging.info(
                "No 'metrics' key found in report config dict. Skipping model performance part."
            )
            return
        print_md("### Metrics")

        for metric in self.config['metrics']:
            if 'name' not in metric.keys():
                metric['name'] = metric['path']

            if metric['path'] in ['confusion_matrix', 'sklearn.metrics.confusion_matrix'] or \
                    metric['name'] == 'confusion_matrix':
                print_md(f"**{metric['name']} :**")
                print_html(
                    convert_fig_to_html(
                        generate_confusion_matrix_plot(y_true=self.y_test,
                                                       y_pred=self.y_pred)))
            else:
                try:
                    metric_fn = get_callable(path=metric['path'])
                    #  Look if we should use proba values instead of predicted values
                    if 'use_proba_values' in metric.keys(
                    ) and metric['use_proba_values'] is True:
                        y_pred = self.explainer.proba_values
                    else:
                        y_pred = self.y_pred
                    res = metric_fn(self.y_test, y_pred)
                except Exception as e:
                    logging.info(
                        f"Could not compute following metric : {metric['path']}. \n{e}"
                    )
                    continue
                if isinstance(res, Number):
                    res = display_value(round_to_k(res, 3))
                    print_md(f"**{metric['name']} :** {res}")
                elif isinstance(res, (list, tuple, np.ndarray)):
                    print_md(f"**{metric['name']} :**")
                    print_html(
                        pd.DataFrame(res).to_html(classes="greyGridTable"))
                elif isinstance(res, str):
                    print_md(f"**{metric['name']} :**")
                    print_html(f"<pre>{res}</pre>")
                else:
                    logging.info(
                        f"Could not compute following metric : {metric['path']}. \n"
                        f"Result of type {res} cannot be displayed")
        print_md('---')