def perform_global_dataframe_analysis(df: Optional[pd.DataFrame]) -> dict: """ Returns a python dict containing global information about a pandas DataFrame : Number of features, Number of observations, missing values... Parameters ---------- df : pd.DataFrame The dataframe that will be used to compute global information. Returns ------- global_d : dict dictionary that contains an ensemble of global information about the input dataframe. """ if df is None: return dict() missing_values = df.isna().sum().sum() global_d = { 'number of features': len(df.columns), 'number of observations': df.shape[0], 'missing values': missing_values, '% missing values': missing_values / (df.shape[0] * df.shape[1]), } for stat in global_d.keys(): if stat == 'number of observations': global_d[stat] = int(global_d[stat]) # Keeping the exact number elif isinstance(global_d[stat], float): global_d[stat] = round_to_k(global_d[stat], 3) replace_dict_values(global_d, display_value, ',', '.') return global_d
def perform_univariate_dataframe_analysis(df: Optional[pd.DataFrame], col_types: dict) -> dict: """ Returns a python dict containing information about each column of a pandas DataFrame. The computed information depends on the type of the column. Parameters ---------- df : pd.DataFrame The dataframe on which the analysis will be performed col_types : dict Dict of types for each column Returns ------- d : dict A dict containing each column as keys and the corresponding dict of information for each column as values. """ if df is None: return dict() d = df.describe().to_dict() for col in df.columns: if col_types[col] == VarType.TYPE_CAT: d[col] = { 'distinct values': df[col].nunique(), 'missing values': df[col].isna().sum() } for col in d.keys(): for stat in d[col].keys(): if stat in ['count', 'distinct values']: d[col][stat] = int( d[col][stat]) # Keeping the exact number here elif isinstance(d[col][stat], float): d[col][stat] = round_to_k(d[col][stat], 3) # Rounding to 3 important figures replace_dict_values(d, display_value, ',', '.') return d
def display_model_performance(self): """ Displays the performance of the model. The metrics are computed using the config dict. Metrics should be given as a list of dict. Each dict contains they following keys : 'path' (path to the metric function, ex: 'sklearn.metrics.mean_absolute_error'), 'name' (optional, name of the metric as displayed in the report), and 'use_proba_values' (optional, possible values are False (default) or True if the metric uses proba values instead of predicted values). For example : config['metrics'] = [ { 'path': 'sklearn.metrics.mean_squared_error', 'name': 'Mean absolute error', # Optional : name that will be displayed next to the metric 'y_pred': 'predicted_values' # Optional }, { 'path': 'Scoring_AP.utils.lift10', # Custom function path 'name': 'Lift10', 'y_pred': 'proba_values' # Use proba values instead of predicted values } ] """ if self.y_test is None: logging.info( "No labels given for test set. Skipping model performance part" ) return print_md("### Univariate analysis of target variable") df = pd.concat([ pd.DataFrame({ self.target_name: self.y_pred }).assign(_dataset="pred"), pd.DataFrame({ self.target_name: self.y_test }).assign(_dataset="true") if self.y_test is not None else None ]) self._perform_and_display_analysis_univariate( df=df, col_splitter="_dataset", split_values=["pred", "true"], names=["Prediction values", "True values"], group_id='target-distribution') if 'metrics' not in self.config.keys(): logging.info( "No 'metrics' key found in report config dict. Skipping model performance part." ) return print_md("### Metrics") for metric in self.config['metrics']: if 'name' not in metric.keys(): metric['name'] = metric['path'] if metric['path'] in ['confusion_matrix', 'sklearn.metrics.confusion_matrix'] or \ metric['name'] == 'confusion_matrix': print_md(f"**{metric['name']} :**") print_html( convert_fig_to_html( generate_confusion_matrix_plot(y_true=self.y_test, y_pred=self.y_pred))) else: try: metric_fn = get_callable(path=metric['path']) # Look if we should use proba values instead of predicted values if 'use_proba_values' in metric.keys( ) and metric['use_proba_values'] is True: y_pred = self.explainer.proba_values else: y_pred = self.y_pred res = metric_fn(self.y_test, y_pred) except Exception as e: logging.info( f"Could not compute following metric : {metric['path']}. \n{e}" ) continue if isinstance(res, Number): res = display_value(round_to_k(res, 3)) print_md(f"**{metric['name']} :** {res}") elif isinstance(res, (list, tuple, np.ndarray)): print_md(f"**{metric['name']} :**") print_html( pd.DataFrame(res).to_html(classes="greyGridTable")) elif isinstance(res, str): print_md(f"**{metric['name']} :**") print_html(f"<pre>{res}</pre>") else: logging.info( f"Could not compute following metric : {metric['path']}. \n" f"Result of type {res} cannot be displayed") print_md('---')
def test_round_to_k_1(self): x = 123456789 expected_r_x = 123000000 assert round_to_k(x, 3) == expected_r_x
def test_round_to_k_6(self): x = 0.0000123456789 expected_r_x = 0.0000123 assert round_to_k(x, 3) == expected_r_x
def test_round_to_k_4(self): x = 123.456789 expected_r_x = 123 assert round_to_k(x, 3) == expected_r_x
def test_round_to_k_3(self): x = 123456789 expected_r_x = 100000000 assert round_to_k(x, 1) == expected_r_x
def __init__(self, explainer): """ Init on class instantiation, everything to be able to run the app on server. Parameters ---------- explainer : SmartExplainer SmartExplainer object """ # APP self.server = Flask(__name__) self.app = dash.Dash( server=self.server, external_stylesheets=[dbc.themes.BOOTSTRAP], ) self.app.title = 'Shapash Monitor' if explainer.title_story: self.app.title += ' - ' + explainer.title_story self.explainer = explainer # SETTINGS self.logo = self.app.get_asset_url('shapash-fond-fonce.png') self.color = '#f4c000' self.bkg_color = "#343736" self.settings_ini = { 'rows': 1000, 'points': 1000, 'violin': 10, 'features': 20, } self.settings = self.settings_ini.copy() self.predict_col = ['_predict_'] self.explainer.features_imp = self.explainer.state.compute_features_import( self.explainer.contributions) if self.explainer._case == 'classification': self.label = self.explainer.check_label_name( len(self.explainer._classes) - 1, 'num')[1] self.selected_feature = self.explainer.features_imp[-1].idxmax() self.max_threshold = int( max([ x.applymap(lambda x: round_to_k(x, k=1)).max().max() for x in self.explainer.contributions ])) else: self.label = None self.selected_feature = self.explainer.features_imp.idxmax() self.max_threshold = int( self.explainer.contributions.applymap( lambda x: round_to_k(x, k=1)).max().max()) self.list_index = [] self.subset = None # DATA self.dataframe = pd.DataFrame() self.round_dataframe = pd.DataFrame() self.init_data() # COMPONENTS self.components = { 'menu': {}, 'table': {}, 'graph': {}, 'filter': {}, 'settings': {} } self.init_components() # LAYOUT self.skeleton = {'navbar': {}, 'body': {}} self.make_skeleton() self.app.layout = html.Div( [self.skeleton['navbar'], self.skeleton['body']]) # CALLBACK self.callback_fullscreen_buttons() self.init_callback_settings() self.callback_generator()