def test_filter(self): # Filter by category report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) report.filter(categories=["Cat B"]) actual = report.data expected = pd.DataFrame([ ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812], ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312], ]) self.assertTrue(actual.equals(expected)) # Filter by slice report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) report.filter(slices=["Slice A", "Slice C"]) actual = report.data expected = pd.DataFrame([ ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300], ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3], ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000], ]) self.assertTrue(actual.equals(expected))
def test_init(self): # Create a basic report report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) self.assertTrue(self.data.equals(report.data)) # Pass config params custom_color_scheme = ["#000000"] report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, color_scheme=custom_color_scheme, ) self.assertEqual(custom_color_scheme, report.config["color_scheme"])
def test_set_range(self): report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) report.set_range("f1", 0.1, 0.3) for col in report.columns: if col.title == "f1": self.assertEqual((col.min_val, col.max_val), (0.1, 0.3))
def test_set_class_codes(self): report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) custom_class_codes = ["A", "B", "C"] report.set_class_codes(custom_class_codes) for col in report.columns: if isinstance(col, ClassDistributionColumn): self.assertEqual(col.class_codes, custom_class_codes)
def test_figure(self): report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) # Original unsorted data should cause an error self.assertRaises(ValueError, report.figure) # Sort should resolve that error report.sort() try: report.figure() except ValueError: self.fail("report.figure() raised ValueError unexpectedly!")
def test_display(self): report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) report.sort() figure = report.figure() figure.show() report.sort(category_order={"Cat C": 1, "Cat A": 2, "Cat B": 3}) report.rename(slice_map={"Slice A": "A"}, category_map={"Cat B": "B"}) report.filter(slices=["A", "Slice B", "Slice C"]) report.set_range("f1", 0.05, 0.45) report.update_config(font_size_heading=16) figure = report.figure(show_title=True) figure.show()
def test_rename(self): report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) category_map = {"Cat C": "Cat D"} slice_map = {"Slice A": "Slice D"} report.rename(category_map=category_map, slice_map=slice_map) actual = report.data expected = pd.DataFrame([ ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300], ["Cat D", "Slice D", 0.2, 10, [0.4, 0.2, 0.4], 3], ["Cat A", "Slice D", 0.3, 15, [0.1, 0, 0.9], 5000], ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812], ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312], ]) self.assertTrue(actual.equals(expected))
def test_display_2(self): data = pd.DataFrame([ [ "Eval", "snli1", 0.8799999952316284, 0.876409113407135, [0.368, 0.304, 0.328], [0.344, 0.288, 0.368], 125, ], [ "Eval", "snli2", 0.8799999952316284, 0.876409113407135, [0.368, 0.304, 0.328], [0.344, 0.288, 0.368], 125, ], [ "Eval", "snli3", 0.8799999952316284, 0.876409113407135, [0.368, 0.304, 0.328], [0.344, 0.288, 0.368], 125, ], ]) cols = [ ScoreColumn("F1", min_val=0, max_val=1, is_0_to_1=True), ScoreColumn("Accuracy", min_val=0, max_val=1, is_0_to_1=True), ClassDistributionColumn("Class Dist", ["e", "n", "c"]), ClassDistributionColumn("Pred Dist", ["e", "n", "c"]), NumericColumn("Size"), ] report = Report(data, cols) report.figure().show()
def create_report( self, model: Union[Model, str], metric_ids: List[str] = None, ) -> Report: """Generate report from cached metrics for a model. Args: model: Model or model id. Metrics must have already been computed for this model. metric_ids (optional): list of metric ids to include in desired order. If None, take metrics from sample slice. Returns: report """ if len(self.slices) == 0: raise ValueError("Cannot create report for empty testbench") if isinstance(model, Model): model = model.identifier if model not in self.metrics: raise ValueError( f"Metrics for model {model} have not been computed yet." f" You must first execute one of " "the following methods for this model: 'evaluate', " "'add_predictions', 'add_metrics'") # TODO(Jesse): Need a category for test set model_metrics = self.metrics[model] # TODO(Jesse): where to put this? Should only need to be called once self._human_readable_identifiers() if metric_ids is None: sample_slice = list(self.slices)[0].identifier metric_ids = list(model_metrics[sample_slice].keys()) sorted_metric_ids = sorted([ metric_id for metric_id in metric_ids if metric_id not in ("class_dist", "pred_dist") ]) if "class_dist" in metric_ids: sorted_metric_ids.append("class_dist") if "pred_dist" in metric_ids: sorted_metric_ids.append("pred_dist") metric_ids = sorted_metric_ids # Populate columns columns = [] for metric_id in metric_ids: if metric_id in ("class_dist", "pred_dist"): if self.task is None: class_cds = None else: class_names = self.task.output_schema.features[list( self.task.output_schema.columns)[0]].names class_cds = [name[0].upper() for name in class_names] columns.append(ClassDistributionColumn(metric_id, class_cds)) else: columns.append( ScoreColumn(metric_id, min_val=0, max_val=1, is_0_to_1=True)) columns.append(NumericColumn("Size")) category_names = { GENERIC: "DataPanel", SUBPOPULATION: "SubPop", ATTACK: "Attack", AUGMENTATION: "Augment", CURATION: "Eval", } # Populate data data = [] for sl in self.slices: slice_name = self.ident_mapping[sl.identifier] slice_size = len(sl) slice_category = category_names.get(sl.category, sl.category.capitalize()) row = [] row.append(slice_category) row.append(slice_name) if sl.identifier not in model_metrics: raise ValueError( f"Metrics for model {model} and slice {sl.identifier}" f"have not yet been computed.") slice_metrics = model_metrics[sl.identifier] for metric_id in metric_ids: row.append(slice_metrics[metric_id]) row.append(slice_size) data.append(row) # TODO(karan): generalize aggregation # slice_metrics = tz.merge_with(np.mean, slice_metrics) # Task-dependent model predictions # TODO(karan): e.g. average class distribution predicted, figure out how to # put this in # Task-dependent sl information # TODO(karan): e.g. class distribution df = pd.DataFrame(data) report = Report(data=df, columns=columns, model_name=model, dataset_name=self.dataset_id) report.sort(category_order=dict((cat, i) for i, cat in enumerate( [SUBPOPULATION, AUGMENTATION, CURATION, ATTACK, GENERIC]))) return report
def create_report( self, models: List[str] = None, aggregator_columns: Dict[str, ReportColumn] = None, ) -> Report: """Generate a report for models in the bench. Args: models (List[str]): names of one or more models that are in the devbench. aggregator_columns (Dict[str, (ReportColumn, dict)]): dict mapping aggregator names to a tuple. The first entry of the tuple is the ReportColumn that should be used for visualization. The second entry is a dict of kwargs that will be passed to the ReportColumn using `ReportColumn.__init__(..., **kwargs)`. For instance, >>> devbench.create_report( >>> models=['BERT'], >>> aggregator_columns={ >>> 'accuracy': (ScoreColumn, {'min_val': 0.3}) >>> } >>> ) By default, aggregators will be displayed as a ScoreColumn with `min_val=0`, `max_val=1` and `is_0_to_1=True`. Returns: a Report, summarizing the performance of the models. """ if len(self.slices) == 0: raise ValueError("No slices found in Bench. Cannot create report.") if models is not None: for model in models: assert model in self.metrics, f"Model {model} not found." else: # Use all the models that are available models = list(self.metrics.keys()) # Set identifiers to be human readable self._human_readable_identifiers() # Get the list of aggregators that are shared by `models` shared_aggregators = list(self._shared_aggregators(models)) # Populate columns columns = [] for model in models: for aggregator in shared_aggregators: if aggregator_columns and aggregator in aggregator_columns: column_type, column_kwargs = aggregator_columns[aggregator] else: column_type = ScoreColumn column_kwargs = dict(min_val=0, max_val=1, is_0_to_1=True) columns.append(column_type(f"{model}-{aggregator}", **column_kwargs)) columns.append(NumericColumn("Size")) category_names = { GENERIC: "Slice", SUBPOPULATION: "SubPop", ATTACK: "Attack", AUGMENTATION: "Augment", CURATION: "Eval", } # Populate data data = [] for sl in self.slices: slice_name = self.ident_mapping[sl.identifier] slice_size = len(sl) slice_category = category_names.get(sl.category, sl.category.capitalize()) row = [slice_category, slice_name] for model in models: model_metrics = self.metrics[model] if sl.identifier not in model_metrics: continue slice_metrics = model_metrics[sl.identifier] for agg in shared_aggregators: row.append(slice_metrics[agg]) row.append(slice_size) data.append(row) df = pd.DataFrame(data) report = Report( data=df, columns=columns, ) report.sort( category_order=dict( (cat, i) for i, cat in enumerate( [SUBPOPULATION, AUGMENTATION, CURATION, ATTACK, GENERIC] ) ) ) return report
def test_sort(self): # Sort alphabetically report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) report.sort() actual = report.data expected = pd.DataFrame([ ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000], ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300], ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812], ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312], ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3], ]) self.assertTrue(actual.equals(expected)) # Sort by specified category order report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) report.sort(category_order={ "Cat B": 0, "Cat C": 2, "Cat A": 1, }) actual = report.data expected = pd.DataFrame([ ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812], ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312], ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000], ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300], ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3], ]) self.assertTrue(actual.equals(expected)) # Sort by specified slice order report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) report.sort(slice_order={ "Slice D": 0, "Slice C": 1, "Slice B": 2, "Slice A": 3 }) actual = report.data expected = pd.DataFrame([ ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300], ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000], ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312], ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812], ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3], ]) self.assertTrue(actual.equals(expected)) # Sort by specified category order and slice order report = Report( self.data, self.cols, model_name=self.model_name, dataset_name=self.dataset_name, ) report.sort( category_order={ "Cat B": 0, "Cat C": 2, "Cat A": 1, }, slice_order={ "Slice D": 0, "Slice C": 1, "Slice B": 2, "Slice A": 3 }, ) actual = report.data expected = pd.DataFrame([ ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312], ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812], ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300], ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000], ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3], ]) self.assertTrue(actual.equals(expected))