def _aggregate_running_metrics(self, model): """Calculate the running overall and task specific metrics.""" metric_dict = dict() total_count = 0 # Log task specific loss for identifier in self.running_uids.keys(): count = len(self.running_uids[identifier]) if count > 0: metric_dict[identifier + "/loss"] = ( self.running_losses[identifier] / count ) total_count += count # Calculate average micro loss if total_count > 0: total_loss = sum(self.running_losses.values()) metric_dict["model/all/train/loss"] = total_loss / total_count micro_score_dict = defaultdict(list) macro_score_dict = defaultdict(list) # Calculate training metric for identifier in self.running_uids.keys(): task_name, data_name, split = identifier.split("/") metric_score = model.scorers[task_name].score( self.running_golds[identifier], self.running_probs[identifier], prob_to_pred(self.running_probs[identifier]), self.running_uids[identifier], ) for metric_name, metric_value in metric_score.items(): metric_dict[f"{identifier}/{metric_name}"] = metric_value # Collect average score identifier = construct_identifier(task_name, data_name, split, "average") metric_dict[identifier] = np.mean(list(metric_score.values())) micro_score_dict[split].extend(list(metric_score.values())) macro_score_dict[split].append(metric_dict[identifier]) # Collect split-wise micro/macro average score for split in micro_score_dict.keys(): identifier = construct_identifier("model", "all", split, "micro_average") metric_dict[identifier] = np.mean(micro_score_dict[split]) identifier = construct_identifier("model", "all", split, "macro_average") metric_dict[identifier] = np.mean(macro_score_dict[split]) # Log the learning rate metric_dict["model/all/train/lr"] = self.optimizer.param_groups[0]["lr"] return metric_dict
def score(self, dataloaders, return_average=True): """Score the data from dataloader with the model :param dataloaders: the dataloader that performs scoring :type dataloaders: dataloader :param return_average: Whether return average scores :type return_average: bool """ self.eval() if not isinstance(dataloaders, list): dataloaders = [dataloaders] metric_score_dict = dict() if return_average: micro_score_dict = defaultdict(list) macro_score_dict = defaultdict(list) macro_loss_dict = defaultdict(list) for dataloader in dataloaders: predictions = self.predict(dataloader, return_preds=True) for task_name in predictions["golds"].keys(): metric_score = self.scorers[task_name].score( predictions["golds"][task_name], predictions["probs"][task_name], predictions["preds"][task_name], predictions["uids"][task_name], ) for metric_name, metric_value in metric_score.items(): identifier = construct_identifier(task_name, dataloader.data_name, dataloader.split, metric_name) metric_score_dict[identifier] = metric_value # Store the loss identifier = construct_identifier(task_name, dataloader.data_name, dataloader.split, "loss") metric_score_dict[identifier] = predictions["losses"][ task_name] if return_average: # Collect average score identifier = construct_identifier(task_name, dataloader.data_name, dataloader.split, "average") metric_score_dict[identifier] = np.mean( list(metric_score.values())) micro_score_dict[dataloader.split].extend( list(metric_score.values())) macro_score_dict[dataloader.split].append( metric_score_dict[identifier]) # Store the loss identifier = construct_identifier(task_name, dataloader.data_name, dataloader.split, "loss") macro_loss_dict[dataloader.split].append( metric_score_dict[identifier]) if return_average: # Collect split-wise micro/macro average score for split in micro_score_dict.keys(): identifier = construct_identifier("model", "all", split, "micro_average") metric_score_dict[identifier] = np.mean( micro_score_dict[split]) identifier = construct_identifier("model", "all", split, "macro_average") metric_score_dict[identifier] = np.mean( macro_score_dict[split]) identifier = construct_identifier("model", "all", split, "loss") metric_score_dict[identifier] = np.mean(macro_loss_dict[split]) # Collect overall micro/macro average score/loss identifier = construct_identifier("model", "all", "all", "micro_average") metric_score_dict[identifier] = np.mean( list(micro_score_dict.values())) identifier = construct_identifier("model", "all", "all", "macro_average") metric_score_dict[identifier] = np.mean( list(macro_score_dict.values())) identifier = construct_identifier("model", "all", "all", "loss") metric_score_dict[identifier] = np.mean( list(macro_loss_dict.values())) # TODO: have a better to handle global evaluation metric if Meta.config["learner_config"]["global_evaluation_metric_dict"]: global_evaluation_metric_dict = Meta.config["learner_config"][ "global_evaluation_metric_dict"] for metric_name, metric in global_evaluation_metric_dict.items(): metric_score_dict[metric_name] = metric(metric_score_dict) return metric_score_dict
def score( self, dataloaders: Union[EmmentalDataLoader, List[EmmentalDataLoader]], return_average: bool = True, ) -> Dict[str, float]: """Score the data from dataloader. Args: dataloaders: The dataloaders to score. return_average: Whether to return average score. Returns: Score dict. """ self.eval() if not isinstance(dataloaders, list): dataloaders = [dataloaders] metric_score_dict = dict() if return_average: micro_score_dict: defaultdict = defaultdict(list) macro_score_dict: defaultdict = defaultdict(list) macro_loss_dict: defaultdict = defaultdict(list) for dataloader in dataloaders: if not dataloader.is_learnable: logger.warning( f"Dataloader {dataloader.data_name} doesn't have gold data, " f"continue..." ) continue return_probs = False return_preds = False for task_name in dataloader.task_to_label_dict: return_probs = return_probs or self.require_prob_for_evals[task_name] return_preds = return_preds or self.require_pred_for_evals[task_name] predictions = self.predict( dataloader, return_probs=return_probs, return_preds=return_preds, return_action_outputs=False, ) for task_name in predictions["uids"].keys(): metric_score = self.scorers[task_name].score( predictions["golds"][task_name], predictions["probs"][task_name] if return_probs else None, predictions["preds"][task_name] if return_preds else None, predictions["uids"][task_name], ) for metric_name, metric_value in metric_score.items(): identifier = construct_identifier( task_name, dataloader.data_name, dataloader.split, metric_name ) metric_score_dict[identifier] = metric_value # Store the loss identifier = construct_identifier( task_name, dataloader.data_name, dataloader.split, "loss" ) metric_score_dict[identifier] = np.mean( # type: ignore predictions["losses"][task_name] ) if return_average: # Collect average score identifier = construct_identifier( task_name, dataloader.data_name, dataloader.split, "average" ) metric_score_dict[identifier] = np.mean( # type: ignore list(metric_score.values()) ) micro_score_dict[dataloader.split].extend( list(metric_score.values()) ) macro_score_dict[dataloader.split].append( metric_score_dict[identifier] ) # Store the loss identifier = construct_identifier( task_name, dataloader.data_name, dataloader.split, "loss" ) macro_loss_dict[dataloader.split].append( metric_score_dict[identifier] ) if return_average: # Collect split-wise micro/macro average score for split in micro_score_dict.keys(): identifier = construct_identifier( "model", "all", split, "micro_average" ) metric_score_dict[identifier] = np.mean( # type: ignore micro_score_dict[split] ) identifier = construct_identifier( "model", "all", split, "macro_average" ) metric_score_dict[identifier] = np.mean( # type: ignore macro_score_dict[split] ) identifier = construct_identifier("model", "all", split, "loss") metric_score_dict[identifier] = np.mean( # type: ignore macro_loss_dict[split] ) # Collect overall micro/macro average score/loss if len(micro_score_dict): identifier = construct_identifier( "model", "all", "all", "micro_average" ) metric_score_dict[identifier] = np.mean( # type: ignore list(itertools.chain.from_iterable(micro_score_dict.values())) ) if len(macro_score_dict): identifier = construct_identifier( "model", "all", "all", "macro_average" ) metric_score_dict[identifier] = np.mean( # type: ignore list(itertools.chain.from_iterable(macro_score_dict.values())) ) if len(macro_loss_dict): identifier = construct_identifier("model", "all", "all", "loss") metric_score_dict[identifier] = np.mean( # type: ignore list(itertools.chain.from_iterable(macro_loss_dict.values())) ) # TODO: have a better to handle global evaluation metric if Meta.config["learner_config"]["global_evaluation_metric_dict"]: global_evaluation_metric_dict = Meta.config["learner_config"][ "global_evaluation_metric_dict" ] for metric_name, metric in global_evaluation_metric_dict.items(): metric_score_dict[metric_name] = metric(metric_score_dict) return metric_score_dict
def test_construct_identifier(caplog): """Unit test of construct_identifier.""" caplog.set_level(logging.INFO) assert construct_identifier("1", "2", "3", "4") == "1/2/3/4" assert construct_identifier("1", "2", "3") == "1/2/3"
def _aggregate_running_metrics( self, model: EmmentalModel, calc_running_scores: bool = False) -> Dict[str, float]: """Calculate the running overall and task specific metrics. Args: model: The model to evaluate. calc_running_scores: Whether to calc running scores Returns: The score dict. """ metric_dict: Dict[str, float] = dict() total_count = 0 # Log task specific loss for identifier in self.running_uids.keys(): count = len(self.running_uids[identifier]) if count > 0: metric_dict[identifier + "/loss"] = float( self.running_losses[identifier] / count) total_count += count # Calculate average micro loss if total_count > 0: total_loss = sum(self.running_losses.values()) metric_dict["model/all/train/loss"] = float(total_loss / total_count) if calc_running_scores: micro_score_dict: Dict[str, List[float]] = defaultdict(list) macro_score_dict: Dict[str, List[float]] = defaultdict(list) # Calculate training metric for identifier in self.running_uids.keys(): task_name, data_name, split = identifier.split("/") if (model.scorers[task_name] and self.running_golds[identifier] and self.running_probs[identifier]): metric_score = model.scorers[task_name].score( self.running_golds[identifier], self.running_probs[identifier], prob_to_pred(self.running_probs[identifier]), self.running_uids[identifier], ) for metric_name, metric_value in metric_score.items(): metric_dict[ f"{identifier}/{metric_name}"] = metric_value # Collect average score identifier = construct_identifier(task_name, data_name, split, "average") metric_dict[identifier] = np.mean( list(metric_score.values())) micro_score_dict[split].extend( list(metric_score.values()) # type: ignore ) macro_score_dict[split].append(metric_dict[identifier]) # Collect split-wise micro/macro average score for split in micro_score_dict.keys(): identifier = construct_identifier("model", "all", split, "micro_average") metric_dict[identifier] = np.mean( micro_score_dict[split] # type: ignore ) identifier = construct_identifier("model", "all", split, "macro_average") metric_dict[identifier] = np.mean( macro_score_dict[split] # type: ignore ) # Log the learning rate metric_dict["model/all/train/lr"] = self.optimizer.param_groups[0][ "lr"] return metric_dict
def score( self, dataloaders: Union[EmmentalDataLoader, List[EmmentalDataLoader]], return_average: bool = True, ) -> Dict[str, float]: """Score the data from dataloader. Args: dataloaders(EmmentalDataLoader or List[EmmentalDataLoader]): The dataloaders to score. return_average(bool): Whether to return average score. Returns: dict: Score dict. """ self.eval() if not isinstance(dataloaders, list): dataloaders = [dataloaders] metric_score_dict = dict() if return_average: micro_score_dict: defaultdict = defaultdict(list) macro_score_dict: defaultdict = defaultdict(list) macro_loss_dict: defaultdict = defaultdict(list) for dataloader in dataloaders: predictions = self.predict(dataloader, return_preds=True) for task_name in predictions["golds"].keys(): metric_score = self.scorers[task_name].score( predictions["golds"][task_name], predictions["probs"][task_name], predictions["preds"][task_name], predictions["uids"][task_name], ) for metric_name, metric_value in metric_score.items(): identifier = construct_identifier( task_name, dataloader.data_name, dataloader.split, metric_name ) metric_score_dict[identifier] = metric_value # Store the loss identifier = construct_identifier( task_name, dataloader.data_name, dataloader.split, "loss" ) metric_score_dict[identifier] = np.mean( predictions["losses"][task_name] ) if return_average: # Collect average score identifier = construct_identifier( task_name, dataloader.data_name, dataloader.split, "average" ) metric_score_dict[identifier] = np.mean(list(metric_score.values())) micro_score_dict[dataloader.split].extend( list(metric_score.values()) ) macro_score_dict[dataloader.split].append( metric_score_dict[identifier] ) # Store the loss identifier = construct_identifier( task_name, dataloader.data_name, dataloader.split, "loss" ) macro_loss_dict[dataloader.split].append( metric_score_dict[identifier] ) if return_average: # Collect split-wise micro/macro average score for split in micro_score_dict.keys(): identifier = construct_identifier( "model", "all", split, "micro_average" ) metric_score_dict[identifier] = np.mean(micro_score_dict[split]) identifier = construct_identifier( "model", "all", split, "macro_average" ) metric_score_dict[identifier] = np.mean(macro_score_dict[split]) identifier = construct_identifier("model", "all", split, "loss") metric_score_dict[identifier] = np.mean(macro_loss_dict[split]) # Collect overall micro/macro average score/loss identifier = construct_identifier("model", "all", "all", "micro_average") metric_score_dict[identifier] = np.mean( list(itertools.chain.from_iterable(micro_score_dict.values())) ) identifier = construct_identifier("model", "all", "all", "macro_average") metric_score_dict[identifier] = np.mean( list(itertools.chain.from_iterable(macro_score_dict.values())) ) identifier = construct_identifier("model", "all", "all", "loss") metric_score_dict[identifier] = np.mean( list(itertools.chain.from_iterable(macro_loss_dict.values())) ) # TODO: have a better to handle global evaluation metric if Meta.config["learner_config"]["global_evaluation_metric_dict"]: global_evaluation_metric_dict = Meta.config["learner_config"][ "global_evaluation_metric_dict" ] for metric_name, metric in global_evaluation_metric_dict.items(): metric_score_dict[metric_name] = metric(metric_score_dict) return metric_score_dict