def train_and_score(learner, train_examples, test_examples, metric): """ A utility method to train a given learner instance on the given training examples, generate predictions on the training set itself and also the given test set, and score those predictions using the given metric. The method returns the train and test scores. Note that this method needs to be a top-level function since it is called from within ``joblib.Parallel()`` and, therefore, needs to be picklable which it would not be as an instancemethod of the ``Learner`` class. Parameters ---------- learner : skll.Learner A SKLL ``Learner`` instance. train_examples : array-like, with shape (n_samples, n_features) The training examples. test_examples : array-like, of length n_samples The test examples. metric : str The scoring function passed to ``use_score_func()``. Returns ------- train_score : float Output of the score function applied to predictions of ``learner`` on ``train_examples``. test_score : float Output of the score function applied to predictions of ``learner`` on ``test_examples``. """ _ = learner.train(train_examples, grid_search=False, shuffle=False) # get the train and test class indices (not labels) train_predictions = learner.predict(train_examples, class_labels=False) test_predictions = learner.predict(test_examples, class_labels=False) # now get the training and test labels and convert them to indices # but make sure to include any unseen labels in the test data if learner.model_type._estimator_type == 'classifier': test_label_list = np.unique(test_examples.labels).tolist() train_and_test_label_dict = add_unseen_labels(learner.label_dict, test_label_list) train_labels = np.array([ train_and_test_label_dict[label] for label in train_examples.labels ]) test_labels = np.array([ train_and_test_label_dict[label] for label in test_examples.labels ]) else: train_labels = train_examples.labels test_labels = test_examples.labels # now compute and return the scores train_score = use_score_func(metric, train_labels, train_predictions) test_score = use_score_func(metric, test_labels, test_predictions) return train_score, test_score
def train_and_score(learner, train_examples, test_examples, metric): """ A utility method to train a given learner instance on the given training examples, generate predictions on the training set itself and also the given test set, and score those predictions using the given metric. The method returns the train and test scores. Note that this method needs to be a top-level function since it is called from within ``joblib.Parallel()`` and, therefore, needs to be picklable which it would not be as an instancemethod of the ``Learner`` class. Parameters ---------- learner : skll.Learner A SKLL ``Learner`` instance. train_examples : array-like, with shape (n_samples, n_features) The training examples. test_examples : array-like, of length n_samples The test examples. metric : str The scoring function passed to ``use_score_func()``. Returns ------- train_score : float Output of the score function applied to predictions of ``learner`` on ``train_examples``. test_score : float Output of the score function applied to predictions of ``learner`` on ``test_examples``. """ _ = learner.train(train_examples, grid_search=False, shuffle=False) train_predictions = learner.predict(train_examples) test_predictions = learner.predict(test_examples) if learner.model_type._estimator_type == 'classifier': test_label_list = np.unique(test_examples.labels).tolist() unseen_test_label_list = [label for label in test_label_list if label not in learner.label_list] unseen_label_dict = {label: i for i, label in enumerate(unseen_test_label_list, start=len(learner.label_list))} # combine the two dictionaries train_and_test_label_dict = learner.label_dict.copy() train_and_test_label_dict.update(unseen_label_dict) train_labels = np.array([train_and_test_label_dict[label] for label in train_examples.labels]) test_labels = np.array([train_and_test_label_dict[label] for label in test_examples.labels]) else: train_labels = train_examples.labels test_labels = test_examples.labels train_score = use_score_func(metric, train_labels, train_predictions) test_score = use_score_func(metric, test_labels, test_predictions) return train_score, test_score
def test_register_custom_metric_values(): """Test to check values of custom metrics""" # register two metrics in the same file metric_dir = join(_my_dir, "other") custom_metrics_file = join(metric_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "f075_macro") register_custom_metric(custom_metrics_file, "ratio_of_ones") # check that the values that SKLL would compute matches what we expect y_true = [1, 1, 1, 0, 2, 1, 2, 0, 1] y_pred = [0, 1, 1, 0, 1, 2, 0, 1, 2] skll_value = use_score_func("f075_macro", y_true, y_pred) sklearn_value = fbeta_score(y_true, y_pred, 0.75, average='macro') eq_(skll_value, sklearn_value) y_true = [1, 1, 1, 0] y_pred = [0, 1, 1, 0] skll_value = use_score_func("ratio_of_ones", y_true, y_pred) true_ones = len([true for true in y_true if true == 1]) pred_ones = len([pred for pred in y_pred if pred == 1]) expected_value = pred_ones / (true_ones + pred_ones) eq_(skll_value, expected_value)
def compute_eval_from_predictions(examples_file, predictions_file, metric_names): ''' Compute evaluation metrics from prediction files after you have run an experiment. :param examples_file: a SKLL examples file (in .jsonlines or other format) :param predictions_file: a SKLL predictions output TSV file with id and prediction column names :param metric_names: a list of SKLL metric names (e.g., [pearson, unweighted_kappa]) :returns: a dictionary from metrics names to values ''' # read gold standard labels data = load_examples(examples_file) gold = dict(zip(data.ids, data.classes)) # read predictions pred = {} with open(predictions_file) as pred_file: reader = csv.reader(pred_file, dialect=csv.excel_tab) next(reader) # skip header for row in reader: pred[row[0]] = float(row[1]) # make a sorted list of example ids in order to match up # labels and predictions if set(gold.keys()) != set(pred.keys()): raise ValueError('The example and prediction IDs do not match.') example_ids = sorted(gold.keys()) res = {} for metric_name in metric_names: score = use_score_func(metric_name, [gold[ex_id] for ex_id in example_ids], [pred[ex_id] for ex_id in example_ids]) res[metric_name] = score return res
def compute_eval_from_predictions(examples_file, predictions_file, metric_names, prediction_method=None): """ Compute evaluation metrics from prediction files after you have run an experiment. Parameters ---------- examples_file: str Path to a SKLL examples file (in .jsonlines or other format). predictions_file: str Path to a SKLL predictions output TSV file with id and prediction column names. metric_names: list of str A list of SKLL metric names (e.g., [pearson, unweighted_kappa]). prediction_method: str or None Indicates how to get a single class prediction from the probabilities. Currently supported options are "highest", which selects the class with the highest probability, and "expected_value", which calculates an expected value over integer classes and rounds to the nearest int. If predictions file does not contain probabilities, this should be set to None. Returns ------- dict Maps metrics names to corresponding values. Raises ------ ValueError If the requested prediction method is 'expected_value' but the class names can't be converted to ints. """ # read gold standard labels data = Reader.for_path(examples_file).read() gold = dict(zip(data.ids, data.labels)) # read predictions pred = {} with open(predictions_file) as pred_file: reader = csv.reader(pred_file, dialect=csv.excel_tab) header = next(reader) # If there are more than two columns, assume column 0 contains the ids, and # columns 1-n contain class probabilities. Convert them to a class prediction # using the specified `method`. if len(header) > 2: classes = [c for c in header[1:] if c] if prediction_method is None: prediction_method = "highest" logger.info("No prediction method specified. Using 'highest'.") if prediction_method == 'expected_value': try: classes = [int(c) for c in classes] except ValueError as e: raise e for row in reader: probabilities = [safe_float(p) for p in row[1:]] prediction = get_prediction_from_probabilities( classes, probabilities, prediction_method) pred[row[0]] = safe_float(prediction) else: if prediction_method is not None: logger.warning( "A prediction method was provided, but the predictions " "file doesn't contain probabilities. Ignoring prediction " "method '{}'.".format(prediction_method)) for row in reader: pred[row[0]] = safe_float(row[1]) # make a sorted list of example ids in order to match up # labels and predictions if set(gold.keys()) != set(pred.keys()): raise ValueError('The example and prediction IDs do not match.') example_ids = sorted(gold.keys()) res = {} for metric_name in metric_names: score = use_score_func(metric_name, [gold[ex_id] for ex_id in example_ids], [pred[ex_id] for ex_id in example_ids]) res[metric_name] = score return res
def compute_evaluation_metrics(metrics, labels, predictions, model_type, label_dict=None, grid_objective=None, probability=False, logger=None): """ Compute given metrics to evaluate the given predictions generated by the given type of estimator against the given true labels. Parameters ---------- metrics : list of str List of metrics to compute. labels : array-like True labels to be used for computing the metrics. predictions : array-like The predictions to be used for computing the metrics. model_type : str One of "classifier" or "regressor". label_dict : dict, optional Dictionary mapping class labels to indices for classification. Defaults to ``None``. grid_objective : str, optional The objective used for tuning the hyper-parameters of the model that generated the predictions. If ``None``, it means that no grid search was done. Defaults to ``None``. probability : bool, optional Does the model output class probabilities? Defaults to ``False``. logger : logging.Logger, optional A logger instance to use for logging messages and warnings. If ``None``, a new one is created. Defaults to ``None``. Returns ------- res : 5-tuple The confusion matrix, the overall accuracy, the per-label PRFs, the grid search objective function score, and the additional evaluation metrics, if any. For regressors, the first two elements are ``None``. """ # set up the logger logger = logger if logger else logging.getLogger(__name__) # warn if grid objective was also specified in metrics if len(metrics) > 0 and grid_objective in metrics: logger.warning(f"The grid objective '{grid_objective}' is also " f"specified as an evaluation metric. Since its " f"value is already included in the results as the " f"objective score, it will not be printed " f"again in the list of metrics.") metrics = [metric for metric in metrics if metric != grid_objective] # initialize a dictionary that will hold all of the metric scores metric_scores = {metric: None for metric in metrics} # if we are a classifier and in probability mode, then # `yhat` are probabilities so we need to compute the # class indices separately and save them too if probability and model_type == 'classifier': class_probs = predictions predictions = np.argmax(class_probs, axis=1) # if we are a regressor or classifier not in probability # mode, then we have the class indices already and there # are no probabilities else: class_probs = None # make a single list of metrics including the grid objective # since it's easier to compute everything together metrics_to_compute = [grid_objective] + metrics for metric in metrics_to_compute: # skip the None if we are not doing grid search if not metric: continue # CASE 1: in probability mode for classification which means we # need to either use the probabilities directly or infer the labels # from them depending on the metric if probability: # there are three possible cases here: # (a) if we are using a correlation metric or # `average_precision` or `roc_auc` in a binary # classification scenario, then we need to explicitly # pass in the probabilities of the positive class. # (b) if we are using `neg_log_loss`, then we # just pass in the full probability array # (c) we compute the most likely labels from the # probabilities via argmax and use those # for all other metrics if (len(label_dict) == 2 and (metric in CORRELATION_METRICS or metric in ['average_precision', 'roc_auc']) and metric != grid_objective): logger.info(f"using probabilities for the positive class to " f"compute '{metric}' for evaluation.") preds_for_metric = class_probs[:, 1] elif metric == 'neg_log_loss': preds_for_metric = class_probs else: preds_for_metric = predictions # CASE 2: no probability mode for classifier or regressor # in which case we just use the predictions as they are else: preds_for_metric = predictions try: metric_scores[metric] = use_score_func(metric, labels, preds_for_metric) except ValueError: metric_scores[metric] = float('NaN') # now separate out the grid objective score from the additional metric scores # if a grid objective was actually passed in. If no objective was passed in # then that score should just be none. objective_score = None additional_scores = metric_scores.copy() if grid_objective: objective_score = metric_scores[grid_objective] del additional_scores[grid_objective] # compute some basic statistics for regressors if model_type == 'regressor': result_dict = {'descriptive': defaultdict(dict)} for table_label, y in zip(['actual', 'predicted'], [labels, predictions]): result_dict['descriptive'][table_label]['min'] = min(y) result_dict['descriptive'][table_label]['max'] = max(y) result_dict['descriptive'][table_label]['avg'] = np.mean(y) result_dict['descriptive'][table_label]['std'] = np.std(y) result_dict['pearson'] = use_score_func('pearson', labels, predictions) res = (None, None, result_dict, objective_score, additional_scores) else: # compute the confusion matrix and precision/recall/f1 # note that we are using the class indices here # and not the actual class labels themselves num_labels = len(label_dict) conf_mat = confusion_matrix(labels, predictions, labels=list(range(num_labels))) # Calculate metrics overall_accuracy = accuracy_score(labels, predictions) result_matrix = precision_recall_fscore_support(labels, predictions, labels=list( range(num_labels)), average=None) # Store results result_dict = defaultdict(dict) for actual_label in sorted(label_dict): col = label_dict[actual_label] result_dict[actual_label]["Precision"] = result_matrix[0][col] result_dict[actual_label]["Recall"] = result_matrix[1][col] result_dict[actual_label]["F-measure"] = result_matrix[2][col] res = (conf_mat.tolist(), overall_accuracy, result_dict, objective_score, additional_scores) return res
def check_f05_metrics(metric_name, average_method): y_true = [1, 1, 1, 0, 0, 0] y_pred = [0, 1, 1, 1, 0, 0] skll_value = use_score_func(metric_name, y_true, y_pred) sklearn_value = fbeta_score(y_true, y_pred, 0.5, average=average_method) eq_(skll_value, sklearn_value)
def evaluate(self, examples, prediction_prefix=None, append=False, grid_objective=None): ''' Evaluates a given model on a given dev or test example set. :param examples: The examples to evaluate the performance of the model on. :type examples: ExamplesTuple :param prediction_prefix: If saving the predictions, this is the prefix that will be used for the filename. It will be followed by ".predictions" :type prediction_prefix: str :param append: Should we append the current predictions to the file if it exists? :type append: bool :param grid_objective: The objective function that was used when doing the grid search. :type grid_objective: function :return: The confusion matrix, the overall accuracy, the per-class PRFs, the model parameters, and the grid search objective function score. :rtype: 5-tuple ''' # initialize grid score grid_score = None # make the prediction on the test data yhat = self.predict(examples, prediction_prefix=prediction_prefix, append=append) # extract actual labels (transformed for classification tasks) if self._model_type not in _REGRESSION_MODELS: ytest = np.array([self.label_dict[label] for label in examples.classes]) else: ytest = examples.classes # if run in probability mode, convert yhat to list of classes predicted if self.probability: # if we're using a correlation grid objective, calculate it here if grid_objective and grid_objective in _CORRELATION_METRICS: try: grid_score = use_score_func(grid_objective, ytest, yhat[:, 1]) except ValueError: grid_score = float('NaN') yhat = np.array([max(range(len(row)), key=lambda i: row[i]) for row in yhat]) # calculate grid search objective function score, if specified if (grid_objective and (grid_objective not in _CORRELATION_METRICS or not self.probability)): try: grid_score = use_score_func(grid_objective, ytest, yhat) except ValueError: grid_score = float('NaN') if self._model_type in _REGRESSION_MODELS: result_dict = {'descriptive': defaultdict(dict)} for table_label, y in zip(['actual', 'predicted'], [ytest, yhat]): result_dict['descriptive'][table_label]['min'] = min(y) result_dict['descriptive'][table_label]['max'] = max(y) result_dict['descriptive'][table_label]['avg'] = np.mean(y) result_dict['descriptive'][table_label]['std'] = np.std(y) result_dict['pearson'] = SCORERS['pearson']._score_func(ytest, yhat) res = (None, None, result_dict, self._model.get_params(), grid_score) else: # compute the confusion matrix num_labels = len(self.label_list) conf_mat = confusion_matrix(ytest, yhat, labels=list(range(num_labels))) # Calculate metrics overall_accuracy = accuracy_score(ytest, yhat) result_matrix = precision_recall_fscore_support(ytest, yhat, labels=list(range(num_labels)), average=None) # Store results result_dict = defaultdict(dict) for actual_class in sorted(self.label_list): c_num = self.label_dict[actual_class] result_dict[actual_class]["Precision"] = result_matrix[0][c_num] result_dict[actual_class]["Recall"] = result_matrix[1][c_num] result_dict[actual_class]["F-measure"] = result_matrix[2][c_num] res = (conf_mat.tolist(), overall_accuracy, result_dict, self._model.get_params(), grid_score) return res