Exemplo n.º 1
0
    def test_binarize_forecast_probs(self):
        """Ensures correct output from binarize_forecast_probs."""

        these_forecast_labels = model_eval.binarize_forecast_probs(
            FORECAST_PROBABILITIES, BINARIZATION_THRESHOLD_HALF)
        self.assertTrue(
            numpy.array_equal(these_forecast_labels,
                              FORECAST_LABELS_THRESHOLD_HALF))
    def test_binarize_forecast_probs(self):
        """Ensures correct output from binarize_forecast_probs."""

        these_forecast_labels = model_eval.binarize_forecast_probs(
            forecast_probabilities=FORECAST_PROBABILITIES,
            binarization_threshold=0.5)

        self.assertTrue(
            numpy.array_equal(these_forecast_labels,
                              FORECAST_LABELS_THRESHOLD_HALF))
def _compute_scores(forecast_probabilities,
                    observed_labels,
                    num_bootstrap_reps,
                    output_file_name,
                    best_prob_threshold=None,
                    downsampling_dict=None):
    """Computes evaluation scores.

    E = number of examples (storm objects)

    :param forecast_probabilities: length-E numpy array of forecast event
        probabilities.
    :param observed_labels: length-E numpy array of observations (1 for event,
        0 for non-event).
    :param num_bootstrap_reps: Number of bootstrap replicates.
    :param output_file_name: Path to output file (will be written by
        `model_evaluation.write_evaluation`).
    :param best_prob_threshold: Best probability threshold.  If None, will be
        determined on the fly.
    :param downsampling_dict: Dictionary with downsampling fractions.  See doc
        for `deep_learning_utils.sample_by_class`.  If this is None,
        downsampling will not be used.
    """

    num_examples = len(observed_labels)
    num_examples_by_class = numpy.unique(observed_labels,
                                         return_counts=True)[-1]

    print('Number of examples by class (no downsampling): {0:s}'.format(
        str(num_examples_by_class)))

    positive_example_indices = numpy.where(observed_labels == 1)[0]
    negative_example_indices = numpy.where(observed_labels == 0)[0]

    if downsampling_dict is None:
        these_indices = numpy.linspace(0,
                                       num_examples - 1,
                                       num=num_examples,
                                       dtype=int)
    else:
        these_indices = dl_utils.sample_by_class(
            sampling_fraction_by_class_dict=downsampling_dict,
            target_name=DUMMY_TARGET_NAME,
            target_values=observed_labels,
            num_examples_total=num_examples)

        this_num_ex_by_class = numpy.unique(observed_labels[these_indices],
                                            return_counts=True)[-1]

        print('Number of examples by class (after downsampling): {0:s}'.format(
            str(this_num_ex_by_class)))

    all_prob_thresholds = model_eval.get_binarization_thresholds(
        threshold_arg=model_eval.THRESHOLD_ARG_FOR_UNIQUE_FORECASTS,
        forecast_probabilities=forecast_probabilities[these_indices],
        forecast_precision=FORECAST_PRECISION)

    if best_prob_threshold is None:
        best_prob_threshold, best_csi = (
            model_eval.find_best_binarization_threshold(
                forecast_probabilities=forecast_probabilities[these_indices],
                observed_labels=observed_labels[these_indices],
                threshold_arg=all_prob_thresholds,
                criterion_function=model_eval.get_csi,
                optimization_direction=model_eval.MAX_OPTIMIZATION_STRING))
    else:
        these_forecast_labels = model_eval.binarize_forecast_probs(
            forecast_probabilities=forecast_probabilities[these_indices],
            binarization_threshold=best_prob_threshold)

        this_contingency_dict = model_eval.get_contingency_table(
            forecast_labels=these_forecast_labels,
            observed_labels=observed_labels[these_indices])

        best_csi = model_eval.get_csi(this_contingency_dict)

    print(
        ('Best probability threshold = {0:.4f} ... corresponding CSI = {1:.4f}'
         ).format(best_prob_threshold, best_csi))

    num_examples_by_forecast_bin = model_eval.get_points_in_reliability_curve(
        forecast_probabilities=forecast_probabilities[these_indices],
        observed_labels=observed_labels[these_indices],
        num_forecast_bins=model_eval.DEFAULT_NUM_RELIABILITY_BINS)[-1]

    list_of_evaluation_tables = []

    for i in range(num_bootstrap_reps):
        print(('Computing scores for {0:d}th of {1:d} bootstrap replicates...'
               ).format(i + 1, num_bootstrap_reps))

        if num_bootstrap_reps == 1:
            if downsampling_dict is None:
                these_indices = numpy.linspace(0,
                                               num_examples - 1,
                                               num=num_examples,
                                               dtype=int)
            else:
                these_indices = dl_utils.sample_by_class(
                    sampling_fraction_by_class_dict=downsampling_dict,
                    target_name=DUMMY_TARGET_NAME,
                    target_values=observed_labels,
                    num_examples_total=num_examples)
        else:
            if len(positive_example_indices) > 0:
                these_positive_indices = bootstrapping.draw_sample(
                    positive_example_indices)[0]
            else:
                these_positive_indices = numpy.array([], dtype=int)

            these_negative_indices = bootstrapping.draw_sample(
                negative_example_indices)[0]

            these_indices = numpy.concatenate(
                (these_positive_indices, these_negative_indices))

            if downsampling_dict is not None:
                these_subindices = dl_utils.sample_by_class(
                    sampling_fraction_by_class_dict=downsampling_dict,
                    target_name=DUMMY_TARGET_NAME,
                    target_values=observed_labels[these_indices],
                    num_examples_total=num_examples)

                these_indices = these_indices[these_subindices]

        if downsampling_dict is not None:
            this_num_ex_by_class = numpy.unique(observed_labels[these_indices],
                                                return_counts=True)[-1]

            print('Number of examples by class: {0:s}'.format(
                str(this_num_ex_by_class)))

        this_evaluation_table = model_eval.run_evaluation(
            forecast_probabilities=forecast_probabilities[these_indices],
            observed_labels=observed_labels[these_indices],
            best_prob_threshold=best_prob_threshold,
            all_prob_thresholds=all_prob_thresholds,
            climatology=numpy.mean(observed_labels[these_indices]))

        list_of_evaluation_tables.append(this_evaluation_table)

        if i == num_bootstrap_reps - 1:
            print(SEPARATOR_STRING)
        else:
            print(MINOR_SEPARATOR_STRING)

        if i == 0:
            continue

        list_of_evaluation_tables[-1] = list_of_evaluation_tables[-1].align(
            list_of_evaluation_tables[0], axis=1)[0]

    evaluation_table = pandas.concat(list_of_evaluation_tables,
                                     axis=0,
                                     ignore_index=True)

    print('Writing results to: "{0:s}"...'.format(output_file_name))

    model_eval.write_evaluation(
        pickle_file_name=output_file_name,
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels,
        best_prob_threshold=best_prob_threshold,
        all_prob_thresholds=all_prob_thresholds,
        num_examples_by_forecast_bin=num_examples_by_forecast_bin,
        downsampling_dict=downsampling_dict,
        evaluation_table=evaluation_table)
Exemplo n.º 4
0
def run_evaluation(forecast_probabilities, observed_labels, output_dir_name):
    """Evaluates forecast-observation pairs from any forecasting method.

    Specifically, this method does the following:

    - creates ROC (receiver operating characteristic) curve
    - creates performance diagram
    - creates attributes diagram
    - saves each of the aforelisted figures to a .jpg file
    - computes many performance metrics and saves them to a Pickle file

    :param forecast_probabilities: length-N numpy array of forecast event
        probabilities.
    :param observed_labels: length-N numpy array of observed labels (1 for
        "yes", 0 for "no").
    :param output_dir_name: Name of output directory.
    """

    file_system_utils.mkdir_recursive_if_necessary(
        directory_name=output_dir_name)

    # TODO(thunderhoser): Make binarization threshold an input argument to this
    # method.
    (binarization_threshold, best_csi
    ) = model_eval.find_best_binarization_threshold(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels,
        threshold_arg=model_eval.THRESHOLD_ARG_FOR_UNIQUE_FORECASTS,
        criterion_function=model_eval.get_csi,
        optimization_direction=model_eval.MAX_OPTIMIZATION_DIRECTION,
        unique_forecast_precision=FORECAST_PRECISION_FOR_THRESHOLDS)

    print (
        'Best binarization threshold = {0:.4f} ... corresponding CSI = {1:.4f}'
    ).format(binarization_threshold, best_csi)

    print 'Binarizing forecast probabilities...'
    forecast_labels = model_eval.binarize_forecast_probs(
        forecast_probabilities=forecast_probabilities,
        binarization_threshold=binarization_threshold)

    print 'Creating contingency table...'
    contingency_table_as_dict = model_eval.get_contingency_table(
        forecast_labels=forecast_labels, observed_labels=observed_labels)
    print '{0:s}\n'.format(str(contingency_table_as_dict))

    print 'Computing performance metrics...'
    pod = model_eval.get_pod(contingency_table_as_dict)
    pofd = model_eval.get_pofd(contingency_table_as_dict)
    success_ratio = model_eval.get_success_ratio(contingency_table_as_dict)
    focn = model_eval.get_focn(contingency_table_as_dict)
    accuracy = model_eval.get_accuracy(contingency_table_as_dict)
    csi = model_eval.get_csi(contingency_table_as_dict)
    frequency_bias = model_eval.get_frequency_bias(contingency_table_as_dict)
    peirce_score = model_eval.get_peirce_score(contingency_table_as_dict)
    heidke_score = model_eval.get_heidke_score(contingency_table_as_dict)

    print (
        'POD = {0:.4f} ... POFD = {1:.4f} ... success ratio = {2:.4f} ... '
        'FOCN = {3:.4f} ... accuracy = {4:.4f} ... CSI = {5:.4f} ... frequency '
        'bias = {6:.4f} ... Peirce score = {7:.4f} ... Heidke score = {8:.4f}\n'
    ).format(pod, pofd, success_ratio, focn, accuracy, csi, frequency_bias,
             peirce_score, heidke_score)

    auc, scikit_learn_auc = _create_roc_curve(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels, output_dir_name=output_dir_name)
    print '\n'

    bss_dict = _create_attributes_diagram(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels, output_dir_name=output_dir_name)
    print '\n'

    aupd = _create_performance_diagram(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels, output_dir_name=output_dir_name)
    print '\n'

    evaluation_file_name = '{0:s}/model_evaluation.p'.format(output_dir_name)
    print 'Writing results to: "{0:s}"...'.format(evaluation_file_name)
    model_eval.write_results(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels,
        binarization_threshold=binarization_threshold, pod=pod, pofd=pofd,
        success_ratio=success_ratio, focn=focn, accuracy=accuracy, csi=csi,
        frequency_bias=frequency_bias, peirce_score=peirce_score,
        heidke_score=heidke_score, auc=auc, scikit_learn_auc=scikit_learn_auc,
        aupd=aupd, bss_dict=bss_dict, pickle_file_name=evaluation_file_name)