Пример #1
0
    def evaluate_period_forecasts(self):
        """
        Evaluates ROC and Reliability scores for forecasts over the full period from start hour to end hour

        Returns:
            A pandas DataFrame with full-period metadata and verification statistics
        """
        score_columns = ["Run_Date", "Ensemble Name", "Model_Name", "Forecast_Variable", "Neighbor_Radius",
                         "Smoothing_Radius", "Size_Threshold",  "ROC", "Reliability"]
        all_scores = pd.DataFrame(columns=score_columns)
        if self.coordinate_file is not None:
            coord_mask = np.where((self.coordinates["lon"] >= self.lon_bounds[0]) &
                                  (self.coordinates["lon"] <= self.lon_bounds[1]) &
                                  (self.coordinates["lat"] >= self.lat_bounds[0]) &
                                  (self.coordinates["lat"] <= self.lat_bounds[1]) &
                                  (self.period_obs[self.mask_variable] > 0))
        else:
            coord_mask = None
        for neighbor_radius in self.neighbor_radii:
            n_filter = disk(neighbor_radius)
            for s, size_threshold in enumerate(self.size_thresholds):
                period_obs = fftconvolve(self.period_obs[self.mrms_variable] >= self.obs_thresholds[s],
                                         n_filter, mode="same")
                period_obs[period_obs > 1] = 1
                if self.obs_mask and self.coordinate_file is None:
                    period_obs = period_obs[self.period_obs[self.mask_variable] > 0]
                elif self.obs_mask and self.coordinate_file is not None:
                    period_obs = period_obs[coord_mask[0], coord_mask[1]]
                else:
                    period_obs = period_obs.ravel()
                for smoothing_radius in self.smoothing_radii:
                    print("Eval period forecast {0} {1} {2} {3} {4} {5}".format(self.model_name,
                                                                                self.forecast_variable,
                                                                                self.run_date,
                                                                                neighbor_radius,
                                                                                size_threshold, smoothing_radius))
                    period_var = "neighbor_prob_{0:d}-hour_r_{1:d}_s_{2:d}_{3}_{4:0.2f}".format(self.end_hour -
                                                                                                self.start_hour + 1,
                                                                                                neighbor_radius,
                                                                                                smoothing_radius,
                                                                                                self.forecast_variable,
                                                                                                size_threshold)
                    if self.obs_mask and self.coordinate_file is None:
                        period_forecast = self.period_forecasts[period_var][self.period_obs[self.mask_variable] > 0]
                    elif self.obs_mask and self.coordinate_file is not None:
                        period_forecast = self.period_forecasts[period_var][coord_mask[0], coord_mask[1]]
                    else:
                        period_forecast = self.period_forecasts[period_var].ravel()
                    roc = DistributedROC(thresholds=self.probability_levels, obs_threshold=0.5)
                    roc.update(period_forecast, period_obs)
                    rel = DistributedReliability(thresholds=self.probability_levels, obs_threshold=0.5)
                    rel.update(period_forecast, period_obs)
                    row = [self.run_date, self.ensemble_name, self.model_name, self.forecast_variable, neighbor_radius,
                           smoothing_radius, size_threshold, roc, rel]
                    all_scores.loc[period_var] = row
        return all_scores
Пример #2
0
    def evaluate_hourly_forecasts(self):
        """
        Calculates ROC curves and Reliability scores for each forecast hour.

        Returns:
            A pandas DataFrame containing forecast metadata as well as DistributedROC and Reliability objects.
        """
        score_columns = [
            "Run_Date", "Forecast_Hour", "Ensemble Name", "Model_Name",
            "Forecast_Variable", "Neighbor_Radius", "Smoothing_Radius",
            "Size_Threshold", "ROC", "Reliability"
        ]
        all_scores = pd.DataFrame(columns=score_columns)
        for h, hour in enumerate(range(self.start_hour, self.end_hour + 1)):
            for neighbor_radius in self.neighbor_radii:
                n_filter = disk(neighbor_radius)
                for s, size_threshold in enumerate(self.size_thresholds):
                    print(
                        "Eval hourly forecast {0:02d} {1} {2} {3} {4:d} {5:d}".
                        format(hour, self.model_name, self.forecast_variable,
                               self.run_date, neighbor_radius, size_threshold))
                    hour_obs = fftconvolve(self.raw_obs[self.mrms_variable][h]
                                           >= self.obs_thresholds[s],
                                           n_filter,
                                           mode="same")
                    hour_obs[hour_obs > 1] = 1
                    hour_obs[hour_obs < 1] = 0
                    if self.obs_mask:
                        hour_obs = hour_obs[
                            self.raw_obs[self.mask_variable][h] > 0]
                    for smoothing_radius in self.smoothing_radii:
                        hour_var = "neighbor_prob_r_{0:d}_s_{1:d}_{2}_{3:0.2f}".format(
                            neighbor_radius, smoothing_radius,
                            self.forecast_variable, size_threshold)
                        if self.obs_mask:
                            hour_forecast = self.hourly_forecasts[hour_var][h][
                                self.raw_obs[self.mask_variable][h] > 0]
                        else:
                            hour_forecast = self.hourly_forecasts[hour_var][h]
                        roc = DistributedROC(
                            thresholds=self.probability_levels,
                            obs_threshold=0.5)
                        roc.update(hour_forecast, hour_obs)
                        rel = DistributedReliability(
                            thresholds=self.probability_levels,
                            obs_threshold=0.5)
                        rel.update(hour_forecast, hour_obs)
                        row = [
                            self.run_date, hour, self.ensemble_name,
                            self.model_name, self.forecast_variable,
                            neighbor_radius, smoothing_radius, size_threshold,
                            roc, rel
                        ]
                        all_scores.loc[hour_var + "_{0:d}".format(hour)] = row
        return all_scores
Пример #3
0
    def roc(self,
            model_type,
            model_name,
            intensity_threshold,
            prob_thresholds,
            query=None):
        """
        Calculates a ROC curve at a specified intensity threshold.

        Args:
            model_type: type of model being evaluated (e.g. size).
            model_name: machine learning model being evaluated
            intensity_threshold: forecast bin used as the split point for evaluation
            prob_thresholds: Array of probability thresholds being evaluated.
            query: str to filter forecasts based on values of forecasts, obs, and metadata.

        Returns:
             A DistributedROC object
        """
        roc_obj = DistributedROC(prob_thresholds, 0.5)
        if query is not None:
            sub_forecasts = self.matched_forecasts[model_type][
                model_name].query(query)
            sub_forecasts = sub_forecasts.reset_index(drop=True)
        else:
            sub_forecasts = self.matched_forecasts[model_type][model_name]
        obs_values = np.zeros(sub_forecasts.shape[0])
        if sub_forecasts.shape[0] > 0:
            if model_type == "dist":
                forecast_values = np.array([
                    gamma_sf(intensity_threshold, *params) for params in
                    sub_forecasts[self.forecast_bins[model_type]].values
                ])
                obs_probs = np.array([
                    gamma_sf(intensity_threshold, *params) for params in
                    sub_forecasts[self.type_cols[model_type]].values
                ])
                obs_values[obs_probs >= 0.01] = 1
            elif len(self.forecast_bins[model_type]) > 1:
                fbin = np.argmin(
                    np.abs(self.forecast_bins[model_type] -
                           intensity_threshold))
                forecast_values = 1 - sub_forecasts[
                    self.forecast_bins[model_type].astype(str)].values.cumsum(
                        axis=1)[:, fbin]
                obs_values[sub_forecasts[self.type_cols[model_type]].values >=
                           intensity_threshold] = 1
            else:
                forecast_values = sub_forecasts[
                    self.forecast_bins[model_type].astype(str)[0]].values
                obs_values[sub_forecasts[self.type_cols[model_type]].values >=
                           intensity_threshold] = 1
            roc_obj.update(forecast_values, obs_values)
        return roc_obj
Пример #4
0
    def evaluate_hourly_forecasts(self):
        """
        Calculates ROC curves and Reliability scores for each forecast hour.

        Returns:
            A pandas DataFrame containing forecast metadata as well as DistributedROC and Reliability objects.
        """
        score_columns = ["Run_Date", "Forecast_Hour", "Ensemble Name", "Model_Name", "Forecast_Variable",
                         "Neighbor_Radius", "Smoothing_Radius", "Size_Threshold", "ROC", "Reliability"]
        all_scores = pd.DataFrame(columns=score_columns)
        for h, hour in enumerate(range(self.start_hour, self.end_hour + 1)):
            for neighbor_radius in self.neighbor_radii:
                n_filter = disk(neighbor_radius)
                for s, size_threshold in enumerate(self.size_thresholds):
                    print("Eval hourly forecast {0:02d} {1} {2} {3} {4:d} {5:d}".format(hour, self.model_name,
                                                                                        self.forecast_variable,
                                                                                        self.run_date, neighbor_radius,
                                                                                        size_threshold))
                    hour_obs = fftconvolve(self.raw_obs[self.mrms_variable][h] >= self.obs_thresholds[s],
                                           n_filter, mode="same")
                    hour_obs[hour_obs > 1] = 1
                    hour_obs[hour_obs < 1] = 0
                    if self.obs_mask:
                        hour_obs = hour_obs[self.raw_obs[self.mask_variable][h] > 0]
                    for smoothing_radius in self.smoothing_radii:
                        hour_var = "neighbor_prob_r_{0:d}_s_{1:d}_{2}_{3:0.2f}".format(neighbor_radius,
                                                                                       smoothing_radius,
                                                                                       self.forecast_variable,
                                                                                       size_threshold)
                        if self.obs_mask:
                            hour_forecast = self.hourly_forecasts[hour_var][h][self.raw_obs[self.mask_variable][h] > 0]
                        else:
                            hour_forecast = self.hourly_forecasts[hour_var][h]
                        roc = DistributedROC(thresholds=self.probability_levels, obs_threshold=0.5)
                        roc.update(hour_forecast, hour_obs)
                        rel = DistributedReliability(thresholds=self.probability_levels, obs_threshold=0.5)
                        rel.update(hour_forecast, hour_obs)
                        row = [self.run_date, hour, self.ensemble_name, self.model_name, self.forecast_variable,
                               neighbor_radius,
                               smoothing_radius, size_threshold, roc, rel]
                        all_scores.loc[hour_var + "_{0:d}".format(hour)] = row
        return all_scores
Пример #5
0
 def test_roc(self):
     perfect_roc = DistributedROC(self.thresholds, self.obs_threshold)
     perfect_roc.update(self.forecasts["perfect"],
                        self.observations["perfect"])
     perfect_auc = perfect_roc.auc()
     random_roc = DistributedROC(self.thresholds, self.obs_threshold)
     random_roc.update(self.forecasts["random"],
                       self.observations["random"])
     random_auc = random_roc.auc()
     self.assertEqual(
         perfect_auc,
         1,
         msg="Perfect AUC not 1, is actually {0:0.2f}".format(perfect_auc))
     self.assertLessEqual(
         np.abs(random_auc - 0.5),
         0.1,
         msg="Random AUC not 0.5, actually {0:0.3f}".format(random_auc))
     self.assertGreater(perfect_auc,
                        random_auc,
                        msg="Perfect AUC is not greater than random.")
Пример #6
0
    def roc(self, model_type, model_name, intensity_threshold, prob_thresholds, query=None):
        """
        Calculates a ROC curve at a specified intensity threshold.

        Args:
            model_type: type of model being evaluated (e.g. size).
            model_name: machine learning model being evaluated
            intensity_threshold: forecast bin used as the split point for evaluation
            prob_thresholds: Array of probability thresholds being evaluated.
            query: str to filter forecasts based on values of forecasts, obs, and metadata.

        Returns:
             A DistributedROC object
        """
        roc_obj = DistributedROC(prob_thresholds, 0.5)
        if query is not None:
            sub_forecasts = self.matched_forecasts[model_type][model_name].query(query)
            sub_forecasts = sub_forecasts.reset_index(drop=True)
        else:
            sub_forecasts = self.matched_forecasts[model_type][model_name]
        obs_values = np.zeros(sub_forecasts.shape[0])
        if sub_forecasts.shape[0] > 0:
            if model_type == "dist":
                forecast_values = np.array([gamma_sf(intensity_threshold, *params)
                                            for params in sub_forecasts[self.forecast_bins[model_type]].values])
                obs_probs = np.array([gamma_sf(intensity_threshold, *params)
                                    for params in sub_forecasts[self.type_cols[model_type]].values])
                obs_values[obs_probs >= 0.01] = 1
            elif len(self.forecast_bins[model_type]) > 1:
                fbin = np.argmin(np.abs(self.forecast_bins[model_type] - intensity_threshold))
                forecast_values = 1 - sub_forecasts[self.forecast_bins[model_type].astype(str)].values.cumsum(axis=1)[:,
                                    fbin]
                obs_values[sub_forecasts[self.type_cols[model_type]].values >= intensity_threshold] = 1
            else:
                forecast_values = sub_forecasts[self.forecast_bins[model_type].astype(str)[0]].values
                obs_values[sub_forecasts[self.type_cols[model_type]].values >= intensity_threshold] = 1
            roc_obj.update(forecast_values, obs_values)
        return roc_obj
Пример #7
0
    def evaluate_period_forecasts(self):
        """
        Evaluates ROC and Reliability scores for forecasts over the full period from start hour to end hour

        Returns:
            A pandas DataFrame with full-period metadata and verification statistics
        """
        score_columns = [
            "Run_Date", "Ensemble Name", "Model_Name", "Forecast_Variable",
            "Neighbor_Radius", "Smoothing_Radius", "Size_Threshold", "ROC",
            "Reliability"
        ]
        all_scores = pd.DataFrame(columns=score_columns)
        if self.coordinate_file is not None:
            coord_mask = np.where(
                (self.coordinates["lon"] >= self.lon_bounds[0])
                & (self.coordinates["lon"] <= self.lon_bounds[1])
                & (self.coordinates["lat"] >= self.lat_bounds[0])
                & (self.coordinates["lat"] <= self.lat_bounds[1])
                & (self.period_obs[self.mask_variable] > 0))
        else:
            coord_mask = None
        for neighbor_radius in self.neighbor_radii:
            n_filter = disk(neighbor_radius)
            for s, size_threshold in enumerate(self.size_thresholds):
                period_obs = fftconvolve(self.period_obs[self.mrms_variable] >=
                                         self.obs_thresholds[s],
                                         n_filter,
                                         mode="same")
                period_obs[period_obs > 1] = 1
                if self.obs_mask and self.coordinate_file is None:
                    period_obs = period_obs[
                        self.period_obs[self.mask_variable] > 0]
                elif self.obs_mask and self.coordinate_file is not None:
                    period_obs = period_obs[coord_mask[0], coord_mask[1]]
                else:
                    period_obs = period_obs.ravel()
                for smoothing_radius in self.smoothing_radii:
                    print(
                        "Eval period forecast {0} {1} {2} {3} {4} {5}".format(
                            self.model_name, self.forecast_variable,
                            self.run_date, neighbor_radius, size_threshold,
                            smoothing_radius))
                    period_var = "neighbor_prob_{0:d}-hour_r_{1:d}_s_{2:d}_{3}_{4:0.2f}".format(
                        self.end_hour - self.start_hour + 1, neighbor_radius,
                        smoothing_radius, self.forecast_variable,
                        size_threshold)
                    if self.obs_mask and self.coordinate_file is None:
                        period_forecast = self.period_forecasts[period_var][
                            self.period_obs[self.mask_variable] > 0]
                    elif self.obs_mask and self.coordinate_file is not None:
                        period_forecast = self.period_forecasts[period_var][
                            coord_mask[0], coord_mask[1]]
                    else:
                        period_forecast = self.period_forecasts[
                            period_var].ravel()
                    roc = DistributedROC(thresholds=self.probability_levels,
                                         obs_threshold=0.5)
                    roc.update(period_forecast, period_obs)
                    rel = DistributedReliability(
                        thresholds=self.probability_levels, obs_threshold=0.5)
                    rel.update(period_forecast, period_obs)
                    row = [
                        self.run_date, self.ensemble_name, self.model_name,
                        self.forecast_variable, neighbor_radius,
                        smoothing_radius, size_threshold, roc, rel
                    ]
                    all_scores.loc[period_var] = row
        return all_scores