Пример #1
0
 def run(self):
     """Build scan results"""
     self.processed = data_preprocessor(
         self.readings,
         self.percentage_missing,
         self.max_anom_per_day,
         self.N_sigma,
         self.repeats,
         self.rolling_hours,
         self.fap_threshold,
         self.consecutive_missing_threshold,
         self.global_threshold,
         self.drop_sparse,
         self.drop_anomalous,
         self.drop_aperiodic,
         self.drop_consecutives,
     )
     self.forecast = count_baseline(
         self.processed,
         self.days_in_past,
         self.days_in_future,
         self.ts_method,
         alpha=self.alpha,
         beta=self.beta,
         gamma=self.gamma,
         kern=self.kernel,
     )
     self.all_results = scan(self.forecast, self.grid_resolution)
     self.grid_results = database_results(self.all_results)
Пример #2
0
def results_builder(
    outbreak_df: pd.DataFrame,
    outbreak_detectors: pd.DataFrame,
    outbreak_start,
    hist_sim_data,
    days_in_past: int,
    days_in_future: int,
    method: str,
    grid_partition: int,
    scan_type: str,
    show_plots=True,
):
    """Builds daily results from the scan up over a number of days, determined by
    the size of outbreak_dataframe. e.g. if outbreak_dataframe has N days worth of
    data, the first `days_in_past` number of days will be dedicated to forecasting.
    This function will then return scan results for the remaining N - `days_in_past`
    days, using the settings described below. Basically - simulating what will be stored
    in the database over a period of time.

    Args:
        outbreak_dataframe: simulated from `simulate_outbreak()`
        outbreak_detectors: dataframe of affected detectors
        outbreak_start: outbreak_start time
        hist_sim_data: Dataframe of F(S) scores from historic data
        days_in_past: as in count_baseline
        days_in_future: as in count_baseline
        method: as in count_baseline
        grid_partition: as in scan()
        scan_type as in scan()

    Returns:
        Dataframe of results spanning (len(outbreak_dataframe) - days_in_past) days worth of analysis
        Dataframe of highest scoring regions per day
        """

    t_min = outbreak_df["measurement_start_utc"].min()
    t_max = outbreak_df["measurement_end_utc"].max()

    # Get outbreak characteristics
    num_outbreak_detectors = len(set(outbreak_detectors["detector_id"]))
    ob_x_min = outbreak_detectors["lon"].min()
    ob_x_max = outbreak_detectors["lon"].max()
    ob_y_min = outbreak_detectors["lat"].min()
    ob_y_max = outbreak_detectors["lat"].max()

    total_num_days = (t_max - t_min).days
    print("Total number of days in dataframe: ", total_num_days)

    first_analysis_day = (t_min + np.timedelta64(days_in_past, "D") +
                          np.timedelta64(days_in_future, "D"))

    num_forecast_days = (t_max - first_analysis_day).days + 1

    print("Producing forecasts and scans for {} days in total.\n".format(
        num_forecast_days))

    print("Outbreak begins at {}.".format(outbreak_start))

    # False-Positive rates to check
    fps = [0.01, 0.05, 0.10, 0.25, 0.50]

    # Threshold of EBP score required to be detected
    threshs = [
        np.percentile(hist_sim_data["l_score_EBP"], 100 * (1 - x)) for x in fps
    ]

    dataframe_list = []

    daily_highest_scoring_regions = {}
    today = first_analysis_day
    for i in range(num_forecast_days):

        print("\nAnalysis day: {}. Looking back at last {} hours.".format(
            today, 24 * days_in_future))

        available_today = outbreak_df[
            outbreak_df["measurement_end_utc"] <= today].copy()

        forecast_df = count_baseline(
            available_today,
            days_in_past=days_in_past,
            days_in_future=days_in_future,
            method=method,
        )

        forecast_df = cleanse_forecast_data(forecast_df)

        if show_plots:
            CB_plot(forecast_df)

        res_df = scan(forecast_df,
                      grid_partition=grid_partition,
                      scan_type=scan_type)

        if show_plots:
            plot_region_by_rank(0,
                                res_df,
                                forecast_df,
                                plot_type="count",
                                add_legend=False)

        #  Return Highest Scoring region here
        highest_region = res_df.iloc[0][[
            "x_min",
            "x_max",
            "y_min",
            "y_max",
            "t_min",
            "t_max",
            "l_score_EBP",
            "l_score_000",
            "l_score_025",
            "l_score_050",
            "posterior_bbayes",
        ]].to_dict()

        # Add some Spatial analysis
        x_min = highest_region["x_min"]
        x_max = highest_region["x_max"]
        y_min = highest_region["y_min"]
        y_max = highest_region["y_max"]

        num_detectors_in_highest_region = len(
            set(outbreak_df[
                (outbreak_df["lon"].between(x_min, x_max))
                & (outbreak_df["lat"].between(y_min, y_max))].detector_id))

        overlap_x_min = max([x_min, ob_x_min])
        overlap_x_max = min([x_max, ob_x_max])
        overlap_y_min = max([y_min, ob_y_min])
        overlap_y_max = min([y_max, ob_y_max])

        num_detectors_in_highest_region_and_true = len(
            set(outbreak_df[
                (outbreak_df["lon"].between(overlap_x_min, overlap_x_max))
                & (outbreak_df["lat"].between(overlap_y_min, overlap_y_max))].
                detector_id))

        # Calculate Spatial Precision and Recall
        precision = (num_detectors_in_highest_region_and_true /
                     num_detectors_in_highest_region)
        recall = num_detectors_in_highest_region_and_true / num_outbreak_detectors

        highest_region["precision"] = precision
        highest_region["recall"] = recall
        highest_region["day"] = today

        # =================================
        # How significant is today's score?
        # =================================
        highest_region["days_since_outbreak"] = (today - outbreak_start).days

        # XXX - There is a much better way of doing this!
        detections = [
            is_outbreak_detected(hist_sim_data,
                                 highest_region["l_score_EBP"],
                                 fp_rate=x) for x in fps
        ]
        highest_region["F_thresh_fp=0.01"] = threshs[0]
        highest_region["detected_fp=0.01"] = detections[0]
        highest_region["F_thresh_fp=0.05"] = threshs[1]
        highest_region["detected_fp=0.05"] = detections[1]
        highest_region["F_thresh_fp=0.10"] = threshs[2]
        highest_region["detected_fp=0.10"] = detections[2]
        highest_region["F_thresh_fp=0.25"] = threshs[3]
        highest_region["detected_fp=0.25"] = detections[3]
        highest_region["F_thresh_fp=0.50"] = threshs[4]
        highest_region["detected_fp=0.50"] = detections[4]

        # Append to list of dataframes
        daily_highest_scoring_regions[i] = highest_region

        #  =========================
        # Simulate Database Storage
        #  =========================
        database_df = database_results(res_df)

        # Updates data correctly with most reliable average likelihood scores.
        # i.e. today is wednesday, and days_in_future = 2
        # We are getting scores for monday and tuesday, and append it in a list.
        # Now, the next day, we get scores for tuesday and wednesday. We throw away the old tuesday,
        # and keep the new one.
        if len(dataframe_list) - (days_in_future - 1) >= 0:
            dataframe_list = dataframe_list[:len(dataframe_list) -
                                            (days_in_future - 1)]

        days_dict = dict(
            iter(database_df.groupby(database_df["start_time_utc"].dt.day)))

        for j in range(days_in_future):
            forecast_day = (today -
                            np.timedelta64(days_in_future - j, "D")).day
            dataframe_list.append(days_dict[forecast_day])

        today += np.timedelta64(1, "D")

    # Return list of highest scoring regions too - add to plot?
    return (
        pd.concat(dataframe_list, ignore_index=True),
        pd.DataFrame.from_dict(daily_highest_scoring_regions, "index"),
    )
Пример #3
0
 def rerun_scan(self):
     # Assumes everything remains the same up to scanning
     print('Using cached processed and forecast data to rebuild scan')
     self.all_results = scan(self.forecast, self.grid_resolution)
     self.grid_results = database_results(self.all_results)
Пример #4
0
 def run(self):
     """Build scan results"""
     self.all_results = scan(self.forecast, self.grid_resolution,
                             self.grid_dict)
     self.grid_results = database_results(self.all_results)