예제 #1
0
def plot_shap_value_maps(
    X_train,
    masked_shap_arrs,
    masked_shap_arrs_std,
    vmin,
    vmax,
    std_vmin,
    std_vmax,
    map_figure_saver,
):
    for i, feature in enumerate(tqdm(X_train.columns, desc="Mapping SHAP values")):
        fig = cube_plotting(
            masked_shap_arrs[i],
            fig=plt.figure(figsize=(5.1, 2.8)),
            title=f"Mean SHAP value for '{shorten_features(feature)}'",
            cmap="Spectral_r",
            nbins=7,
            cmap_midpoint=0,
            cmap_symmetric=True,
            vmin=vmin,
            vmax=vmax,
            log=True,
            log_auto_bins=False,
            min_edge=1e-3,
            extend="neither",
            colorbar_kwargs={
                "format": "%0.1e",
                "label": f"SHAP ('{shorten_features(feature)}')",
            },
            coastline_kwargs={"linewidth": 0.3},
        )
        map_figure_saver.save_figure(
            fig, f"shap_value_map_{feature}", sub_directory="shap_map"
        )
        fig = cube_plotting(
            masked_shap_arrs_std[i],
            fig=plt.figure(figsize=(5.1, 2.8)),
            title=f"STD SHAP value for '{shorten_features(feature)}'",
            cmap="YlOrRd",
            nbins=7,
            vmin=std_vmin,
            vmax=std_vmax,
            log=True,
            log_auto_bins=False,
            min_edge=1e-3,
            extend="neither",
            colorbar_kwargs={
                "format": "%0.1e",
                "label": f"SHAP ('{shorten_features(feature)}')",
            },
            coastline_kwargs={"linewidth": 0.3},
        )
        map_figure_saver.save_figure(
            fig, f"shap_value_std_map_{feature}", sub_directory="shap_map_std"
        )
예제 #2
0
def outputs_plotting(thres, outputs):
    """Plotting of fire season statistics.

    Args:
        thres (float): Threshold used to generate the data.
        outputs: Output of `wildfires.analysis.thres_fire_season_stats`.

    """
    enable_logging()
    FigureSaver.debug = True
    FigureSaver.directory = os.path.join(os.path.expanduser("~"), "tmp",
                                         "fire_season")
    os.makedirs(FigureSaver.directory, exist_ok=True)

    for dataset_outputs in outputs:
        name = dataset_outputs[0]
        starts = dataset_outputs[1]
        ends = dataset_outputs[2]
        sizes = dataset_outputs[3]
        fractions = dataset_outputs[5]

        for plot_type, data, cmap, boundaries in zip(
            ("start (month)", "end (month)", "length (months)",
             "fraction (1)"),
            (starts, ends, sizes, fractions),
            (*("twilight", ) * 2, *("brewer_RdYlBu_11_r", ) * 2),
            (*(np.arange(0, 12), ) * 3, None),
        ):
            with FigureSaver(
                    f"{name}_thres_{str(thres).replace('.', '_')}_{plot_type}"
            ):
                mpl.rc("figure", figsize=(7.4, 3.3))
                cube_plotting(
                    data,
                    coastline_kwargs=dict(linewidth=0.5),
                    cmap=cmap,
                    label=plot_type,
                    title=name,
                    boundaries=boundaries,
                )
            # Close all figures after saving.
            plt.close("all")
def outputs_plotting(thres, outputs):
    """Plotting of fire season fractions in South Africa.

    Args:
        thres (float): Threshold used to generate the data.
        outputs: Output of `wildfires.analysis.thres_fire_season_stats`.

    """
    enable_logging()
    FigureSaver.debug = True
    FigureSaver.directory = os.path.join(os.path.expanduser("~"), "tmp",
                                         "south_africa_fire_season_fraction")
    os.makedirs(FigureSaver.directory, exist_ok=True)

    for dataset_outputs in outputs:
        name = dataset_outputs[0]
        # if name != "GFEDv4":
        #     continue
        fractions = dataset_outputs[5]

        for plot_type, data, cmap, boundaries in zip(
            ("fraction (1)", ), (fractions, ),
            (*("brewer_RdYlBu_11_r", ) * 1, ), (None, )):
            data.mask |= get_south_africa_mask()
            with FigureSaver(
                    f"{name}_thres_{str(thres).replace('.', '_')}_{plot_type}"
            ):
                mpl.rc("figure", figsize=(7.4, 3.3))
                cube_plotting(
                    data,
                    coastline_kwargs=dict(linewidth=0.5),
                    cmap=cmap,
                    label=plot_type,
                    title=name,
                    boundaries=boundaries,
                    select_valid=True,
                )
            # Close all figures after saving.
            plt.close("all")
예제 #4
0
def buffered_leave_one_out(
    exog_data,
    endog_data,
    master_mask,
    radius,
    max_rad,
    seed=0,
    max_tries=50,
    extrapolation_check=False,
    verbose=False,
    dpi=400,
):
    """Split data with a single test sample surrounded by ignored data.

    Data is excluded using a given number of radii up to a given maximum radius (in
    units of pixels, i.e. grid cells).

    Args:
        exog_data, endog_data (pd.DataFrame, pd.Series): Predictor and target
            variables.
        master_mask (numpy.ndarray): Mask controlling mapping from `exog_data`,
            `endog_data` to mapped data.
        radius (float): Radius (number of pixels) to exclude.
        max_rad (float): Maximum radius to be attempted.
        seed (int): Random number generator seed used to dictate where test samples
            are located.
        max_tries (int): Number of allowed attempts to find a test sample that is
            within the train observations, given the maximum excluded radius `max_rad`.
        extrapolation_check (bool): If True, require that no extrapolation is done
            during testing relative to the training set.
        verbose (bool): Plot the training and test masks.
        dpi (int): Figure dpi. Only used if `verbose` is True.

    Returns:
        n_ignored (int): Number of ignored samples.
        n_train (int): Number of train samples.
        n_test (int): Number of test samples.
        total_samples (int): Total number of samples.
        train_X: Training predictor data.
        test_X: Test predictor data.
        train_y: Training target data.
        test_y: Test training data.

    Raises:
        ValueError: If `master_mask` is not a rank 3 array.
        ValueError: If `master_mask` is not identical across each slice along its
            first (temporal) dimension.
        ValueError: If `radius` is larger than `max_rad`.
        RuntimeError: If no suitable test site can be found within `max_tries`.

    """
    if master_mask.ndim != 3:
        raise ValueError(f"Expected a rank 3 array, got: {master_mask.ndim}.")

    if not np.all(np.all(master_mask[:1] == master_mask, axis=0)):
        raise ValueError("'master_mask' was not identical across each temporal slice.")

    if radius > (max_rad + 1e-7):
        raise ValueError("'radius' was larger than 'max_rad'")

    rng = np.random.default_rng(seed)

    collapsed_master_mask = master_mask[0]
    single_total_samples = np.sum(~collapsed_master_mask)
    total_samples = single_total_samples * master_mask.shape[0]

    possible_indices = np.array(list(zip(*np.where(~collapsed_master_mask))))

    def get_structure(radius):
        # Generate a rank 2 structure.
        N = math.ceil(radius * 2)
        if N % 2 == 0:
            # Ensure there is an odd number of elements. This results in a symmetric
            # structure.
            N += 1

        if N > 1:
            # Calculate the differences to the central index.
            diffs = (np.arange(N) - N // 2) ** 2
            structure = np.sqrt(diffs[np.newaxis] + diffs[:, np.newaxis]) <= (
                radius + 1e-7
            )
        else:
            structure = np.array([[True]])

        # Trim excess False elements.
        if not np.any(structure[0]):
            structure = structure[1:]
        if not np.any(structure[-1]):
            structure = structure[:-1]
        if not np.any(structure[:, 0]):
            structure = structure[:, 1:]
        if not np.any(structure[:, -1]):
            structure = structure[:, :-1]

        if verbose:
            plt.figure()
            plt.imshow(structure, cmap="Greys", vmin=0, vmax=1)
            plt.axis("off")
            plt.title(f"N={N}, Total={np.sum(structure)}")

        return structure

    structure = get_structure(radius)
    max_rad_structure = get_structure(max_rad)

    n_train_samples = single_total_samples - np.sum(max_rad_structure)

    tries = 0

    while tries < max_tries:
        # Select a single test sample.
        test_indices = possible_indices[rng.integers(len(possible_indices), size=(1,))]
        hold_out_selection = np.zeros_like(collapsed_master_mask)
        hold_out_selection[(test_indices[:, 0], test_indices[:, 1])] = True

        # Select data around the test sample to ignore.
        ignored_data = apply_structure(hold_out_selection, structure) & (
            ~hold_out_selection
        )
        max_rad_ignored_data = apply_structure(
            hold_out_selection, max_rad_structure
        ) & (~hold_out_selection)

        # The remaining data is then used for training, depending on train_frac.
        possible_train_selection = (
            ~(hold_out_selection | ignored_data) & ~collapsed_master_mask
        )
        max_rad_possible_train_selection = (
            ~(hold_out_selection | max_rad_ignored_data) & ~collapsed_master_mask
        )

        possible_train_indices = np.array(
            list(zip(*np.where(possible_train_selection)))
        )

        if len(possible_train_indices) < n_train_samples:
            raise ValueError(
                f"Need at least {n_train_samples} samples, but only have "
                f"{len(possible_train_indices)}."
            )

        # Select train data.
        train_indices = possible_train_indices[
            rng.choice(
                np.arange(len(possible_train_indices)),
                size=n_train_samples,
                replace=False,
            )
        ]

        train_selection = np.zeros_like(collapsed_master_mask)
        train_selection[(train_indices[:, 0], train_indices[:, 1])] = True

        max_rad_train_selection = train_selection & max_rad_possible_train_selection

        # Apply the master_mask to the training and test data to arrive at the final 3D mask.
        train_selection = train_selection[None] & (~master_mask)
        hold_out_selection = hold_out_selection[None] & (~master_mask)

        max_rad_train_selection = max_rad_train_selection[None] & (~master_mask)

        if verbose:
            # Plot a map of the selections.
            mask_vis = np.zeros_like(master_mask, dtype=np.int32)
            mask_vis[hold_out_selection] = 1
            mask_vis[train_selection] = 2
            cube_plotting(
                np.mean(mask_vis, axis=0),
                title=str(seed),
                fig=plt.figure(dpi=dpi),
            )

        # Transform X, y to 3D arrays before selecting using the above masks.
        mm_endog = get_map_data(endog_data.values, master_mask)
        train_y = mm_endog.data[train_selection]
        hold_out_y = mm_endog.data[hold_out_selection]

        # Repeat for all columns in X.
        train_X_data = {}
        hold_out_X_data = {}

        max_rad_train_X_data = {}

        for col in exog_data.columns:
            mm_x_col = get_map_data(exog_data[col].values, master_mask)
            train_X_data[col] = mm_x_col.data[train_selection]
            hold_out_X_data[col] = mm_x_col.data[hold_out_selection]

            max_rad_train_X_data[col] = mm_x_col.data[max_rad_train_selection]

        train_X = pd.DataFrame(train_X_data)
        hold_out_X = pd.DataFrame(hold_out_X_data)

        max_rad_train_X = pd.DataFrame(max_rad_train_X_data)

        # Verify that test data for the largest radius is within the range of
        # observations in the train data.
        # For each test sample, require at least one train sample where ALL variables
        # exceed all test variables, and likewise require at least one train sample
        # where ALL variables are lower.
        for test_values in hold_out_X.values:
            if extrapolation_check and (
                not np.any(np.all(test_values <= max_rad_train_X.values, axis=1))
                or not np.any(np.all(test_values >= max_rad_train_X.values, axis=1))
            ):
                logger.warning("Data range test failed.")
                break
        else:
            # If the test above passed, i.e. 'break' was never encountered.
            n_ignored = np.sum(ignored_data[None] & (~master_mask))
            n_train = np.sum(train_selection)
            n_hold_out = np.sum(hold_out_selection)

            assert np.sum(max_rad_train_selection) <= n_train

            predicted_y = threading_get_model_predict(
                X_train=train_X,
                y_train=train_y,
                predict_X=hold_out_X,
            )

            return (
                test_indices[0],
                n_ignored,
                n_train,
                n_hold_out,
                total_samples,
                hold_out_y,
                predicted_y,
            )
        tries += 1
        logger.warning(f"Trying another sample location ({tries} failed tries).")

    raise RuntimeError("No suitable site could be found.")
예제 #5
0
def random_binary_dilation_split(
    exog_data,
    endog_data,
    structure,
    master_mask,
    test_frac=0.05,
    train_frac=None,
    seed=0,
    verbose=False,
    dpi=400,
):
    """Split data with the test data surrounded by ignored data.

    The shape and quantity of ignored data is dictated by `structure`.

    Note: Need ~dpi=1400 to see the divisions clearly (`verbose=True`).

    Args:
        exog_data, endog_data (pd.DataFrame, pd.Series): Predictor and target
            variables.
        structure ((N, N) numpy.ndarray): Structure used to dictate ignored data
            around the test samples.
        master_mask (numpy.ndarray): Mask controlling mapping from `exog_data`,
            `endog_data` to mapped data.
        test_frac (float): Fraction of samples to reserve for testing.
        train_frac (float or None): Fraction of samples to use for training. If `None`
            is given, all possible samples will be used.
        seed (int): Random number generator seed used to dictate where test samples
            are located.
        verbose (bool): Plot the training and test masks.
        dpi (int): Figure dpi. Only used if `verbose` is True.

    Returns:
        desc_str: Descriptive string.
        (total_samples, n_ignored, n_train, n_hold_out): Split statistics.
        train_X: Training predictor data.
        hold_out_X: Test predictor data.
        train_y: Training target data.
        hold_out_y: Test training data.

    Raises:
        ValueError: If `master_mask` is not a rank 3 array.
        ValueError: If `master_mask` is not identical across each slice along its
            first (temporal) dimension.
        ValueError: If `train_frac` cannot be satisfied, e.g. because too many samples
            are being used for testing and exclusion zones around test samples.

    """
    if master_mask.ndim != 3:
        raise ValueError(f"Expected a rank 3 array, got: {master_mask.ndim}.")

    if not np.all(np.all(master_mask[:1] == master_mask, axis=0)):
        raise ValueError("'master_mask' was not identical across each temporal slice.")

    rng = np.random.default_rng(seed)
    collapsed_master_mask = master_mask[0]
    single_total_samples = np.sum(~collapsed_master_mask)
    total_samples = single_total_samples * master_mask.shape[0]

    possible_indices = np.array(list(zip(*np.where(~collapsed_master_mask))))

    # Per time slice.
    n_test_samples = round(single_total_samples * test_frac)

    # Select test data.
    test_indices = possible_indices[
        rng.choice(np.arange(len(possible_indices)), size=n_test_samples, replace=False)
    ]

    hold_out_selection = np.zeros_like(collapsed_master_mask)
    hold_out_selection[(test_indices[:, 0], test_indices[:, 1])] = True

    # Select data around the test data to ignore.
    ignored_data = ndimage.binary_dilation(hold_out_selection, structure) & (
        ~hold_out_selection
    )

    # The remaining data is then used for training, depending on train_frac.
    possible_train_selection = (
        ~(hold_out_selection | ignored_data) & ~collapsed_master_mask
    )

    if train_frac is None:
        train_selection = possible_train_selection
    else:
        possible_train_indices = np.array(
            list(zip(*np.where(possible_train_selection)))
        )
        n_train_samples = round(single_total_samples * train_frac)

        if len(possible_train_indices) < n_train_samples:
            raise ValueError(
                f"Need at least {n_train_samples} samples to satisfy train_frac: "
                f"{train_frac}, but only have {len(possible_train_indices)} "
                f"({len(possible_train_indices) / single_total_samples:0.4f})."
            )

        # Select train data.
        train_indices = possible_train_indices[
            rng.choice(
                np.arange(len(possible_train_indices)),
                size=n_train_samples,
                replace=False,
            )
        ]

        train_selection = np.zeros_like(collapsed_master_mask)
        train_selection[(train_indices[:, 0], train_indices[:, 1])] = True

    # Apply the master_mask to the training and test data to arrive at the final 3D mask.
    train_selection = train_selection[None] & (~master_mask)
    hold_out_selection = hold_out_selection[None] & (~master_mask)

    if verbose:
        # Plot a map of the selections.
        mask_vis = np.zeros_like(master_mask, dtype=np.int32)
        mask_vis[hold_out_selection] = 1
        mask_vis[train_selection] = 2
        cube_plotting(
            np.mean(mask_vis, axis=0),
            title=str(seed),
            fig=plt.figure(dpi=dpi),
        )

    # Transform X, y to 3D arrays before selecting using the above masks.
    mm_endog = get_map_data(endog_data.values, master_mask)
    train_y = mm_endog.data[train_selection]
    hold_out_y = mm_endog.data[hold_out_selection]

    # Repeat for all columns in X.
    train_X_data = {}
    hold_out_X_data = {}
    for col in exog_data.columns:
        mm_x_col = get_map_data(exog_data[col].values, master_mask)
        train_X_data[col] = mm_x_col.data[train_selection]
        hold_out_X_data[col] = mm_x_col.data[hold_out_selection]

    train_X = pd.DataFrame(train_X_data)
    hold_out_X = pd.DataFrame(hold_out_X_data)

    n_ignored = np.sum(ignored_data[None] & (~master_mask))
    n_train = np.sum(train_selection)
    n_hold_out = np.sum(hold_out_selection)

    desc_str = (
        f"Total samples: {total_samples:0.1e}, "
        f"Ignored: {100 * n_ignored / total_samples: 0.1f}%, "
        f"Train: {100 * n_train / total_samples: 0.1f}%, "
        f"Test: {100 * n_hold_out / total_samples: 0.1f}%"
    )

    return (
        desc_str,
        (total_samples, n_ignored, n_train, n_hold_out),
        train_X,
        hold_out_X,
        train_y,
        hold_out_y,
    )