예제 #1
0
 def create_dataset_group(spec):
     """Create a dataset group from its specification."""
     group = []
     for dataset_name, selected_variables in spec.items():
         # Select the relevant dataset.
         matching_datasets = [
             d for d in original_datasets if d.__name__ == dataset_name
         ]
         if not len(matching_datasets) == 1:
             raise ValueError(
                 f"Expected 1 matching dataset for '{dataset_name}', "
                 f"got {matching_datasets}.")
         # Instantiate the matching Dataset.
         matching_dataset = matching_datasets[0]()
         if selected_variables:
             # There are variables to select.
             group.append(
                 Datasets(matching_dataset).select_variables(
                     selected_variables).dataset)
         else:
             # There is nothing to select.
             group.append(matching_dataset)
     return group
예제 #2
0
def get_data(
    shift_months=[1, 3, 6, 9, 12, 18, 24],
    selection_variables=None,
    masks=None,
    n_months=n_months,
):
    target_variable = "GFED4 BA"

    # Variables required for the above.
    required_variables = [target_variable]

    # Dataset selection.

    selection_datasets = [
        AvitabileThurnerAGB(),
        # Copernicus_SWI(),
        ERA5_Temperature(),
        ESA_CCI_Landcover_PFT(),
        GFEDv4(),
        HYDE(),
        WWLLN(),
    ]

    # Datasets subject to temporal interpolation.
    temporal_interp_datasets = [
        Datasets(Copernicus_SWI()).select_variables(("SWI(1)", )).dataset
    ]

    # Datasets subject to interpolation and shifting.
    shift_and_interp_datasets = [
        Datasets(MOD15A2H_LAI_fPAR()).select_variables(
            ("FAPAR", "LAI")).dataset,
        Datasets(VODCA()).select_variables(("VOD Ku-band", )).dataset,
        Datasets(GlobFluo_SIF()).select_variables(("SIF", )).dataset,
    ]

    # These datasets may be shifted.
    datasets_to_shift = [
        Datasets(ERA5_DryDayPeriod()).select_variables(
            ("Dry Day Period", )).dataset
    ]

    # Determine shared temporal extent of the data.
    min_time, max_time = dataset_times(selection_datasets +
                                       temporal_interp_datasets +
                                       shift_and_interp_datasets +
                                       datasets_to_shift)[:2]
    interp_min_time, interp_max_time = dataset_times(
        temporal_interp_datasets + shift_and_interp_datasets)[:2]
    target_timespan = (
        max(min_time, interp_min_time + relativedelta(months=+n_months)),
        min(max_time, interp_max_time - relativedelta(months=+n_months)),
    )

    # Carry out the temporal NN interpolation.
    for i, dataset in enumerate(temporal_interp_datasets):
        temporal_interp_datasets[
            i] = dataset.get_temporally_interpolated_dataset(
                target_timespan, n_months)
    for i, dataset in enumerate(shift_and_interp_datasets):
        shift_and_interp_datasets[
            i] = dataset.get_temporally_interpolated_dataset(
                target_timespan, n_months)

    datasets_to_shift.extend(shift_and_interp_datasets)
    selection_datasets += datasets_to_shift
    selection_datasets += temporal_interp_datasets

    if shift_months is not None:
        for shift in shift_months:
            for shift_dataset in datasets_to_shift:
                selection_datasets.append(
                    shift_dataset.get_temporally_shifted_dataset(months=-shift,
                                                                 deep=False))

    if selection_variables is None:
        selection_variables = [
            "AGB Tree",
            "Diurnal Temp Range",
            "Dry Day Period",
            f"FAPAR {n_months}NN",
            f"LAI {n_months}NN",
            "Max Temp",
            f"SIF {n_months}NN",
            f"SWI(1) {n_months}NN",
            "ShrubAll",
            "TreeAll",
            f"VOD Ku-band {n_months}NN",
            "lightning",
            "pftCrop",
            "pftHerb",
            "popd",
        ]
        if shift_months is not None:
            for shift in shift_months:
                selection_variables.extend([
                    f"LAI {n_months}NN {-shift} Month",
                    f"FAPAR {n_months}NN {-shift} Month",
                    f"Dry Day Period {-shift} Month",
                    f"VOD Ku-band {n_months}NN {-shift} Month",
                    f"SIF {n_months}NN {-shift} Month",
                ])

    selection_variables = list(
        set(selection_variables).union(required_variables))

    selection = Datasets(selection_datasets).select_variables(
        selection_variables)
    (
        endog_data,
        exog_data,
        master_mask,
        filled_datasets,
        masked_datasets,
        land_mask,
    ) = data_processing(
        selection,
        which="climatology",
        transformations={},
        deletions=[],
        use_lat_mask=False,
        use_fire_mask=False,
        target_variable=target_variable,
        masks=masks,
    )
    return (
        endog_data,
        exog_data,
        master_mask,
        filled_datasets,
        masked_datasets,
        land_mask,
    )
예제 #3
0
def get_data(shift_months=[1, 3, 6, 9, 12, 18, 24],
             selection_variables=None,
             masks=None):
    target_variable = "GFED4 BA"

    # Variables required for the above.
    required_variables = [target_variable]

    # Dataset selection.

    selection_datasets = [
        AvitabileThurnerAGB(),
        Copernicus_SWI(),
        ERA5_Temperature(),
        ESA_CCI_Landcover_PFT(),
        GFEDv4(),
        HYDE(),
        WWLLN(),
    ]
    # These datasets will potentially be shifted.
    datasets_to_shift = [
        ERA5_DryDayPeriod(),
        MOD15A2H_LAI_fPAR(),
        VODCA(),
        GlobFluo_SIF(),
    ]
    selection_datasets += datasets_to_shift
    if shift_months is not None:
        for shift in shift_months:
            for shift_dataset in datasets_to_shift:
                selection_datasets.append(
                    shift_dataset.get_temporally_shifted_dataset(months=-shift,
                                                                 deep=False))

    if selection_variables is None:
        selection_variables = [
            "AGB Tree",
            "Diurnal Temp Range",
            "Dry Day Period",
            "FAPAR",
            "LAI",
            "Max Temp",
            "SIF",
            "SWI(1)",
            "ShrubAll",
            "TreeAll",
            "VOD Ku-band",
            "lightning",
            "pftCrop",
            "pftHerb",
            "popd",
        ]
        if shift_months is not None:
            for shift in shift_months:
                selection_variables.extend([
                    f"LAI {-shift} Month",
                    f"FAPAR {-shift} Month",
                    f"Dry Day Period {-shift} Month",
                    f"VOD Ku-band {-shift} Month",
                    f"SIF {-shift} Month",
                ])

    selection_variables = list(
        set(selection_variables).union(required_variables))

    selection = Datasets(selection_datasets).select_variables(
        selection_variables)
    (
        endog_data,
        exog_data,
        master_mask,
        filled_datasets,
        masked_datasets,
        land_mask,
    ) = data_processing(
        selection,
        which="climatology",
        transformations={},
        deletions=[],
        use_lat_mask=False,
        use_fire_mask=False,
        target_variable=target_variable,
        masks=masks,
    )
    return (
        endog_data,
        exog_data,
        master_mask,
        filled_datasets,
        masked_datasets,
        land_mask,
    )
예제 #4
0
def _nn_basis_func(
    *,
    check_max_time,
    check_min_time,
    check_shift_min_time,
    exp_features,
    normal_mask_ignore_n,
    shift_mask_ignore_n,
    max_time=None,
    min_time=None,
    n_months,
    normal_n_time,
    shift_n_time,
    spec_datasets_to_shift,
    spec_selection_datasets,
    spec_shift_and_interp_datasets,
    spec_temporal_interp_datasets,
    target_var,
    which,
    all_shifted_variables=variable.shifted_variables,
    # Store this initially, since this is changed as new datasets (e.g. filled
    # datasets) are derived from the original datasets.
    original_datasets=tuple(
        sorted(Dataset.datasets, key=attrgetter("__name__"))),
):
    target_variable = target_var.name

    required_variables = [target_variable]

    shifted_variables = {var.parent for var in exp_features if var.shift != 0}
    assert all(shifted_var in all_shifted_variables
               for shifted_var in shifted_variables)

    shift_months = [
        shift for shift in sorted({var.shift
                                   for var in exp_features}) if shift != 0
    ]

    def year_month_datetime(dt):
        """Use only year and month information to construct a datetime."""
        return datetime(dt.year, dt.month, 1)

    def create_dataset_group(spec):
        """Create a dataset group from its specification."""
        group = []
        for dataset_name, selected_variables in spec.items():
            # Select the relevant dataset.
            matching_datasets = [
                d for d in original_datasets if d.__name__ == dataset_name
            ]
            if not len(matching_datasets) == 1:
                raise ValueError(
                    f"Expected 1 matching dataset for '{dataset_name}', "
                    f"got {matching_datasets}.")
            # Instantiate the matching Dataset.
            matching_dataset = matching_datasets[0]()
            if selected_variables:
                # There are variables to select.
                group.append(
                    Datasets(matching_dataset).select_variables(
                        selected_variables).dataset)
            else:
                # There is nothing to select.
                group.append(matching_dataset)
        return group

    selection_datasets = create_dataset_group(spec_selection_datasets)
    temporal_interp_datasets = create_dataset_group(
        spec_temporal_interp_datasets)
    shift_and_interp_datasets = create_dataset_group(
        spec_shift_and_interp_datasets)
    datasets_to_shift = create_dataset_group(spec_datasets_to_shift)

    all_datasets = (selection_datasets + temporal_interp_datasets +
                    shift_and_interp_datasets + datasets_to_shift)

    # Determine shared temporal extent of the data.
    _min_time, _max_time, _times_df = dataset_times(all_datasets)

    print(_times_df)

    if min_time is None:
        min_time = _min_time
    if max_time is None:
        max_time = _max_time

    assert min_time >= _min_time
    assert max_time <= _max_time

    if shift_months:
        _shift_min_time = year_month_datetime(min_time) - relativedelta(
            months=shift_months[-1])
        shift_min_time = PartialDateTime(year=_shift_min_time.year,
                                         month=_shift_min_time.month)
    else:
        shift_min_time = min_time

    # Sanity check.
    assert min_time == check_min_time
    assert shift_min_time == check_shift_min_time
    assert max_time == check_max_time

    for dataset in datasets_to_shift:
        # Apply longer time limit to the datasets to be shifted.
        dataset.limit_months(shift_min_time, max_time)

        for cube in dataset:
            assert cube.shape[0] == shift_n_time

    for dataset in selection_datasets:
        # Apply time limit.
        dataset.limit_months(min_time, max_time)

        if dataset.frequency == "monthly":
            for cube in dataset:
                assert cube.shape[0] == normal_n_time

    for dataset in shift_and_interp_datasets:
        # Apply longer time limit to the datasets to be shifted.
        dataset.limit_months(
            year_month_datetime(shift_min_time) -
            relativedelta(months=+n_months),
            year_month_datetime(max_time) + relativedelta(months=+n_months),
        )

        for cube in dataset:
            assert cube.shape[0] == shift_n_time + 2 * n_months

    for dataset in temporal_interp_datasets:
        # Apply time limit.
        dataset.limit_months(
            year_month_datetime(min_time) - relativedelta(months=+n_months),
            year_month_datetime(max_time) + relativedelta(months=+n_months),
        )

        if dataset.frequency == "monthly":
            for cube in dataset:
                assert cube.shape[0] == normal_n_time + 2 * n_months

    for dataset in all_datasets:
        # Regrid each dataset to the common grid.
        dataset.regrid()

    # Calculate and apply the shared mask.
    total_masks = []

    for dataset in temporal_interp_datasets:
        for cube in dataset.cubes:
            # Ignore areas that are always masked, e.g. water.
            ignore_mask = np.all(cube.data.mask, axis=0)
            # Also ignore those areas with low data availability.
            ignore_mask |= np.sum(cube.data.mask,
                                  axis=0) > normal_mask_ignore_n
            total_masks.append(ignore_mask)

    for dataset in shift_and_interp_datasets:
        for cube in dataset.cubes:
            # Ignore areas that are always masked, e.g. water.
            ignore_mask = np.all(cube.data.mask, axis=0)
            # Also ignore those areas with low data availability.
            ignore_mask |= np.sum(cube.data.mask, axis=0) > shift_mask_ignore_n
            total_masks.append(ignore_mask)

    combined_mask = reduce(np.logical_or, total_masks)

    # Apply mask to all datasets.
    for dataset in all_datasets:
        dataset.apply_masks(combined_mask)

    # Carry out the nearest-neighbour filling.
    for i, dataset in enumerate(temporal_interp_datasets):
        temporal_interp_datasets[
            i] = dataset.get_temporally_interpolated_dataset(
                target_timespan=tuple(
                    map(year_month_datetime, (min_time, max_time))),
                n_months=n_months,
                verbose=True,
            )
    for i, dataset in enumerate(shift_and_interp_datasets):
        shift_and_interp_datasets[
            i] = dataset.get_temporally_interpolated_dataset(
                target_timespan=tuple(
                    map(year_month_datetime, (shift_min_time, max_time))),
                n_months=n_months,
                verbose=True,
            )

    datasets_to_shift.extend(shift_and_interp_datasets)
    selection_datasets += datasets_to_shift
    selection_datasets += temporal_interp_datasets

    if shift_months is not None:
        for shift in shift_months:
            for shift_dataset in datasets_to_shift:
                # Remove any temporal coordinates other than 'time' here if needed,
                # since these would otherwise become misaligned when the data is
                # shifted below.
                for cube in shift_dataset:
                    for prune_coord in ("month_number", "year"):
                        if cube.coords(prune_coord):
                            cube.remove_coord(prune_coord)

                selection_datasets.append(
                    shift_dataset.get_temporally_shifted_dataset(months=-shift,
                                                                 deep=False))

    selection_variables = list(
        set(map(lambda v: v.get_standard().raw_nn_filled,
                exp_features)).union(required_variables))

    selection = Datasets(selection_datasets).select_variables(
        selection_variables)
    (
        endog_data,
        exog_data,
        master_mask,
        _,  # We don't need the `filled_datasets`.
        masked_datasets,
        land_mask,
    ) = data_processing(
        selection,
        which=which,
        transformations={},
        deletions=[],
        use_lat_mask=False,
        use_fire_mask=False,
        target_variable=target_variable,
        masks=None,
    )

    def _pandas_string_labels_to_variables(
        x,
        target_var,
        all_features=selected_features[Experiment.ALL],
    ):
        """Transform series names or columns labels to variable.Variable instances."""

        all_variables = tuple(
            # Get the instantaneous variables corresponding to all variables.
            list(map(methodcaller("get_standard"), all_features)) +
            [target_var])
        all_variable_names = tuple(
            map(attrgetter("raw_nn_filled"), all_variables))
        if isinstance(x, pd.Series):
            x.name = all_variables[all_variable_names.index(x.name)]
        elif isinstance(x, pd.DataFrame):
            x.columns = [
                all_variables[all_variable_names.index(c)] for c in x.columns
            ]
        else:
            raise TypeError(
                f"Expected either a pandas.Series or pandas.DataFrame. Got '{x}'."
            )

    _pandas_string_labels_to_variables(endog_data, target_var)
    _pandas_string_labels_to_variables(exog_data, target_var)

    assert exog_data.shape[1] == len(exp_features)

    # Calculate anomalies for large lags.
    # NOTE: Modifies `exog_data` inplace.
    to_delete = []

    for var in exog_data:
        if var.shift < 12:
            continue

        new_var = var.get_offset()
        comp_var = variable.get_matching(exog_data.columns,
                                         name=new_var.name,
                                         shift=new_var.comp_shift)
        print(f"{var} - {comp_var} -> {new_var}")
        exog_data[new_var] = exog_data[var] - exog_data[comp_var]
        to_delete.append(var)

    for column in to_delete:
        del exog_data[column]

    # Check again.
    assert exog_data.shape[1] == len(exp_features)

    return (
        endog_data,
        exog_data,
        master_mask,
        masked_datasets,
        land_mask,
        set(exog_data.columns),
    )
예제 #5
0
def get_data(
    shift_months=[1, 3, 6, 9, 12, 18, 24],
    selection_variables=None,
    masks=None,
    n_months=n_months,
):
    target_variable = "GFED4 BA"

    # Variables required for the above.
    required_variables = [target_variable]

    # Dataset selection.

    selection_datasets = [
        AvitabileThurnerAGB(),
        ERA5_Temperature(),
        ESA_CCI_Landcover_PFT(),
        GFEDv4(),
        HYDE(),
        WWLLN(),
    ]

    # Datasets subject to temporal interpolation (filling).
    temporal_interp_datasets = [
        Datasets(Copernicus_SWI()).select_variables(("SWI(1)",)).dataset
    ]

    # Datasets subject to temporal interpolation and shifting.
    shift_and_interp_datasets = [
        Datasets(MOD15A2H_LAI_fPAR()).select_variables(("FAPAR", "LAI")).dataset,
        Datasets(VODCA()).select_variables(("VOD Ku-band",)).dataset,
        Datasets(GlobFluo_SIF()).select_variables(("SIF",)).dataset,
    ]

    # Datasets subject to temporal shifting.
    datasets_to_shift = [
        Datasets(ERA5_DryDayPeriod()).select_variables(("Dry Day Period",)).dataset
    ]

    all_datasets = (
        selection_datasets
        + temporal_interp_datasets
        + shift_and_interp_datasets
        + datasets_to_shift
    )

    # Determine shared temporal extent of the data.
    min_time, max_time = dataset_times(all_datasets)[:2]
    shift_min_time = min_time - relativedelta(years=2)

    interp_min_time, interp_max_time = dataset_times(
        temporal_interp_datasets + shift_and_interp_datasets
    )[:2]
    target_timespan = (
        max(shift_min_time, interp_min_time + relativedelta(months=+n_months)),
        min(max_time, interp_max_time - relativedelta(months=+n_months)),
    )

    # Sanity check.
    assert min_time == datetime(2010, 1, 1)
    assert shift_min_time == datetime(2008, 1, 1)
    assert max_time == datetime(2015, 4, 1)

    # Carry out the temporal NN interpolation.
    for datasets in (temporal_interp_datasets, shift_and_interp_datasets):
        for i, dataset in enumerate(datasets):
            datasets[i] = dataset.get_temporally_interpolated_dataset(
                target_timespan, n_months
            )

    datasets_to_shift.extend(shift_and_interp_datasets)
    selection_datasets += datasets_to_shift
    selection_datasets += temporal_interp_datasets

    if shift_months is not None:
        for shift in shift_months:
            for shift_dataset in datasets_to_shift:
                selection_datasets.append(
                    shift_dataset.get_temporally_shifted_dataset(
                        months=-shift, deep=False
                    )
                )

    if selection_variables is None:
        selection_variables = get_filled_names(
            [
                "AGB Tree",
                "Diurnal Temp Range",
                "Dry Day Period",
                "FAPAR",
                "LAI",
                "Max Temp",
                "SIF",
                "SWI(1)",
                "ShrubAll",
                "TreeAll",
                "VOD Ku-band",
                "lightning",
                "pftCrop",
                "pftHerb",
                "popd",
            ]
        )
        if shift_months is not None:
            for shift in shift_months:
                selection_variables.extend(
                    [
                        f"{var} {-shift} Month"
                        for var in get_filled_names(
                            ["LAI", "FAPAR", "Dry Day Period", "VOD Ku-band", "SIF"]
                        )
                    ]
                )

    selection_variables = list(set(selection_variables).union(required_variables))

    selection = Datasets(selection_datasets).select_variables(selection_variables)

    # Ensure correct number of samples (in time).
    overall_min_time, overall_max_time = dataset_times(selection)[:2]
    for dataset in selection:
        dataset.limit_months(overall_min_time, overall_max_time)

        if dataset.frequency == "monthly":
            for cube in dataset:
                assert cube.shape[0] == 61

    (
        endog_data,
        exog_data,
        master_mask,
        filled_datasets,
        masked_datasets,
        land_mask,
    ) = data_processing(
        selection,
        which="climatology",
        transformations={},
        deletions=[],
        use_lat_mask=False,
        use_fire_mask=False,
        target_variable=target_variable,
        masks=masks,
    )
    return (
        endog_data,
        exog_data,
        master_mask,
        filled_datasets,
        masked_datasets,
        land_mask,
    )