Exemplo n.º 1
0
def experiment_step(params: Dict,
                    smoke_test: bool = False,
                    subsample: Optional[int] = None) -> None:
    # ======================
    # experiment - Data
    # ======================
    # Get DataCube
    datacube = get_dataset(params["variable"])[params["variable"]]

    # subset datacube (spatially)
    datacube = select_region(xr_data=datacube,
                             bbox=params["region"])[params["variable"]]

    # subset datacube (temporally)
    datacube = select_period(xr_data=datacube,
                             period=params["period"]).compute()

    # get density cubes
    density_cube_df = get_density_cubes(
        data=datacube,
        spatial=params["dimensions"].spatial,
        temporal=params["dimensions"].temporal,
    )
    # # standardize data
    X = density_cube_df.iloc[:, 0][:, np.newaxis]
    Y = density_cube_df.iloc[:, 1:]

    # standardize data
    X, Y = standardizer_data(X=X, Y=Y)

    # ======================
    # experiment - Methods
    # ======================
    res = get_similarity_scores(X_ref=X,
                                Y_compare=Y,
                                smoke_test=smoke_test,
                                subsample=subsample)

    # Save Results
    results_df = pd.DataFrame(
        {
            "region": params["region"].name,
            "period": params["period"].name,
            "variable": params["variable"],
            "spatial": params["dimensions"].spatial,
            "temporal": params["dimensions"].temporal,
            "n_dimensions": params["dimensions"].dimensions,
            **res,
        },
        index=[0],
    )
    return results_df
Exemplo n.º 2
0
def experiment_step(params: Dict, smoke_test: bool = False) -> None:
    # ======================
    # experiment - Data
    # ======================
    # Get DataCube
    datacube = get_dataset(params["variable"])

    # subset datacube (spatially)
    datacube = select_region(xr_data=datacube, bbox=params["region"])[
        params["variable"]
    ]

    # subset datacube (temporally)
    datacube = select_period(xr_data=datacube, period=params["period"])

    # get datacubes
    reference_cube_df = get_reference_cube(data=datacube)

    # get density cubes
    density_cube_df = get_density_cubes(
        data=datacube, spatial=params["spatial"], temporal=params["temporal"]
    )

    # get reference dataframe
    X, Y = get_common_indices(
        reference_df=reference_cube_df, density_df=density_cube_df
    )

    # standardize data
    X, Y = standardizer_data(X=X, Y=Y)

    # ======================
    # experiment - Methods
    # ======================
    res = get_similarity_scores(X_ref=X, Y_compare=Y, smoke_test=smoke_test)

    # Save Results
    results_df = pd.DataFrame(
        {
            "region": params["region"].name,
            "period": params["period"].name,
            "variable": params["variable"],
            "spatial": params["spatial"],
            "temporal": params["temporal"],
            **res,
        },
        index=[0],
    )
    return results_df
Exemplo n.º 3
0
def experiment_step(params: Dict,
                    smoke_test: bool = False,
                    subsample: Optional[int] = None) -> pd.DataFrame:
    # ======================
    # experiment - Data
    # ======================
    # Get DataCube
    datacube = get_dataset([params["variable"]])

    # subset datacube (spatially)
    if params["region"] not in ["world"]:
        region_name = params["region"].name
        datacube = select_region(xr_data=datacube,
                                 bbox=params["region"])[params["variable"]]
    else:

        region_name = "world"

    # remove climatology
    # print(type(datacube))
    datacube, _ = remove_climatology(datacube)
    # print(type(datacube))
    #
    if isinstance(datacube, xr.Dataset):
        # print(type(datacube))
        datacube = datacube[params["variable"]]
        # print(type(datacube))

    # subset datacube (temporally)
    # print(type(datacube))
    # print(datacube)
    datacube = select_period(xr_data=datacube, period=params["period"])

    # get density cubes
    density_cube_df = get_density_cubes(
        data=datacube,
        spatial=params["dimensions"].spatial,
        temporal=params["dimensions"].temporal,
    )

    if smoke_test:
        density_cube_df = density_cube_df.iloc[:10_000]
        logging.info(f"Total data (smoke-test): {density_cube_df.shape}")

    # # standardize data
    x_transformer = StandardScaler().fit(density_cube_df.values)

    density_cube_df_norm = pd.DataFrame(
        data=x_transformer.transform(density_cube_df.values),
        columns=density_cube_df.columns.values,
        index=density_cube_df.index,
    )

    # =========================
    # Model - Gaussianization
    # =========================

    # Gaussianize the data
    t0 = time.time()
    rbig_h = rbig_h_measures(density_cube_df_norm.values,
                             subsample=subsample,
                             random_state=123)
    t1 = time.time() - t0

    # Save Results
    results_df = pd.DataFrame(
        {
            "region": region_name,
            "period": params["period"].name,
            "variable": params["variable"],
            "spatial": params["dimensions"].spatial,
            "temporal": params["dimensions"].temporal,
            "n_dimensions": params["dimensions"].dimensions,
            "n_samples": density_cube_df_norm.shape[0],
            "entropy": rbig_h,
            "time": t1,
        },
        index=[0],
    )
    return results_df
Exemplo n.º 4
0
def experiment_step(parameters: Dict, args: argparse.Namespace,) -> pd.DataFrame:

    # ======================
    # experiment - Data
    # ======================
    # Get DataCube
    datacube = get_dataset([parameters["variable"]])

    # ======================
    # RESAMPLE
    # ======================
    if args.resample:
        datacube = datacube.resample(time=args.resample).mean()

    # ======================
    # SPATIAL SUBSET
    # ======================
    if parameters["region"] not in ["world"]:
        region_name = parameters["region"].name
        datacube = select_region(xr_data=datacube, bbox=parameters["region"])[
            parameters["variable"]
        ]
    else:
        region_name = "world"

    # ======================
    # CLIMATOLOGY (TEMPORAL)
    # ======================
    if args.remove_climatology:
        datacube, _ = remove_climatology(datacube)
    # print(type(datacube))
    #
    # ======================
    # TEMPORAL SUBSET
    # ======================
    datacube = select_period(xr_data=datacube, period=parameters["period"])

    # ======================
    # DENSITY CUBES
    # ======================
    if isinstance(datacube, xr.Dataset):
        # print(type(datacube))
        datacube = datacube[parameters["variable"]]

    density_cube_df = get_density_cubes(
        data=datacube,
        spatial=parameters["dimensions"].spatial,
        temporal=parameters["dimensions"].temporal,
    )

    # ======================
    # STANDARDIZE DATA
    # ======================
    x_transformer = StandardScaler().fit(density_cube_df.values)

    density_cube_df_norm = pd.DataFrame(
        data=x_transformer.transform(density_cube_df.values),
        columns=density_cube_df.columns.values,
        index=density_cube_df.index,
    )
    # ======================
    # SUBSAMPLE DATA
    # ======================
    if args.subsample is not None:
        idx = subset_indices(
            density_cube_df_norm.values, subsample=args.subsample, random_state=100
        )
        if idx is not None:
            X = density_cube_df_norm.iloc[idx, :].values
        else:
            X = density_cube_df_norm.values
    else:
        X = density_cube_df_norm.values

    # =========================
    # Model - Gaussianization
    # =========================
    # Gaussianize the data
    t0 = time.time()
    rbig_h = rbig_h_measures(X, random_state=123, method=args.method)
    t1 = time.time() - t0

    # Save Results
    results_df = pd.DataFrame(
        {
            "region": region_name,
            "period": parameters["period"].name,
            "variable": parameters["variable"],
            "spatial": parameters["dimensions"].spatial,
            "temporal": parameters["dimensions"].temporal,
            "n_dimensions": parameters["dimensions"].dimensions,
            "n_samples": X.shape[0],
            "entropy": rbig_h,
            "time": t1,
        },
        index=[0],
    )
    return results_df
Exemplo n.º 5
0
def experiment_step(args: argparse.Namespace, ) -> Union[Any, Any]:

    logging.info(f"Extracting Parameters")
    parameters = get_parameters(args)
    # ======================
    # experiment - Data
    # ======================
    # Get DataCube
    logging.info(f"Loading '{parameters['variable'][0]}' variable")
    datacube = get_dataset(parameters["variable"])

    # ======================
    # RESAMPLE
    # ======================
    if args.resample:
        logging.info(f"Resampling datacube...")
        datacube = datacube.resample(time="1MS").mean()

    # ======================
    # SPATIAL SUBSET
    # ======================
    try:
        logging.info(f"Selecting region '{parameters['region'].name}'")
        datacube = select_region(
            xr_data=datacube,
            bbox=parameters["region"])[parameters["variable"]]
    except:
        logging.info(f"Selecting region 'world'")

    # ======================
    # CLIMATOLOGY (TEMPORAL)
    # ======================
    if args.clima:
        logging.info("Removing climatology...")
        datacube, _ = remove_climatology(datacube)

    # ======================
    # TEMPORAL SUBSET
    # ======================
    logging.info(f"Selecting temporal period: '{parameters['period'].name}'")
    datacube = select_period(xr_data=datacube, period=parameters["period"])

    # ======================
    # DENSITY CUBES
    # ======================
    logging.info(
        f"Getting density cubes: S: {args.spatial}, T: {args.temporal}")
    if isinstance(datacube, xr.Dataset):
        datacube = datacube[parameters["variable"][0]]

    density_cube_df = get_density_cubes(
        data=datacube,
        spatial=args.spatial,
        temporal=args.temporal,
    )
    logging.info(f"Total data: {density_cube_df.shape}")

    # ======================
    # STANDARDIZE DATA
    # ======================
    logging.info(f"Standardizing data...")
    x_transformer = StandardScaler().fit(density_cube_df.values)

    density_cube_df_norm = pd.DataFrame(
        data=x_transformer.transform(density_cube_df.values),
        columns=density_cube_df.columns.values,
        index=density_cube_df.index,
    )

    # ======================
    # SUBSAMPLE DATA
    # ======================
    if args.smoke_test:
        logging.info(f"Smoke Test...")
        logging.info(f"Subsampling datacube...")
        idx = subset_indices(density_cube_df_norm.values,
                             subsample=1000,
                             random_state=100)
        X = density_cube_df_norm.iloc[idx, :].values
        index = density_cube_df_norm.iloc[idx, :].index
    elif args.subsample is not None:
        logging.info(f"Subsampling datacube...")
        idx = subset_indices(density_cube_df_norm.values,
                             subsample=args.subsample,
                             random_state=100)
        X = density_cube_df_norm.iloc[idx, :].values
        index = density_cube_df_norm.index
    else:
        X = density_cube_df_norm.values
        index = density_cube_df_norm.index

    logging.info(f"Input shape: {X.shape}")
    parameters["input_shape"] = X.shape

    # =========================
    # Model - Gaussianization
    # =========================
    # Gaussianize the data
    logging.info(f"Gaussianizing data...")
    t0 = time.time()
    rbig_model = get_rbig_model(X=X, method=args.method)

    rbig_model.fit(X)
    t1 = time.time() - t0
    logging.info(f"Time Taken: {t1:.2f} secs")
    parameters["rbig_fit_time"] = t1

    # =========================
    # PROB ESTIMATES
    # =========================
    logging.info(f"Getting probability estimates...")
    t0 = time.time()
    # add noise
    if args.add_noise:
        logging.info(f"Adding noise to values for probability...")
        density_cube_df_norm.values += 1e-1 * RNG.rand(
            *density_cube_df_norm.values.shape)
    logging.info(f"Parallel predictions...")
    if args.smoke_test:
        X_prob = parallel_predictions(
            X=X,
            func=rbig_model.predict_proba,
            batchsize=100,
            n_jobs=-1,
            verbose=1,
        )
    else:
        X_prob = parallel_predictions(
            X=density_cube_df_norm.values,
            func=rbig_model.predict_proba,
            batchsize=10_000,
            n_jobs=-1,
            verbose=1,
        )

    t1 = time.time() - t0
    logging.info(f"Time Taken: {t1:.2f} secs")
    parameters["prob_size"] = density_cube_df_norm.values.shape
    parameters["rbig_predict_time"] = t1

    X_prob = pd.DataFrame(data=X_prob, index=index, columns=["probability"])

    # returning density cubes
    logging.info(f"Getting information cubes.")
    X_prob = get_information_cubes(X_prob, time=args.temporal_mean)

    X_prob.attrs = parameters
    return X_prob
Exemplo n.º 6
0
def experiment_step(
    params: Dict, smoke_test: bool = False, subsample: Optional[int] = None
) -> Union[Any, Any, Any, Any]:
    # ======================
    # experiment - Data
    # ======================
    # Get DataCube
    logging.info(f"Loading '{params['variable']}' variable")
    datacube = get_dataset(params["variable"])

    # subset datacube (spatially)
    try:
        logging.info(f"Selecting region '{params['region'].name}'")
        datacube = select_region(xr_data=datacube, bbox=params["region"])[
            params["variable"]
        ]
    except:
        logging.info(f"Selecting region 'world'")

    #
    logging.info("Removing climatology...")
    datacube, _ = remove_climatology(datacube)

    # subset datacube (temporally)
    logging.info(f"Selecting temporal period: '{params['period'].name}'")
    datacube = select_period(xr_data=datacube, period=params["period"])

    # get density cubes
    logging.info(
        f"Getting density cubes: S: {params['spatial']}, T: {params['temporal']}"
    )
    if isinstance(datacube, xr.Dataset):
        # print(type(datacube))
        datacube = datacube[params["variable"][0]]

    density_cube_df = get_density_cubes(
        data=datacube, spatial=params["spatial"], temporal=params["temporal"],
    )
    logging.info(f"Total data: {density_cube_df.shape}")

    if smoke_test:
        density_cube_df = density_cube_df.iloc[:1_000]
        logging.info(f"Total data (smoke-test): {density_cube_df.shape}")

    # # standardize data
    logging.info(f"Standardizing data...")
    x_transformer = StandardScaler().fit(density_cube_df.values)

    density_cube_df_norm = pd.DataFrame(
        data=x_transformer.transform(density_cube_df.values),
        columns=density_cube_df.columns.values,
        index=density_cube_df.index,
    )

    # =========================
    # Model - Gaussianization
    # =========================

    # Gaussianize the data
    logging.info(f"Gaussianizing data...")
    t0 = time.time()
    rbig_model = get_rbig_model(
        X=density_cube_df_norm.values, subsample=params["subsample"]
    )
    t1 = time.time() - t0
    logging.info(f"Time Taken: {t1:.2f} secs")

    # get the probability estimates
    logging.info(f"Getting probability estimates...")
    t0 = time.time()
    # add noise

    prob_inputs = density_cube_df_norm.values + 1e-1 * RNG.rand(
        *density_cube_df_norm.values.shape
    )
    logging.info(f"Parallel predictions...")
    X_prob = parallel_predictions(
        X=prob_inputs,
        func=rbig_model.predict_proba,
        batchsize=10_000,
        n_jobs=-1,
        verbose=1,
    )

    t1 = time.time() - t0
    logging.info(f"Time Taken: {t1:.2f} secs")

    X_prob = pd.DataFrame(data=X_prob, index=density_cube_df_norm.index,)

    logging.info(f"Computing Mean...")
    X_prob = X_prob.groupby(level=["lat", "lon"]).mean()
    return rbig_model, x_transformer, X_prob, density_cube_df