def get_predictions( plevels: List[float], filepath: Union[ os.PathLike, str] = "/data/cees/zespinos/runs/feature_experiments/40_levels/year_three/evaluate/gwfu/data_avail", ): months = [60 * i for i in range(24 * 1 + 1)] data_avail = ["three", "six", "nine", "full_features"] data_predictions = [] for avail in data_avail: avail_metrics = from_pickle( os.path.join(filepath, avail, "metrics.pkl")) avail_pred = from_pickle( os.path.join(filepath, avail, "predictions.pkl")) avail_truth = avail_pred["targets"].T avail_truth = avail_truth.reshape(33, 1440, 64, 128).swapaxes(1, 0) avail_truth = avail_truth[:, LOWEST_PLEVEL:LAST_PLEVEL, :, :] avail_pred = avail_pred["predictions"].T avail_pred = avail_pred.reshape(33, 1440, 64, 128).swapaxes(1, 0) avail_pred = avail_pred[:, LOWEST_PLEVEL:LAST_PLEVEL, :, :] avail_pred = generate_monthly_averages(avail_pred, months) data_predictions.append(avail_pred) print(f"{avail}: ", avail_metrics["r_squared"]) avail_pred = None avail_truth = generate_monthly_averages(avail_truth, months) data_predictions.append(avail_truth) data_predictions = np.concatenate(data_predictions, axis=1) return data_predictions
def predicted_qbo( plevels: List[float], filepath: Union[ os.PathLike, str] = "/data/cees/zespinos/runs/feature_experiments/40_levels", ): years = ["year_two", "year_one", "year_three", "year_four", "year_five"] year_predictions = [] for year in years: year_metrics = from_pickle( os.path.join(filepath, f"{year}/evaluate/gwfu/full_features/metrics.pkl")) year_data = from_pickle( os.path.join( filepath, f"{year}/evaluate/gwfu/full_features/predictions.pkl")) year_data = year_data["predictions"].T year_data = year_data.reshape(33, 1440, 64, 128).swapaxes(1, 0) year_predictions.append(year_data[:, LOWEST_PLEVEL:LAST_PLEVEL, :, :]) print(f"{year}: ", year_metrics["r_squared"]) year_data = None months = [60 * i for i in range(24 * len(year_predictions) + 1)] year_predictions = np.concatenate(year_predictions, axis=0) year_predictions = generate_monthly_averages(year_predictions, months) return year_predictions
def __init__(self, scaler: Dict[str, Union[str, bool]], save_path: Union[os.PathLike, str]): self.save_path = save_path if scaler["load"]: # load scalers self.tensors_scaler = from_pickle( os.path.join(scaler["path"], TENSORS_SCALER_FN)) self.gwfu_scaler = from_pickle( os.path.join(scaler["path"], GWFU_SCALER_FN)) self.gwfv_scaler = from_pickle( os.path.join(scaler["path"], GWFV_SCALER_FN)) else: self.tensors_scaler = StandardScaler() self.gwfu_scaler = StandardScaler() self.gwfv_scaler = StandardScaler()
def get_num_samples(source_path: Union[str, os.PathLike]) -> int: """ Finds the number of samples in source_path/tensors.csv Subtract one to remove header from count """ metadata = from_pickle(os.path.join(source_path, "metadata.pkl")) return metadata["total_samples"]
def get_metadata(source_path: Union[os.PathLike, str]): """ Gets metadata from source_path """ metadata_fn = os.path.join(source_path, "metadata.pkl") metadata = from_pickle(metadata_fn) return metadata
def get_eval(path): data = from_pickle(path) pred = data["predictions"].T pred = pred.reshape(33, 1440, 64, 128).swapaxes(1, 0) targets = data["targets"].T targets = targets.reshape(33, 1440, 64, 128).swapaxes(1, 0) return targets, pred
def save_metadata( save_path: Union[os.PathLike, str], source_path: Union[os.PathLike, str], metadata: Any, ): prev_metadata = from_pickle(os.path.join(source_path, "metadata.pkl")) # Shallow Merge metadata = {**prev_metadata, **metadata} to_pickle(path=os.path.join(save_path, "metadata.pkl"), obj=metadata)
def aggregate_experiment_results(metrics_path, experiments): u_experiments, v_experiments = defaultdict(lambda: {}), defaultdict(lambda: {}) for experiment, label in experiments.items(): if experiment not in ["vtemp", "vhght", "vlatlon"]: metrics = from_pickle( os.path.join(metrics_path, "gwfu", experiment, "metrics.pkl") ) u_experiments[label]["maes"] = metrics["maes"] u_experiments[label]["rmse"] = metrics["rmse"] u_experiments[label]["r_squared"] = metrics["r_squared"] if experiment not in ["utemp", "uhght", "ulatlon"]: metrics = from_pickle( os.path.join(metrics_path, "gwfv", experiment, "metrics.pkl") ) v_experiments[label]["maes"] = metrics["maes"] v_experiments[label]["rmse"] = metrics["rmse"] v_experiments[label]["r_squared"] = metrics["r_squared"] return u_experiments, v_experiments
def __init__(self, source_path: Union[os.PathLike, str], scaler_path: Union[os.PathLike, str], num_samples: Union[None, int], target: str, save_path: Union[os.PathLike, str], model, ) -> None: X_fp = os.path.join(source_path, "tensors.csv") Y_fp = os.path.join(source_path, f"{target}.csv") # Get Scalers X_scaler_fp = os.path.join(scaler_path, "tensors_scaler.pkl") self.X_scaler = from_pickle(X_scaler_fp) Y_scaler_fp = os.path.join(scaler_path, f"{target}_scaler.pkl") self.Y_scaler = from_pickle(Y_scaler_fp) for X, Y in zip( pd.read_csv(X_fp, header=None, chunksize=num_samples), pd.read_csv(Y_fp, header=None, chunksize=num_samples) ): # Tensors self.X_raw = X.to_numpy() self.X = self.X_scaler.transform(self.X_raw) # Targets self.Y_raw = Y.to_numpy() self.Y = self.Y_scaler.transform(self.Y_raw) # Predictions self.Y_pred = self.predict(model) return
def main(**params): """ Train Model """ with tracking( experiment="train", params=params, local_dir=params["save_path"], tracking=params["tracking"] ): target = params["target"] os.makedirs(params["save_path"], exist_ok=True) metadata = get_metadata(params["source_path"][0]) # Get Model if params["model_path"] is None: logger.info("Training new model") Model = get_model(params["model"]) model = Model.build((metadata["input_shape"],), metadata["output_shape"], params["learning_rate"]) else: model_path = params["model_path"] logger.info(f"Training model from {model_path}") model = load_model(params["model_path"], params["learning_rate"]) model.summary() # Get scalers tensors_scaler = from_pickle(os.path.join(params["scaler_path"], "tensors_scaler.pkl")) target_scaler = from_pickle(os.path.join(params["scaler_path"], f"{target}_scaler.pkl")) # Create data generators train_generator = DataGenerator( tensors_filepath=[os.path.join(path, "train_tensors.csv") for path in params["source_path"]], target_filepath=[os.path.join(path, f"train_{target}.csv") for path in params["source_path"]], batch_size=params["batch_size"], chunk_size=params["chunk_size"], num_samples=metadata["total_samples"]*len(params["source_path"]), tensors_scaler=tensors_scaler, target_scaler=target_scaler, name="train", train_with_random=params["train_with_random"], ) val_generator = DataGenerator( tensors_filepath=[os.path.join(path, "val_tensors.csv") for path in params["source_path"]], target_filepath=[os.path.join(path, f"val_{target}.csv") for path in params["source_path"]], batch_size=params["batch_size"], chunk_size=params["chunk_size"], num_samples=metadata["total_samples"]*len(params["source_path"]), tensors_scaler=tensors_scaler, target_scaler=target_scaler, name="val", train_with_random=params["train_with_random"], ) # Fit Model callbacks = get_callbacks(params["save_path"], params["model"]) # model.run_eagerly = True history = model.fit( x=train_generator, validation_data=val_generator, steps_per_epoch=params["steps_per_epoch"], validation_steps=params["validation_steps"], epochs=params["epochs"], verbose=params["verbose"], callbacks=callbacks, use_multiprocessing=params["use_multiprocessing"], )
def __init__( self, source_path: Union[os.PathLike, str], scaler_path: Union[os.PathLike, str], num_samples: Union[None, float], target: str, remove_outliers: Union[str, float], save_path: Union[os.PathLike, str], model, evaluate_with_random: bool = False, ) -> None: test_tensors_fp = os.path.join(source_path, "tensors.csv") test_targets_fp = os.path.join(source_path, f"{target}.csv") # Get Scalers tensors_scaler_fp = os.path.join(scaler_path, "tensors_scaler.pkl") tensors_scaler = from_pickle(tensors_scaler_fp) target_scaler_fp = os.path.join(scaler_path, f"{target}_scaler.pkl") target_scaler = from_pickle(target_scaler_fp) self.predictions = [] self.targets = [] chunksize = 100000 num_total_predictions = 0 if num_samples is not None and int(num_samples) < chunksize: num_samples = int(num_samples) chunksize = num_samples for test_tensors, test_targets in tqdm( zip( pd.read_csv(test_tensors_fp, header=None, chunksize=chunksize), pd.read_csv(test_targets_fp, header=None, chunksize=chunksize), ), "Load test data"): if num_samples is not None and num_total_predictions >= int( num_samples): break test_tensors = test_tensors.to_numpy() test_targets = test_targets.to_numpy() # Transform Targets test_tensors = tensors_scaler.transform(test_tensors) if evaluate_with_random: test_tensors = np.random.normal(loc=0.0, scale=1.0, size=test_tensors.shape) self.targets.append(test_targets) self.predictions.append( self.predict( model=model, tensors=test_tensors, target_scaler=target_scaler, )) num_total_predictions += chunksize self.predictions = np.concatenate(np.array(self.predictions), axis=0) self.targets = np.concatenate(np.array(self.targets), axis=0) # Removes outliers and returns dictionary keyed on each pressure level self.plevel_predictions, self.plevel_targets = self.split_predictions_on_plevel( predictions=self.predictions, targets=self.targets, outliers=remove_outliers, ) # Save unaltered predictions and targets to_pickle(path=os.path.join(save_path, "predictions.pkl"), obj={ "predictions": self.predictions, "targets": self.targets, })
import os import matplotlib.pyplot as plt import numpy as np import pandas as pd from lrgwd.utils.io import from_pickle gwfu_scaler = from_pickle("../runs/massive/split/gwfu_scaler.pkl") for gwfu_chunk, tensors_chunk in zip( pd.read_csv("../runs/massive/split/train_gwfu.csv", chunksize=100000), pd.read_csv("../runs/massive/split/train_tensors.csv", chunksize=100000)): gwfu_chunk = gwfu_chunk.to_numpy() tensors_chunk = tensors_chunk.to_numpy() break scaled_gwfu_chunk = gwfu_scaler.transform(gwfu_chunk) plevels = gwfu_chunk[0].shape[0] for plevel in reversed(range(plevels)): # raw_gwfu = gwfu_chunk[:,plevel] scaled_gwfu = scaled_gwfu_chunk[:, plevel] fig = plt.figure(figsize=(8, 6)) plt.hist([scaled_gwfu], bins=1000, label=["scaled_gwfu"]) plt.xlabel("gwfu (m/s^2)", size=14) plt.ylabel("Count", size=14) plt.title(f"Histogram scaled_gwfu for Plevel {plevel}") plt.legend(loc='upper right')