def generate_metrics( predictions: np.ndarray, targets: np.ndarray, plevel_predictions: Dict[str, np.ndarray], plevel_targets: Dict[str, np.ndarray], save_path: Union[os.PathLike, str] ) -> None: # Pressure Level Specific Metrics metrics = { "maes": mean_absolute_error(targets, predictions, multioutput="raw_values"), "rmse": mean_squared_error(targets, predictions, multioutput="raw_values", squared=False), "stds": np.std(targets, axis=1), "mins": np.min(targets, axis=1), "maxes": np.max(targets, axis=1), "means": np.mean(targets, axis=1), "medians": np.median(targets, axis=1), } metrics["r_squared"] = calculate_r_squared( test_predictions=plevel_predictions, test_targets=plevel_targets, ) to_pickle(os.path.join(save_path, "metrics.pkl"), metrics) return metrics
def save_metadata( path: os.PathLike, total_samples: int, features: List[str], plevels: int, output_shape: Tuple[int], indx: List[int], ) -> None: """ Pickles Metadata: input_shape: shape of train tensor for vertical column output_shape: shape of gwfu or gwfv tensor for vertical column indx: indicies that we used to shuffle total samples extracted """ input_shape = len(features) num_vc_feat = 0 for vc_feat in VERTICAL_COLUMN_FEATURES: if vc_feat in features: input_shape -= 1 num_vc_feat += 1 input_shape = input_shape*plevels + num_vc_feat to_pickle( path=path, obj={ "total_samples": total_samples, "input_shape": input_shape, "output_shape": output_shape, "indx": indx, } )
def save_metadata( save_path: Union[os.PathLike, str], source_path: Union[os.PathLike, str], metadata: Any, ): prev_metadata = from_pickle(os.path.join(source_path, "metadata.pkl")) # Shallow Merge metadata = {**prev_metadata, **metadata} to_pickle(path=os.path.join(save_path, "metadata.pkl"), obj=metadata)
def extract_tensors( data: Data, save_path: Union[os.PathLike, str], num_samples: Union[int, None], plevels: int, batch_size: int, ) -> None: """ Extracts feature tensors and target columns from raw data. Arguments: ---------- data (Data): save_path (Union[os.PathLike, str]): path to save all files num_samples (Union[int, None]): number of samples to extract from data. If None, extracts all samples. plevels (int): number of pressure levels to include in tensors. Use to ignore low altitude pressure levels batch_size (int): number of samples to gather before writing to disk. Useful for environment with more memory or in time termination. Returns: -------- None """ raw_data = data.raw_data # If num_samples not set, default to using all data max_samples = data.time*data.lat*data.lon if not num_samples or num_samples > max_samples: logger.warning("Extracting all possible samples") num_samples = max_samples first_batch = True tensors, tensors_labels = (pd.DataFrame(), pd.DataFrame()) targets_gwfu, targets_gwfv = (pd.DataFrame(), pd.DataFrame()) for i in tqdm(range(num_samples), "Extracting Tensors"): t, lat, lon = np.unravel_index(i, (data.time, data.lat, data.lon)) tensor, tensor_labels = (pd.DataFrame(), pd.DataFrame()) for feat in TENSOR: if feat == "slp": # Labels labels = pd.DataFrame([f"slp_{t}_{lat}_{lon}"]) tensor_labels = pd.concat([tensor_labels, labels], copy=False) # Vertical Column slp = pd.DataFrame(data=[raw_data[feat][t, lat, lon]]) tensor = pd.concat([tensor, slp], copy=False) else: if feat in TRAIN_FEATURES: # Labels labels = pd.DataFrame([ f"{feat}_{t}_{plevel}_{lat}_{lon}" for plevel in range(0, plevels) ]) vertical_column = pd.DataFrame(data=raw_data[feat][t, :plevels, lat, lon]) tensor = pd.concat([tensor, vertical_column], copy=False) tensor_labels = pd.concat([tensor_labels, labels], copy=False) elif feat in TARGET_FEATURES: # Vertical Column vertical_column = pd.DataFrame( data=raw_data[feat][t, :NON_ZERO_GWD_PLEVELS, lat, lon], ) if feat == "gwfu_cgwd": targets_gwfu = pd.concat([targets_gwfu, vertical_column], axis=1) else: targets_gwfv = pd.concat([targets_gwfv, vertical_column], axis=1) else: logger.warning("Unused attribute") # Concat tensors to batch tensors = pd.concat([tensors, tensor], axis=1) tensors_labels = pd.concat([tensors_labels, tensor_labels], axis=1) if tensors.shape[1] == batch_size: save_batch( tensors=tensors, labels=tensors_labels, targets_gwfu=targets_gwfu, targets_gwfv=targets_gwfv, save_path=save_path, include_header=first_batch ) if first_batch: to_pickle( path=os.path.join(save_path, "metadata.pkl"), obj={ "total_samples": num_samples, "input_shape": tensors.iloc[:,0].shape, "output_shape": targets_gwfu.iloc[:,0].shape, } ) first_batch=False tensors, tensors_labels = (pd.DataFrame(), pd.DataFrame()) targets_gwfu, targets_gwfv = (pd.DataFrame(), pd.DataFrame())
def __init__( self, source_path: Union[os.PathLike, str], scaler_path: Union[os.PathLike, str], num_samples: Union[None, float], target: str, remove_outliers: Union[str, float], save_path: Union[os.PathLike, str], model, evaluate_with_random: bool = False, ) -> None: test_tensors_fp = os.path.join(source_path, "tensors.csv") test_targets_fp = os.path.join(source_path, f"{target}.csv") # Get Scalers tensors_scaler_fp = os.path.join(scaler_path, "tensors_scaler.pkl") tensors_scaler = from_pickle(tensors_scaler_fp) target_scaler_fp = os.path.join(scaler_path, f"{target}_scaler.pkl") target_scaler = from_pickle(target_scaler_fp) self.predictions = [] self.targets = [] chunksize = 100000 num_total_predictions = 0 if num_samples is not None and int(num_samples) < chunksize: num_samples = int(num_samples) chunksize = num_samples for test_tensors, test_targets in tqdm( zip( pd.read_csv(test_tensors_fp, header=None, chunksize=chunksize), pd.read_csv(test_targets_fp, header=None, chunksize=chunksize), ), "Load test data"): if num_samples is not None and num_total_predictions >= int( num_samples): break test_tensors = test_tensors.to_numpy() test_targets = test_targets.to_numpy() # Transform Targets test_tensors = tensors_scaler.transform(test_tensors) if evaluate_with_random: test_tensors = np.random.normal(loc=0.0, scale=1.0, size=test_tensors.shape) self.targets.append(test_targets) self.predictions.append( self.predict( model=model, tensors=test_tensors, target_scaler=target_scaler, )) num_total_predictions += chunksize self.predictions = np.concatenate(np.array(self.predictions), axis=0) self.targets = np.concatenate(np.array(self.targets), axis=0) # Removes outliers and returns dictionary keyed on each pressure level self.plevel_predictions, self.plevel_targets = self.split_predictions_on_plevel( predictions=self.predictions, targets=self.targets, outliers=remove_outliers, ) # Save unaltered predictions and targets to_pickle(path=os.path.join(save_path, "predictions.pkl"), obj={ "predictions": self.predictions, "targets": self.targets, })