def test_min_max_scale(self): combined = combine_patch_data(self.input_data, self.input_variables) scaled_data, scale_values = min_max_scale(combined) inverse_data = min_max_inverse_scale(scaled_data, scale_values) self.assertEqual(combined.max(), inverse_data.max()) self.assertEqual(combined.min(), inverse_data.min()) self.assertEqual(scaled_data.max(), 1) self.assertEqual(scaled_data.min(), 0)
def main(): parser = argparse.ArgumentParser() parser.add_argument("config", help="Name of the config file.") parser.add_argument("-i", "--interp", action="store_true", help="Run interpretation.") parser.add_argument("-p", "--plot", action="store_true", help="Plot interpretation results.") args = parser.parse_args() if not exists(args.config): raise FileNotFoundError(args.config + " not found.") with open(args.config, "r") as config_file: config = yaml.load(config_file, Loader=yaml.Loader) input, output, meta = load_patch_files(config["run_start_date"], config["run_end_date"], config["data_path"], config["input_variables"], config["output_variables"], config["meta_variables"], config["patch_radius"]) input_combined = combine_patch_data(input, config["input_variables"]) scale_values = pd.read_csv(join(config["out_path"], "scale_values.csv")) input_scaled, scale_values = min_max_scale(input_combined, scale_values) out_max = storm_max_value(output[config["output_variables"][0]], meta["masks"]) meta_df = get_meta_scalars(meta) if config["classifier"]: labels = np.where(out_max >= config["classifier_threshold"], 1, 0) else: labels = out_max models = {} neuron_activations = {} if not exists(config["activation_path"]): makedirs(config["activation_path"]) for model_name in config["models"]: model_out_path = join(config["out_path"], model_name) models[model_name] = load_conv_net(model_out_path, model_name) neuron_columns = [f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons)] neuron_activations[model_name] = pd.merge(meta_df, pd.DataFrame(0, columns=neuron_columns, index=meta_df.index), left_index=True, right_index=True) neuron_activations[model_name].loc[:, neuron_columns] = models[model_name].output_hidden_layer(input_scaled) run_dates = neuron_activations[model_name]["run_date"].unique() for run_date in run_dates: rdi = neuron_activations[model_name]["run_date"] == run_date run_date_str = run_date.strftime(config["date_format"]) na_file = join(config["activation_path"], f"neuron_activations_{model_name}_{run_date_str}.csv") neuron_activations[model_name].loc[rdi].to_csv(na_file, index_col="index") return
def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("config", help="Name of the config file.") parser.add_argument("-t", "--train", action="store_true", help="Run neural network training.") parser.add_argument("-i", "--interp", action="store_true", help="Run interpretation.") parser.add_argument("-p", "--plot", action="store_true", help="Plot interpretation results.") args = parser.parse_args() if not exists(args.config): raise FileNotFoundError(args.config + " not found.") with open(args.config, "r") as config_file: config = yaml.load(config_file, Loader=yaml.Loader) # Load training data print( f"Loading training data period: {config['train_start_date']} to {config['train_end_date']}" ) input = {} output = {} out_max = {} labels = {} meta = {} meta_df = {} input_combined = {} input_scaled = {} scale_values = {} predictions = {} modes = ["train", "val", "test"] # Load training, validation, and testing data for mode in modes: input[mode], output[mode], meta[mode] = load_patch_files( config[mode + "_start_date"], config[mode + "_end_date"], config["data_path"], config["input_variables"], config["output_variables"], config["meta_variables"], config["patch_radius"]) input_combined[mode] = combine_patch_data(input[mode], config["input_variables"]) if mode == "train": input_scaled[mode], scale_values[mode] = min_max_scale( input_combined[mode]) else: input_scaled[mode], scale_values[mode] = min_max_scale( input_combined[mode], scale_values["train"]) out_max[mode] = storm_max_value( output[mode][config["output_variables"][0]], meta[mode]["masks"]) meta_df[mode] = get_meta_scalars(meta[mode]) print(meta_df[mode].columns) if config["classifier"]: labels[mode] = np.where( out_max[mode] >= config["classifier_threshold"], 1, 0) else: labels[mode] = out_max[mode] if not exists(config["out_path"]): makedirs(config["out_path"]) scale_values["train"].to_csv(join(config["out_path"], "scale_values.csv"), index_label="variable") if "get_visible_devices" in dir(tf.config.experimental): gpus = tf.config.experimental.get_visible_devices("GPU") else: gpus = tf.config.get_visible_devices("GPU") for device in gpus: tf.config.experimental.set_memory_growth(device, True) models = {} neuron_activations = {} neuron_scores = {} saliency = {} if args.train: print("Begin model training") for mode in modes: predictions[mode] = pd.DataFrame(0, index=meta_df[mode].index, columns=list( config["models"].keys())) predictions[mode] = pd.merge(meta_df[mode], predictions[mode], left_index=True, right_index=True) for model_name, model_config in config["models"].items(): model_out_path = join(config["out_path"], model_name) if not exists(model_out_path): makedirs(model_out_path) scale_values["train"].to_csv(join( model_out_path, "scale_values_" + model_name + ".csv"), index_label="variable") models[model_name] = BaseConvNet(**model_config) models[model_name].fit(input_scaled["train"].values, labels["train"], val_x=input_scaled["val"].values, val_y=labels["val"]) models[model_name].save_model(model_out_path, model_name) for mode in modes: predictions[mode].loc[:, model_name] = models[model_name].predict( input_scaled[mode].values) for mode in modes: predictions[mode].to_csv(join(config["out_path"], f"predictions_{mode}.csv"), index_label="index") print("Calculate metrics") if config["classifier"]: model_scores = classifier_metrics( labels["test"], predictions["test"][list(config["models"].keys())]) model_scores.to_csv(join(config["out_path"], "model_test_scores.csv"), index_label="model_name") if args.interp: for model_name, model_config in config["models"].items(): if model_name not in models.keys(): model_out_path = join(config["out_path"], model_name) models[model_name] = load_conv_net(model_out_path, model_name) neuron_columns = [ f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons) ] neuron_activations[model_name] = {} neuron_scores[model_name] = pd.DataFrame(0, columns=neuron_columns, index=modes) saliency[model_name] = {} for mode in modes: neuron_activations[model_name][mode] = pd.merge( meta_df[mode], pd.DataFrame(0, columns=neuron_columns, index=meta_df[mode].index), left_index=True, right_index=True) neuron_activations[model_name][ mode].loc[:, neuron_columns] = models[ model_name].output_hidden_layer( input_scaled[mode].values) neuron_activations[model_name][mode].to_csv( join(config["out_path"], f"neuron_activations_{model_name}_{mode}.csv"), index_label="index") saliency[model_name][mode] = models[model_name].saliency( input_scaled[mode]) saliency[model_name][mode].to_netcdf(join( config["out_path"], f"neuron_saliency_{model_name}_{mode}.nc"), encoding={ "saliency": { "zlib": True, "complevel": 4, "shuffle": True } }) if config["classifier"]: neuron_scores[model_name].loc[mode] = score_neurons( labels[mode], neuron_activations[model_name][mode] [neuron_columns].values) else: neuron_scores[model_name].loc[mode] = score_neurons( labels[mode], neuron_activations[model_name][mode] [neuron_columns].values, metric="r") neuron_scores[model_name].to_csv(join( config["out_path"], f"neuron_scores_{model_name}.csv"), index_label="mode") if args.plot: for model_name, model_config in config["models"].items(): if model_name not in models.keys(): model_out_path = join(config["out_path"], model_name) models[model_name] = load_conv_net(model_out_path, model_name) neuron_activations[model_name] = {} neuron_scores[model_name] = pd.read_csv(join( config["out_path"], f"neuron_scores_{model_name}.csv"), index_col="mode") saliency[model_name] = {} for mode in modes: if mode not in neuron_activations[model_name].keys(): neuron_activations[model_name][mode] = pd.read_csv( join(config["out_path"], f"neuron_activations_{model_name}_{mode}.csv"), index_col="index") saliency[model_name][mode] = xr.open_dataarray( join(config["out_path"], f"neuron_saliency_{model_name}_{mode}.nc")) for variable_name in config["input_variables"]: plot_neuron_composites( config["out_path"], model_name + "_" + mode, input_combined[mode], neuron_activations[model_name][mode].values, neuron_scores[model_name].loc[mode].values, variable_name) plot_saliency_composites( config["out_path"], model_name + "_" + mode, saliency[model_name][mode], neuron_activations[model_name][mode].values, neuron_scores[model_name].loc[mode].values, variable_name) return
def main(): parser = argparse.ArgumentParser() parser.add_argument("config", help="Name of the config file.") args = parser.parse_args() if not exists(args.config): raise FileNotFoundError(args.config + " not found.") with open(args.config, "r") as config_file: config = yaml.load(config_file, Loader=yaml.Loader) input_data, output, meta = load_patch_files(config["run_start_date"], config["run_end_date"], config["data_path"], config["input_variables"], config["output_variables"], config["meta_variables"], config["patch_radius"]) input_combined = combine_patch_data(input_data, config["input_variables"]) out_max = storm_max_value(output[config["output_variables"][0]], meta["masks"]) print(list(meta.variables.keys())) meta_df = get_meta_scalars(meta) models = {} neuron_activations = {} if not exists(config["activation_path"]): makedirs(config["activation_path"]) for model_name in config["models"]: print(model_name) scale_values = pd.read_csv(join(config["out_path"], model_name, f"scale_values_{model_name}.csv"), index_col="variable") input_scaled, scale_values = min_max_scale(input_combined, scale_values) print("Input shape", input_scaled.shape) model_out_path = join(config["out_path"], model_name) models[model_name] = load_conv_net(model_out_path, model_name) print(models[model_name].model_.summary()) neuron_columns = [f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons)] neuron_activations[model_name] = pd.merge(meta_df, pd.DataFrame(0, columns=neuron_columns, index=meta_df.index), left_index=True, right_index=True) neuron_activations[model_name].loc[:, neuron_columns] = models[model_name].output_hidden_layer(input_scaled.values) print("Neuron activation shape:", neuron_activations[model_name].shape) run_dates = pd.DatetimeIndex(neuron_activations[model_name]["run_date"].unique()) print(run_dates) for run_date in run_dates: print(run_date) rdi = neuron_activations[model_name]["run_date"] == run_date print("Run storm count", np.count_nonzero(rdi)) run_date_str = run_date.strftime(config["date_format"]) na_file = join(config["activation_path"], f"neuron_activations_{model_name}_{run_date_str}.csv") neuron_activations[model_name].loc[rdi].to_csv(na_file, index_label="index") meta_run = meta.isel(p=np.where(rdi)[0]) print(run_date.strftime("%Y-%m-%d") + " plot all storms") plot_storm_mode_analysis_map(neuron_activations[model_name].loc[rdi], meta_run, config["models"][model_name], run_date, 1, 35, model_name, config["activation_path"], period_total=True) plot_storm_mode_analysis_map(neuron_activations[model_name].loc[rdi], meta_run, config["models"][model_name], run_date, 12, 35, model_name, config["activation_path"], period_total=True) print(run_date.strftime("%Y-%m-%d") + " plot hourly storms") plot_storm_mode_analysis_map(neuron_activations[model_name].loc[rdi], meta_run, config["models"][model_name], run_date, config["start_hour"], config["end_hour"], model_name, config["activation_path"], period_total=False) return
def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("config", help="Name of the config file.") parser.add_argument("-t", "--train", action="store_true", help="Run neural network training.") parser.add_argument("-i", "--interp", action="store_true", help="Run interpretation.") parser.add_argument("-u", "--train_gmm", action="store_true", help="Run unsupervised model training.") parser.add_argument("-p", "--plot", action="store_true", help="Plot interpretation results.") parser.add_argument("-p2", "--plot2", action="store_true", help="Plot additional interpretation results.") args = parser.parse_args() if not exists(args.config): raise FileNotFoundError(args.config + " not found.") with open(args.config, "r") as config_file: config = yaml.load(config_file, Loader=yaml.Loader) np.random.seed(config["random_seed"]) random.seed(config["random_seed"]) tf.random.set_seed(config["random_seed"]) # Load training data print(f"Loading training data period: {config['train_start_date']} to {config['train_end_date']}") data_input = {} output = {} out_max = {} labels = {} meta = {} meta_df = {} input_combined = {} input_scaled = {} scale_values = {} predictions = {} if exists('mask') in config: mask = config['mask'] else: mask = False modes = ["train", "val", "test"] # Load training, validation, and testing data for mode in modes: data_input[mode], output[mode], meta[mode] = load_patch_files(config[mode + "_start_date"], config[mode + "_end_date"], None, config["data_path"], config["input_variables"], config["output_variables"], config["meta_variables"], config["patch_radius"], mask) input_combined[mode] = combine_patch_data(data_input[mode], config["input_variables"]) if mode == "train": input_scaled[mode], scale_values[mode] = min_max_scale(input_combined[mode]) else: input_scaled[mode], scale_values[mode] = min_max_scale(input_combined[mode], scale_values["train"]) out_max[mode] = storm_max_value(output[mode][config["output_variables"][0]], meta[mode]["masks"]) meta_df[mode] = get_meta_scalars(meta[mode]) print(meta_df[mode].columns) if config["classifier"]: labels[mode] = np.where(out_max[mode] >= config["classifier_threshold"], 1, 0) else: labels[mode] = out_max[mode] del data_input, out_max for folder in ['models', 'plots', 'data', 'metrics', 'labels']: makedirs(join(config["out_path"], folder), exist_ok=True) with open(join(config['out_path'], 'full_config.yml'), "w") as config_file: yaml.dump(config, config_file) if "get_visible_devices" in dir(tf.config.experimental): gpus = tf.config.experimental.get_visible_devices("GPU") else: gpus = tf.config.get_visible_devices("GPU") for device in gpus: tf.config.experimental.set_memory_growth(device, True) models = {} neuron_activations = {} neuron_scores = {} saliency = {} if args.train: print("Begin model training") for mode in modes: predictions[mode] = pd.DataFrame(0, index=meta_df[mode].index, columns=list(config["models"].keys())) predictions[mode] = pd.merge(meta_df[mode], predictions[mode], left_index=True, right_index=True) for model_name, model_config in config["models"].items(): model_out_path = join(config["out_path"], "models", model_name) if not exists(model_out_path): makedirs(model_out_path) scale_values["train"].to_csv(join(model_out_path, "scale_values_" + model_name + ".csv"), index_label="variable") models[model_name] = BaseConvNet(**model_config) models[model_name].fit(input_scaled["train"].values, labels["train"], val_x=input_scaled["val"].values, val_y=labels["val"]) models[model_name].save_model(model_out_path, model_name) for mode in modes: predictions[mode].loc[:, model_name] = models[model_name].predict(input_scaled[mode].values) for mode in modes: predictions[mode].to_csv( join(config["out_path"], "metrics", f"predictions_{mode}.csv"), index_label="index") print("Calculate metrics") if config["classifier"]: model_scores = classifier_metrics(labels["test"], predictions["test"][list(config["models"].keys())]) model_scores.to_csv(join(config["out_path"], "metrics", "model_test_scores.csv"), index_label="model_name") if args.interp: for model_name, model_config in config["models"].items(): if model_name not in models.keys(): model_out_path = join(config["out_path"], "models", model_name) models[model_name] = load_conv_net(model_out_path, model_name) neuron_columns = [f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons)] neuron_activations[model_name] = {} neuron_scores[model_name] = pd.DataFrame(0, columns=neuron_columns, index=modes) saliency[model_name] = {} for mode in modes: neuron_activations[model_name][mode] = pd.merge(meta_df[mode], pd.DataFrame(0, columns=neuron_columns, index=meta_df[mode].index), left_index=True, right_index=True) neuron_activations[model_name][mode].loc[:, neuron_columns] = models[model_name].output_hidden_layer( input_scaled[mode].values) neuron_activations[model_name][mode].to_csv(join(config["out_path"], "data", f"neuron_activations_{model_name}_{mode}.csv"), index_label="index") saliency[model_name][mode] = models[model_name].saliency(input_scaled[mode]) saliency[model_name][mode].to_netcdf(join(config["out_path"], "data", f"neuron_saliency_{model_name}_{mode}.nc"), encoding={"saliency": {"zlib": True, "complevel": 4, "shuffle": True, "least_significant_digit": 3}}) if config["classifier"]: neuron_scores[model_name].loc[mode] = score_neurons(labels[mode], neuron_activations[model_name][mode][ neuron_columns].values) else: neuron_scores[model_name].loc[mode] = score_neurons(labels[mode], neuron_activations[model_name][mode][ neuron_columns].values, metric="r") del saliency[model_name][mode] neuron_scores[model_name].to_csv(join(config["out_path"], "metrics", f"neuron_scores_{model_name}.csv"), index_label="mode") del models[model_name], neuron_activations[model_name] if args.train_gmm: print('Begin Training Gaussian Mixture Model(s)') cluster_df = {} GMM = {} for model_name, model_config in config["models"].items(): for mode in modes: neuron_activations[model_name] = {} neuron_activations[model_name][mode] = pd.read_csv(join(config["out_path"], "data", f"neuron_activations_{model_name}_{mode}.csv")) X = neuron_activations[model_name][mode].loc[ :, neuron_activations[model_name][mode].columns.str.contains('neuron')] for GMM_mod_name, GMM_config in config["GMM_models"].items(): if mode == "train": GMM[GMM_mod_name] = GaussianMixture(**GMM_config).fit(X) cluster_df[GMM_mod_name] = {} cluster_df[GMM_mod_name][mode] = pd.DataFrame(GMM[GMM_mod_name].predict_proba(X), columns=[f"cluster {i}" for i in range( GMM_config['n_components'])]) cluster_df[GMM_mod_name][mode]['label prob'] = cluster_df[GMM_mod_name][mode].max(axis=1) cluster_df[GMM_mod_name][mode]['label'] = GMM[GMM_mod_name].predict(X) neuron_activations[model_name][mode].merge( cluster_df[GMM_mod_name][mode], right_index=True, left_index=True).to_csv(join( config["out_path"], "data", f"{model_name}_{GMM_mod_name}_{mode}_clusters.csv"), index=False) joblib.dump(GMM[GMM_mod_name], join( config["out_path"], "models", f'{model_name}_{GMM_mod_name}.mod')) if args.plot: print("Begin plotting") if "plot_kwargs" not in config.keys(): config["plot_kwargs"] = {} for model_name, model_config in config["models"].items(): print(model_name) if model_name not in models.keys(): model_out_path = join(config["out_path"], "models", model_name) models[model_name] = load_conv_net(model_out_path, model_name) neuron_activations[model_name] = {} neuron_scores[model_name] = pd.read_csv(join(config["out_path"], "metrics", f"neuron_scores_{model_name}.csv"), index_col="mode") saliency[model_name] = {} for mode in modes: print(mode) if mode not in neuron_activations[model_name].keys(): neuron_activations[model_name][mode] = pd.read_csv(join(config["out_path"], "data", f"neuron_activations_{model_name}_{mode}.csv"), index_col="index") saliency[model_name][mode] = xr.open_dataarray(join(config["out_path"], "data", f"neuron_saliency_{model_name}_{mode}.nc")) for variable_name in config["input_variables"]: print(variable_name) if variable_name not in config["plot_kwargs"].keys(): plot_kwargs = None else: plot_kwargs = config["plot_kwargs"][variable_name] plot_out_path = join(config["out_path"], "plots") plot_neuron_composites(plot_out_path, model_name + "_" + mode, input_combined[mode], neuron_activations[model_name][mode].values, neuron_scores[model_name].loc[mode].values, variable_name, plot_kwargs=plot_kwargs) plot_saliency_composites(plot_out_path, model_name + "_" + mode, saliency[model_name][mode], neuron_activations[model_name][mode].values, neuron_scores[model_name].loc[mode].values, variable_name) plot_top_activations(plot_out_path, model_name + "_" + mode, input_combined[mode], meta_df[mode], neuron_activations[model_name][mode], neuron_scores[model_name].loc[mode].values, saliency[model_name][mode], variable_name, plot_kwargs=plot_kwargs) del saliency[model_name][mode] if args.plot2: print("Additional Plotting...") for model_name in config["models"].keys(): for mode in ["val"]: for GMM_mod_name, GMM_config in config["GMM_models"].items(): plot_out_path = join(config["out_path"], "plots", model_name, GMM_mod_name) if not exists(plot_out_path): makedirs(plot_out_path, exist_ok=True) cluster_df = pd.read_csv(join( config["out_path"], "data", f"{model_name}_{GMM_mod_name}_{mode}_clusters.csv")) plot_prob_dist(cluster_df, plot_out_path, GMM_mod_name, GMM_config["n_components"]) plot_prob_cdf(cluster_df, plot_out_path, GMM_mod_name, GMM_config["n_components"]) cape_shear_modes(cluster_df, plot_out_path, config["data_path"], mode, model_name, gmm_name=GMM_mod_name, cluster=True, num_storms=1000) spatial_neuron_activations(cluster_df, plot_out_path, mode, model_name, gmm_name=GMM_mod_name, cluster=True) diurnal_neuron_activations(cluster_df, plot_out_path, mode, model_name, gmm_name=GMM_mod_name, cluster=True) for prob_type in ['highest', 'lowest']: plot_storm_clusters(config['data_path'], plot_out_path, cluster_df, n_storms=25, patch_radius=config["patch_radius"], prob_type=prob_type, seed=config["random_seed"]) return
def main(): parser = argparse.ArgumentParser() parser.add_argument("config", help="Name of the config file.") parser.add_argument("-p", "--plot_activation", action="store_true", help="Plot storms by specified active neurons.") args = parser.parse_args() if not exists(args.config): raise FileNotFoundError(args.config + " not found.") with open(args.config, "r") as config_file: config = yaml.load(config_file, Loader=yaml.Loader) for path in [config["activation_path"], config["labels_path"]]: if not exists(path): makedirs(path) models, gmms, neuron_activations = {}, {}, {} if config["run_start_date"] == "today": if config['run_freq'] == 'hourly': start_str = (pd.Timestamp(config["run_start_date"], tz="UTC") - pd.Timedelta(hours=3)).strftime("%Y%m%d-%H00") end_str = (pd.Timestamp(config["run_end_date"], tz="UTC") - pd.Timedelta(hours=3)).strftime("%Y%m%d-%H00") elif config['run_freq'] == 'daily': start_str = (pd.Timestamp(config["run_start_date"], tz="UTC")).strftime("%Y%m%d-0000") end_str = (pd.Timestamp(config["run_end_date"], tz="UTC")).strftime("%Y%m%d-0000") else: start_str = (pd.Timestamp(config["run_start_date"], tz="UTC")).strftime("%Y%m%d-%H00") end_str = (pd.Timestamp(config["run_end_date"], tz="UTC")).strftime("%Y%m%d-%H00") if start_str != end_str: date_str = start_str + '_' + end_str else: date_str = start_str for model_type, model_dict in config["models"].items(): for model_name in model_dict.keys(): scale_values = pd.read_csv( join(config["model_path"], model_name, f"scale_values_{model_name}.csv")) scale_values['variable'] = model_dict[model_name][ 'input_variables'] scale_values = scale_values.set_index('variable') print('Loading storm patches...') input_data, output, meta = load_patch_files( config["run_start_date"], config["run_end_date"], config["run_freq"], config["data_path"], model_dict[model_name]["input_variables"], model_dict[model_name]["output_variables"], config["meta_variables"], model_dict[model_name]["patch_radius"]) input_combined = combine_patch_data( input_data, model_dict[model_name]["input_variables"]) print('COMBINED VARNAMES: ', input_combined['var_name']) input_scaled, scale_values = min_max_scale(input_combined, scale_values) print("Input shape:", input_scaled.shape) meta_df = get_meta_scalars(meta) geometry_df, skips = get_contours(meta) model_out_path = join(config["model_path"], model_name) models[model_name] = load_conv_net(model_out_path, model_name) print(model_name, f'({model_type})') print(models[model_name].model_.summary()) if model_type == 'semi_supervised': neuron_columns = [ f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons) ] neuron_activations[model_name] = pd.merge( meta_df, pd.DataFrame(0, columns=neuron_columns, index=meta_df.index), left_index=True, right_index=True) neuron_activations[model_name].loc[:, neuron_columns] = \ models[model_name].output_hidden_layer(input_scaled.values) neuron_activations[model_name].to_csv(join( config["activation_path"], f'{model_name}_activations_{date_str}.csv'), index=False) gmms[model_name] = joblib.load( join(config["model_path"], model_name, f'{model_name}.gmm')) cluster_assignments = joblib.load( join(config["model_path"], model_name, f'{model_name}_gmm_labels.dict')) labels = predict_labels_gmm(neuron_activations[model_name], neuron_columns, gmms[model_name], cluster_assignments) labels = pd.merge(labels, geometry_df) elif model_type == 'supervised': labels = predict_labels_cnn(input_scaled, meta_df, models[model_name]) labels = pd.merge(labels, geometry_df) l = [] for d in pd.date_range(start_str.replace('-', ''), end_str.replace('-', ''), freq=config['run_freq'][0]): agg_df = pd.read_csv( join( config['data_path'].replace('nc', 'csv'), f'{config["csv_model_prefix"]}{d.strftime("%Y%m%d-%H00")}.csv' )) l.append(agg_df) agg_storm_data = pd.concat(l).reset_index(drop=True) labels['MAX_UPHL'] = pd.merge( labels, agg_storm_data.drop(skips), on=labels.index)[config["agg_variables"]] if config['output_format'] == 'csv': labels.to_csv(join(config["labels_path"], f'{model_name}_labels_{date_str}.csv'), index_label=False) elif config['output_format'] == 'parquet': labels.to_parquet( join(config["labels_path"], f'{model_name}_labels_{date_str}.parquet')) else: raise ValueError( f'File format {config["output_format"]} not found. Please use "parquet" or "csv"' ) print( 'Wrote', join( config["labels_path"], f'{model_name}_labels_{date_str}.{config["output_format"]}' )) print("Completed.") return
def main(): parser = argparse.ArgumentParser() parser.add_argument("config", help="Name of the config file.") parser.add_argument("-e", "--eval", action="store_true", help="Evaluate conditional probabilities.") args = parser.parse_args() if not exists(args.config): raise FileNotFoundError(args.config + " not found.") with open(args.config, "r") as config_file: config = yaml.load(config_file, Loader=yaml.Loader) models, gmms, neuron_activations, labels = {}, {}, {}, {} if config["run_start_date"] == "today": if config['run_freq'] == 'hourly': start_str = (pd.Timestamp(config["run_start_date"], tz="UTC") - pd.Timedelta(hours=3)).strftime("%Y%m%d-%H00") end_str = (pd.Timestamp(config["run_end_date"], tz="UTC") - pd.Timedelta(hours=3)).strftime("%Y%m%d-%H00") elif config['run_freq'] == 'daily': start_str = (pd.Timestamp(config["run_start_date"], tz="UTC")).strftime("%Y%m%d-0000") end_str = (pd.Timestamp(config["run_end_date"], tz="UTC")).strftime("%Y%m%d-0000") else: start_str = (pd.Timestamp(config["run_start_date"], tz="UTC")).strftime("%Y%m%d-%H00") end_str = (pd.Timestamp(config["run_end_date"], tz="UTC")).strftime("%Y%m%d-%H00") if start_str != end_str: date_str = start_str + '_' + end_str else: date_str = start_str l = [] for d in pd.date_range(start_str.replace('-', ''), end_str.replace('-', ''), freq=config['run_freq'][0]): file_path = join(config["data_path"].replace('_nc/', '_csv/'), f'{config["csv_model_prefix"]}{d.strftime("%Y%m%d-%H00")}.csv') if exists(file_path): df = pd.read_csv(file_path) l.append(df) storm_data = pd.concat(l).reset_index(drop=True) for model_type, model_dict in config["models"].items(): for model_name in model_dict.keys(): model_path = join(model_dict[model_name]["model_path"], "models", model_name) label_path = join(model_dict[model_name]["model_path"], 'labels') if not exists(label_path): makedirs(label_path) scale_values = pd.read_csv(join(model_path, f"scale_values_{model_name}.csv")) scale_values['variable'] = model_dict[model_name]['input_variables'] scale_values = scale_values.set_index('variable') print('Loading storm patches...') input_data, output, meta = load_patch_files(config["run_start_date"], config["run_end_date"], config["run_freq"], config["data_path"], model_dict[model_name]["input_variables"], model_dict[model_name]["output_variables"], config["meta_variables"], model_dict[model_name]["patch_radius"]) input_combined = combine_patch_data(input_data, model_dict[model_name]["input_variables"]) print('COMBINED VARNAMES: ', input_combined['var_name']) input_scaled, scale_values = min_max_scale(input_combined, scale_values) print("Input shape:", input_scaled.shape) meta_df = get_meta_scalars(meta) models[model_name] = load_conv_net(model_path, model_name) print(model_name, f'({model_type})') print(models[model_name].model_.summary()) if model_type == 'semi_supervised': neuron_columns = [f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons)] neuron_activations[model_name] = pd.merge(meta_df, pd.DataFrame(0, columns=neuron_columns, index=meta_df.index), left_index=True, right_index=True) neuron_activations[model_name].loc[:, neuron_columns] = \ models[model_name].output_hidden_layer(input_scaled.values) gmms[model_name] = joblib.load(join(f"{model_path}_GMM_1.mod")) cluster_assignments = joblib.load(join(model_path, f'{model_name}_GMM_1_gmm_labels.dict')) labels[model_name] = predict_labels_gmm(neuron_activations[model_name], neuron_columns, gmms[model_name], cluster_assignments) labels[model_name] = pd.merge(labels[model_name], meta_df) elif model_type == 'supervised': labels[model_name] = predict_labels_cnn(input_scaled, meta_df, models[model_name]) labels[model_name] = pd.merge(labels[model_name], meta_df) labels[model_name][config["agg_variables"]] = pd.merge(labels[model_name], storm_data, on=labels[model_name].index)[config["agg_variables"]] file_name = join(model_dict[model_name]["model_path"], "labels", f"{model_name}_labels_{date_str}.{config['output_format']}") save_labels(labels=labels[model_name], file_path=file_name, format=config["output_format"]) if args.eval: storm_report_path = config["storm_report_path"] if not path.exists(storm_report_path): makedirs(storm_report_path, exist_ok=False) start_date =(pd.Timestamp(config["run_start_date"], tz="UTC")).strftime("%Y%m%d0000") end_date = (pd.Timestamp(config["run_end_date"], tz="UTC")).strftime("%Y%m%d0000") for report_type in ['filtered_torn', 'filtered_hail', 'filtered_wind']: print(f'Downloading SPC storm reports from {start_date} through {end_date} for {report_type}') fetch_storm_reports(start_date, end_date, storm_report_path, report_type) if not isfile(join(model_dict[model_name]["model_path"], "labels", f"obs_{date_str}.nc")): print(f'Aggregating storm reports to a grid.') obs = generate_obs_grid(beg=start_date, end=end_date, storm_report_path=storm_report_path, model_grid_path=config["model_grid_path"]) file_name = join(model_dict[model_name]["model_path"], "labels", f"obs_{date_str}.nc") obs.to_netcdf(file_name) print(f"Wrote {file_name}.") print("Aggregating storm mode labels to a grid.") data = generate_mode_grid(beg=start_date, end=end_date, labels=labels[model_name], model_grid_path=config["model_grid_path"], min_lead_time=1, max_lead_time=24, run_date_freq='1d', bin_width=None) file_name = join(model_dict[model_name]["model_path"], "labels", config["physical_model"], f"{model_name}_gridded_labels_{date_str}.nc") data.to_netcdf(file_name) print(f"Wrote {file_name}.") return