示例#1
0
 def test_min_max_scale(self):
     combined = combine_patch_data(self.input_data, self.input_variables)
     scaled_data, scale_values = min_max_scale(combined)
     inverse_data = min_max_inverse_scale(scaled_data, scale_values)
     self.assertEqual(combined.max(), inverse_data.max())
     self.assertEqual(combined.min(), inverse_data.min())
     self.assertEqual(scaled_data.max(), 1)
     self.assertEqual(scaled_data.min(), 0)
示例#2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("config", help="Name of the config file.")
    parser.add_argument("-i", "--interp", action="store_true", help="Run interpretation.")
    parser.add_argument("-p", "--plot", action="store_true", help="Plot interpretation results.")
    args = parser.parse_args()
    if not exists(args.config):
        raise FileNotFoundError(args.config + " not found.")
    with open(args.config, "r") as config_file:
        config = yaml.load(config_file, Loader=yaml.Loader)
    input, output, meta = load_patch_files(config["run_start_date"],
                                           config["run_end_date"],
                                           config["data_path"],
                                           config["input_variables"],
                                           config["output_variables"],
                                           config["meta_variables"],
                                           config["patch_radius"])
    input_combined = combine_patch_data(input, config["input_variables"])
    scale_values = pd.read_csv(join(config["out_path"], "scale_values.csv"))
    input_scaled, scale_values = min_max_scale(input_combined, scale_values)
    out_max = storm_max_value(output[config["output_variables"][0]], meta["masks"])
    meta_df = get_meta_scalars(meta)
    if config["classifier"]:
        labels = np.where(out_max >= config["classifier_threshold"], 1, 0)
    else:
        labels = out_max
    models = {}
    neuron_activations = {}
    if not exists(config["activation_path"]):
        makedirs(config["activation_path"])
    for model_name in config["models"]:
        model_out_path = join(config["out_path"], model_name)
        models[model_name] = load_conv_net(model_out_path, model_name)
        neuron_columns = [f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons)]
        neuron_activations[model_name] = pd.merge(meta_df, pd.DataFrame(0, columns=neuron_columns,
                                                                          index=meta_df.index),
                                                    left_index=True, right_index=True)
        neuron_activations[model_name].loc[:, neuron_columns] = models[model_name].output_hidden_layer(input_scaled)
        run_dates = neuron_activations[model_name]["run_date"].unique()
        for run_date in run_dates:
            rdi = neuron_activations[model_name]["run_date"] == run_date
            run_date_str = run_date.strftime(config["date_format"])
            na_file = join(config["activation_path"],
                           f"neuron_activations_{model_name}_{run_date_str}.csv")
            neuron_activations[model_name].loc[rdi].to_csv(na_file, index_col="index")
    return
示例#3
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("config", help="Name of the config file.")
    parser.add_argument("-t",
                        "--train",
                        action="store_true",
                        help="Run neural network training.")
    parser.add_argument("-i",
                        "--interp",
                        action="store_true",
                        help="Run interpretation.")
    parser.add_argument("-p",
                        "--plot",
                        action="store_true",
                        help="Plot interpretation results.")
    args = parser.parse_args()
    if not exists(args.config):
        raise FileNotFoundError(args.config + " not found.")
    with open(args.config, "r") as config_file:
        config = yaml.load(config_file, Loader=yaml.Loader)
    # Load training data
    print(
        f"Loading training data period: {config['train_start_date']} to {config['train_end_date']}"
    )
    input = {}
    output = {}
    out_max = {}
    labels = {}
    meta = {}
    meta_df = {}
    input_combined = {}
    input_scaled = {}
    scale_values = {}
    predictions = {}
    modes = ["train", "val", "test"]
    # Load training, validation, and testing data
    for mode in modes:
        input[mode], output[mode], meta[mode] = load_patch_files(
            config[mode + "_start_date"], config[mode + "_end_date"],
            config["data_path"], config["input_variables"],
            config["output_variables"], config["meta_variables"],
            config["patch_radius"])
        input_combined[mode] = combine_patch_data(input[mode],
                                                  config["input_variables"])
        if mode == "train":
            input_scaled[mode], scale_values[mode] = min_max_scale(
                input_combined[mode])
        else:
            input_scaled[mode], scale_values[mode] = min_max_scale(
                input_combined[mode], scale_values["train"])
        out_max[mode] = storm_max_value(
            output[mode][config["output_variables"][0]], meta[mode]["masks"])
        meta_df[mode] = get_meta_scalars(meta[mode])
        print(meta_df[mode].columns)
        if config["classifier"]:
            labels[mode] = np.where(
                out_max[mode] >= config["classifier_threshold"], 1, 0)
        else:
            labels[mode] = out_max[mode]
    if not exists(config["out_path"]):
        makedirs(config["out_path"])
    scale_values["train"].to_csv(join(config["out_path"], "scale_values.csv"),
                                 index_label="variable")
    if "get_visible_devices" in dir(tf.config.experimental):
        gpus = tf.config.experimental.get_visible_devices("GPU")
    else:
        gpus = tf.config.get_visible_devices("GPU")
    for device in gpus:
        tf.config.experimental.set_memory_growth(device, True)
    models = {}
    neuron_activations = {}
    neuron_scores = {}
    saliency = {}
    if args.train:
        print("Begin model training")
        for mode in modes:
            predictions[mode] = pd.DataFrame(0,
                                             index=meta_df[mode].index,
                                             columns=list(
                                                 config["models"].keys()))
            predictions[mode] = pd.merge(meta_df[mode],
                                         predictions[mode],
                                         left_index=True,
                                         right_index=True)
        for model_name, model_config in config["models"].items():
            model_out_path = join(config["out_path"], model_name)
            if not exists(model_out_path):
                makedirs(model_out_path)
            scale_values["train"].to_csv(join(
                model_out_path, "scale_values_" + model_name + ".csv"),
                                         index_label="variable")
            models[model_name] = BaseConvNet(**model_config)
            models[model_name].fit(input_scaled["train"].values,
                                   labels["train"],
                                   val_x=input_scaled["val"].values,
                                   val_y=labels["val"])
            models[model_name].save_model(model_out_path, model_name)
            for mode in modes:
                predictions[mode].loc[:,
                                      model_name] = models[model_name].predict(
                                          input_scaled[mode].values)
        for mode in modes:
            predictions[mode].to_csv(join(config["out_path"],
                                          f"predictions_{mode}.csv"),
                                     index_label="index")
        print("Calculate metrics")
        if config["classifier"]:
            model_scores = classifier_metrics(
                labels["test"],
                predictions["test"][list(config["models"].keys())])
            model_scores.to_csv(join(config["out_path"],
                                     "model_test_scores.csv"),
                                index_label="model_name")
    if args.interp:
        for model_name, model_config in config["models"].items():
            if model_name not in models.keys():
                model_out_path = join(config["out_path"], model_name)
                models[model_name] = load_conv_net(model_out_path, model_name)
            neuron_columns = [
                f"neuron_{n:03d}"
                for n in range(models[model_name].dense_neurons)
            ]
            neuron_activations[model_name] = {}
            neuron_scores[model_name] = pd.DataFrame(0,
                                                     columns=neuron_columns,
                                                     index=modes)
            saliency[model_name] = {}
            for mode in modes:
                neuron_activations[model_name][mode] = pd.merge(
                    meta_df[mode],
                    pd.DataFrame(0,
                                 columns=neuron_columns,
                                 index=meta_df[mode].index),
                    left_index=True,
                    right_index=True)
                neuron_activations[model_name][
                    mode].loc[:, neuron_columns] = models[
                        model_name].output_hidden_layer(
                            input_scaled[mode].values)
                neuron_activations[model_name][mode].to_csv(
                    join(config["out_path"],
                         f"neuron_activations_{model_name}_{mode}.csv"),
                    index_label="index")
                saliency[model_name][mode] = models[model_name].saliency(
                    input_scaled[mode])

                saliency[model_name][mode].to_netcdf(join(
                    config["out_path"],
                    f"neuron_saliency_{model_name}_{mode}.nc"),
                                                     encoding={
                                                         "saliency": {
                                                             "zlib": True,
                                                             "complevel": 4,
                                                             "shuffle": True
                                                         }
                                                     })
                if config["classifier"]:
                    neuron_scores[model_name].loc[mode] = score_neurons(
                        labels[mode], neuron_activations[model_name][mode]
                        [neuron_columns].values)
                else:
                    neuron_scores[model_name].loc[mode] = score_neurons(
                        labels[mode],
                        neuron_activations[model_name][mode]
                        [neuron_columns].values,
                        metric="r")
            neuron_scores[model_name].to_csv(join(
                config["out_path"], f"neuron_scores_{model_name}.csv"),
                                             index_label="mode")
    if args.plot:
        for model_name, model_config in config["models"].items():
            if model_name not in models.keys():
                model_out_path = join(config["out_path"], model_name)
                models[model_name] = load_conv_net(model_out_path, model_name)
                neuron_activations[model_name] = {}
                neuron_scores[model_name] = pd.read_csv(join(
                    config["out_path"], f"neuron_scores_{model_name}.csv"),
                                                        index_col="mode")
                saliency[model_name] = {}
            for mode in modes:
                if mode not in neuron_activations[model_name].keys():
                    neuron_activations[model_name][mode] = pd.read_csv(
                        join(config["out_path"],
                             f"neuron_activations_{model_name}_{mode}.csv"),
                        index_col="index")
                    saliency[model_name][mode] = xr.open_dataarray(
                        join(config["out_path"],
                             f"neuron_saliency_{model_name}_{mode}.nc"))

                for variable_name in config["input_variables"]:
                    plot_neuron_composites(
                        config["out_path"], model_name + "_" + mode,
                        input_combined[mode],
                        neuron_activations[model_name][mode].values,
                        neuron_scores[model_name].loc[mode].values,
                        variable_name)
                    plot_saliency_composites(
                        config["out_path"], model_name + "_" + mode,
                        saliency[model_name][mode],
                        neuron_activations[model_name][mode].values,
                        neuron_scores[model_name].loc[mode].values,
                        variable_name)
    return
示例#4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("config", help="Name of the config file.")
    args = parser.parse_args()
    if not exists(args.config):
        raise FileNotFoundError(args.config + " not found.")
    with open(args.config, "r") as config_file:
        config = yaml.load(config_file, Loader=yaml.Loader)
    input_data, output, meta = load_patch_files(config["run_start_date"],
                                           config["run_end_date"],
                                           config["data_path"],
                                           config["input_variables"],
                                           config["output_variables"],
                                           config["meta_variables"],
                                           config["patch_radius"])
    input_combined = combine_patch_data(input_data, config["input_variables"])
    out_max = storm_max_value(output[config["output_variables"][0]], meta["masks"])
    print(list(meta.variables.keys()))
    meta_df = get_meta_scalars(meta)
    models = {}
    neuron_activations = {}
    if not exists(config["activation_path"]):
        makedirs(config["activation_path"])
    for model_name in config["models"]:
        print(model_name)
        scale_values = pd.read_csv(join(config["out_path"], model_name, f"scale_values_{model_name}.csv"), index_col="variable")

        input_scaled, scale_values = min_max_scale(input_combined, scale_values)
        print("Input shape", input_scaled.shape)
        model_out_path = join(config["out_path"], model_name)
        models[model_name] = load_conv_net(model_out_path, model_name)
        print(models[model_name].model_.summary())
        neuron_columns = [f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons)]
        neuron_activations[model_name] = pd.merge(meta_df, pd.DataFrame(0, columns=neuron_columns,
                                                                          index=meta_df.index),
                                                    left_index=True, right_index=True)
        neuron_activations[model_name].loc[:, neuron_columns] = models[model_name].output_hidden_layer(input_scaled.values)
        print("Neuron activation shape:", neuron_activations[model_name].shape)
        run_dates = pd.DatetimeIndex(neuron_activations[model_name]["run_date"].unique())
        print(run_dates)
        for run_date in run_dates:
            print(run_date)
            rdi = neuron_activations[model_name]["run_date"] == run_date
            print("Run storm count", np.count_nonzero(rdi))
            run_date_str = run_date.strftime(config["date_format"])
            na_file = join(config["activation_path"],
                           f"neuron_activations_{model_name}_{run_date_str}.csv")
            neuron_activations[model_name].loc[rdi].to_csv(na_file, index_label="index")
            meta_run = meta.isel(p=np.where(rdi)[0])
            print(run_date.strftime("%Y-%m-%d") + " plot all storms")
            plot_storm_mode_analysis_map(neuron_activations[model_name].loc[rdi],
                                         meta_run, config["models"][model_name],
                                         run_date, 1, 35, model_name, config["activation_path"], period_total=True)
            plot_storm_mode_analysis_map(neuron_activations[model_name].loc[rdi],
                                         meta_run, config["models"][model_name],
                                         run_date, 12, 35, model_name, config["activation_path"], period_total=True)
            print(run_date.strftime("%Y-%m-%d") + " plot hourly storms")
            plot_storm_mode_analysis_map(neuron_activations[model_name].loc[rdi],
                                         meta_run, config["models"][model_name],
                                         run_date, config["start_hour"], config["end_hour"], model_name,
                                         config["activation_path"],
                                         period_total=False)
    return
示例#5
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("config", help="Name of the config file.")
    parser.add_argument("-t", "--train", action="store_true", help="Run neural network training.")
    parser.add_argument("-i", "--interp", action="store_true", help="Run interpretation.")
    parser.add_argument("-u", "--train_gmm", action="store_true", help="Run unsupervised model training.")
    parser.add_argument("-p", "--plot", action="store_true", help="Plot interpretation results.")
    parser.add_argument("-p2", "--plot2", action="store_true", help="Plot additional interpretation results.")
    args = parser.parse_args()
    if not exists(args.config):
        raise FileNotFoundError(args.config + " not found.")
    with open(args.config, "r") as config_file:
        config = yaml.load(config_file, Loader=yaml.Loader)
    np.random.seed(config["random_seed"])
    random.seed(config["random_seed"])
    tf.random.set_seed(config["random_seed"])
    # Load training data
    print(f"Loading training data period: {config['train_start_date']} to {config['train_end_date']}")
    data_input = {}
    output = {}
    out_max = {}
    labels = {}
    meta = {}
    meta_df = {}
    input_combined = {}
    input_scaled = {}
    scale_values = {}
    predictions = {}
    if exists('mask') in config:
        mask = config['mask']
    else:
        mask = False
    modes = ["train", "val", "test"]
    # Load training, validation, and testing data
    for mode in modes:
        data_input[mode], output[mode], meta[mode] = load_patch_files(config[mode + "_start_date"],
                                                                      config[mode + "_end_date"],
                                                                      None,
                                                                      config["data_path"],
                                                                      config["input_variables"],
                                                                      config["output_variables"],
                                                                      config["meta_variables"],
                                                                      config["patch_radius"],
                                                                      mask)
        input_combined[mode] = combine_patch_data(data_input[mode], config["input_variables"])
        if mode == "train":
            input_scaled[mode], scale_values[mode] = min_max_scale(input_combined[mode])
        else:
            input_scaled[mode], scale_values[mode] = min_max_scale(input_combined[mode], scale_values["train"])
        out_max[mode] = storm_max_value(output[mode][config["output_variables"][0]], meta[mode]["masks"])
        meta_df[mode] = get_meta_scalars(meta[mode])
        print(meta_df[mode].columns)
        if config["classifier"]:
            labels[mode] = np.where(out_max[mode] >= config["classifier_threshold"], 1, 0)
        else:
            labels[mode] = out_max[mode]
    del data_input, out_max
    for folder in ['models', 'plots', 'data', 'metrics', 'labels']:
        makedirs(join(config["out_path"], folder), exist_ok=True)
    with open(join(config['out_path'], 'full_config.yml'), "w") as config_file:
        yaml.dump(config, config_file)
    if "get_visible_devices" in dir(tf.config.experimental):
        gpus = tf.config.experimental.get_visible_devices("GPU")
    else:
        gpus = tf.config.get_visible_devices("GPU")
    for device in gpus:
        tf.config.experimental.set_memory_growth(device, True)
    models = {}
    neuron_activations = {}
    neuron_scores = {}
    saliency = {}
    if args.train:
        print("Begin model training")
        for mode in modes:
            predictions[mode] = pd.DataFrame(0, index=meta_df[mode].index,
                                             columns=list(config["models"].keys()))
            predictions[mode] = pd.merge(meta_df[mode], predictions[mode], left_index=True, right_index=True)
        for model_name, model_config in config["models"].items():
            model_out_path = join(config["out_path"], "models", model_name)
            if not exists(model_out_path):
                makedirs(model_out_path)
            scale_values["train"].to_csv(join(model_out_path, "scale_values_" + model_name + ".csv"),
                                         index_label="variable")
            models[model_name] = BaseConvNet(**model_config)
            models[model_name].fit(input_scaled["train"].values, labels["train"],
                                   val_x=input_scaled["val"].values, val_y=labels["val"])
            models[model_name].save_model(model_out_path, model_name)
            for mode in modes:
                predictions[mode].loc[:, model_name] = models[model_name].predict(input_scaled[mode].values)
        for mode in modes:
            predictions[mode].to_csv(
                join(config["out_path"], "metrics", f"predictions_{mode}.csv"), index_label="index")

        print("Calculate metrics")
        if config["classifier"]:
            model_scores = classifier_metrics(labels["test"], predictions["test"][list(config["models"].keys())])
            model_scores.to_csv(join(config["out_path"], "metrics", "model_test_scores.csv"), index_label="model_name")
    if args.interp:
        for model_name, model_config in config["models"].items():
            if model_name not in models.keys():
                model_out_path = join(config["out_path"], "models", model_name)
                models[model_name] = load_conv_net(model_out_path, model_name)
            neuron_columns = [f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons)]
            neuron_activations[model_name] = {}
            neuron_scores[model_name] = pd.DataFrame(0, columns=neuron_columns, index=modes)
            saliency[model_name] = {}
            for mode in modes:
                neuron_activations[model_name][mode] = pd.merge(meta_df[mode], pd.DataFrame(0, columns=neuron_columns,
                                                                                            index=meta_df[mode].index),
                                                                left_index=True, right_index=True)
                neuron_activations[model_name][mode].loc[:, neuron_columns] = models[model_name].output_hidden_layer(
                    input_scaled[mode].values)
                neuron_activations[model_name][mode].to_csv(join(config["out_path"], "data",
                                                                 f"neuron_activations_{model_name}_{mode}.csv"),
                                                            index_label="index")
                saliency[model_name][mode] = models[model_name].saliency(input_scaled[mode])

                saliency[model_name][mode].to_netcdf(join(config["out_path"], "data",
                                                          f"neuron_saliency_{model_name}_{mode}.nc"),
                                                     encoding={"saliency": {"zlib": True,
                                                                            "complevel": 4,
                                                                            "shuffle": True,
                                                                            "least_significant_digit": 3}})
                if config["classifier"]:
                    neuron_scores[model_name].loc[mode] = score_neurons(labels[mode],
                                                                        neuron_activations[model_name][mode][
                                                                            neuron_columns].values)
                else:
                    neuron_scores[model_name].loc[mode] = score_neurons(labels[mode],
                                                                        neuron_activations[model_name][mode][
                                                                            neuron_columns].values,
                                                                        metric="r")
                del saliency[model_name][mode]
            neuron_scores[model_name].to_csv(join(config["out_path"], "metrics",
                                                  f"neuron_scores_{model_name}.csv"), index_label="mode")
            del models[model_name], neuron_activations[model_name]

    if args.train_gmm:
        print('Begin Training Gaussian Mixture Model(s)')
        cluster_df = {}
        GMM = {}
        for model_name, model_config in config["models"].items():
            for mode in modes:
                neuron_activations[model_name] = {}
                neuron_activations[model_name][mode] = pd.read_csv(join(config["out_path"], "data",
                                                                        f"neuron_activations_{model_name}_{mode}.csv"))
                X = neuron_activations[model_name][mode].loc[
                    :, neuron_activations[model_name][mode].columns.str.contains('neuron')]
                for GMM_mod_name, GMM_config in config["GMM_models"].items():
                    if mode == "train":
                        GMM[GMM_mod_name] = GaussianMixture(**GMM_config).fit(X)
                    cluster_df[GMM_mod_name] = {}
                    cluster_df[GMM_mod_name][mode] = pd.DataFrame(GMM[GMM_mod_name].predict_proba(X),
                                                                  columns=[f"cluster {i}" for i in range(
                                                                      GMM_config['n_components'])])
                    cluster_df[GMM_mod_name][mode]['label prob'] = cluster_df[GMM_mod_name][mode].max(axis=1)
                    cluster_df[GMM_mod_name][mode]['label'] = GMM[GMM_mod_name].predict(X)
                    neuron_activations[model_name][mode].merge(
                        cluster_df[GMM_mod_name][mode], right_index=True, left_index=True).to_csv(join(
                        config["out_path"], "data", f"{model_name}_{GMM_mod_name}_{mode}_clusters.csv"), index=False)
                    joblib.dump(GMM[GMM_mod_name], join(
                        config["out_path"], "models", f'{model_name}_{GMM_mod_name}.mod'))

    if args.plot:
        print("Begin plotting")
        if "plot_kwargs" not in config.keys():
            config["plot_kwargs"] = {}
        for model_name, model_config in config["models"].items():
            print(model_name)
            if model_name not in models.keys():
                model_out_path = join(config["out_path"], "models", model_name)
                models[model_name] = load_conv_net(model_out_path, model_name)
                neuron_activations[model_name] = {}
                neuron_scores[model_name] = pd.read_csv(join(config["out_path"], "metrics",
                                                             f"neuron_scores_{model_name}.csv"), index_col="mode")
                saliency[model_name] = {}
            for mode in modes:
                print(mode)
                if mode not in neuron_activations[model_name].keys():
                    neuron_activations[model_name][mode] = pd.read_csv(join(config["out_path"], "data",
                                                                            f"neuron_activations_{model_name}_{mode}.csv"),
                                                                       index_col="index")
                    saliency[model_name][mode] = xr.open_dataarray(join(config["out_path"], "data",
                                                                        f"neuron_saliency_{model_name}_{mode}.nc"))
                for variable_name in config["input_variables"]:
                    print(variable_name)
                    if variable_name not in config["plot_kwargs"].keys():
                        plot_kwargs = None
                    else:
                        plot_kwargs = config["plot_kwargs"][variable_name]
                    plot_out_path = join(config["out_path"], "plots")
                    plot_neuron_composites(plot_out_path,
                                           model_name + "_" + mode,
                                           input_combined[mode],
                                           neuron_activations[model_name][mode].values,
                                           neuron_scores[model_name].loc[mode].values,
                                           variable_name, plot_kwargs=plot_kwargs)
                    plot_saliency_composites(plot_out_path,
                                             model_name + "_" + mode,
                                             saliency[model_name][mode], neuron_activations[model_name][mode].values,
                                             neuron_scores[model_name].loc[mode].values,
                                             variable_name)
                    plot_top_activations(plot_out_path,
                                         model_name + "_" + mode,
                                         input_combined[mode], meta_df[mode],
                                         neuron_activations[model_name][mode],
                                         neuron_scores[model_name].loc[mode].values,
                                         saliency[model_name][mode],
                                         variable_name, plot_kwargs=plot_kwargs)
                del saliency[model_name][mode]
    if args.plot2:
        print("Additional Plotting...")
        for model_name in config["models"].keys():
            for mode in ["val"]:
                for GMM_mod_name, GMM_config in config["GMM_models"].items():
                    plot_out_path = join(config["out_path"], "plots", model_name, GMM_mod_name)
                    if not exists(plot_out_path):
                        makedirs(plot_out_path, exist_ok=True)
                    cluster_df = pd.read_csv(join(
                        config["out_path"], "data", f"{model_name}_{GMM_mod_name}_{mode}_clusters.csv"))
                    plot_prob_dist(cluster_df, plot_out_path, GMM_mod_name, GMM_config["n_components"])
                    plot_prob_cdf(cluster_df, plot_out_path, GMM_mod_name, GMM_config["n_components"])
                    cape_shear_modes(cluster_df, plot_out_path, config["data_path"], mode, model_name,
                                     gmm_name=GMM_mod_name, cluster=True, num_storms=1000)
                    spatial_neuron_activations(cluster_df, plot_out_path, mode, model_name,
                                               gmm_name=GMM_mod_name, cluster=True)
                    diurnal_neuron_activations(cluster_df, plot_out_path, mode, model_name,
                                               gmm_name=GMM_mod_name, cluster=True)
                    for prob_type in ['highest', 'lowest']:
                        plot_storm_clusters(config['data_path'], plot_out_path, cluster_df,
                                            n_storms=25,
                                            patch_radius=config["patch_radius"],
                                            prob_type=prob_type,
                                            seed=config["random_seed"])

    return
示例#6
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("config", help="Name of the config file.")
    parser.add_argument("-p",
                        "--plot_activation",
                        action="store_true",
                        help="Plot storms by specified active neurons.")
    args = parser.parse_args()
    if not exists(args.config):
        raise FileNotFoundError(args.config + " not found.")
    with open(args.config, "r") as config_file:
        config = yaml.load(config_file, Loader=yaml.Loader)
    for path in [config["activation_path"], config["labels_path"]]:
        if not exists(path):
            makedirs(path)
    models, gmms, neuron_activations = {}, {}, {}
    if config["run_start_date"] == "today":
        if config['run_freq'] == 'hourly':
            start_str = (pd.Timestamp(config["run_start_date"], tz="UTC") -
                         pd.Timedelta(hours=3)).strftime("%Y%m%d-%H00")
            end_str = (pd.Timestamp(config["run_end_date"], tz="UTC") -
                       pd.Timedelta(hours=3)).strftime("%Y%m%d-%H00")
        elif config['run_freq'] == 'daily':
            start_str = (pd.Timestamp(config["run_start_date"],
                                      tz="UTC")).strftime("%Y%m%d-0000")
            end_str = (pd.Timestamp(config["run_end_date"],
                                    tz="UTC")).strftime("%Y%m%d-0000")
    else:
        start_str = (pd.Timestamp(config["run_start_date"],
                                  tz="UTC")).strftime("%Y%m%d-%H00")
        end_str = (pd.Timestamp(config["run_end_date"],
                                tz="UTC")).strftime("%Y%m%d-%H00")
    if start_str != end_str:
        date_str = start_str + '_' + end_str
    else:
        date_str = start_str
    for model_type, model_dict in config["models"].items():
        for model_name in model_dict.keys():

            scale_values = pd.read_csv(
                join(config["model_path"], model_name,
                     f"scale_values_{model_name}.csv"))
            scale_values['variable'] = model_dict[model_name][
                'input_variables']
            scale_values = scale_values.set_index('variable')

            print('Loading storm patches...')
            input_data, output, meta = load_patch_files(
                config["run_start_date"], config["run_end_date"],
                config["run_freq"], config["data_path"],
                model_dict[model_name]["input_variables"],
                model_dict[model_name]["output_variables"],
                config["meta_variables"],
                model_dict[model_name]["patch_radius"])

            input_combined = combine_patch_data(
                input_data, model_dict[model_name]["input_variables"])
            print('COMBINED VARNAMES: ', input_combined['var_name'])
            input_scaled, scale_values = min_max_scale(input_combined,
                                                       scale_values)
            print("Input shape:", input_scaled.shape)
            meta_df = get_meta_scalars(meta)
            geometry_df, skips = get_contours(meta)
            model_out_path = join(config["model_path"], model_name)
            models[model_name] = load_conv_net(model_out_path, model_name)
            print(model_name, f'({model_type})')
            print(models[model_name].model_.summary())

            if model_type == 'semi_supervised':

                neuron_columns = [
                    f"neuron_{n:03d}"
                    for n in range(models[model_name].dense_neurons)
                ]
                neuron_activations[model_name] = pd.merge(
                    meta_df,
                    pd.DataFrame(0,
                                 columns=neuron_columns,
                                 index=meta_df.index),
                    left_index=True,
                    right_index=True)
                neuron_activations[model_name].loc[:, neuron_columns] = \
                    models[model_name].output_hidden_layer(input_scaled.values)

                neuron_activations[model_name].to_csv(join(
                    config["activation_path"],
                    f'{model_name}_activations_{date_str}.csv'),
                                                      index=False)

                gmms[model_name] = joblib.load(
                    join(config["model_path"], model_name,
                         f'{model_name}.gmm'))
                cluster_assignments = joblib.load(
                    join(config["model_path"], model_name,
                         f'{model_name}_gmm_labels.dict'))
                labels = predict_labels_gmm(neuron_activations[model_name],
                                            neuron_columns, gmms[model_name],
                                            cluster_assignments)
                labels = pd.merge(labels, geometry_df)

            elif model_type == 'supervised':

                labels = predict_labels_cnn(input_scaled, meta_df,
                                            models[model_name])
                labels = pd.merge(labels, geometry_df)

            l = []
            for d in pd.date_range(start_str.replace('-', ''),
                                   end_str.replace('-', ''),
                                   freq=config['run_freq'][0]):
                agg_df = pd.read_csv(
                    join(
                        config['data_path'].replace('nc', 'csv'),
                        f'{config["csv_model_prefix"]}{d.strftime("%Y%m%d-%H00")}.csv'
                    ))
                l.append(agg_df)
            agg_storm_data = pd.concat(l).reset_index(drop=True)

            labels['MAX_UPHL'] = pd.merge(
                labels, agg_storm_data.drop(skips),
                on=labels.index)[config["agg_variables"]]

            if config['output_format'] == 'csv':
                labels.to_csv(join(config["labels_path"],
                                   f'{model_name}_labels_{date_str}.csv'),
                              index_label=False)
            elif config['output_format'] == 'parquet':
                labels.to_parquet(
                    join(config["labels_path"],
                         f'{model_name}_labels_{date_str}.parquet'))
            else:
                raise ValueError(
                    f'File format {config["output_format"]} not found. Please use "parquet" or "csv"'
                )
            print(
                'Wrote',
                join(
                    config["labels_path"],
                    f'{model_name}_labels_{date_str}.{config["output_format"]}'
                ))

    print("Completed.")

    return
示例#7
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("config", help="Name of the config file.")
    parser.add_argument("-e", "--eval", action="store_true", help="Evaluate conditional probabilities.")
    args = parser.parse_args()
    if not exists(args.config):
        raise FileNotFoundError(args.config + " not found.")
    with open(args.config, "r") as config_file:
        config = yaml.load(config_file, Loader=yaml.Loader)
    models, gmms, neuron_activations, labels = {}, {}, {}, {}
    if config["run_start_date"] == "today":
        if config['run_freq'] == 'hourly':
            start_str = (pd.Timestamp(config["run_start_date"], tz="UTC") - pd.Timedelta(hours=3)).strftime("%Y%m%d-%H00")
            end_str = (pd.Timestamp(config["run_end_date"], tz="UTC") - pd.Timedelta(hours=3)).strftime("%Y%m%d-%H00")
        elif config['run_freq'] == 'daily':
            start_str = (pd.Timestamp(config["run_start_date"], tz="UTC")).strftime("%Y%m%d-0000")
            end_str = (pd.Timestamp(config["run_end_date"], tz="UTC")).strftime("%Y%m%d-0000")
    else:
        start_str = (pd.Timestamp(config["run_start_date"], tz="UTC")).strftime("%Y%m%d-%H00")
        end_str = (pd.Timestamp(config["run_end_date"], tz="UTC")).strftime("%Y%m%d-%H00")
    if start_str != end_str:
        date_str = start_str + '_' + end_str
    else:
        date_str = start_str

    l = []
    for d in pd.date_range(start_str.replace('-', ''), end_str.replace('-', ''), freq=config['run_freq'][0]):
        file_path = join(config["data_path"].replace('_nc/', '_csv/'),
                         f'{config["csv_model_prefix"]}{d.strftime("%Y%m%d-%H00")}.csv')
        if exists(file_path):
            df = pd.read_csv(file_path)
            l.append(df)

    storm_data = pd.concat(l).reset_index(drop=True)

    for model_type, model_dict in config["models"].items():
        for model_name in model_dict.keys():
            model_path = join(model_dict[model_name]["model_path"], "models", model_name)
            label_path = join(model_dict[model_name]["model_path"], 'labels')
            if not exists(label_path):
                makedirs(label_path)
            scale_values = pd.read_csv(join(model_path, f"scale_values_{model_name}.csv"))
            scale_values['variable'] = model_dict[model_name]['input_variables']
            scale_values = scale_values.set_index('variable')

            print('Loading storm patches...')
            input_data, output, meta = load_patch_files(config["run_start_date"],
                                                        config["run_end_date"],
                                                        config["run_freq"],
                                                        config["data_path"],
                                                        model_dict[model_name]["input_variables"],
                                                        model_dict[model_name]["output_variables"],
                                                        config["meta_variables"],
                                                        model_dict[model_name]["patch_radius"])

            input_combined = combine_patch_data(input_data, model_dict[model_name]["input_variables"])
            print('COMBINED VARNAMES: ', input_combined['var_name'])
            input_scaled, scale_values = min_max_scale(input_combined, scale_values)
            print("Input shape:", input_scaled.shape)
            meta_df = get_meta_scalars(meta)
            models[model_name] = load_conv_net(model_path, model_name)
            print(model_name, f'({model_type})')
            print(models[model_name].model_.summary())

            if model_type == 'semi_supervised':

                neuron_columns = [f"neuron_{n:03d}" for n in range(models[model_name].dense_neurons)]
                neuron_activations[model_name] = pd.merge(meta_df, pd.DataFrame(0, columns=neuron_columns,
                                                          index=meta_df.index), left_index=True, right_index=True)
                neuron_activations[model_name].loc[:, neuron_columns] = \
                    models[model_name].output_hidden_layer(input_scaled.values)

                gmms[model_name] = joblib.load(join(f"{model_path}_GMM_1.mod"))
                cluster_assignments = joblib.load(join(model_path, f'{model_name}_GMM_1_gmm_labels.dict'))

                labels[model_name] = predict_labels_gmm(neuron_activations[model_name], neuron_columns, gmms[model_name],
                                                        cluster_assignments)
                labels[model_name] = pd.merge(labels[model_name], meta_df)

            elif model_type == 'supervised':

                labels[model_name] = predict_labels_cnn(input_scaled, meta_df, models[model_name])
                labels[model_name] = pd.merge(labels[model_name], meta_df)

            labels[model_name][config["agg_variables"]] = pd.merge(labels[model_name], storm_data,
                                                                   on=labels[model_name].index)[config["agg_variables"]]
            file_name = join(model_dict[model_name]["model_path"], "labels",
                                       f"{model_name}_labels_{date_str}.{config['output_format']}")
            save_labels(labels=labels[model_name],
                        file_path=file_name,
                        format=config["output_format"])

            if args.eval:
                storm_report_path = config["storm_report_path"]
                if not path.exists(storm_report_path):
                    makedirs(storm_report_path, exist_ok=False)
                start_date =(pd.Timestamp(config["run_start_date"], tz="UTC")).strftime("%Y%m%d0000")
                end_date = (pd.Timestamp(config["run_end_date"], tz="UTC")).strftime("%Y%m%d0000")
                for report_type in ['filtered_torn', 'filtered_hail', 'filtered_wind']:
                    print(f'Downloading SPC storm reports from {start_date} through {end_date} for {report_type}')
                    fetch_storm_reports(start_date, end_date, storm_report_path, report_type)

                if not isfile(join(model_dict[model_name]["model_path"], "labels", f"obs_{date_str}.nc")):
                    print(f'Aggregating storm reports to a grid.')
                    obs = generate_obs_grid(beg=start_date,
                                            end=end_date,
                                            storm_report_path=storm_report_path,
                                            model_grid_path=config["model_grid_path"])
                    file_name = join(model_dict[model_name]["model_path"], "labels", f"obs_{date_str}.nc")
                    obs.to_netcdf(file_name)
                    print(f"Wrote {file_name}.")

                print("Aggregating storm mode labels to a grid.")
                data = generate_mode_grid(beg=start_date,
                                          end=end_date,
                                          labels=labels[model_name],
                                          model_grid_path=config["model_grid_path"],
                                          min_lead_time=1,
                                          max_lead_time=24,
                                          run_date_freq='1d',
                                          bin_width=None)
                file_name = join(model_dict[model_name]["model_path"], "labels", config["physical_model"],
                                 f"{model_name}_gridded_labels_{date_str}.nc")
                data.to_netcdf(file_name)
                print(f"Wrote {file_name}.")
    return