示例#1
0
def run_variational_hp(dump_dir, debug, seed):
    """Variational dropout and weight decay tests
        Default configurations used when not specified
        e.g. no redshift
    """

    lu.print_green(f"SEED {seed}: VARIATIONAL HP")

    if seed != LIST_SEED[0]:
        return

    list_dropout_values = [0.01, 0.05, 0.1]
    list_weight_decay = [0, 1E-7, 1E-5]

    if debug is True:
        list_dropout_values = list_dropout_values[:1]
        list_weight_decay = list_weight_decay[:1]

    for (dropout_values, weight_decay) in product(
        list_dropout_values, list_weight_decay
    ):
        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--model variational "
            f"--dump_dir {dump_dir} "
            f"--cyclic "
            f"--data_fraction 0.2 "
            f"--num_inference_samples 10 "
            f"--dropout {dropout_values} "
            f"--weight_decay {weight_decay} "
        )
        run_cmd(cmd, debug, seed)
示例#2
0
def run_representative(dump_dir, debug, seed):

    lu.print_green(f"SEED {seed}: REPRESENTATIVE")

    # Get all trained models with saltfit + DF == 1
    list_saltfit = (Path(dump_dir) / "models").glob("**/*saltfit_DF_1.0*.pt")

    # Only run representative on models which have a photometry counterpart
    list_saltfit = [
        m
        for m in list_saltfit
        if Path(str(m).replace("saltfit_DF_1.0", "photometry_DF_0.43")).exists()
    ]

    for model_file in list_saltfit:

        # Validate saltfit model on photometry test set
        cmd = (
            f"python -W ignore run.py "
            f"--validate_rnn "
            f"--override_source_data photometry "
            f"--model_files {model_file} "
            f"--dump_dir {dump_dir} "
        )

        run_cmd(cmd, debug, seed)
def run_baseline_tmp(dump_dir, debug, seed):

    lu.print_green(f"SEED {seed}: BASELINE HP")

    if seed != LIST_SEED[0]:
        return

    list_peak_norm = [None, 'basic', 'log']
    list_random_start = [False, True]

    for (
            peak_norm,
            random_start,
    ) in product(
            list_peak_norm,
            list_random_start,
    ):
        cmd = (f"python -W ignore run.py --train_rnn "
               f"--dump_dir {dump_dir} "
               f"--cyclic ")
        if peak_norm:
            cmd += f"--peak_norm {peak_norm} "
        if random_start:
            cmd += f"--random_start"
        run_cmd(cmd, debug, seed)
示例#4
0
def run_baseline_hp(dump_dir, debug, seed):

    lu.print_green(f"SEED {seed}: BASELINE HP")

    if seed != LIST_SEED[0]:
        return

    list_batch_size = [64, 128, 512]
    list_dropout = [0.05, 0.1, 0.2]
    list_num_layers = [1, 2]
    list_layer_type = ["gru", "lstm"]
    list_bidirectional = [True, False]
    list_rnn_output_option = ["standard", "mean"]
    list_random_length = [True, False]
    list_hidden_dim = [16, 32]

    if debug is True:
        list_batch_size = list_batch_size[:1]
        list_dropout = list_dropout[:1]
        list_hidden_dim = list_hidden_dim[:1]

    for (
        batch_size,
        dropout,
        num_layers,
        layer_type,
        bidirectional,
        rnn_output_option,
        random_length,
        hidden_dim,
    ) in product(
        list_batch_size,
        list_dropout,
        list_num_layers,
        list_layer_type,
        list_bidirectional,
        list_rnn_output_option,
        list_random_length,
        list_hidden_dim,
    ):
        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--dump_dir {dump_dir} "
            f"--cyclic "
            f"--data_fraction 0.2 "
            f"--dropout {dropout} "
            f"--batch_size {batch_size} "
            f"--layer_type {layer_type} "
            f"--num_layers {num_layers} "
            f"--bidirectional {bidirectional} "
            f"--random_length {random_length} "
            f"--rnn_output_option {rnn_output_option} "
            f"--hidden_dim {hidden_dim} "
        )
        run_cmd(cmd, debug, seed)
示例#5
0
def save_randomforest_model(settings, clf):
    """Save RandomForest model

    Args:
        settings (ExperimentSettings): controls experiment hyperparameters
        clf (RandomForestClassifier): RandomForest model
    """

    filename = f"{settings.rf_dir}/{settings.randomforest_model_name}.pickle"
    with open(filename, "wb") as f:
        pickle.dump(clf, f)
    lu.print_green("Saved model")
示例#6
0
def run_variational_best(dump_dir, debug, seed):
    """Variational dropout and weight decay tests
        Default configurations used when not specified
        e.g. no redshift
    """

    lu.print_green(f"SEED {seed}: VARIATIONAL BEST")

    list_nb_classes = [2, 3, 7]
    list_redshift = [None, "zpho", "zspe"]
    list_data_fraction = [0.43, 1.0]
    for (nb_classes, redshift, data_fraction) in product(
        list_nb_classes, list_redshift, list_data_fraction
    ):

        # Carry out representativeness only in binary classification
        if data_fraction == 0.43 and nb_classes != 2:
            continue

        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--model variational "
            f"--source_data photometry "
            f"--dump_dir {dump_dir} "
            f"--cyclic "
            f"--data_fraction {data_fraction} "
            f"--dropout 0.01 "
            f"--weight_decay 1e-7 "
            f"--nb_classes {nb_classes} "
        )
        if redshift is not None:
            cmd += f" --redshift {redshift} "

        run_cmd(cmd, debug, seed)

    list_data_fraction = [0.5, 1.0]
    for (redshift, data_fraction) in product(list_redshift, list_data_fraction):
        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--model variational "
            f"--source_data saltfit "
            f"--dump_dir {dump_dir} "
            f"--cyclic "
            f"--data_fraction {data_fraction} "
            f"--dropout 0.01 "
            f"--weight_decay 1e-7 "
        )
        if redshift is not None:
            cmd += f" --redshift {redshift} "

        run_cmd(cmd, debug, seed)
示例#7
0
def run_benchmark_cyclic(dump_dir, debug, seed):

    lu.print_green(f"SEED {seed}: BENCHMARK CYCLIC")

    if seed != LIST_SEED[0]:
        return

    models = ["vanilla"]
    training_method = ["cyclic", ""]
    data_fraction = [0.25, 0.5, 1.0]
    cyclic_phases = [["5", "10", "15"], ["10", "20", "25"]]
    source_data = "saltfit"
    purpose = "train_rnn"

    if debug is True:
        cyclic_phases = [["1", "2", "3"]]
        data_fraction = data_fraction[:1]

    for model, training_method, data_fraction in product(
        models, training_method, data_fraction
    ):

        cmd = (
            f"python -W ignore run.py --{purpose} "
            f"--model {model} "
            f"--dump_dir {dump_dir} "
            f"--source_data {source_data} "
            f"--data_fraction {data_fraction} "
        )
        if training_method == "cyclic":
            cmd += "--cyclic "
            for cyclic_phase in cyclic_phases:
                cmd += f"--cyclic_phases {' '.join(cyclic_phase)} "
                # Set debug to False to avoid overriding
                run_cmd(cmd, False, seed)
        else:
            run_cmd(cmd, debug, seed)

    # Load and save results
    list_logs = (Path(dump_dir) / "models").glob("**/training_log.json")
    list_df = []
    for log_file in list_logs:
        with open(log_file, "r") as f:
            df_cyclic = pd.DataFrame.from_dict(json.load(f), orient="index")
            list_df.append(df_cyclic)

    df_cyclic = pd.concat(list_df)

    with open(Path(dump_dir) / "stats/cyclic_stats.tex", "w") as tf:
        tf.write(df_cyclic.to_latex())
示例#8
0
def run_speed(dump_dir, debug, seed):

    lu.print_green(f"SEED {seed}: SPEED")

    if seed != LIST_SEED[0]:
        return

    models = ["vanilla", "variational", "bayesian"]
    use_cuda = [True, False]
    source_data = "saltfit"
    purpose = "speed"

    for model, use_cuda in product(models, use_cuda):

        # No cuda benchmark if cuda is not available
        if use_cuda is True and not torch.cuda.is_available():
            continue

        cmd = (
            f"python -W ignore run.py --{purpose} "
            f"--model {model} "
            f"--cyclic "
            f"--dump_dir {dump_dir} "
            f"--source_data {source_data} "
        )
        if use_cuda:
            cmd += f" --use_cuda "

        # Call subprocess here because run_cmd otherwise
        # may mess up with gpu options
        if debug is True:
            # Run for 1 epoch only
            cmd = cmd.replace("--cyclic ", " ")
            cmd = cmd + " --nb_epoch 1 "

            if "num_inference_samples" not in cmd:
                # Make inference faster
                cmd = cmd + "--num_inference_samples 2 "

            if "hidden_dim" not in cmd:
                # Decrease NN size
                cmd = cmd + "--hidden_dim 2 "

        subprocess.check_call(shlex.split(cmd))

    # Create plots with the results
    plot_speed_benchmark(dump_dir)
示例#9
0
def run_performance(dump_dir, debug, seed):
    """Performance and plots
    """

    lu.print_green(f"SEED {seed}: PERFORMANCE")

    # Make sure all PRED files have accompanying METRICS file
    list_predictions = (Path(dump_dir) / "models").glob("**/*PRED*.pickle")
    prediction_files_str = " ".join(list(map(str, list_predictions)))

    cmd = (
        f"python run.py --metrics --prediction_files {prediction_files_str} --dump_dir {dump_dir} "
    )
    run_cmd(cmd, debug, seed)

    # Aggregate all metrics
    cmd = f"python -W ignore run.py --performance --dump_dir {dump_dir} "
    run_cmd(cmd, debug, seed)
示例#10
0
def load_randomforest_model(settings, model_file=None):
    """Load RandomForest model

    Args:
        settings (ExperimentSettings): controls experiment hyperparameters
        model_file (str): path to saved randomforest model. Default: ``None``

    Returns:
        (RandomForestClassifier) RandomForest model
    """

    if model_file is None:
        model_file = f"{settings.rf_dir}/{settings.randomforest_model_name}.pickle"
    assert os.path.isfile(model_file)
    with open(model_file, "rb") as f:
        clf = pickle.load(f)
    lu.print_green("Loaded model")

    return clf
示例#11
0
def run_bayesian_hp(dump_dir, debug, seed):
    """Bayesian scale tests
        Default configurations used when not specified
        e.g. data_fraction(1),no redshift
    """

    lu.print_green(f"SEED {seed}: BAYESIAN HP")

    if seed != LIST_SEED[0]:
        return

    list_params = [[-2, -7, 4, 3], [-1, -7, 4, 3], [-2, -1, 20, 5]]
    list_params_output = [[-1, -0.5, 2, 1], [-0.5, -0.1, 2, 1], [-0.5, -0.1, 3, 2]]

    if debug is True:
        list_params = list_params[:1]
        list_params_output = list_params_output[:1]

    for (params, params_output) in product(list_params, list_params_output):
        log_sigma1, log_sigma2, rho_scale_lower, rho_scale_upper = params
        log_sigma1_output, log_sigma2_output, rho_scale_lower_output, rho_scale_upper_output = (
            params_output
        )

        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--model bayesian "
            f"--dump_dir {dump_dir} "
            f"--data_fraction 0.2 "
            f"--num_inference_samples 10 "
            f"--log_sigma1 {log_sigma1} "
            f"--log_sigma2 {log_sigma2} "
            f"--rho_scale_lower {rho_scale_lower} "
            f"--rho_scale_upper {rho_scale_upper} "
            f"--log_sigma1_output {log_sigma1_output} "
            f"--log_sigma2_output {log_sigma2_output} "
            f"--rho_scale_lower_output {rho_scale_lower_output} "
            f"--rho_scale_upper_output {rho_scale_upper_output} "
        )
        run_cmd(cmd, debug, seed)
示例#12
0
def train_and_evaluate_randomforest_model(clf, X_train, y_train, X_val, y_val):
    """Train a RandomForestClassifier and evaluate AUC, precision, accuracy
    on a validation set

    Args:
        clf (RandomForestClassifier): RandomForest model to fit and evaluate
        X_train (np.array): the training features
        y_train (np.array): the training target
        X_val (np.array): the validation features
        y_val (np.array): the validation target
    """
    lu.print_green("Fitting RandomForest...")
    clf = clf.fit(X_train, y_train)
    lu.print_green("Fitting complete")

    # Evaluate our classifier
    probas_ = clf.predict_proba(X_val)
    # Compute AUC and precision
    fpr, tpr, thresholds = metrics.roc_curve(y_val, probas_[:, 1])
    roc_auc = metrics.auc(fpr, tpr)
    pscore = metrics.precision_score(y_val,
                                     clf.predict(X_val),
                                     average="binary")
    lu.print_green("Validation AUC", roc_auc)
    lu.print_green("Validation precision score", pscore)

    lu.print_green(
        "Train data accuracy",
        100 * (sum(clf.predict(X_train) == y_train)) / X_train.shape[0],
    )
    lu.print_green("Val data accuracy",
                   100 * (sum(clf.predict(X_val) == y_val)) / X_val.shape[0])

    return clf
示例#13
0
def load_HDF5(settings, test=False):
    """Load data from HDF5

    Args:
        settings (ExperimentSettings): controls experiment hyperparameters
        test (bool): If True: load data for test. Default: ``False``

    Returns:
        list_data_test (list) test data tuples if test is True

        or

        Tuple containing
            - list_data_train (list): training data tuples
            - list_data_val (list): validation data tuples
    """
    file_name = f"{settings.processed_dir}/database.h5"
    lu.print_green(f"Loading {file_name}")

    with h5py.File(file_name, "r") as hf:

        list_data_train = []
        list_data_val = []

        config_name = f"{settings.source_data}_{settings.nb_classes}classes"

        dataset_split_key = f"dataset_{config_name}"
        target_key = f"target_{settings.nb_classes}classes"

        if any([settings.train_plasticc, settings.predict_plasticc]):
            target_key = "target"
            dataset_split_key = "dataset"

        if test:
            # ridiculous failsafe in case we have different classes in dataset/model
            # we will always have 2 classes
            try:
                idxs_test = np.where(hf[dataset_split_key][:] == 2)[0]
            except Exception:
                idxs_test = np.where(
                    hf["dataset_photometry_2classes"][:] != 100)[0]
        else:
            idxs_train = np.where(hf[dataset_split_key][:] == 0)[0]
            idxs_val = np.where(hf[dataset_split_key][:] == 1)[0]
            idxs_test = np.where(hf[dataset_split_key][:] == 2)[0]

            # Shuffle for good measure
            np.random.shuffle(idxs_train)
            np.random.shuffle(idxs_val)
            np.random.shuffle(idxs_test)

            idxs_train = idxs_train[:int(settings.data_fraction *
                                         len(idxs_train))]

        n_features = hf["data"].attrs["n_features"]

        training_features = " ".join(hf["features"][:][settings.idx_features])
        lu.print_green("Features used", training_features)

        arr_data = hf["data"][:]
        if test:
            # ridiculous failsafe in case we have different classes in dataset/model
            # we will always have 2 classes
            try:
                arr_target = hf[target_key][:]
            except Exception:
                arr_target = hf["target_2classes"][:]
        else:
            arr_target = hf[target_key][:]
        arr_SNID = hf["SNID"][:]

        if test is True:
            return fill_data_list(
                idxs_test,
                arr_data,
                arr_target,
                arr_SNID,
                settings,
                n_features,
                "Loading Test Set",
                test,
            )
        else:

            list_data_train = fill_data_list(
                idxs_train,
                arr_data,
                arr_target,
                arr_SNID,
                settings,
                n_features,
                "Loading Training Set",
            )
            list_data_val = fill_data_list(
                idxs_val,
                arr_data,
                arr_target,
                arr_SNID,
                settings,
                n_features,
                "Loading Validation Set",
            )

        return list_data_train, list_data_val
示例#14
0
def run_bayesian_best(dump_dir, debug, seed):
    """Bayesian scale tests
        Default configurations used when not specified
        e.g. data_fraction(1),no redshift
    """

    lu.print_green(f"SEED {seed}: BAYESIAN BEST")

    list_nb_classes = [2, 3, 7]
    list_redshift = [None, "zpho", "zspe"]
    list_data_fraction = [0.43, 1.0]
    for (nb_classes, redshift, data_fraction) in product(
        list_nb_classes, list_redshift, list_data_fraction
    ):

        # Carry out representativeness only in binary classification
        if data_fraction == 0.43 and nb_classes != 2:
            continue

        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--model bayesian "
            f"--source_data photometry "
            f"--dump_dir {dump_dir} "
            f"--data_fraction {data_fraction} "
            f"--nb_classes {nb_classes} "
            f"--log_sigma1 -1 "
            f"--log_sigma2 -7 "
            f"--rho_scale_lower 4 "
            f"--rho_scale_upper 3 "
            f"--log_sigma1_output -0.5 "
            f"--log_sigma2_output -0.1 "
            f"--rho_scale_lower_output 3 "
            f"--rho_scale_upper_output 2 "
        )
        if redshift is not None:
            cmd += f" --redshift {redshift} "

        run_cmd(cmd, debug, seed)

    list_data_fraction = [0.5, 1.0]
    for (redshift, data_fraction) in product(list_redshift, list_data_fraction):
        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--model bayesian "
            f"--source_data saltfit "
            f"--dump_dir {dump_dir} "
            f"--data_fraction {data_fraction} "
            f"--log_sigma1 -1 "
            f"--log_sigma2 -7 "
            f"--rho_scale_lower 4 "
            f"--rho_scale_upper 3 "
            f"--log_sigma1_output -0.5 "
            f"--log_sigma2_output -0.1 "
            f"--rho_scale_lower_output 3 "
            f"--rho_scale_upper_output 2 "
        )
        if redshift is not None:
            cmd += f" --redshift {redshift} "

        run_cmd(cmd, debug, seed)
示例#15
0
def run_baseline(dump_dir, debug, seed):
    """Baseline/Random Forest Accuracy vs. number of supernovae
        Default configurations used when not specified
        e.g. source_data(saltfit),modelrnn(vanilla),norm(global)
    """

    lu.print_green(f"SEED {seed}: TRAINING")

    #################################
    # Train baseline models on SALT #
    #################################
    list_data_fraction = [0.5, 1.0] if debug else [0.05, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0]
    list_redshift = [None, "zpho", "zspe"]

    # Train RF models
    for (data_fraction, redshift) in product(list_data_fraction, list_redshift):
        cmd = (
            f"python -W ignore run.py --train_rf "
            f"--data_fraction {data_fraction} "
            f"--dump_dir {dump_dir} "
        )
        if redshift is not None:
            cmd += f" --redshift {redshift} "
        run_cmd(cmd, debug, seed)

    # Train RNN models
    for (data_fraction, redshift) in product(list_data_fraction, list_redshift):
        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--data_fraction {data_fraction} "
            f"--cyclic "
            f"--dump_dir {dump_dir} "
        )
        if redshift is not None:
            cmd += f" --redshift {redshift} "
        run_cmd(cmd, debug, seed)

    # Train RNN models, varying normalization strategy
    for norm in ["perfilter", "none"]:
        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--data_fraction 0.5 "
            f"--norm {norm} "
            f"--cyclic "
            f"--dump_dir {dump_dir} "
        )
        run_cmd(cmd, debug, seed)

    #######################################
    # Train baseline models on COMPLETE   #
    # goal: representativeness            #
    #######################################

    for data_fraction, redshift in product([0.43, 0.5], list_redshift):
        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--data_fraction {data_fraction} "
            f"--cyclic "
            f"--source_data photometry "
            f"--dump_dir {dump_dir} "
        )
        if redshift is not None:
            cmd += f" --redshift {redshift} "
        run_cmd(cmd, debug, seed)

    #######################################
    # Train baseline models on COMPLETE   #
    # goal: multiclass                    #
    #######################################

    list_nb_classes = [2, 3, 7]
    for (redshift, nb_classes) in product(list_redshift, list_nb_classes):
        cmd = (
            f"python -W ignore run.py --train_rnn "
            f"--nb_classes {nb_classes} "
            f"--dump_dir {dump_dir} "
            f"--source_data photometry "
            f"--cyclic "
        )
        if redshift is not None:
            cmd += f" --redshift {redshift} "
        run_cmd(cmd, debug, seed)