type=str,
        nargs="+",
        default=AVAILABLE_MODELS,
        choices=AVAILABLE_MODELS,
        help="Determine the models which are being used for this experiment.",
    )
    parser.add_argument(
        "--result-dir",
        type=str,
        default=RESULT_DIR,
        help="Define the directory that results should be saved to.",
    )
    args = parser.parse_args()

    # Loading the data
    dh = DataHandler(args.data_origin)
    feature_names = dh.load_feature_names()
    train_data, test_data, val_data = dh.load_data_splits()

    y_name = dh.load_target_name()

    pipe = pipeline.Pipeline([("scaler", StandardScaler()),
                              ("imputer", SimpleImputer())])

    pipe.fit(train_data[feature_names])
    X_train = pipe.transform(train_data[feature_names])
    X_test = pipe.transform(test_data[feature_names])
    X_val = pipe.transform(val_data[feature_names])

    uncertainties = defaultdict(list)
def perform_hyperparameter_search(
    data_origin: str, models: List[str], result_dir: str, save_top_n: int = 10
):
    """
    Perform hyperparameter search for a list of models and save the results into a directory.

    Parameters
    ----------
    data_origin: str
        Name of data set models should be evaluated on.
    models: List[str]
        List specifiying the names of models.
    result_dir: str
        Directory that results should be saved to.
    save_top_n: int
        Save the top n parameter configuration. Default is 10.
    """

    data_loader = load_data_from_origin(args.data_origin)
    dh = DataHandler(**data_loader)

    train_data, _, val_data = dh.load_data_splits()
    feat_names = dh.load_feature_names()
    target_name = dh.load_target_name()

    with tqdm(total=get_num_runs(models)) as progress_bar:

        for model_name in models:

            X_train = train_data[feat_names].values
            X_val = val_data[feat_names].values

            # Scale and impute
            if model_name != "HI-VAE":
                pipe = pipeline.Pipeline(
                    [("scaler", StandardScaler()), ("imputer", SimpleImputer())]
                )
                X_train = pipe.fit_transform(X_train)
                X_val = pipe.transform(X_val)

            y_train, y_val = (
                train_data[target_name].values,
                val_data[target_name].values,
            )

            progress_bar.postfix = f"(model: {model_name})"
            progress_bar.update()
            scores = {}
            model_type = MODEL_CLASSES[model_name]

            sampled_params = sample_hyperparameters(model_name, data_origin)

            for run, param_set in enumerate(sampled_params):

                if model_name in NEURAL_MODELS - DEEP_KERNELS:
                    param_set.update(input_size=len(feat_names))

                model = model_type(**param_set)

                try:
                    try:
                        model.fit(X_train, y_train, **TRAIN_PARAMS[model_name])
                    except AttributeError:
                        model.train(X_train, y_train, **TRAIN_PARAMS[model_name])

                    preds = model.predict(X_val)

                    # Neural predictors: Use the AUC-ROC score
                    if model_name in NEURAL_PREDICTORS | DEEP_KERNELS:
                        # When model training goes completely awry
                        if np.isnan(preds).all():
                            score = 0

                        else:
                            preds = preds[:, 1]
                            score = roc_auc_score(
                                y_true=y_val[~np.isnan(preds)],
                                y_score=preds[~np.isnan(preds)],
                            )
                            print(f"Score: {score}")

                    # Auto-encoders: Use mean negative reconstruction error (because score are sorted descendingly)
                    elif model_name in AUTOENCODERS:
                        score = -float(preds.mean())

                    # PPCA: Just use the (mean) log-likelihood
                    else:
                        score = preds.mean()

                # In case of nans due bad training parameters
                except (ValueError, RuntimeError) as e:
                    print(f"There was an error: '{str(e)}', run aborted.")
                    score = -np.inf

                if np.isnan(score):
                    score = -np.inf

                scores[run] = {"score": score, "hyperparameters": param_set}
                progress_bar.update(1)

                # Rank and save results
                # Do after every experiment in case anything goes wrong
                sorted_scores = dict(
                    list(
                        sorted(
                            scores.items(),
                            key=lambda run: run[1]["score"],
                            reverse=True,
                        )
                    )[:save_top_n]
                )
                model_result_dir = f"{result_dir}/{data_origin}/"

                if not os.path.exists(model_result_dir):
                    os.makedirs(model_result_dir)

                with open(f"{model_result_dir}/{model_name}.json", "w") as result_file:
                    result_file.write(json.dumps(sorted_scores, indent=4, default=str))
    parser.add_argument(
        "--data-origin", type=str, default="MIMIC", help="Which data to use",
    )
    parser.add_argument(
        "--stats-dir",
        type=str,
        default=STATS_DIR,
        help="Define the directory that results should be saved to.",
    )
    args = parser.parse_args()

    df = defaultdict(lambda: defaultdict(dict))
    data_origin = args.data_origin

    for data_origin in BASE_ORIGINS:
        dh = DataHandler(data_origin)
        feature_names = dh.load_feature_names()
        train_data, test_data, val_data = dh.load_data_splits()
        y_name = dh.load_target_name()
        ood_mappings = dh.load_ood_mappings()
        rel_sizes = {}
        percentage_sigs = {}

        if data_origin == "MIMIC":

            train_ood, test_ood, val_ood = dh.load_newborns()
            all_ood = pd.concat([train_ood, test_ood, val_ood])

            df[data_origin]["Newborn"]["Count"] = len(all_ood)
            df[data_origin]["Newborn"]["Mortality rate"] = round(all_ood["y"].mean(), 3)
Exemplo n.º 4
0
        nargs="+",
        default={"LOF", "DUE"},
        choices=AVAILABLE_MODELS,
        help="Determine the models which are being used for this experiment.",
    )
    parser.add_argument(
        "--result-dir",
        type=str,
        default=RESULT_DIR,
        help="Define the directory that results should be saved to.",
    )
    args = parser.parse_args()

    # Loading the data
    data_loader = load_data_from_origin(args.data_origin)
    dh = DataHandler(**data_loader)
    feature_names = dh.load_feature_names()
    train_data, test_data, val_data = dh.load_data_splits()
    y_name = dh.load_target_name()

    for ne, scoring_funcs, name in init_models(input_dim=len(feature_names),
                                               selection=args.models,
                                               origin=args.data_origin):
        print(name)
        nov_an = NoveltyAnalyzer(
            ne,
            train_data[feature_names],
            test_data[feature_names],
            val_data[feature_names],
            train_data[y_name],
            test_data[y_name],
    )
    parser.add_argument(
        "--result-dir",
        type=str,
        default=RESULT_DIR,
        help="Define the directory that results should be saved to.",
    )
    parser.add_argument(
        "--stats-dir",
        type=str,
        default=STATS_DIR,
        help="Define the directory that results should be saved to.",
    )
    args = parser.parse_args()

    dh_mimic = DataHandler("MIMIC_for_DA")
    feature_names_mimic = dh_mimic.load_feature_names()
    train_mimic, test_mimic, val_mimic = dh_mimic.load_data_splits()
    y_mimic = dh_mimic.load_target_name()

    mimic_data = ood_utils.DomainData(train_mimic, test_mimic, val_mimic,
                                      feature_names_mimic, y_mimic, "MIMIC")

    dh_eicu = DataHandler("eICU_for_DA")
    feature_names_eicu = dh_eicu.load_feature_names()
    train_eicu, test_eicu, val_eicu = dh_eicu.load_data_splits()
    y_eicu = dh_eicu.load_target_name()

    eicu_data = ood_utils.DomainData(train_eicu, test_eicu, val_eicu,
                                     feature_names_eicu, y_eicu, "eICU")
Exemplo n.º 6
0
        nargs="+",
        default=AVAILABLE_MODELS,
        choices=AVAILABLE_MODELS,
        help="Determine the models which are being used for this experiment.",
    )
    parser.add_argument(
        "--result-dir",
        type=str,
        default=RESULT_DIR,
        help="Define the directory that results should be saved to.",
    )
    args = parser.parse_args()

    # Loading the data
    data_loader = load_data_from_origin(args.data_origin)
    dh = DataHandler(**data_loader)
    feature_names = dh.load_feature_names()

    train_data, test_data, val_data = dh.load_data_splits()
    y_name = dh.load_target_name()

    if args.data_origin in MIMIC_ORIGINS:
        train_newborns, test_newborns, val_newborns = dh.load_other_groups("newborns")

    ood_mappings = dh.load_ood_mappings()

    # loop over the different methods
    for model_info in init_models(
        input_dim=len(feature_names), selection=args.models, origin=args.data_origin
    ):
        print(model_info[2])
Exemplo n.º 7
0
from src.models.hi_vae import infer_types
from src.utils.datahandler import DataHandler, BASE_ORIGINS, load_data_from_origin

# CONST
FEAT_TYPES_DIR = "../../data/feature_types"

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--feat-types-dir",
        type=str,
        default=FEAT_TYPES_DIR,
        help="Define the directory that results should be saved to.",
    )
    args = parser.parse_args()

    for data_origin in BASE_ORIGINS:
        data_loader = load_data_from_origin(args.data_origin)
        dh = DataHandler(**data_loader)
        feature_names = dh.load_feature_names()
        train_data, _, _ = dh.load_data_splits()

        feat_types = infer_types(train_data[feature_names].to_numpy(),
                                 feature_names)

        mappings = OrderedDict(zip(feature_names, feat_types))

        with open(f"{args.feat_types_dir}/feat_types_{data_origin}.json",
                  "w") as result_file:
            result_file.write(json.dumps(mappings, indent=4))
Exemplo n.º 8
0
from src.models.hi_vae import infer_types
from src.utils.datahandler import DataHandler, BASE_ORIGINS

# CONST
FEAT_TYPES_DIR = "../../data/feature_types"


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--feat-types-dir",
        type=str,
        default=FEAT_TYPES_DIR,
        help="Define the directory that results should be saved to.",
    )
    args = parser.parse_args()

    for data_origin in BASE_ORIGINS:
        dh = DataHandler(data_origin)
        feature_names = dh.load_feature_names()
        train_data, _, _ = dh.load_data_splits()

        feat_types = infer_types(train_data[feature_names].to_numpy(), feature_names)

        mappings = OrderedDict(zip(feature_names, feat_types))

        with open(
            f"{args.feat_types_dir}/feat_types_{data_origin}.json", "w"
        ) as result_file:
            result_file.write(json.dumps(mappings, indent=4))
Exemplo n.º 9
0
        "--result-dir",
        type=str,
        default=RESULT_DIR,
        help="Define the directory that results should be saved to.",
    )
    parser.add_argument(
        "--stats-dir",
        type=str,
        default=STATS_DIR,
        help="Define the directory that results should be saved to.",
    )
    args = parser.parse_args()

    # MIMIC
    data_loader = load_data_from_origin("MIMIC_for_DA")
    dh_mimic = DataHandler(**data_loader)

    feature_names_mimic = dh_mimic.load_feature_names()
    train_mimic, test_mimic, val_mimic = dh_mimic.load_data_splits()
    y_mimic = dh_mimic.load_target_name()

    mimic_data = ood_utils.DomainData(
        train_mimic, test_mimic, val_mimic, feature_names_mimic, y_mimic, "MIMIC"
    )

    # eICU
    data_loader = load_data_from_origin("eICU_for_DA")
    dh_eicu = DataHandler(**data_loader)

    feature_names_eicu = dh_eicu.load_feature_names()
    train_eicu, test_eicu, val_eicu = dh_eicu.load_data_splits()