type=str, nargs="+", default=AVAILABLE_MODELS, choices=AVAILABLE_MODELS, help="Determine the models which are being used for this experiment.", ) parser.add_argument( "--result-dir", type=str, default=RESULT_DIR, help="Define the directory that results should be saved to.", ) args = parser.parse_args() # Loading the data dh = DataHandler(args.data_origin) feature_names = dh.load_feature_names() train_data, test_data, val_data = dh.load_data_splits() y_name = dh.load_target_name() pipe = pipeline.Pipeline([("scaler", StandardScaler()), ("imputer", SimpleImputer())]) pipe.fit(train_data[feature_names]) X_train = pipe.transform(train_data[feature_names]) X_test = pipe.transform(test_data[feature_names]) X_val = pipe.transform(val_data[feature_names]) uncertainties = defaultdict(list)
def perform_hyperparameter_search( data_origin: str, models: List[str], result_dir: str, save_top_n: int = 10 ): """ Perform hyperparameter search for a list of models and save the results into a directory. Parameters ---------- data_origin: str Name of data set models should be evaluated on. models: List[str] List specifiying the names of models. result_dir: str Directory that results should be saved to. save_top_n: int Save the top n parameter configuration. Default is 10. """ data_loader = load_data_from_origin(args.data_origin) dh = DataHandler(**data_loader) train_data, _, val_data = dh.load_data_splits() feat_names = dh.load_feature_names() target_name = dh.load_target_name() with tqdm(total=get_num_runs(models)) as progress_bar: for model_name in models: X_train = train_data[feat_names].values X_val = val_data[feat_names].values # Scale and impute if model_name != "HI-VAE": pipe = pipeline.Pipeline( [("scaler", StandardScaler()), ("imputer", SimpleImputer())] ) X_train = pipe.fit_transform(X_train) X_val = pipe.transform(X_val) y_train, y_val = ( train_data[target_name].values, val_data[target_name].values, ) progress_bar.postfix = f"(model: {model_name})" progress_bar.update() scores = {} model_type = MODEL_CLASSES[model_name] sampled_params = sample_hyperparameters(model_name, data_origin) for run, param_set in enumerate(sampled_params): if model_name in NEURAL_MODELS - DEEP_KERNELS: param_set.update(input_size=len(feat_names)) model = model_type(**param_set) try: try: model.fit(X_train, y_train, **TRAIN_PARAMS[model_name]) except AttributeError: model.train(X_train, y_train, **TRAIN_PARAMS[model_name]) preds = model.predict(X_val) # Neural predictors: Use the AUC-ROC score if model_name in NEURAL_PREDICTORS | DEEP_KERNELS: # When model training goes completely awry if np.isnan(preds).all(): score = 0 else: preds = preds[:, 1] score = roc_auc_score( y_true=y_val[~np.isnan(preds)], y_score=preds[~np.isnan(preds)], ) print(f"Score: {score}") # Auto-encoders: Use mean negative reconstruction error (because score are sorted descendingly) elif model_name in AUTOENCODERS: score = -float(preds.mean()) # PPCA: Just use the (mean) log-likelihood else: score = preds.mean() # In case of nans due bad training parameters except (ValueError, RuntimeError) as e: print(f"There was an error: '{str(e)}', run aborted.") score = -np.inf if np.isnan(score): score = -np.inf scores[run] = {"score": score, "hyperparameters": param_set} progress_bar.update(1) # Rank and save results # Do after every experiment in case anything goes wrong sorted_scores = dict( list( sorted( scores.items(), key=lambda run: run[1]["score"], reverse=True, ) )[:save_top_n] ) model_result_dir = f"{result_dir}/{data_origin}/" if not os.path.exists(model_result_dir): os.makedirs(model_result_dir) with open(f"{model_result_dir}/{model_name}.json", "w") as result_file: result_file.write(json.dumps(sorted_scores, indent=4, default=str))
parser.add_argument( "--data-origin", type=str, default="MIMIC", help="Which data to use", ) parser.add_argument( "--stats-dir", type=str, default=STATS_DIR, help="Define the directory that results should be saved to.", ) args = parser.parse_args() df = defaultdict(lambda: defaultdict(dict)) data_origin = args.data_origin for data_origin in BASE_ORIGINS: dh = DataHandler(data_origin) feature_names = dh.load_feature_names() train_data, test_data, val_data = dh.load_data_splits() y_name = dh.load_target_name() ood_mappings = dh.load_ood_mappings() rel_sizes = {} percentage_sigs = {} if data_origin == "MIMIC": train_ood, test_ood, val_ood = dh.load_newborns() all_ood = pd.concat([train_ood, test_ood, val_ood]) df[data_origin]["Newborn"]["Count"] = len(all_ood) df[data_origin]["Newborn"]["Mortality rate"] = round(all_ood["y"].mean(), 3)
nargs="+", default={"LOF", "DUE"}, choices=AVAILABLE_MODELS, help="Determine the models which are being used for this experiment.", ) parser.add_argument( "--result-dir", type=str, default=RESULT_DIR, help="Define the directory that results should be saved to.", ) args = parser.parse_args() # Loading the data data_loader = load_data_from_origin(args.data_origin) dh = DataHandler(**data_loader) feature_names = dh.load_feature_names() train_data, test_data, val_data = dh.load_data_splits() y_name = dh.load_target_name() for ne, scoring_funcs, name in init_models(input_dim=len(feature_names), selection=args.models, origin=args.data_origin): print(name) nov_an = NoveltyAnalyzer( ne, train_data[feature_names], test_data[feature_names], val_data[feature_names], train_data[y_name], test_data[y_name],
) parser.add_argument( "--result-dir", type=str, default=RESULT_DIR, help="Define the directory that results should be saved to.", ) parser.add_argument( "--stats-dir", type=str, default=STATS_DIR, help="Define the directory that results should be saved to.", ) args = parser.parse_args() dh_mimic = DataHandler("MIMIC_for_DA") feature_names_mimic = dh_mimic.load_feature_names() train_mimic, test_mimic, val_mimic = dh_mimic.load_data_splits() y_mimic = dh_mimic.load_target_name() mimic_data = ood_utils.DomainData(train_mimic, test_mimic, val_mimic, feature_names_mimic, y_mimic, "MIMIC") dh_eicu = DataHandler("eICU_for_DA") feature_names_eicu = dh_eicu.load_feature_names() train_eicu, test_eicu, val_eicu = dh_eicu.load_data_splits() y_eicu = dh_eicu.load_target_name() eicu_data = ood_utils.DomainData(train_eicu, test_eicu, val_eicu, feature_names_eicu, y_eicu, "eICU")
nargs="+", default=AVAILABLE_MODELS, choices=AVAILABLE_MODELS, help="Determine the models which are being used for this experiment.", ) parser.add_argument( "--result-dir", type=str, default=RESULT_DIR, help="Define the directory that results should be saved to.", ) args = parser.parse_args() # Loading the data data_loader = load_data_from_origin(args.data_origin) dh = DataHandler(**data_loader) feature_names = dh.load_feature_names() train_data, test_data, val_data = dh.load_data_splits() y_name = dh.load_target_name() if args.data_origin in MIMIC_ORIGINS: train_newborns, test_newborns, val_newborns = dh.load_other_groups("newborns") ood_mappings = dh.load_ood_mappings() # loop over the different methods for model_info in init_models( input_dim=len(feature_names), selection=args.models, origin=args.data_origin ): print(model_info[2])
from src.models.hi_vae import infer_types from src.utils.datahandler import DataHandler, BASE_ORIGINS, load_data_from_origin # CONST FEAT_TYPES_DIR = "../../data/feature_types" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--feat-types-dir", type=str, default=FEAT_TYPES_DIR, help="Define the directory that results should be saved to.", ) args = parser.parse_args() for data_origin in BASE_ORIGINS: data_loader = load_data_from_origin(args.data_origin) dh = DataHandler(**data_loader) feature_names = dh.load_feature_names() train_data, _, _ = dh.load_data_splits() feat_types = infer_types(train_data[feature_names].to_numpy(), feature_names) mappings = OrderedDict(zip(feature_names, feat_types)) with open(f"{args.feat_types_dir}/feat_types_{data_origin}.json", "w") as result_file: result_file.write(json.dumps(mappings, indent=4))
from src.models.hi_vae import infer_types from src.utils.datahandler import DataHandler, BASE_ORIGINS # CONST FEAT_TYPES_DIR = "../../data/feature_types" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--feat-types-dir", type=str, default=FEAT_TYPES_DIR, help="Define the directory that results should be saved to.", ) args = parser.parse_args() for data_origin in BASE_ORIGINS: dh = DataHandler(data_origin) feature_names = dh.load_feature_names() train_data, _, _ = dh.load_data_splits() feat_types = infer_types(train_data[feature_names].to_numpy(), feature_names) mappings = OrderedDict(zip(feature_names, feat_types)) with open( f"{args.feat_types_dir}/feat_types_{data_origin}.json", "w" ) as result_file: result_file.write(json.dumps(mappings, indent=4))
"--result-dir", type=str, default=RESULT_DIR, help="Define the directory that results should be saved to.", ) parser.add_argument( "--stats-dir", type=str, default=STATS_DIR, help="Define the directory that results should be saved to.", ) args = parser.parse_args() # MIMIC data_loader = load_data_from_origin("MIMIC_for_DA") dh_mimic = DataHandler(**data_loader) feature_names_mimic = dh_mimic.load_feature_names() train_mimic, test_mimic, val_mimic = dh_mimic.load_data_splits() y_mimic = dh_mimic.load_target_name() mimic_data = ood_utils.DomainData( train_mimic, test_mimic, val_mimic, feature_names_mimic, y_mimic, "MIMIC" ) # eICU data_loader = load_data_from_origin("eICU_for_DA") dh_eicu = DataHandler(**data_loader) feature_names_eicu = dh_eicu.load_feature_names() train_eicu, test_eicu, val_eicu = dh_eicu.load_data_splits()