def run_100(task, ori_df, clf_model, params, args, threshold): preds = [] targs = [] probs = [] seeds = list(range(args.start_seed, args.start_seed + 100)) for seed in tqdm(seeds, desc=f'{task} Runs'): df = set_group_splits(task_df.copy(), group_col='hadm_id', seed=seed) vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), binary=True, max_features=60_000) x_train = vectorizer.fit_transform(df.loc[(df['split'] == 'train')]['processed_note']) x_test = vectorizer.transform(df.loc[(df['split'] == 'test')]['processed_note']) y_train = df.loc[(df['split'] == 'train')][f'{task}_label'].to_numpy() y_test = df.loc[(df['split'] == 'test')][f'{task}_label'].to_numpy() targs.append(y_test) clf = clf_model(**params) clf.fit(x_train, y_train) pickle.dump(clf, open(args.modeldir/f'{task}_seed_{seed}.pkl', 'wb')) pos_prob = clf.predict_proba(x_test)[:, 1] probs.append(pos_prob) y_pred = (pos_prob > threshold).astype(np.int64) preds.append(y_pred) with open(args.workdir/f'{task}_preds.pkl', 'wb') as f: pickle.dump(targs, f) pickle.dump(preds, f) pickle.dump(probs, f)
def run_100(task, task_df, args, threshold): reduce_lr = LRScheduler( policy='ReduceLROnPlateau', mode='min', factor=0.5, patience=1, ) seeds = list(range(args.start_seed, args.start_seed + 100)) for seed in tqdm(seeds, desc=f'{task} Runs'): logger.info(f"Spliting with seed {seed}") checkpoint = Checkpoint(dirname=args.modeldir / f'{task}_seed_{seed}', ) df = set_group_splits(task_df.copy(), group_col='hadm_id', seed=seed) vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), binary=True, max_features=60_000) x_train = vectorizer.fit_transform( df.loc[(df['split'] == 'train')]['processed_note']).astype( np.float32) x_test = vectorizer.transform( df.loc[(df['split'] == 'test')]['processed_note']).astype( np.float32) x_train = np.asarray(x_train.todense()) x_test = np.asarray(x_test.todense()) vocab_sz = len(vectorizer.vocabulary_) y_train = df.loc[(df['split'] == 'train')][f'{task}_label'].to_numpy() y_test = df.loc[(df['split'] == 'test')][f'{task}_label'].to_numpy() clf = MLPModule(input_units=vocab_sz, output_units=1, hidden_units=args.hidden_dim, num_hidden=1, dropout=args.dropout_p, squeeze_output=True) net = NeuralNetBinaryClassifier( clf, max_epochs=args.max_epochs, lr=args.lr, device=args.device, optimizer=optim.Adam, optimizer__weight_decay=args.wd, batch_size=args.batch_size, verbose=1, callbacks=[EarlyStopping, ProgressBar, checkpoint, reduce_lr], train_split=CVSplit(cv=0.15, stratified=True), iterator_train__shuffle=True, threshold=threshold, ) net.set_params(callbacks__valid_acc=None) net.fit(x_train, y_train.astype(np.float32))
if len(sys.argv) != 2: logger.error(f"Usage: {sys.argv[0]} task_name (ia|ps)") sys.exit(1) task = sys.argv[1] ori_df = pd.read_csv(args.dataset_csv, usecols=args.cols, parse_dates=args.dates) if task == 'ia': logger.info(f"Running hyperparameter search for Imminent Admission Prediction task") task_df = ori_df.loc[(ori_df['imminent_adm_label'] != -1)][args.imminent_adm_cols].reset_index(drop=True) label = 'imminent_adm_label' if task == 'ps': logger.info(f"Running hyperparameter search for Prolonged Stay Prediction task ") task_df = ori_df[args.prolonged_stay_cols].copy() label = 'prolonged_stay_label' df = set_group_splits(task_df.copy(), group_col='hadm_id', seed=42) vectorizer = TfidfVectorizer(min_df=args.min_freq, analyzer=str.split, sublinear_tf=True, ngram_range=(2,2)) x_train = vectorizer.fit_transform(df.loc[(df['split'] == 'train')]['processed_note']) x_test = vectorizer.transform(df.loc[(df['split'] == 'test')]['processed_note']) y_train = df.loc[(df['split'] == 'train')][label].to_numpy() y_test = df.loc[(df['split'] == 'test')][label].to_numpy() clf_params = { 'solver': 'liblinear', 'multi_class': 'ovr', } clf = LogisticRegression(**clf_params) param_space = {