id_cols, time_col, sequence_feature_cols) # impute missing values in the test features curr_sequence_features_df = curr_sequence_features_df.groupby(id_cols).apply(lambda x: x.fillna(method='pad')).copy() for feature_col in sequence_feature_cols: curr_sequence_features_df[feature_col].fillna(curr_sequence_features_df[feature_col].mean(), inplace=True) for feature_col in sequence_feature_cols: curr_sequence_features_df[feature_col].fillna(x_train_df[feature_col].mean(), inplace=True) # load test data with TidySequentialDataLoader test_vitals = TidySequentialDataCSVLoader( x_csv_path=curr_sequence_features_df, y_csv_path=chosen_stay_outcomes_df, x_col_names=feature_cols_with_mask_features, idx_col_names=id_cols, y_col_name=args.outcome_column_name, y_label_type='per_sequence' ) # predict on test data x_test, y_test = test_vitals.get_batch_data(batch_id=0) per_feature_scaling = np.load(os.path.join(args.rnn_models_dir, 'per_feature_scaling.npy')) for f in range(x_test.shape[2]): x_test[:,:,f] = x_test[:,:,f]/per_feature_scaling[f] mask_feature_cols = [i for i in feature_cols_with_mask_features if 'mask' in i] total_missing_features_over_time[q]=(curr_sequence_features_df[mask_feature_cols]==0).sum().sum()
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--outcome_col_name', type=str, required=True) parser.add_argument('--train_csv_files', type=str, required=True) parser.add_argument('--test_csv_files', type=str, required=True) parser.add_argument('--data_dict_files', type=str, required=True) parser.add_argument('--batch_size', type=int, default=1024, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=50, help='Number of epochs') parser.add_argument('--hidden_units', type=int, default=32, help='Number of hidden units') parser.add_argument('--hidden_layers', type=int, default=1, help='Number of hidden layers') parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0, help='dropout for optimizer') parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--validation_size', type=float, default=0.15, help='validation split size') parser.add_argument( '--is_data_simulated', type=bool, default=False, help='boolean to check if data is simulated or from mimic') parser.add_argument( '--simulated_data_dir', type=str, default='simulated_data/2-state/', help= 'dir in which to simulated data is saved.Must be provide if is_data_simulated = True' ) parser.add_argument( '--output_dir', type=str, default=None, help= 'directory where trained model and loss curves over epochs are saved') parser.add_argument( '--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split( ',') x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',') x_dict, y_dict = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_dict) # get the id and feature columns id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) # extract data train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_csv_filename, y_csv_path=y_train_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename, y_csv_path=y_test_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') X_train, y_train = train_vitals.get_batch_data(batch_id=0) X_test, y_test = test_vitals.get_batch_data(batch_id=0) _, T, F = X_train.shape print('number of time points : %s\n number of features : %s\n' % (T, F)) # set class weights as 1/(number of samples in class) for each class to handle class imbalance class_weights = torch.tensor( [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).double() # scale features # X_train = standard_scaler_3d(X_train) # X_test = standard_scaler_3d(X_test) # callback to compute gradient norm compute_grad_norm = ComputeGradientNorm(norm_type=2) # LSTM if args.output_filename_prefix == None: output_filename_prefix = ( 'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' % (args.hidden_units, args.hidden_layers, args.lr, args.dropout, args.weight_decay)) else: output_filename_prefix = args.output_filename_prefix print('RNN parameters : ' + output_filename_prefix) # # from IPython import embed; embed() rnn = RNNBinaryClassifier( max_epochs=50, batch_size=args.batch_size, device=device, lr=args.lr, callbacks=[ EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'), EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'), EarlyStopping(monitor='aucroc_score_valid', patience=20, threshold=0.002, threshold_mode='rel', lower_is_better=False), LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10), compute_grad_norm, GradientNormClipping(gradient_clip_value=0.3, gradient_clip_norm_type=2), Checkpoint(monitor='aucroc_score_valid', f_history=os.path.join( args.output_dir, output_filename_prefix + '.json')), TrainEndCheckpoint(dirname=args.output_dir, fn_prefix=output_filename_prefix), ], criterion=torch.nn.CrossEntropyLoss, criterion__weight=class_weights, train_split=skorch.dataset.CVSplit(args.validation_size), module__rnn_type='LSTM', module__n_layers=args.hidden_layers, module__n_hiddens=args.hidden_units, module__n_inputs=X_train.shape[-1], module__dropout_proba=args.dropout, optimizer=torch.optim.Adam, optimizer__weight_decay=args.weight_decay) clf = rnn.fit(X_train, y_train) y_pred_proba = clf.predict_proba(X_train) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos) print('AUROC with LSTM (Train) : %.2f' % auroc_train_final) y_pred_proba = clf.predict_proba(X_test) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos) print('AUROC with LSTM (Test) : %.2f' % auroc_test_final)
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--outcome_col_name', type=str, required=True) parser.add_argument('--train_csv_files', type=str, required=True) parser.add_argument('--test_csv_files', type=str, required=True) parser.add_argument('--data_dict_files', type=str, required=True) parser.add_argument('--batch_size', type=int, default=1024, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=50, help='Number of epochs') parser.add_argument('--n_filters', type=int, default=32, help='Number of filters') parser.add_argument('--kernel_size', type=int, default=1, help='size of eack kernel') parser.add_argument('--n_conv_layers', type=int, default=1, help='number of convolutional layers') parser.add_argument('--stride', type=int, default=1, help='stride') parser.add_argument('--pool_size', type=int, default=4, help='max pool size') parser.add_argument('--dense_units', type=int, default=128, help='number of units in fully connected layer') parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0, help='dropout for optimizer') parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--validation_size', type=float, default=0.15, help='validation split size') parser.add_argument( '--output_dir', type=str, default=None, help= 'directory where trained model and loss curves over epochs are saved') parser.add_argument( '--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split( ',') x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',') x_dict, y_dict = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_dict) # get the id and feature columns id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) # extract data train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_csv_filename, y_csv_path=y_train_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename, y_csv_path=y_test_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') X_train, y_train = train_vitals.get_batch_data(batch_id=0) X_test, y_test = test_vitals.get_batch_data(batch_id=0) N, T, F = X_train.shape # add class weights class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) # class_weights = dict(zip(range(len(class_weights)), class_weights)) # convert y_train to categorical y_train = keras.utils.to_categorical(y_train) y_test = keras.utils.to_categorical(y_test) X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=args.validation_size, random_state=213) print('number of time points : %s\nnumber of features : %s\n' % (T, F)) set_random_seed(args.seed) model = keras.Sequential() for i in range(args.n_conv_layers): model.add( keras.layers.Conv1D(filters=args.n_filters, kernel_size=args.kernel_size, activation='relu', strides=args.stride)) model.add(keras.layers.Dropout(args.dropout)) model.add(keras.layers.MaxPooling1D(pool_size=args.pool_size)) model.add(keras.layers.Flatten()) model.add(keras.layers.Dense(args.dense_units, activation='relu')) model.add(keras.layers.Dense(2, activation='softmax')) # set optimizer opt = keras.optimizers.Adam(learning_rate=args.lr) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', keras.metrics.AUC()]) # set early stopping early_stopping = EarlyStopping(monitor='val_auc', patience=20, mode='max', verbose=1) model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[early_stopping], class_weight=class_weights, batch_size=args.batch_size) y_score_val = model.predict_proba(X_val) val_auc = roc_auc_score(y_val, y_score_val) print('AUC on val set : %.4f' % val_auc) y_score_test = model.predict_proba(X_test) test_auc = roc_auc_score(y_test, y_score_test) print('AUC on val set : %.4f' % test_auc) # save the model history training_hist_df = pd.DataFrame(model.history.history) training_hist_df.loc[:, 'test_auc'] = test_auc training_hist_csv = os.path.join(args.output_dir, args.output_filename_prefix + '.csv') training_hist_df.to_csv(training_hist_csv, index=False) # save the model model_file = os.path.join(args.output_dir, args.output_filename_prefix + '.model') model.save(model_file)
type=int, default=10, metavar='N', help='number of epochs') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--save', type=str, default='model.pt', help='path to save the final model') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' dataset = TidySequentialDataCSVLoader('my_dataset.csv') X, y = dataset.get_batch_data(batch_id=0) rnn = RNNBinaryClassifier( max_epochs=args.epochs, batch_size=args.batch_size, device=device, callbacks=[ #skorch.callbacks.Checkpoint(), skorch.callbacks.ProgressBar(), ], module__rnn_type='ELMAN+relu', module__n_inputs=X.shape[-1], module__n_hiddens=10, module__n_layers=1, optimizer=torch.optim.SGD,
metavar='N', help='number of epochs') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--save', type=str, default='model.pt', help='path to save the final model') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' dataset = TidySequentialDataCSVLoader( per_tstep_csv_path='eeg_rnn_data/eeg_train_balanced.csv', idx_col_names='chunk_id', x_col_names=['eeg_signal'], y_col_name='seizure_binary_label', y_label_type='per_tstep') X, y = dataset.get_batch_data(batch_id=0) rnn = RNNBinaryClassifier( max_epochs=args.epochs, batch_size=args.batch_size, device=device, callbacks=[ #skorch.callbacks.Checkpoint(), skorch.callbacks.ProgressBar(), ], module__rnn_type='ELMAN+relu', #module__rnn_type='LSTM', module__n_inputs=X.shape[-1],
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--train_vitals_csv', type=str, help='Location of vitals data for training') parser.add_argument('--test_vitals_csv', type=str, help='Location of vitals data for testing') parser.add_argument('--metadata_csv', type=str, help='Location of metadata for testing and training') parser.add_argument('--data_dict', type=str) parser.add_argument('--batch_size', type=int, default=256, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=100000, help='Number of epochs') parser.add_argument('--hidden_units', type=int, default=10, help='Number of hidden units') parser.add_argument('--lr', type=float, default=1e-2, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0.3, help='dropout for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--save', type=str, default='RNNmodel.pt', help='path to save the final model') parser.add_argument('--report_dir', type=str, default='html', help='dir in which to save results report') parser.add_argument('--simulated_data_dir', type=str, default='simulated_data/2-state/', help='dir in which to simulated data is saved') parser.add_argument( '--is_data_simulated', type=bool, default=False, help='boolean to check if data is simulated or from mimic') parser.add_argument( '--output_filename_prefix', type=str, default='current_config', help='file to save the loss and validation over epochs') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' # hyperparameter space # learning_rate = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1] # hyperparameters = dict(lr=learning_rate) # extract data if not (args.is_data_simulated): #------------------------Loaded from TidySequentialDataCSVLoader--------------------# train_vitals = TidySequentialDataCSVLoader( per_tstep_csv_path=args.train_vitals_csv, per_seq_csv_path=args.metadata_csv, idx_col_names=['subject_id', 'episode_id'], x_col_names='__all__', y_col_name='inhospital_mortality', y_label_type='per_tstep') test_vitals = TidySequentialDataCSVLoader( per_tstep_csv_path=args.test_vitals_csv, per_seq_csv_path=args.metadata_csv, idx_col_names=['subject_id', 'episode_id'], x_col_names='__all__', y_col_name='inhospital_mortality', y_label_type='per_tstep') X_train_with_time_appended, y_train = train_vitals.get_batch_data( batch_id=0) X_test_with_time_appended, y_test = test_vitals.get_batch_data( batch_id=0) _, T, F = X_train_with_time_appended.shape if T > 1: X_train = X_train_with_time_appended[:, :, 1:] # removing hours column X_test = X_test_with_time_appended[:, :, 1:] # removing hours column else: # account for collapsed features across time X_train = X_train_with_time_appended X_test = X_test_with_time_appended # set class weights as (1-Beta)/(1-Beta^(number of training samples in class)) # beta = (len(y_train)-1)/len(y_train) # class_weights = torch.tensor(np.asarray([(1-beta)/(1-beta**((y_train==0).sum())), (1-beta)/(1-beta**((y_train==1).sum()))])) # set class weights as 1/(number of samples in class) for each class to handle class imbalance class_weights = torch.tensor( [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).double() # define a auc scorer function and pass it as callback of skorch to track training and validation AUROC roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) # use only last time step as feature for LR debugging # X_train = X_train[:,-1,:][:,np.newaxis,:] # X_test = X_test[:,-1,:][:,np.newaxis,:] # use time steps * features as vectorized feature into RNN for LR debugging # X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]*X_train.shape[2])) # X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]*X_test.shape[2])) #---------------------------------------------------------------------# # Pseudo LSTM (hand engineered features through LSTM, collapsed across time) #---------------------------------------------------------------------# # instantiate RNN rnn = RNNBinaryClassifier( max_epochs=args.epochs, batch_size=args.batch_size, device=device, criterion=torch.nn.CrossEntropyLoss, criterion__weight=class_weights, train_split=skorch.dataset.CVSplit(4), callbacks=[ skorch.callbacks.GradientNormClipping(gradient_clip_value=0.4, gradient_clip_norm_type=2), skorch.callbacks.EpochScoring(roc_auc_scorer, lower_is_better=False, on_train=True, name='aucroc_score_train'), skorch.callbacks.EpochScoring(roc_auc_scorer, lower_is_better=False, on_train=False, name='aucroc_score_valid'), ComputeGradientNorm( norm_type=2, f_history=args.report_dir + '/%s_running_rnn_classifer_gradient_norm_history.csv' % args.output_filename_prefix), # LSTMtoLogReg(),# transformation to log reg for debugging skorch.callbacks.EarlyStopping(monitor='aucroc_score_valid', patience=1000, threshold=1e-10, threshold_mode='rel', lower_is_better=False), skorch.callbacks.Checkpoint( monitor='train_loss', f_history=args.report_dir + '/%s_running_rnn_classifer_history.json' % args.output_filename_prefix), # skorch.callbacks.Checkpoint(monitor='aucroc_score_valid', f_pickle = args.report_dir + '/%s_running_rnn_classifer_model'%args.output_filename_prefix), skorch.callbacks.PrintLog(floatfmt='.2f') ], module__rnn_type='LSTM', module__n_inputs=X_train.shape[-1], module__n_hiddens=args.hidden_units, module__n_layers=1, # module__dropout_proba_non_recurrent=args.dropout, # module__dropout_proba=args.dropout, optimizer=torch.optim.SGD, optimizer__weight_decay=1e-2, # optimizer__momentum=0.9, # optimizer=torch.optim.Adam, lr=args.lr) from IPython import embed embed() # scale input features X_train = standard_scaler_3d(X_train) X_test = standard_scaler_3d(X_test) rnn.fit(X_train, y_train) # get the training history epochs, train_loss, validation_loss, aucroc_score_train, aucroc_score_valid = get_loss_plots_from_training_history( rnn.history) # plot the validation and training error plots and save f = plt.figure() plt.plot(epochs, train_loss, 'r-.', label='Train Loss') plt.plot(epochs, validation_loss, 'b-.', label='Validation Loss') plt.plot(epochs, aucroc_score_train, 'g-.', label='AUCROC score (Train)') plt.plot(epochs, aucroc_score_valid, 'm-.', label='AUCROC score (Valid)') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.title('Training Performance (learning rate : %s, hidden units : %s)' % (str(args.lr), str(args.hidden_units))) f.savefig(args.report_dir + '/%s_training_performance_plots.png' % args.output_filename_prefix) plt.close() # save the training and validation loss in a csv train_perf_df = pd.DataFrame( data=np.stack([epochs, train_loss, validation_loss]).T, columns=['epochs', 'train_loss', 'validation_loss']) train_perf_df.to_csv(args.report_dir + '/%s_perf_metrics.csv' % args.output_filename_prefix) # save classifier history to later evaluate early stopping for this model dump( rnn, args.report_dir + '/%s_rnn_classifer.pkl' % args.output_filename_prefix) y_pred_proba = rnn.predict_proba(X_test) y_pred = convert_proba_to_binary(y_pred_proba) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_pos) roc_area = roc_auc_score(y_test, y_pred_proba_pos) from IPython import embed embed() # Brief Summary # print('Best lr:', rnn.best_estimator_.get_params()['lr']) print('Accuracy:', accuracy_score(y_test, y_pred)) print('Balanced Accuracy:', balanced_accuracy_score(y_test, y_pred)) print('Log Loss:', log_loss(y_test, y_pred_proba)) print('AUC ROC:', roc_area) conf_matrix = confusion_matrix(y_test, y_pred) true_neg = conf_matrix[0][0] true_pos = conf_matrix[1][1] false_neg = conf_matrix[1][0] false_pos = conf_matrix[0][1] print('True Positive Rate:', float(true_pos) / (true_pos + false_neg)) print('True Negative Rate:', float(true_neg) / (true_neg + false_pos)) print('Positive Predictive Value:', float(true_pos) / (true_pos + false_pos)) print('Negative Predictive Value', float(true_neg) / (true_neg + false_pos)) create_html_report(args.report_dir, args.output_filename_prefix, y_test, y_pred, y_pred_proba, args.lr)
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--outcome_col_name', type=str, required=True) parser.add_argument('--train_csv_files', type=str, required=True) parser.add_argument('--valid_csv_files', type=str, required=True) parser.add_argument('--test_csv_files', type=str, required=True) parser.add_argument('--data_dict_files', type=str, required=True) parser.add_argument('--batch_size', type=int, default=1024, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=50, help='Number of epochs') parser.add_argument('--hidden_units', type=int, default=32, help='Number of hidden units') parser.add_argument('--hidden_layers', type=int, default=1, help='Number of hidden layers') parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0, help='dropout for optimizer') parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--validation_size', type=float, default=0.15, help='validation split size') parser.add_argument( '--is_data_simulated', type=bool, default=False, help='boolean to check if data is simulated or from mimic') parser.add_argument( '--output_dir', type=str, default=None, help= 'directory where trained model and loss curves over epochs are saved') parser.add_argument( '--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split( ',') x_valid_csv_filename, y_valid_csv_filename = args.valid_csv_files.split( ',') x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',') x_dict, y_dict = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_dict) # get the id and feature columns id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) # extract data train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_csv_filename, y_csv_path=y_train_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_tstep') valid_vitals = TidySequentialDataCSVLoader( x_csv_path=x_valid_csv_filename, y_csv_path=y_valid_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_tstep') test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename, y_csv_path=y_test_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_tstep') X_train, y_train = train_vitals.get_batch_data(batch_id=0) X_valid, y_valid = valid_vitals.get_batch_data(batch_id=0) X_test, y_test = test_vitals.get_batch_data(batch_id=0) N, T, F = X_train.shape # from IPython import embed; embed() # X_train = (X_train - np.min(X_train))/(np.max(X_train)-np.min(X_train)) # X_valid = (X_valid - np.min(X_train))/(np.max(X_train)-np.min(X_train)) # X_test = (X_test - np.min(X_train))/(np.max(X_train)-np.min(X_train)) valid_ds = Dataset(X_valid, y_valid) print('number of time points : %s\nnumber of features : %s\n' % (T, F)) # set class weights as 1/(number of samples in class) for each class to handle class imbalance class_weights = torch.tensor( [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).float() print('Number of training sequences : %s' % N) print('Number of test sequences : %s' % X_test.shape[0]) print('Ratio positive in train : %.2f' % ((y_train == 1).sum() / len(y_train))) print('Ratio positive in test : %.2f' % ((y_test == 1).sum() / len(y_test))) # callback to compute gradient norm compute_grad_norm = ComputeGradientNorm(norm_type=2) # LSTM if args.output_filename_prefix == None: output_filename_prefix = ( 'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' % (args.hidden_units, args.hidden_layers, args.lr, args.dropout, args.weight_decay)) else: output_filename_prefix = args.output_filename_prefix print('RNN parameters : ' + output_filename_prefix) loss_early_stopping_cp = EarlyStopping(monitor='valid_loss', patience=15, threshold=0.002, threshold_mode='rel', lower_is_better=True) rnn = RNNPerTStepBinaryClassifier( max_epochs=250, batch_size=args.batch_size, device=device, lr=args.lr, callbacks=[ EpochScoring(calc_auprc, lower_is_better=False, on_train=True, name='auprc_train'), EpochScoring(calc_auprc, lower_is_better=False, on_train=False, name='auprc_valid'), EpochScoring(calc_auroc, lower_is_better=False, on_train=True, name='auroc_train'), EpochScoring(calc_auroc, lower_is_better=False, on_train=False, name='auroc_valid'), # EpochScoring(calc_precision, lower_is_better=False, on_train=True, name='precision_train'), # EpochScoring(calc_precision, lower_is_better=False, on_train=False, name='precision_valid'), # EpochScoring(calc_recall, lower_is_better=False, on_train=True, name='recall_train'), # EpochScoring(calc_recall, lower_is_better=False, on_train=False, name='recall_valid'), # EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'), # EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'), # EarlyStopping(monitor='auprc_valid', patience=5, threshold=0.002, threshold_mode='rel', # lower_is_better=False), # LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10), # compute_grad_norm, # GradientNormClipping(gradient_clip_value=0.5, gradient_clip_norm_type=2), loss_early_stopping_cp, Checkpoint(monitor='auprc_valid', f_history=os.path.join( args.output_dir, output_filename_prefix + '.json')), TrainEndCheckpoint(dirname=args.output_dir, fn_prefix=output_filename_prefix), ], # criterion=torch.nn.CrossEntropyLoss, # criterion__weight=class_weights, train_split=predefined_split(valid_ds), module__rnn_type='GRU', module__n_layers=args.hidden_layers, module__n_hiddens=args.hidden_units, module__n_inputs=X_train.shape[-1], module__dropout_proba=args.dropout, optimizer=torch.optim.Adam, optimizer__weight_decay=args.weight_decay) # N=len(X_train) # X_train = X_train[:N] # y_train = y_train[:N] clf = rnn.fit(X_train, y_train) # get threshold with max recall at fixed precision fixed_precision = 0.1 # get predict probas for y=1 on validation set keep_inds_va = torch.logical_not( torch.all(torch.isnan(torch.FloatTensor(X_valid)), dim=-1)) y_va_pred_proba = clf.predict_proba( X_valid)[keep_inds_va][:, 1].detach().numpy() unique_probas = np.unique(y_va_pred_proba) thr_grid_G = np.linspace(np.percentile(unique_probas, 1), max(unique_probas), 100) precision_scores_G, recall_scores_G = [ np.zeros(thr_grid_G.size), np.zeros(thr_grid_G.size) ] for gg, thr in enumerate(thr_grid_G): # logistic_clf.module_.linear_transform_layer.bias.data = torch.tensor(thr_grid[gg]).double() curr_thr_y_preds = clf.predict_proba( torch.FloatTensor(X_valid))[keep_inds_va][:, 1] >= thr_grid_G[gg] precision_scores_G[gg] = precision_score(y_valid[keep_inds_va], curr_thr_y_preds) recall_scores_G[gg] = recall_score(y_valid[keep_inds_va], curr_thr_y_preds) keep_inds = precision_scores_G >= fixed_precision if keep_inds.sum() > 0: print('Choosing threshold with precision >= %.3f' % fixed_precision) else: fixed_precision_old = fixed_precision fixed_precision = np.percentile(precision_scores_G, 99) keep_inds = precision_scores_G >= fixed_precision print( 'Could not find threshold with precision >= %.3f \n Choosing threshold to maximize recall at precision %.3f' % (fixed_precision_old, fixed_precision)) thr_grid_G = thr_grid_G[keep_inds] precision_scores_G = precision_scores_G[keep_inds] recall_scores_G = recall_scores_G[keep_inds] thr_perf_df = pd.DataFrame( np.vstack([ thr_grid_G[np.newaxis, :], precision_scores_G[np.newaxis, :], recall_scores_G[np.newaxis, :] ]).T, columns=['thr', 'precision_score', 'recall_score']) print(thr_perf_df) best_ind = np.argmax(recall_scores_G) best_thr = thr_grid_G[best_ind] print('chosen threshold : %.3f' % best_thr) splits = ['train', 'valid', 'test'] # data_splits = ((x_tr, y_tr), (x_va, y_va), (X_test, y_test)) auroc_per_split, auprc_per_split, precisions_per_split, recalls_per_split = [ np.zeros(len(splits)), np.zeros(len(splits)), np.zeros(len(splits)), np.zeros(len(splits)) ] for ii, (X, y) in enumerate([(X_train, y_train), (X_valid, y_valid), (X_test, y_test)]): keep_inds = torch.logical_not( torch.all(torch.isnan(torch.FloatTensor(X)), dim=-1)) y_pred_proba_pos = clf.predict_proba(X)[keep_inds][:, 1].detach().numpy() # y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_per_split[ii] = roc_auc_score(y[keep_inds], y_pred_proba_pos) # y_pred_proba_pos = np.asarray(y_pred_proba_pos) auprc_per_split[ii] = average_precision_score(y[keep_inds], y_pred_proba_pos) y_pred = y_pred_proba_pos >= best_thr precisions_per_split[ii] = precision_score(y[keep_inds], y_pred) recalls_per_split[ii] = recall_score(y[keep_inds], y_pred) auroc_train, auroc_valid, auroc_test = auroc_per_split auprc_train, auprc_valid, auprc_test = auprc_per_split precision_train, precision_valid, precision_test = precisions_per_split recall_train, recall_valid, recall_test = recalls_per_split # save performance perf_dict = { 'auroc_train': auroc_train, 'auroc_valid': auroc_valid, 'auroc_test': auroc_test, 'auprc_train': auprc_train, 'auprc_valid': auprc_valid, 'auprc_test': auprc_test, 'precision_train': precision_train, 'precision_valid': precision_valid, 'precision_test': precision_test, 'recall_train': recall_train, 'recall_valid': recall_valid, 'recall_test': recall_test, 'threshold': best_thr } perf_df = pd.DataFrame([perf_dict]) perf_csv = os.path.join(args.output_dir, output_filename_prefix + '.csv') print('Final performance on train, valid and test :\n') print(perf_df) print('Final performance saved to %s' % perf_csv) perf_df.to_csv(perf_csv, index=False)
# Get the tstops_df for each patient-stay-slice tstops_df = pd.read_csv(os.path.join(args.tstops_dir, 'TSLICE={tslice}', 'tstops_filtered_{tslice}_hours.csv.gz').format(tslice=tslice)) x_train_curr_tslice, y_train_curr_tslice = get_tslice_x_y(x_train, y_train, tstops_df, id_cols, time_col) x_test_curr_tslice, y_test_curr_tslice = get_tslice_x_y(x_test, y_test, tstops_df, id_cols, time_col) # limit sequence length reduced_T = 200 print('Getting train and test sets for all patient stay slices...') # Pass each of the 3 dataframes through dataset_loader and 3 different tensors train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_curr_tslice, y_csv_path=y_train_curr_tslice, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name='clinical_deterioration_outcome', y_label_type='per_sequence', batch_size=45000, max_seq_len=reduced_T ) test_vitals = TidySequentialDataCSVLoader( x_csv_path=x_test_curr_tslice, y_csv_path=y_test_curr_tslice, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name='clinical_deterioration_outcome', y_label_type='per_sequence', batch_size=10, max_seq_len=reduced_T )
def main(): parser = argparse.ArgumentParser(description='PyTorch RNN with variable-length numeric sequences wrapper') parser.add_argument('--train_vitals_csv', type=str, help='Location of vitals data for training') parser.add_argument('--test_vitals_csv', type=str, help='Location of vitals data for testing') parser.add_argument('--metadata_csv', type=str, help='Location of metadata for testing and training') parser.add_argument('--data_dict', type=str) parser.add_argument('--epochs', type=int, default=1000, help='Number of epochs') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--report_dir', type=str, default='html', help='dir in which to save results report') parser.add_argument('--simulated_data_dir', type=str, default='simulated_data/2-state/', help='dir in which to simulated data is saved') parser.add_argument('--is_data_simulated', type=bool, default=False, help='boolean to check if data is simulated or from mimic') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' # extract data if not(args.is_data_simulated): train_vitals = TidySequentialDataCSVLoader( per_tstep_csv_path=args.train_vitals_csv, per_seq_csv_path=args.metadata_csv, idx_col_names=['subject_id', 'episode_id'], x_col_names='__all__', y_col_name='inhospital_mortality', y_label_type='per_tstep') test_vitals = TidySequentialDataCSVLoader( per_tstep_csv_path=args.test_vitals_csv, per_seq_csv_path=args.metadata_csv, idx_col_names=['subject_id', 'episode_id'], x_col_names='__all__', y_col_name='inhospital_mortality', y_label_type='per_tstep') X_train_with_time_appended, y_train = train_vitals.get_batch_data(batch_id=0) X_test_with_time_appended, y_test = test_vitals.get_batch_data(batch_id=0) _,T,F = X_train_with_time_appended.shape if T>1: X_train = X_train_with_time_appended[:,:,1:]# removing hours column X_test = X_test_with_time_appended[:,:,1:]# removing hours column else:# account for collapsed features across time X_train = X_train_with_time_appended X_test = X_test_with_time_appended # set class weights as 1/(number of samples in class) for each class to handle class imbalance class_weights = torch.tensor([1/(y_train==0).sum(), 1/(y_train==1).sum()]).double() # define a auc scorer function and pass it as callback of skorch to track training and validation AUROC roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) # scale features X_train = standard_scaler_3d(X_train) X_test = standard_scaler_3d(X_test) # Define parameter grid params = {'lr':[0.0001, 0.0005, 0.001, 0.005, 0.01], 'optimizer__weight_decay':[0.0001, 0.001, 0.01, 0.1, 1, 10] } #---------------------------------------------------------------------# # LSTM with gridsearchcv #---------------------------------------------------------------------# print('-------------------------------------------------------------------') print('Running LSTM converted to logistic regression on collapsed Features') model_name='logreg_hist' save_cv_results = SaveCVResults(dirname=args.report_dir, f_history=model_name+'.json') rnn = RNNBinaryClassifier( max_epochs=args.epochs, batch_size=-1, device=device, callbacks=[ skorch.callbacks.EpochScoring(roc_auc_scorer, lower_is_better=False, on_train=True, name='aucroc_score_train'), skorch.callbacks.EpochScoring(roc_auc_scorer, lower_is_better=False, on_train=False, name='aucroc_score_valid'), skorch.callbacks.EarlyStopping(monitor='aucroc_score_valid', patience=5, threshold=1e-10, threshold_mode='rel', lower_is_better=False), save_cv_results, ], criterion=torch.nn.NLLLoss, criterion__weight=class_weights, train_split=skorch.dataset.CVSplit(0.2), module__rnn_type='LSTM', module__n_layers=1, module__n_hiddens=X_train.shape[-1], module__n_inputs=X_train.shape[-1], module__convert_to_log_reg=True, optimizer=torch.optim.Adam) gs = GridSearchCV(rnn, params, scoring=roc_auc_scorer, refit=True, cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=14232)) lr_cv = gs.fit(X_train, y_train) y_pred_proba = lr_cv.best_estimator_.predict_proba(X_train) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos) print('AUROC with logistic regression (Train) : %.3f'%auroc_train_final) y_pred_proba = lr_cv.best_estimator_.predict_proba(X_test) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos) print('AUROC with logistic regression (Test) : %.3f'%auroc_test_final) # get the loss plots for logistic regression plot_training_history(model_name='logreg_hist', model_alias = 'Logistic Regression',report_dir=args.report_dir, params=params, auroc_train_final = auroc_train_final, auroc_test_final=auroc_test_final) # LSTM print('-------------------------------------------------------------------') print('Running LSTM on Collapsed Features') model_name='lstm_hist' save_cv_results = SaveCVResults(dirname=args.report_dir, f_history=model_name+'.json') rnn = RNNBinaryClassifier( max_epochs=args.epochs, batch_size=-1, device=device, callbacks=[ save_cv_results, EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'), EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'), EarlyStopping(monitor='aucroc_score_valid', patience=5, threshold=0.002, threshold_mode='rel', lower_is_better=False) ], criterion=torch.nn.CrossEntropyLoss, criterion__weight=class_weights, train_split=skorch.dataset.CVSplit(0.2), module__rnn_type='LSTM', module__n_layers=1, module__n_hiddens=X_train.shape[-1], module__n_inputs=X_train.shape[-1], module__convert_to_log_reg=False, optimizer=torch.optim.Adam) gs = GridSearchCV(rnn, params, scoring='roc_auc', cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=14232), ) rnn_cv = gs.fit(X_train, y_train) y_pred_proba = rnn_cv.predict_proba(X_train) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos) print('AUROC with LSTM (Train) : %.2f'%auroc_train_final) y_pred_proba = rnn_cv.predict_proba(X_test) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos) print('AUROC with LSTM (Test) : %.2f'%auroc_test_final) # get the loss plots for LSTM plot_training_history(model_name='lstm_hist', model_alias = 'LSTM', report_dir=args.report_dir, params=params, auroc_train_final = auroc_train_final, auroc_test_final=auroc_test_final)