required=False, default=None) parser.add_argument('--dynamic_mews_data_dict', type=str, required=False, default=None) parser.add_argument('--dynamic_outcomes_csv', type=str, required=False, default=None) args = parser.parse_args() print('reading features...') ts_df = pd.read_csv(args.input) data_dict = load_data_dict_json(args.data_dict) print('done reading features...') print('reading outcomes...') outcomes_df = pd.read_csv(args.outcomes) data_dict_outcomes = load_data_dict_json(args.data_dict_outcomes) print('done reading outcomes...') # define the mews score dataframe max_val = np.inf mews_list = [['systolic_blood_pressure', 0, 70, 3], ['systolic_blood_pressure', 70, 80, 2], ['systolic_blood_pressure', 80, 100, 1], ['systolic_blood_pressure', 100, 199, 0], ['systolic_blood_pressure', 199, max_val, 2], ['heart_rate', 0, 40, 2], ['heart_rate', 40, 50, 1],
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--outcome_col_name', type=str, required=True) parser.add_argument('--train_csv_files', type=str, required=True) parser.add_argument('--test_csv_files', type=str, required=True) parser.add_argument('--data_dict_files', type=str, required=True) parser.add_argument('--batch_size', type=int, default=1024, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=50, help='Number of epochs') parser.add_argument('--hidden_units', type=int, default=32, help='Number of hidden units') parser.add_argument('--hidden_layers', type=int, default=1, help='Number of hidden layers') parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0, help='dropout for optimizer') parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--validation_size', type=float, default=0.15, help='validation split size') parser.add_argument( '--is_data_simulated', type=bool, default=False, help='boolean to check if data is simulated or from mimic') parser.add_argument( '--simulated_data_dir', type=str, default='simulated_data/2-state/', help= 'dir in which to simulated data is saved.Must be provide if is_data_simulated = True' ) parser.add_argument( '--output_dir', type=str, default=None, help= 'directory where trained model and loss curves over epochs are saved') parser.add_argument( '--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split( ',') x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',') x_dict, y_dict = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_dict) # get the id and feature columns id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) # extract data train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_csv_filename, y_csv_path=y_train_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename, y_csv_path=y_test_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') X_train, y_train = train_vitals.get_batch_data(batch_id=0) X_test, y_test = test_vitals.get_batch_data(batch_id=0) _, T, F = X_train.shape print('number of time points : %s\n number of features : %s\n' % (T, F)) # set class weights as 1/(number of samples in class) for each class to handle class imbalance class_weights = torch.tensor( [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).double() # scale features # X_train = standard_scaler_3d(X_train) # X_test = standard_scaler_3d(X_test) # callback to compute gradient norm compute_grad_norm = ComputeGradientNorm(norm_type=2) # LSTM if args.output_filename_prefix == None: output_filename_prefix = ( 'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' % (args.hidden_units, args.hidden_layers, args.lr, args.dropout, args.weight_decay)) else: output_filename_prefix = args.output_filename_prefix print('RNN parameters : ' + output_filename_prefix) # # from IPython import embed; embed() rnn = RNNBinaryClassifier( max_epochs=50, batch_size=args.batch_size, device=device, lr=args.lr, callbacks=[ EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'), EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'), EarlyStopping(monitor='aucroc_score_valid', patience=20, threshold=0.002, threshold_mode='rel', lower_is_better=False), LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10), compute_grad_norm, GradientNormClipping(gradient_clip_value=0.3, gradient_clip_norm_type=2), Checkpoint(monitor='aucroc_score_valid', f_history=os.path.join( args.output_dir, output_filename_prefix + '.json')), TrainEndCheckpoint(dirname=args.output_dir, fn_prefix=output_filename_prefix), ], criterion=torch.nn.CrossEntropyLoss, criterion__weight=class_weights, train_split=skorch.dataset.CVSplit(args.validation_size), module__rnn_type='LSTM', module__n_layers=args.hidden_layers, module__n_hiddens=args.hidden_units, module__n_inputs=X_train.shape[-1], module__dropout_proba=args.dropout, optimizer=torch.optim.Adam, optimizer__weight_decay=args.weight_decay) clf = rnn.fit(X_train, y_train) y_pred_proba = clf.predict_proba(X_train) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos) print('AUROC with LSTM (Train) : %.2f' % auroc_train_final) y_pred_proba = clf.predict_proba(X_test) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos) print('AUROC with LSTM (Test) : %.2f' % auroc_test_final)
clf_model_file = os.path.join(args.clf_models_dir, model, '%s_trained_model.joblib' % model) clf_model = load(clf_model_file) clf_models_dict[model] = clf_model clf_models_dict['mews'] = pd.read_csv( os.path.join(args.clf_models_dir, 'mews', 'mews_best_threshold.csv')) ## get the test patient id's # get the test set's csv and dict y_test_df = pd.read_csv( os.path.join(args.clf_train_test_split_dir, 'y_test.csv')) y_test_dict_file = os.path.join(args.clf_train_test_split_dir, 'y_dict.json') # import the y dict to get the id cols y_test_dict = load_data_dict_json(y_test_dict_file) id_cols = parse_id_cols(y_test_dict) tslice_folders = os.path.join(args.tslice_folder, 'TSLICE=') collapsed_tslice_folders = os.path.join(args.collapsed_tslice_folder, 'TSLICE=') outcome_col = args.outcome_column_name tslices_list = args.evaluation_tslices.split(' ') y_test_ids_df = y_test_df[id_cols].drop_duplicates( subset=id_cols).reset_index(drop=True) # get demographics csv and data_dict # for each patient get their vitals, labs, demographics _, _, _, _, demographics_df, demographics_data_dict, _, _ = get_preprocessed_data( args.preproc_data_dir)
parser.add_argument('--preproc_data_dir', type=str, help='folder where the preprocessed data is stored') parser.add_argument('--outcome_column_name', default='clinical_deterioration_outcome', type=str, help='name of outcome column in test dataframe') parser.add_argument('--output_dir', default='', type=str, help='dir to save plots') args = parser.parse_args() ## get the test patient id's # get the test set's csv and dict y_test_df = pd.read_csv(os.path.join(args.shallow_clf_train_test_split_dir, 'y_test.csv')) y_test_dict_file = os.path.join(args.shallow_clf_train_test_split_dir, 'y_dict.json') y_test_dict = load_data_dict_json(y_test_dict_file) id_cols = parse_id_cols(y_test_dict) # rnn_train_test_split_dir=args.rnn_train_test_split_dir.replace(' ', '') ## get the data dict of sequence features with mask features x_test_dict_file = os.path.join(args.rnn_train_test_split_dir,'x_dict.json') x_test_dict = load_data_dict_json(x_test_dict_file) feature_cols_with_mask_features = parse_feature_cols(x_test_dict) x_train_df = pd.read_csv(os.path.join(args.rnn_train_test_split_dir,'x_train.csv')) # load shallow models shallow_models = ['logistic_regression', 'random_forest'] clf_models_dict = dict.fromkeys(shallow_models) for model in shallow_models: clf_model_file = os.path.join(args.shallow_clf_models_dir, model, '%s_trained_model.joblib'%model) clf_model = load(clf_model_file)
args = parser.parse_args() print('Loading mews scores...') # Get collapsed features DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH = args.dynamic_collapsed_features_folder dynamic_mews_df = pd.read_csv( os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'MewsDynamic.csv.gz')) demographics_df = pd.read_csv( os.path.join(args.static_data_dict_dir, 'demographics_before_icu.csv.gz')) # get data dicts of collapsed features demographics_dd = load_data_dict_json( os.path.join(args.static_data_dict_dir, 'Spec-Demographics.json')) outcomes_dd = load_data_dict_json( os.path.join(args.static_data_dict_dir, 'Spec-Outcomes_TransferToICU.json')) # merge vitals, labs and medications id_cols = parse_id_cols(demographics_dd) print('Merging demographics...') # merge demographics dynamic_mews_df = pd.merge(dynamic_mews_df, demographics_df, on=id_cols, how='left') # Set the dynamic outputs to be same as the vitals dynamic outputs because all stays contain atleast 1 vital
help='fraction of features considered at each split of rf') parser.add_argument('--n_splits', type=int, default=2) parser.add_argument('--n_estimators', type=int, default=25) parser.add_argument('--merge_x_y', default=True, type=lambda x: (str(x).lower() == 'true'), required=False) args = parser.parse_args() # read the data dictionaries print('Reading train-test data...') # read the data dict JSONs and parse the feature and outcome columns x_data_dict_file, y_data_dict_file = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_data_dict_file) y_data_dict = load_data_dict_json(y_data_dict_file) feature_cols = parse_feature_cols(x_data_dict) key_cols = parse_id_cols(x_data_dict) df_by_split = dict() for split_name, csv_files in [('train', args.train_csv_files.split(',')), ('test', args.test_csv_files.split(','))]: cur_df = None for csv_file in csv_files: # TODO use json data dict to load specific columns as desired types more_df = pd.read_csv(csv_file) if cur_df is None: cur_df = more_df
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--train_test_split_dir', type=str) parser.add_argument('--output_dir', type=str) parser.add_argument('--normalization', type=str, default='minmax') args = parser.parse_args() # get the train test features x_train_csv = os.path.join(args.train_test_split_dir, 'x_train.csv.gz') x_valid_csv = os.path.join(args.train_test_split_dir, 'x_valid.csv.gz') x_test_csv = os.path.join(args.train_test_split_dir, 'x_test.csv.gz') x_dict_json = os.path.join(args.train_test_split_dir, 'x_dict.json') # impute values by carry forward and then pop mean on train and test sets separately x_data_dict = load_data_dict_json(x_dict_json) x_train_df = pd.read_csv(x_train_csv) x_valid_df = pd.read_csv(x_valid_csv) x_test_df = pd.read_csv(x_test_csv) id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) time_col = parse_time_col(x_data_dict) # add mask features non_medication_feature_cols = [ feature_col for feature_col in feature_cols if 'medication' not in feature_col ] medication_feature_cols = [ feature_col for feature_col in feature_cols
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--outcome_col_name', type=str, required=True) parser.add_argument('--train_csv_files', type=str, required=True) parser.add_argument('--test_csv_files', type=str, required=True) parser.add_argument('--data_dict_files', type=str, required=True) parser.add_argument('--batch_size', type=int, default=1024, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=50, help='Number of epochs') parser.add_argument('--n_filters', type=int, default=32, help='Number of filters') parser.add_argument('--kernel_size', type=int, default=1, help='size of eack kernel') parser.add_argument('--n_conv_layers', type=int, default=1, help='number of convolutional layers') parser.add_argument('--stride', type=int, default=1, help='stride') parser.add_argument('--pool_size', type=int, default=4, help='max pool size') parser.add_argument('--dense_units', type=int, default=128, help='number of units in fully connected layer') parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0, help='dropout for optimizer') parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--validation_size', type=float, default=0.15, help='validation split size') parser.add_argument( '--output_dir', type=str, default=None, help= 'directory where trained model and loss curves over epochs are saved') parser.add_argument( '--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split( ',') x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',') x_dict, y_dict = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_dict) # get the id and feature columns id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) # extract data train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_csv_filename, y_csv_path=y_train_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename, y_csv_path=y_test_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') X_train, y_train = train_vitals.get_batch_data(batch_id=0) X_test, y_test = test_vitals.get_batch_data(batch_id=0) N, T, F = X_train.shape # add class weights class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) # class_weights = dict(zip(range(len(class_weights)), class_weights)) # convert y_train to categorical y_train = keras.utils.to_categorical(y_train) y_test = keras.utils.to_categorical(y_test) X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=args.validation_size, random_state=213) print('number of time points : %s\nnumber of features : %s\n' % (T, F)) set_random_seed(args.seed) model = keras.Sequential() for i in range(args.n_conv_layers): model.add( keras.layers.Conv1D(filters=args.n_filters, kernel_size=args.kernel_size, activation='relu', strides=args.stride)) model.add(keras.layers.Dropout(args.dropout)) model.add(keras.layers.MaxPooling1D(pool_size=args.pool_size)) model.add(keras.layers.Flatten()) model.add(keras.layers.Dense(args.dense_units, activation='relu')) model.add(keras.layers.Dense(2, activation='softmax')) # set optimizer opt = keras.optimizers.Adam(learning_rate=args.lr) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', keras.metrics.AUC()]) # set early stopping early_stopping = EarlyStopping(monitor='val_auc', patience=20, mode='max', verbose=1) model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[early_stopping], class_weight=class_weights, batch_size=args.batch_size) y_score_val = model.predict_proba(X_val) val_auc = roc_auc_score(y_val, y_score_val) print('AUC on val set : %.4f' % val_auc) y_score_test = model.predict_proba(X_test) test_auc = roc_auc_score(y_test, y_score_test) print('AUC on val set : %.4f' % test_auc) # save the model history training_hist_df = pd.DataFrame(model.history.history) training_hist_df.loc[:, 'test_auc'] = test_auc training_hist_csv = os.path.join(args.output_dir, args.output_filename_prefix + '.csv') training_hist_df.to_csv(training_hist_csv, index=False) # save the model model_file = os.path.join(args.output_dir, args.output_filename_prefix + '.model') model.save(model_file)
# Get collapsed features DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH = args.dynamic_collapsed_features_folder dynamic_collapsed_vitals_df = pd.read_csv( os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'CollapsedVitalsDynamic.csv.gz')) dynamic_collapsed_labs_df = pd.read_csv( os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'CollapsedLabsDynamic.csv.gz')) demographics_df = pd.read_csv( os.path.join(args.static_data_dict_dir, 'demographics_before_icu.csv.gz')) # get data dicts of collapsed features vitals_dd = load_data_dict_json( os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'Spec_CollapsedVitalsDynamic.json')) labs_dd = load_data_dict_json( os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'Spec_CollapsedLabsDynamic.json')) demographics_dd = load_data_dict_json( os.path.join(args.static_data_dict_dir, 'Spec-Demographics.json')) outcomes_dd = load_data_dict_json( os.path.join(args.static_data_dict_dir, 'Spec-Outcomes_TransferToICU.json')) # get dynamic outputs vitals_output = pd.read_csv( os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'OutputsDynamicVitals.csv.gz')) labs_output = pd.read_csv(
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--outcome_col_name', type=str, required=True) parser.add_argument('--train_csv_files', type=str, required=True) parser.add_argument('--valid_csv_files', type=str, required=True) parser.add_argument('--test_csv_files', type=str, required=True) parser.add_argument('--data_dict_files', type=str, required=True) parser.add_argument('--batch_size', type=int, default=1024, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=50, help='Number of epochs') parser.add_argument('--hidden_units', type=int, default=32, help='Number of hidden units') parser.add_argument('--hidden_layers', type=int, default=1, help='Number of hidden layers') parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0, help='dropout for optimizer') parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--validation_size', type=float, default=0.15, help='validation split size') parser.add_argument( '--is_data_simulated', type=bool, default=False, help='boolean to check if data is simulated or from mimic') parser.add_argument( '--output_dir', type=str, default=None, help= 'directory where trained model and loss curves over epochs are saved') parser.add_argument( '--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split( ',') x_valid_csv_filename, y_valid_csv_filename = args.valid_csv_files.split( ',') x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',') x_dict, y_dict = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_dict) # get the id and feature columns id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) # extract data train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_csv_filename, y_csv_path=y_train_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_tstep') valid_vitals = TidySequentialDataCSVLoader( x_csv_path=x_valid_csv_filename, y_csv_path=y_valid_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_tstep') test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename, y_csv_path=y_test_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_tstep') X_train, y_train = train_vitals.get_batch_data(batch_id=0) X_valid, y_valid = valid_vitals.get_batch_data(batch_id=0) X_test, y_test = test_vitals.get_batch_data(batch_id=0) N, T, F = X_train.shape # from IPython import embed; embed() # X_train = (X_train - np.min(X_train))/(np.max(X_train)-np.min(X_train)) # X_valid = (X_valid - np.min(X_train))/(np.max(X_train)-np.min(X_train)) # X_test = (X_test - np.min(X_train))/(np.max(X_train)-np.min(X_train)) valid_ds = Dataset(X_valid, y_valid) print('number of time points : %s\nnumber of features : %s\n' % (T, F)) # set class weights as 1/(number of samples in class) for each class to handle class imbalance class_weights = torch.tensor( [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).float() print('Number of training sequences : %s' % N) print('Number of test sequences : %s' % X_test.shape[0]) print('Ratio positive in train : %.2f' % ((y_train == 1).sum() / len(y_train))) print('Ratio positive in test : %.2f' % ((y_test == 1).sum() / len(y_test))) # callback to compute gradient norm compute_grad_norm = ComputeGradientNorm(norm_type=2) # LSTM if args.output_filename_prefix == None: output_filename_prefix = ( 'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' % (args.hidden_units, args.hidden_layers, args.lr, args.dropout, args.weight_decay)) else: output_filename_prefix = args.output_filename_prefix print('RNN parameters : ' + output_filename_prefix) loss_early_stopping_cp = EarlyStopping(monitor='valid_loss', patience=15, threshold=0.002, threshold_mode='rel', lower_is_better=True) rnn = RNNPerTStepBinaryClassifier( max_epochs=250, batch_size=args.batch_size, device=device, lr=args.lr, callbacks=[ EpochScoring(calc_auprc, lower_is_better=False, on_train=True, name='auprc_train'), EpochScoring(calc_auprc, lower_is_better=False, on_train=False, name='auprc_valid'), EpochScoring(calc_auroc, lower_is_better=False, on_train=True, name='auroc_train'), EpochScoring(calc_auroc, lower_is_better=False, on_train=False, name='auroc_valid'), # EpochScoring(calc_precision, lower_is_better=False, on_train=True, name='precision_train'), # EpochScoring(calc_precision, lower_is_better=False, on_train=False, name='precision_valid'), # EpochScoring(calc_recall, lower_is_better=False, on_train=True, name='recall_train'), # EpochScoring(calc_recall, lower_is_better=False, on_train=False, name='recall_valid'), # EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'), # EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'), # EarlyStopping(monitor='auprc_valid', patience=5, threshold=0.002, threshold_mode='rel', # lower_is_better=False), # LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10), # compute_grad_norm, # GradientNormClipping(gradient_clip_value=0.5, gradient_clip_norm_type=2), loss_early_stopping_cp, Checkpoint(monitor='auprc_valid', f_history=os.path.join( args.output_dir, output_filename_prefix + '.json')), TrainEndCheckpoint(dirname=args.output_dir, fn_prefix=output_filename_prefix), ], # criterion=torch.nn.CrossEntropyLoss, # criterion__weight=class_weights, train_split=predefined_split(valid_ds), module__rnn_type='GRU', module__n_layers=args.hidden_layers, module__n_hiddens=args.hidden_units, module__n_inputs=X_train.shape[-1], module__dropout_proba=args.dropout, optimizer=torch.optim.Adam, optimizer__weight_decay=args.weight_decay) # N=len(X_train) # X_train = X_train[:N] # y_train = y_train[:N] clf = rnn.fit(X_train, y_train) # get threshold with max recall at fixed precision fixed_precision = 0.1 # get predict probas for y=1 on validation set keep_inds_va = torch.logical_not( torch.all(torch.isnan(torch.FloatTensor(X_valid)), dim=-1)) y_va_pred_proba = clf.predict_proba( X_valid)[keep_inds_va][:, 1].detach().numpy() unique_probas = np.unique(y_va_pred_proba) thr_grid_G = np.linspace(np.percentile(unique_probas, 1), max(unique_probas), 100) precision_scores_G, recall_scores_G = [ np.zeros(thr_grid_G.size), np.zeros(thr_grid_G.size) ] for gg, thr in enumerate(thr_grid_G): # logistic_clf.module_.linear_transform_layer.bias.data = torch.tensor(thr_grid[gg]).double() curr_thr_y_preds = clf.predict_proba( torch.FloatTensor(X_valid))[keep_inds_va][:, 1] >= thr_grid_G[gg] precision_scores_G[gg] = precision_score(y_valid[keep_inds_va], curr_thr_y_preds) recall_scores_G[gg] = recall_score(y_valid[keep_inds_va], curr_thr_y_preds) keep_inds = precision_scores_G >= fixed_precision if keep_inds.sum() > 0: print('Choosing threshold with precision >= %.3f' % fixed_precision) else: fixed_precision_old = fixed_precision fixed_precision = np.percentile(precision_scores_G, 99) keep_inds = precision_scores_G >= fixed_precision print( 'Could not find threshold with precision >= %.3f \n Choosing threshold to maximize recall at precision %.3f' % (fixed_precision_old, fixed_precision)) thr_grid_G = thr_grid_G[keep_inds] precision_scores_G = precision_scores_G[keep_inds] recall_scores_G = recall_scores_G[keep_inds] thr_perf_df = pd.DataFrame( np.vstack([ thr_grid_G[np.newaxis, :], precision_scores_G[np.newaxis, :], recall_scores_G[np.newaxis, :] ]).T, columns=['thr', 'precision_score', 'recall_score']) print(thr_perf_df) best_ind = np.argmax(recall_scores_G) best_thr = thr_grid_G[best_ind] print('chosen threshold : %.3f' % best_thr) splits = ['train', 'valid', 'test'] # data_splits = ((x_tr, y_tr), (x_va, y_va), (X_test, y_test)) auroc_per_split, auprc_per_split, precisions_per_split, recalls_per_split = [ np.zeros(len(splits)), np.zeros(len(splits)), np.zeros(len(splits)), np.zeros(len(splits)) ] for ii, (X, y) in enumerate([(X_train, y_train), (X_valid, y_valid), (X_test, y_test)]): keep_inds = torch.logical_not( torch.all(torch.isnan(torch.FloatTensor(X)), dim=-1)) y_pred_proba_pos = clf.predict_proba(X)[keep_inds][:, 1].detach().numpy() # y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_per_split[ii] = roc_auc_score(y[keep_inds], y_pred_proba_pos) # y_pred_proba_pos = np.asarray(y_pred_proba_pos) auprc_per_split[ii] = average_precision_score(y[keep_inds], y_pred_proba_pos) y_pred = y_pred_proba_pos >= best_thr precisions_per_split[ii] = precision_score(y[keep_inds], y_pred) recalls_per_split[ii] = recall_score(y[keep_inds], y_pred) auroc_train, auroc_valid, auroc_test = auroc_per_split auprc_train, auprc_valid, auprc_test = auprc_per_split precision_train, precision_valid, precision_test = precisions_per_split recall_train, recall_valid, recall_test = recalls_per_split # save performance perf_dict = { 'auroc_train': auroc_train, 'auroc_valid': auroc_valid, 'auroc_test': auroc_test, 'auprc_train': auprc_train, 'auprc_valid': auprc_valid, 'auprc_test': auprc_test, 'precision_train': precision_train, 'precision_valid': precision_valid, 'precision_test': precision_test, 'recall_train': recall_train, 'recall_valid': recall_valid, 'recall_test': recall_test, 'threshold': best_thr } perf_df = pd.DataFrame([perf_dict]) perf_csv = os.path.join(args.output_dir, output_filename_prefix + '.csv') print('Final performance on train, valid and test :\n') print(perf_df) print('Final performance saved to %s' % perf_csv) perf_df.to_csv(perf_csv, index=False)
help='validation split size') parser.add_argument('--pretrained_model_dir', type=str, default=None, help='load pretrained model from this dir if not None. If None, then start from scratch') parser.add_argument('--output_dir', type=str, default=None, help='directory where trained model and loss curves over epochs are saved') parser.add_argument('--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() # Load x-train, ytrain and x-test, ytest print('Loading full sequence train-test data...') x_train = pd.read_csv(os.path.join(args.train_test_split_dir, 'x_train.csv.gz')) x_test = pd.read_csv(os.path.join(args.train_test_split_dir, 'x_test.csv.gz')) y_train = pd.read_csv(os.path.join(args.train_test_split_dir, 'y_train.csv.gz')) y_test = pd.read_csv(os.path.join(args.train_test_split_dir, 'y_test.csv.gz')) x_data_dict = load_data_dict_json(os.path.join(args.train_test_split_dir, 'x_dict.json')) y_data_dict = load_data_dict_json(os.path.join(args.train_test_split_dir, 'y_dict.json')) max_T_train = 0 max_T_test = 0 id_cols = parse_id_cols(x_data_dict) time_col = parse_time_col(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) x_train[feature_cols] = x_train[feature_cols].astype(np.float32) x_test[feature_cols] = x_test[feature_cols].astype(np.float32) # Get 2 different train and test dataframes divided by slice train_tensors_per_tslice_list = [] test_tensors_per_tslice_list = []
def main(): parser = argparse.ArgumentParser(description="Script for collapsing" "time features or adding" "new features.") parser.add_argument('--input', type=str, required=True, help='Path to csv dataframe of readings') parser.add_argument('--data_dict', type=str, required=True, help='Path to json data dictionary file') parser.add_argument('--outcomes', type=str, required=True, help='Path to csv dataframe of outcomes') parser.add_argument('--data_dict_outcomes', type=str, required=True, help='Path to json data dictionary file for outcomes') parser.add_argument('--dynamic_collapsed_features_csv', type=str, required=False, default=None) parser.add_argument('--dynamic_collapsed_features_data_dict', type=str, required=False, default=None) parser.add_argument('--dynamic_outcomes_csv', type=str, required=False, default=None) # parser.add_argument('--dynamic_outcomes_data_dict', type=str, required=False, default=None) parser.add_argument('--collapse_features', type=str, required=False, default='count mean median std min max', help="Enclose options with 's, choose " "from mean, std, min, max, " "median, slope, count, present") parser.add_argument( '--collapse_range_features', type=str, required=False, default='slope std', help="Enclose options with 's, choose " "from mean, std, min, max, " "median, slope, count, present, skew, hours_since_measured") parser.add_argument( '--range_pairs', type=str, required=False, default= '[(0, 10), (0, 25), (0, 50), (50, 100), (75, 100), (90, 100), (0, 100)]', help="Enclose pairs list with 's and [], list all desired ranges in " "parentheses like this: '[(0, 50), (25, 75), (50, 100)]'") args = parser.parse_args() print('reading features...') ts_df = pd.read_csv(args.input) data_dict = load_data_dict_json(args.data_dict) print('done reading features...') print('reading outcomes...') outcomes_df = pd.read_csv(args.outcomes) data_dict_outcomes = load_data_dict_json(args.data_dict_outcomes) print('done reading outcomes...') # transform data t1 = time.time() dynamic_collapsed_df, dynamic_outcomes_df = collapse_dynamic( ts_df=ts_df, data_dict=data_dict, collapse_range_features=args.collapse_range_features, range_pairs=args.range_pairs, outcomes_df=outcomes_df, data_dict_outcomes=data_dict_outcomes) dynamic_collapsed_features_data_dict = update_data_dict_collapse( data_dict, args.collapse_range_features, args.range_pairs) t2 = time.time() print('done collapsing data..') print('time taken to collapse data : {} seconds'.format(t2 - t1)) # save data to file dynamic_collapsed_df.to_csv(args.dynamic_collapsed_features_csv, index=False, compression='gzip') print('Saved dynamic collapsed features to :\n%s' % args.dynamic_collapsed_features_csv) dynamic_outcomes_df.to_csv(args.dynamic_outcomes_csv, index=False, compression='gzip') print('Saved dynamic outcomes to :\n%s' % args.dynamic_outcomes_csv) # save data dictionary to file with open(args.dynamic_collapsed_features_data_dict, 'w') as f: json.dump(dynamic_collapsed_features_data_dict, f, indent=4) print('Saved dynamic collapsed features dict to :\n%s' % args.dynamic_collapsed_features_data_dict)
def main(): parser = argparse.ArgumentParser(description="Script for collapsing" "time features or adding" "new features.") parser.add_argument('--input', type=str, required=True, help='Path to csv dataframe of readings') parser.add_argument('--ts_data_dict', type=str, required=False, help='Path to json data dictionary file') parser.add_argument('--outcomes', type=str, required=False, help='Path to csv dataframe of outcomes') parser.add_argument('--outcomes_data_dict', type=str, required=False, help='Path to json data dictionary file for outcomes') parser.add_argument('--dynamic_collapsed_features_csv', type=str, required=False, default=None) parser.add_argument('--dynamic_collapsed_features_data_dict', type=str, required=False, default=None) parser.add_argument('--dynamic_outcomes_csv', type=str, required=False, default=None) parser.add_argument('--features_to_summarize', type=str, required=False, default='slope std', help="Enclose options with 's, choose " "from mean, std, min, max, " "median, slope, count, present, hours_since_measured") parser.add_argument( '--percentile_ranges_to_summarize', type=str, required=False, default= '[(0, 10), (0, 25), (0, 50), (50, 100), (75, 100), (90, 100), (0, 100)]', help="Enclose pairs list with 's and [], list all desired ranges in " "parentheses like this: '[(0, 50), (25, 75), (50, 100)]'") args = parser.parse_args() if not os.path.exists(args.input): is_fake = True ts_df, ts_data_dict, outcomes_df, outcomes_data_dict = make_fake_input_data( n_seqs=10, n_features=100, min_duration=24.0, max_duration=240.0) else: is_fake = False print('reading features...') ts_df = pd.read_csv(args.input) ts_data_dict = load_data_dict_json(args.ts_data_dict) print('done reading features...') print('reading outcomes...') outcomes_df = pd.read_csv(args.outcomes) outcomes_data_dict = load_data_dict_json(args.outcomes_data_dict) print('done reading outcomes...') # transform data t1 = time.time() dynamic_collapsed_df, dynamic_outcomes_df = featurize_stack_of_many_time_series( ts_df=ts_df, ts_data_dict=ts_data_dict, outcomes_df=outcomes_df, outcomes_data_dict=outcomes_data_dict, summary_ops=args.features_to_summarize, percentile_slices_to_featurize=args.percentile_ranges_to_summarize, ) t2 = time.time() print('done collapsing data..') print('time taken to collapse data : {} seconds'.format(t2 - t1)) if is_fake: sys.exit() # save data to file dynamic_collapsed_df.to_csv(args.dynamic_collapsed_features_csv, index=False, compression='gzip') print('Saved dynamic collapsed features to :\n%s' % args.dynamic_collapsed_features_csv) dynamic_outcomes_df.to_csv(args.dynamic_outcomes_csv, index=False, compression='gzip') print('Saved dynamic outcomes to :\n%s' % args.dynamic_outcomes_csv) # save data dictionary to file dynamic_collapsed_features_data_dict = update_data_dict_collapse( ts_data_dict, args.features_to_summarize, args.percentile_ranges_to_summarize) with open(args.dynamic_collapsed_features_data_dict, 'w') as f: json.dump(dynamic_collapsed_features_data_dict, f, indent=4) print('Saved dynamic collapsed features dict to :\n%s' % (args.dynamic_collapsed_features_data_dict))