def get_all_features_data(labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict, medications_df, medications_data_dict, include_medications=True): '''Returns the merged labs, vitals and demographics features into a single table and the data dict''' time_col = parse_time_col(vitals_data_dict) id_cols = parse_id_cols(vitals_data_dict) # merge the labs, vitals and medications if include_medications: highfreq_df = pd.merge(pd.merge(vitals_df, labs_df, on=id_cols + [time_col], how='outer'), medications_df, on=id_cols + [time_col], how='outer') # forward fill medications because the patient is/is not on medication on new time points created by outer join medication_features = parse_feature_cols(medications_data_dict) highfreq_df[id_cols + medication_features] = highfreq_df[ id_cols + medication_features].groupby(id_cols).apply( lambda x: x.fillna(method='pad')).copy() highfreq_df[id_cols + medication_features] = highfreq_df[ id_cols + medication_features].fillna(0) highfreq_data_dict = merge_data_dicts( [labs_data_dict, vitals_data_dict, medications_data_dict]) else: highfreq_df = pd.merge(vitals_df, labs_df, on=id_cols + [time_col], how='outer') highfreq_data_dict = merge_data_dicts( [labs_data_dict, vitals_data_dict]) highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields'] cols_to_keep = parse_id_cols(highfreq_data_dict) + [ parse_time_col(highfreq_data_dict) ] + parse_feature_cols(highfreq_data_dict) highfreq_df = highfreq_df[cols_to_keep].copy() # merge the highfrequency features with the static features features_df = pd.merge(highfreq_df, demographics_df, on=id_cols, how='inner') features_data_dict = merge_data_dicts( [highfreq_data_dict, demographics_data_dict]) features_data_dict['fields'] = features_data_dict['schema']['fields'] return features_df, features_data_dict
def update_data_dict_mews(args): data_dict = args.data_dict id_cols = parse_id_cols(args.data_dict) feature_cols = parse_feature_cols(args.data_dict) new_fields = [] for name in id_cols: for col in data_dict['fields']: if col['name'] == name: new_fields.append(col) new_fields.append({ 'name': 'mews_score', 'role': 'feature', 'type': 'numeric', 'description': 'Modified Early Warning Score', 'units': 'NONE', 'constraints': { 'required': 'FALSE', 'minimum': '0', 'maximum': 'INF' } }) new_data_dict = copy.deepcopy(data_dict) if 'schema' in new_data_dict: new_data_dict['schema']['fields'] = new_fields del new_data_dict['fields'] else: new_data_dict['fields'] = new_fields return new_data_dict
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--outcome_col_name', type=str, required=True) parser.add_argument('--train_csv_files', type=str, required=True) parser.add_argument('--test_csv_files', type=str, required=True) parser.add_argument('--data_dict_files', type=str, required=True) parser.add_argument('--batch_size', type=int, default=1024, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=50, help='Number of epochs') parser.add_argument('--hidden_units', type=int, default=32, help='Number of hidden units') parser.add_argument('--hidden_layers', type=int, default=1, help='Number of hidden layers') parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0, help='dropout for optimizer') parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--validation_size', type=float, default=0.15, help='validation split size') parser.add_argument( '--is_data_simulated', type=bool, default=False, help='boolean to check if data is simulated or from mimic') parser.add_argument( '--simulated_data_dir', type=str, default='simulated_data/2-state/', help= 'dir in which to simulated data is saved.Must be provide if is_data_simulated = True' ) parser.add_argument( '--output_dir', type=str, default=None, help= 'directory where trained model and loss curves over epochs are saved') parser.add_argument( '--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split( ',') x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',') x_dict, y_dict = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_dict) # get the id and feature columns id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) # extract data train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_csv_filename, y_csv_path=y_train_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename, y_csv_path=y_test_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') X_train, y_train = train_vitals.get_batch_data(batch_id=0) X_test, y_test = test_vitals.get_batch_data(batch_id=0) _, T, F = X_train.shape print('number of time points : %s\n number of features : %s\n' % (T, F)) # set class weights as 1/(number of samples in class) for each class to handle class imbalance class_weights = torch.tensor( [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).double() # scale features # X_train = standard_scaler_3d(X_train) # X_test = standard_scaler_3d(X_test) # callback to compute gradient norm compute_grad_norm = ComputeGradientNorm(norm_type=2) # LSTM if args.output_filename_prefix == None: output_filename_prefix = ( 'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' % (args.hidden_units, args.hidden_layers, args.lr, args.dropout, args.weight_decay)) else: output_filename_prefix = args.output_filename_prefix print('RNN parameters : ' + output_filename_prefix) # # from IPython import embed; embed() rnn = RNNBinaryClassifier( max_epochs=50, batch_size=args.batch_size, device=device, lr=args.lr, callbacks=[ EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'), EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'), EarlyStopping(monitor='aucroc_score_valid', patience=20, threshold=0.002, threshold_mode='rel', lower_is_better=False), LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10), compute_grad_norm, GradientNormClipping(gradient_clip_value=0.3, gradient_clip_norm_type=2), Checkpoint(monitor='aucroc_score_valid', f_history=os.path.join( args.output_dir, output_filename_prefix + '.json')), TrainEndCheckpoint(dirname=args.output_dir, fn_prefix=output_filename_prefix), ], criterion=torch.nn.CrossEntropyLoss, criterion__weight=class_weights, train_split=skorch.dataset.CVSplit(args.validation_size), module__rnn_type='LSTM', module__n_layers=args.hidden_layers, module__n_hiddens=args.hidden_units, module__n_inputs=X_train.shape[-1], module__dropout_proba=args.dropout, optimizer=torch.optim.Adam, optimizer__weight_decay=args.weight_decay) clf = rnn.fit(X_train, y_train) y_pred_proba = clf.predict_proba(X_train) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos) print('AUROC with LSTM (Train) : %.2f' % auroc_train_final) y_pred_proba = clf.predict_proba(X_test) y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos) print('AUROC with LSTM (Test) : %.2f' % auroc_test_final)
parser.add_argument('--merge_x_y', default=True, type=lambda x: (str(x).lower() == 'true'), required=False) args = parser.parse_args() # read the data dictionaries print('Reading train-test data...') # read the data dict JSONs and parse the feature and outcome columns x_data_dict_file, y_data_dict_file = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_data_dict_file) y_data_dict = load_data_dict_json(y_data_dict_file) feature_cols = parse_feature_cols(x_data_dict) key_cols = parse_id_cols(x_data_dict) df_by_split = dict() for split_name, csv_files in [('train', args.train_csv_files.split(',')), ('test', args.test_csv_files.split(','))]: cur_df = None for csv_file in csv_files: # TODO use json data dict to load specific columns as desired types more_df = pd.read_csv(csv_file) if cur_df is None: cur_df = more_df else: if args.merge_x_y: cur_df = cur_df.merge(more_df, on=key_cols)
else: print( 'Merging collapsed vitals, collapsed labs, demographics and outcomes data dicts into a single features data dict and a single outcomes data dict...' ) features_dict_merged = collapsed_labs_data_dict['schema'][ 'fields'] + collapsed_vitals_data_dict['schema'][ 'fields'] + demographics_data_dict['schema']['fields'] for feat_dict in features_dict_merged: if feat_dict['name'] not in feat_names: features_data_dict['schema']['fields'].append(feat_dict) feat_names.append(feat_dict['name']) # convert the features to numpy float 32 to avoid memory issues feature_cols = parse_feature_cols(features_data_dict['schema']) feature_type_dict = dict.fromkeys(feature_cols) for k in feature_type_dict.keys(): feature_type_dict[k] = np.float32 features_df_all_slices = features_df_all_slices.astype(feature_type_dict) # save to disk features_csv = os.path.join(args.output_dir, 'features.csv.gz') outcomes_csv = os.path.join(args.output_dir, 'outcomes.csv.gz') mews_csv = os.path.join(args.output_dir, 'mews.csv.gz') features_json = os.path.join(args.output_dir, 'Spec_features.json') outcomes_json = os.path.join(args.output_dir, 'Spec_outcomes.json') mews_json = os.path.join(args.output_dir, 'Spec_mews.json') print('saving features and outcomes to :\n%s\n%s\n%s' % (features_csv, outcomes_csv, mews_csv))
type=str, default=2, help= '''Slice of data to be extracted. If tslice is provided with a % sign (for eg. 20%), then the script extracts the first tslice% data from the stay. If tslice is an int (for eg. 5), the the script extracts the first tslice hrs of data. If tslice is negative (for eg. -5), then the script extracts the data until tslice hours before deterioration/discharge.''' ) parser.add_argument('--output_dir', type=str) args = parser.parse_args() labs_df, labs_data_dict, vitals_df, vitals_data_dict, \ demographics_df, demographics_data_dict, outcomes_df, outcomes_data_dict = get_preprocessed_data(args.preproc_data_dir) id_cols = parse_id_cols(vitals_data_dict) labs_feature_cols = parse_feature_cols(labs_data_dict) vitals_feature_cols = parse_feature_cols(vitals_data_dict) # get lengths of stay for each admission vitals_df_with_stay_lengths = pd.merge(vitals_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner') labs_df_with_stay_lengths = pd.merge(labs_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner') #demographics_df_with_stay_lengths = pd.merge(demographics_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner')
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--outcome_col_name', type=str, required=True) parser.add_argument('--train_csv_files', type=str, required=True) parser.add_argument('--test_csv_files', type=str, required=True) parser.add_argument('--data_dict_files', type=str, required=True) parser.add_argument('--batch_size', type=int, default=1024, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=50, help='Number of epochs') parser.add_argument('--n_filters', type=int, default=32, help='Number of filters') parser.add_argument('--kernel_size', type=int, default=1, help='size of eack kernel') parser.add_argument('--n_conv_layers', type=int, default=1, help='number of convolutional layers') parser.add_argument('--stride', type=int, default=1, help='stride') parser.add_argument('--pool_size', type=int, default=4, help='max pool size') parser.add_argument('--dense_units', type=int, default=128, help='number of units in fully connected layer') parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0, help='dropout for optimizer') parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--validation_size', type=float, default=0.15, help='validation split size') parser.add_argument( '--output_dir', type=str, default=None, help= 'directory where trained model and loss curves over epochs are saved') parser.add_argument( '--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split( ',') x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',') x_dict, y_dict = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_dict) # get the id and feature columns id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) # extract data train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_csv_filename, y_csv_path=y_train_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename, y_csv_path=y_test_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_sequence') X_train, y_train = train_vitals.get_batch_data(batch_id=0) X_test, y_test = test_vitals.get_batch_data(batch_id=0) N, T, F = X_train.shape # add class weights class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) # class_weights = dict(zip(range(len(class_weights)), class_weights)) # convert y_train to categorical y_train = keras.utils.to_categorical(y_train) y_test = keras.utils.to_categorical(y_test) X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=args.validation_size, random_state=213) print('number of time points : %s\nnumber of features : %s\n' % (T, F)) set_random_seed(args.seed) model = keras.Sequential() for i in range(args.n_conv_layers): model.add( keras.layers.Conv1D(filters=args.n_filters, kernel_size=args.kernel_size, activation='relu', strides=args.stride)) model.add(keras.layers.Dropout(args.dropout)) model.add(keras.layers.MaxPooling1D(pool_size=args.pool_size)) model.add(keras.layers.Flatten()) model.add(keras.layers.Dense(args.dense_units, activation='relu')) model.add(keras.layers.Dense(2, activation='softmax')) # set optimizer opt = keras.optimizers.Adam(learning_rate=args.lr) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', keras.metrics.AUC()]) # set early stopping early_stopping = EarlyStopping(monitor='val_auc', patience=20, mode='max', verbose=1) model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[early_stopping], class_weight=class_weights, batch_size=args.batch_size) y_score_val = model.predict_proba(X_val) val_auc = roc_auc_score(y_val, y_score_val) print('AUC on val set : %.4f' % val_auc) y_score_test = model.predict_proba(X_test) test_auc = roc_auc_score(y_test, y_score_test) print('AUC on val set : %.4f' % test_auc) # save the model history training_hist_df = pd.DataFrame(model.history.history) training_hist_df.loc[:, 'test_auc'] = test_auc training_hist_csv = os.path.join(args.output_dir, args.output_filename_prefix + '.csv') training_hist_df.to_csv(training_hist_csv, index=False) # save the model model_file = os.path.join(args.output_dir, args.output_filename_prefix + '.model') model.save(model_file)
args = parser.parse_args() # get the train test features x_train_csv = os.path.join(args.train_test_split_dir, 'x_train.csv.gz') x_valid_csv = os.path.join(args.train_test_split_dir, 'x_valid.csv.gz') x_test_csv = os.path.join(args.train_test_split_dir, 'x_test.csv.gz') x_dict_json = os.path.join(args.train_test_split_dir, 'x_dict.json') # impute values by carry forward and then pop mean on train and test sets separately x_data_dict = load_data_dict_json(x_dict_json) x_train_df = pd.read_csv(x_train_csv) x_valid_df = pd.read_csv(x_valid_csv) x_test_df = pd.read_csv(x_test_csv) id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) time_col = parse_time_col(x_data_dict) # add mask features non_medication_feature_cols = [ feature_col for feature_col in feature_cols if 'medication' not in feature_col ] medication_feature_cols = [ feature_col for feature_col in feature_cols if 'medication' in feature_col ] print('Adding missing values mask as features...') for feature_col in non_medication_feature_cols: x_train_df.loc[:, 'mask_' +
def get_all_features_data(labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict): '''Returns the merged labs, vitals and demographics features into a single table and the data dict''' time_col = parse_time_col(vitals_data_dict) id_cols = parse_id_cols(vitals_data_dict) # merge the labs and vitals highfreq_df = pd.merge(vitals_df, labs_df, on=id_cols +[time_col], how='outer') highfreq_data_dict = merge_data_dicts([labs_data_dict, vitals_data_dict]) highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields'] cols_to_keep = parse_id_cols(highfreq_data_dict) + [parse_time_col(highfreq_data_dict)] + parse_feature_cols(highfreq_data_dict) highfreq_df = highfreq_df[cols_to_keep].copy() # merge the highfrequency features with the static features features_df = pd.merge(highfreq_df, demographics_df, on=id_cols, how='inner') features_data_dict = merge_data_dicts([highfreq_data_dict, demographics_data_dict]) features_data_dict['fields'] = features_data_dict['schema']['fields'] return features_df, features_data_dict
def main(): parser = argparse.ArgumentParser( description='PyTorch RNN with variable-length numeric sequences wrapper' ) parser.add_argument('--outcome_col_name', type=str, required=True) parser.add_argument('--train_csv_files', type=str, required=True) parser.add_argument('--valid_csv_files', type=str, required=True) parser.add_argument('--test_csv_files', type=str, required=True) parser.add_argument('--data_dict_files', type=str, required=True) parser.add_argument('--batch_size', type=int, default=1024, help='Number of sequences per minibatch') parser.add_argument('--epochs', type=int, default=50, help='Number of epochs') parser.add_argument('--hidden_units', type=int, default=32, help='Number of hidden units') parser.add_argument('--hidden_layers', type=int, default=1, help='Number of hidden layers') parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer') parser.add_argument('--dropout', type=float, default=0, help='dropout for optimizer') parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay for optimizer') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--validation_size', type=float, default=0.15, help='validation split size') parser.add_argument( '--is_data_simulated', type=bool, default=False, help='boolean to check if data is simulated or from mimic') parser.add_argument( '--output_dir', type=str, default=None, help= 'directory where trained model and loss curves over epochs are saved') parser.add_argument( '--output_filename_prefix', type=str, default=None, help='prefix for the training history jsons and trained classifier') args = parser.parse_args() torch.manual_seed(args.seed) device = 'cpu' x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split( ',') x_valid_csv_filename, y_valid_csv_filename = args.valid_csv_files.split( ',') x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',') x_dict, y_dict = args.data_dict_files.split(',') x_data_dict = load_data_dict_json(x_dict) # get the id and feature columns id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) # extract data train_vitals = TidySequentialDataCSVLoader( x_csv_path=x_train_csv_filename, y_csv_path=y_train_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_tstep') valid_vitals = TidySequentialDataCSVLoader( x_csv_path=x_valid_csv_filename, y_csv_path=y_valid_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_tstep') test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename, y_csv_path=y_test_csv_filename, x_col_names=feature_cols, idx_col_names=id_cols, y_col_name=args.outcome_col_name, y_label_type='per_tstep') X_train, y_train = train_vitals.get_batch_data(batch_id=0) X_valid, y_valid = valid_vitals.get_batch_data(batch_id=0) X_test, y_test = test_vitals.get_batch_data(batch_id=0) N, T, F = X_train.shape # from IPython import embed; embed() # X_train = (X_train - np.min(X_train))/(np.max(X_train)-np.min(X_train)) # X_valid = (X_valid - np.min(X_train))/(np.max(X_train)-np.min(X_train)) # X_test = (X_test - np.min(X_train))/(np.max(X_train)-np.min(X_train)) valid_ds = Dataset(X_valid, y_valid) print('number of time points : %s\nnumber of features : %s\n' % (T, F)) # set class weights as 1/(number of samples in class) for each class to handle class imbalance class_weights = torch.tensor( [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).float() print('Number of training sequences : %s' % N) print('Number of test sequences : %s' % X_test.shape[0]) print('Ratio positive in train : %.2f' % ((y_train == 1).sum() / len(y_train))) print('Ratio positive in test : %.2f' % ((y_test == 1).sum() / len(y_test))) # callback to compute gradient norm compute_grad_norm = ComputeGradientNorm(norm_type=2) # LSTM if args.output_filename_prefix == None: output_filename_prefix = ( 'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' % (args.hidden_units, args.hidden_layers, args.lr, args.dropout, args.weight_decay)) else: output_filename_prefix = args.output_filename_prefix print('RNN parameters : ' + output_filename_prefix) loss_early_stopping_cp = EarlyStopping(monitor='valid_loss', patience=15, threshold=0.002, threshold_mode='rel', lower_is_better=True) rnn = RNNPerTStepBinaryClassifier( max_epochs=250, batch_size=args.batch_size, device=device, lr=args.lr, callbacks=[ EpochScoring(calc_auprc, lower_is_better=False, on_train=True, name='auprc_train'), EpochScoring(calc_auprc, lower_is_better=False, on_train=False, name='auprc_valid'), EpochScoring(calc_auroc, lower_is_better=False, on_train=True, name='auroc_train'), EpochScoring(calc_auroc, lower_is_better=False, on_train=False, name='auroc_valid'), # EpochScoring(calc_precision, lower_is_better=False, on_train=True, name='precision_train'), # EpochScoring(calc_precision, lower_is_better=False, on_train=False, name='precision_valid'), # EpochScoring(calc_recall, lower_is_better=False, on_train=True, name='recall_train'), # EpochScoring(calc_recall, lower_is_better=False, on_train=False, name='recall_valid'), # EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'), # EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'), # EarlyStopping(monitor='auprc_valid', patience=5, threshold=0.002, threshold_mode='rel', # lower_is_better=False), # LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10), # compute_grad_norm, # GradientNormClipping(gradient_clip_value=0.5, gradient_clip_norm_type=2), loss_early_stopping_cp, Checkpoint(monitor='auprc_valid', f_history=os.path.join( args.output_dir, output_filename_prefix + '.json')), TrainEndCheckpoint(dirname=args.output_dir, fn_prefix=output_filename_prefix), ], # criterion=torch.nn.CrossEntropyLoss, # criterion__weight=class_weights, train_split=predefined_split(valid_ds), module__rnn_type='GRU', module__n_layers=args.hidden_layers, module__n_hiddens=args.hidden_units, module__n_inputs=X_train.shape[-1], module__dropout_proba=args.dropout, optimizer=torch.optim.Adam, optimizer__weight_decay=args.weight_decay) # N=len(X_train) # X_train = X_train[:N] # y_train = y_train[:N] clf = rnn.fit(X_train, y_train) # get threshold with max recall at fixed precision fixed_precision = 0.1 # get predict probas for y=1 on validation set keep_inds_va = torch.logical_not( torch.all(torch.isnan(torch.FloatTensor(X_valid)), dim=-1)) y_va_pred_proba = clf.predict_proba( X_valid)[keep_inds_va][:, 1].detach().numpy() unique_probas = np.unique(y_va_pred_proba) thr_grid_G = np.linspace(np.percentile(unique_probas, 1), max(unique_probas), 100) precision_scores_G, recall_scores_G = [ np.zeros(thr_grid_G.size), np.zeros(thr_grid_G.size) ] for gg, thr in enumerate(thr_grid_G): # logistic_clf.module_.linear_transform_layer.bias.data = torch.tensor(thr_grid[gg]).double() curr_thr_y_preds = clf.predict_proba( torch.FloatTensor(X_valid))[keep_inds_va][:, 1] >= thr_grid_G[gg] precision_scores_G[gg] = precision_score(y_valid[keep_inds_va], curr_thr_y_preds) recall_scores_G[gg] = recall_score(y_valid[keep_inds_va], curr_thr_y_preds) keep_inds = precision_scores_G >= fixed_precision if keep_inds.sum() > 0: print('Choosing threshold with precision >= %.3f' % fixed_precision) else: fixed_precision_old = fixed_precision fixed_precision = np.percentile(precision_scores_G, 99) keep_inds = precision_scores_G >= fixed_precision print( 'Could not find threshold with precision >= %.3f \n Choosing threshold to maximize recall at precision %.3f' % (fixed_precision_old, fixed_precision)) thr_grid_G = thr_grid_G[keep_inds] precision_scores_G = precision_scores_G[keep_inds] recall_scores_G = recall_scores_G[keep_inds] thr_perf_df = pd.DataFrame( np.vstack([ thr_grid_G[np.newaxis, :], precision_scores_G[np.newaxis, :], recall_scores_G[np.newaxis, :] ]).T, columns=['thr', 'precision_score', 'recall_score']) print(thr_perf_df) best_ind = np.argmax(recall_scores_G) best_thr = thr_grid_G[best_ind] print('chosen threshold : %.3f' % best_thr) splits = ['train', 'valid', 'test'] # data_splits = ((x_tr, y_tr), (x_va, y_va), (X_test, y_test)) auroc_per_split, auprc_per_split, precisions_per_split, recalls_per_split = [ np.zeros(len(splits)), np.zeros(len(splits)), np.zeros(len(splits)), np.zeros(len(splits)) ] for ii, (X, y) in enumerate([(X_train, y_train), (X_valid, y_valid), (X_test, y_test)]): keep_inds = torch.logical_not( torch.all(torch.isnan(torch.FloatTensor(X)), dim=-1)) y_pred_proba_pos = clf.predict_proba(X)[keep_inds][:, 1].detach().numpy() # y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba) auroc_per_split[ii] = roc_auc_score(y[keep_inds], y_pred_proba_pos) # y_pred_proba_pos = np.asarray(y_pred_proba_pos) auprc_per_split[ii] = average_precision_score(y[keep_inds], y_pred_proba_pos) y_pred = y_pred_proba_pos >= best_thr precisions_per_split[ii] = precision_score(y[keep_inds], y_pred) recalls_per_split[ii] = recall_score(y[keep_inds], y_pred) auroc_train, auroc_valid, auroc_test = auroc_per_split auprc_train, auprc_valid, auprc_test = auprc_per_split precision_train, precision_valid, precision_test = precisions_per_split recall_train, recall_valid, recall_test = recalls_per_split # save performance perf_dict = { 'auroc_train': auroc_train, 'auroc_valid': auroc_valid, 'auroc_test': auroc_test, 'auprc_train': auprc_train, 'auprc_valid': auprc_valid, 'auprc_test': auprc_test, 'precision_train': precision_train, 'precision_valid': precision_valid, 'precision_test': precision_test, 'recall_train': recall_train, 'recall_valid': recall_valid, 'recall_test': recall_test, 'threshold': best_thr } perf_df = pd.DataFrame([perf_dict]) perf_csv = os.path.join(args.output_dir, output_filename_prefix + '.csv') print('Final performance on train, valid and test :\n') print(perf_df) print('Final performance saved to %s' % perf_csv) perf_df.to_csv(perf_csv, index=False)
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--preproc_data_dir', help = 'directory where the labs, vitals, demographics and outcomes are stored') parser.add_argument('--tslice', type=str, default=2, help='''Slice of data to be extracted. If tslice is provided with a % sign (for eg. 20%), then the script extracts the first tslice% data from the stay. If tslice is an int (for eg. 5), the the script extracts the first tslice hrs of data. If tslice is negative (for eg. -5), then the script extracts the data until tslice hours before deterioration/discharge.''') parser.add_argument('--output_dir', type=str) args = parser.parse_args() labs_df, labs_data_dict, vitals_df, vitals_data_dict, \ demographics_df, demographics_data_dict, medications_df, medications_data_dict, outcomes_df, outcomes_data_dict = get_preprocessed_data(args.preproc_data_dir) id_cols = parse_id_cols(vitals_data_dict) labs_feature_cols = parse_feature_cols(labs_data_dict) vitals_feature_cols = parse_feature_cols(vitals_data_dict) medications_feature_cols = parse_feature_cols(medications_data_dict) # get lengths of stay for each admission vitals_df_with_stay_lengths = pd.merge(vitals_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner') labs_df_with_stay_lengths = pd.merge(labs_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner') medications_df_with_stay_lengths = pd.merge(medications_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner') #demographics_df_with_stay_lengths = pd.merge(demographics_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner') # find stays that satisfy minimum stay length censor_start=504 tstops_df = outcomes_df[id_cols].copy() if ('%' in args.tslice): min_stay_length = 0 print('Including EHR measured in first %s percent of patient stays having atleast %s hours of data'%(args.tslice, min_stay_length))
def featurize_stack_of_many_time_series( ts_df=None, ts_data_dict=None, outcomes_df=None, outcomes_data_dict=None, summary_ops=['mean', 'min', 'max'], percentile_slices_to_featurize=[(0, 100)], outcome_col='clinical_deterioration_outcome', outcome_seq_duration_col='stay_length', start_time_of_each_sequence=-24.0, max_time_of_each_sequence=504, start_time_of_endpoints=0.0, time_between_endpoints=12, prediction_horizon=24, verbose=True, ): ''' Featurize many patient stays slices and extract outcome for each slice Args ---- ts_df : pandas DataFrame Each row provides all measurements at one time of a single patient-stay Must contain one column already converted to numerical time ts_data_dict : dict Provides specification for every column of ts_df outcomes_df : pandas DataFrame Each row provides outcome of a single patient stay outcomes_data_dict : dict Provides specification for each column of outcomes_df summary_ops : list of strings Identifies the summary functions we wish to apply to each variable's ts percentile_slices_to_featurize : list of tuples Indicates percentile range of all subwindows we will featurize Example: [(0, 100), (0, 50)]) Returns ------- all_feat_df : DataFrame One row per featurized window of any patient-stay slice Key columns: ids + ['start', 'stop'] Value columns: one per extracted feature all_outcomes_df : DataFrame One row per featurized window of any patient-stay slice Key columns: ids + ['start', 'stop'] Value columns: just one, the outcome column Examples -------- >>> args = make_fake_input_data(n_seqs=25, n_features=10, max_duration=50.0) >>> feat_df, outcome_df = featurize_stack_of_many_time_series(*args, ... summary_ops=['mean', 'slope'], ... start_time_of_each_sequence=0, ... start_time_of_endpoints=0.0, ... time_between_endpoints=12.0, ... verbose=False, ... ); >>> feat_df.shape (95, 24) ''' # Parse desired slices to featurize at each window # This allows command-line specification of ranges as a string if isinstance(percentile_slices_to_featurize, str): percentile_slices_to_featurize = ast.literal_eval( percentile_slices_to_featurize) if isinstance(summary_ops, str): summary_ops = summary_ops.split(' ') # Parse provided data dictionary # Recover specific columns for each of the different roles: id_cols = parse_id_cols(ts_data_dict) id_cols = remove_col_names_from_list_if_not_in_df(id_cols, ts_df) feature_cols = parse_feature_cols(ts_data_dict) feature_cols = remove_col_names_from_list_if_not_in_df(feature_cols, ts_df) time_cols = parse_time_cols(ts_data_dict) time_cols = remove_col_names_from_list_if_not_in_df(time_cols, ts_df) if len(time_cols) == 0: raise ValueError("Expected at least one variable with role='time'") elif len(time_cols) > 1: raise Warning("More than one time variable found. Choosing %s" % time_cols[-1]) time_col = time_cols[-1] # Obtain fenceposts delineating each individual sequence within big stack # We assume that sequences changeover when *any* key differs # We convert all keys to a numerical datatype to make this possible keys_df = ts_df[id_cols].copy() for col in id_cols: if not pd.api.types.is_numeric_dtype(keys_df[col].dtype): keys_df[col] = keys_df[col].astype('category') keys_df[col] = keys_df[col].cat.codes middle_fence_posts = 1 + np.flatnonzero( np.diff(keys_df.values, axis=0).any(axis=1)) fp = np.hstack([0, middle_fence_posts, keys_df.shape[0]]) feat_arr_per_seq = list() windows_per_seq = list() outcomes_per_seq = list() durations_per_seq = list() missingness_density_per_seq = list() ids_per_seq = list() # Total number of features we'll compute in each feature vector F = len(percentile_slices_to_featurize) * len(feature_cols) * len( summary_ops) # Loop over each sequence in the tall tidy-format dataset start_time_sec = time.time() n_seqs = len(fp) - 1 pbar = ProgressBar() for p in pbar(range(n_seqs)): # Get features and times for the current fencepost fp_start = fp[p] fp_end = fp[p + 1] # Get the current stay keys cur_id_df = ts_df[id_cols].iloc[fp_start:fp_end].drop_duplicates( subset=id_cols) if outcomes_df is not None: # Get the total duration of the current sequence cur_outcomes_df = pd.merge(outcomes_df, cur_id_df, on=id_cols, how='inner') # Get the current sequence's finale outcome cur_final_outcome = int(cur_outcomes_df[outcome_col].values[0]) cur_seq_duration = float( cur_outcomes_df[outcome_seq_duration_col].values[0]) else: cur_seq_duration = float( ts_df[time_col].iloc[fp_start:fp_end].values[-1]) # Create windows at desired spacing stop_time_of_cur_sequence = min(cur_seq_duration, max_time_of_each_sequence) window_ends = np.arange( start_time_of_endpoints, stop_time_of_cur_sequence + 0.01 * time_between_endpoints, time_between_endpoints) # Create a dictionary of times and values for each feature time_arr_by_var = dict() val_arr_by_var = dict() times_U = ts_df[time_col].values[fp_start:fp_end] for feature_col in feature_cols: vals_U = ts_df[feature_col].values[fp_start:fp_end] keep_mask_U = np.isfinite(vals_U) if np.sum(keep_mask_U) > 0: time_arr_by_var[feature_col] = times_U[keep_mask_U] val_arr_by_var[feature_col] = vals_U[keep_mask_U] cur_seq_missing_density = ( 1.0 - len(val_arr_by_var.keys()) / float(len(feature_cols))) # Deprecated code from preetish.... MCH couldn't get this to work. ''' v = cur_fp_df.set_index(time_col).agg(lambda x: x.dropna().to_dict()) res = v[v.str.len() > 0].to_dict() for feature_col in feature_cols: if feature_col in res.keys(): time_arr_by_var[feature_col] = np.array( list(res[feature_col].keys()), dtype=np.float64) val_arr_by_var[feature_col] = np.array( list(res[feature_col].values()), dtype=np.float64) ''' W = len(window_ends) window_features_WF = np.zeros([W, F], dtype=np.float32) window_starts_stops_W2 = np.zeros([W, 2], dtype=np.float32) if outcomes_df is not None: window_outcomes_W1 = np.zeros([W, 1], dtype=np.int64) for ww, window_end in enumerate(window_ends): window_starts_stops_W2[ww, 0] = start_time_of_each_sequence window_starts_stops_W2[ww, 1] = window_end window_features_WF[ww, :], feat_names = featurize_ts( time_arr_by_var, val_arr_by_var, var_cols=feature_cols, var_spec_dict=ts_data_dict, start_numerictime=start_time_of_each_sequence, stop_numerictime=window_end, summary_ops=summary_ops, percentile_slices_to_featurize=percentile_slices_to_featurize) if outcomes_df is not None: # Determine the outcome for this window # Set outcome as final outcome if within the provided horizon # Otherwise, set to zero if window_end >= cur_seq_duration - prediction_horizon: window_outcomes_W1[ww] = cur_final_outcome else: window_outcomes_W1[ww] = 0 # Append all windows from this sequence to the big lists feat_arr_per_seq.append(window_features_WF) windows_per_seq.append(window_starts_stops_W2) ids_per_seq.append(np.tile(cur_id_df.values[0], (W, 1))) durations_per_seq.append(np.tile(cur_seq_duration, (W, 1))) missingness_density_per_seq.append(cur_seq_missing_density) if outcomes_df is not None: outcomes_per_seq.append(window_outcomes_W1) # Produce final data frames features_df = pd.DataFrame(np.vstack(feat_arr_per_seq), columns=feat_names) ids_df = pd.DataFrame(np.vstack(ids_per_seq), columns=id_cols) windows_df = pd.DataFrame(np.vstack(windows_per_seq), columns=['start', 'stop']) all_features_df = pd.concat([ids_df, windows_df, features_df], axis=1) if outcomes_df is not None: durations_df = pd.DataFrame(np.vstack(durations_per_seq), columns=[outcome_seq_duration_col]) outcomes_df = pd.DataFrame(np.vstack(outcomes_per_seq), columns=[outcome_col]) all_outcomes_df = pd.concat( [ids_df, windows_df, durations_df, outcomes_df], axis=1) else: durations_df = pd.DataFrame(np.vstack(durations_per_seq), columns=[outcome_seq_duration_col]) all_outcomes_df = pd.concat([ids_df, windows_df, durations_df], axis=1) seq_lengths = np.vstack([a[0] for a in durations_per_seq]) elapsed_time_sec = time.time() - start_time_sec if verbose: print('-----------------------------------------') print('Processed %d sequences of duration %.1f-%.1f in %.1f sec' % ( n_seqs, np.percentile(seq_lengths, 5), np.percentile(seq_lengths, 95), elapsed_time_sec, )) print(' Total number of measured features: %d' % len(feature_cols)) print( ' Fraction of possible features NEVER seen in a seq. : %.2f-%.2f ' % ( np.percentile(missingness_density_per_seq, 5), np.percentile(missingness_density_per_seq, 95), )) print('-----------------------------------------') return all_features_df, all_outcomes_df