Пример #1
0
                        required=False,
                        default=None)
    parser.add_argument('--dynamic_mews_data_dict',
                        type=str,
                        required=False,
                        default=None)
    parser.add_argument('--dynamic_outcomes_csv',
                        type=str,
                        required=False,
                        default=None)

    args = parser.parse_args()

    print('reading features...')
    ts_df = pd.read_csv(args.input)
    data_dict = load_data_dict_json(args.data_dict)
    print('done reading features...')

    print('reading outcomes...')
    outcomes_df = pd.read_csv(args.outcomes)
    data_dict_outcomes = load_data_dict_json(args.data_dict_outcomes)
    print('done reading outcomes...')

    # define the mews score dataframe
    max_val = np.inf
    mews_list = [['systolic_blood_pressure', 0, 70, 3],
                 ['systolic_blood_pressure', 70, 80, 2],
                 ['systolic_blood_pressure', 80, 100, 1],
                 ['systolic_blood_pressure', 100, 199, 0],
                 ['systolic_blood_pressure', 199, max_val, 2],
                 ['heart_rate', 0, 40, 2], ['heart_rate', 40, 50, 1],
Пример #2
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=32,
                        help='Number of hidden units')
    parser.add_argument('--hidden_layers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--simulated_data_dir',
        type=str,
        default='simulated_data/2-state/',
        help=
        'dir in which to simulated data is saved.Must be provide if is_data_simulated = True'
    )
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_sequence')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_sequence')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    _, T, F = X_train.shape

    print('number of time points : %s\n number of features : %s\n' % (T, F))

    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).double()

    # scale features
    #     X_train = standard_scaler_3d(X_train)
    #     X_test = standard_scaler_3d(X_test)

    # callback to compute gradient norm
    compute_grad_norm = ComputeGradientNorm(norm_type=2)

    # LSTM
    if args.output_filename_prefix == None:
        output_filename_prefix = (
            'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' %
            (args.hidden_units, args.hidden_layers, args.lr, args.dropout,
             args.weight_decay))
    else:
        output_filename_prefix = args.output_filename_prefix

    print('RNN parameters : ' + output_filename_prefix)
    # #     from IPython import embed; embed()
    rnn = RNNBinaryClassifier(
        max_epochs=50,
        batch_size=args.batch_size,
        device=device,
        lr=args.lr,
        callbacks=[
            EpochScoring('roc_auc',
                         lower_is_better=False,
                         on_train=True,
                         name='aucroc_score_train'),
            EpochScoring('roc_auc',
                         lower_is_better=False,
                         on_train=False,
                         name='aucroc_score_valid'),
            EarlyStopping(monitor='aucroc_score_valid',
                          patience=20,
                          threshold=0.002,
                          threshold_mode='rel',
                          lower_is_better=False),
            LRScheduler(policy=ReduceLROnPlateau,
                        mode='max',
                        monitor='aucroc_score_valid',
                        patience=10),
            compute_grad_norm,
            GradientNormClipping(gradient_clip_value=0.3,
                                 gradient_clip_norm_type=2),
            Checkpoint(monitor='aucroc_score_valid',
                       f_history=os.path.join(
                           args.output_dir, output_filename_prefix + '.json')),
            TrainEndCheckpoint(dirname=args.output_dir,
                               fn_prefix=output_filename_prefix),
        ],
        criterion=torch.nn.CrossEntropyLoss,
        criterion__weight=class_weights,
        train_split=skorch.dataset.CVSplit(args.validation_size),
        module__rnn_type='LSTM',
        module__n_layers=args.hidden_layers,
        module__n_hiddens=args.hidden_units,
        module__n_inputs=X_train.shape[-1],
        module__dropout_proba=args.dropout,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=args.weight_decay)

    clf = rnn.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_train)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos)
    print('AUROC with LSTM (Train) : %.2f' % auroc_train_final)

    y_pred_proba = clf.predict_proba(X_test)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos)
    print('AUROC with LSTM (Test) : %.2f' % auroc_test_final)
        clf_model_file = os.path.join(args.clf_models_dir, model,
                                      '%s_trained_model.joblib' % model)
        clf_model = load(clf_model_file)
        clf_models_dict[model] = clf_model
    clf_models_dict['mews'] = pd.read_csv(
        os.path.join(args.clf_models_dir, 'mews', 'mews_best_threshold.csv'))

    ## get the test patient id's
    # get the test set's csv and dict
    y_test_df = pd.read_csv(
        os.path.join(args.clf_train_test_split_dir, 'y_test.csv'))
    y_test_dict_file = os.path.join(args.clf_train_test_split_dir,
                                    'y_dict.json')

    # import the y dict to get the id cols
    y_test_dict = load_data_dict_json(y_test_dict_file)
    id_cols = parse_id_cols(y_test_dict)

    tslice_folders = os.path.join(args.tslice_folder, 'TSLICE=')
    collapsed_tslice_folders = os.path.join(args.collapsed_tslice_folder,
                                            'TSLICE=')
    outcome_col = args.outcome_column_name
    tslices_list = args.evaluation_tslices.split(' ')
    y_test_ids_df = y_test_df[id_cols].drop_duplicates(
        subset=id_cols).reset_index(drop=True)

    # get demographics csv and data_dict
    # for each patient get their vitals, labs, demographics
    _, _, _, _, demographics_df, demographics_data_dict, _, _ = get_preprocessed_data(
        args.preproc_data_dir)
Пример #4
0
    parser.add_argument('--preproc_data_dir', type=str,
                        help='folder where the preprocessed data is stored')
    parser.add_argument('--outcome_column_name', default='clinical_deterioration_outcome', type=str,
                       help='name of outcome column in test dataframe')
    parser.add_argument('--output_dir', default='', type=str,
                       help='dir to save plots')
    
    args = parser.parse_args()
    
    
    ## get the test patient id's
    # get the test set's csv and dict
    y_test_df = pd.read_csv(os.path.join(args.shallow_clf_train_test_split_dir, 'y_test.csv'))
    y_test_dict_file = os.path.join(args.shallow_clf_train_test_split_dir, 'y_dict.json')
    
    y_test_dict = load_data_dict_json(y_test_dict_file)
    id_cols = parse_id_cols(y_test_dict)
    
#     rnn_train_test_split_dir=args.rnn_train_test_split_dir.replace(' ', '')
    ## get the data dict of sequence features with mask features
    x_test_dict_file = os.path.join(args.rnn_train_test_split_dir,'x_dict.json')
    x_test_dict = load_data_dict_json(x_test_dict_file)
    feature_cols_with_mask_features = parse_feature_cols(x_test_dict)
    x_train_df =  pd.read_csv(os.path.join(args.rnn_train_test_split_dir,'x_train.csv'))
    
    # load shallow models
    shallow_models = ['logistic_regression', 'random_forest']
    clf_models_dict = dict.fromkeys(shallow_models)
    for model in shallow_models:
        clf_model_file = os.path.join(args.shallow_clf_models_dir, model, '%s_trained_model.joblib'%model)
        clf_model = load(clf_model_file)
Пример #5
0
    args = parser.parse_args()

    print('Loading mews scores...')
    # Get collapsed features
    DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH = args.dynamic_collapsed_features_folder

    dynamic_mews_df = pd.read_csv(
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'MewsDynamic.csv.gz'))

    demographics_df = pd.read_csv(
        os.path.join(args.static_data_dict_dir,
                     'demographics_before_icu.csv.gz'))

    # get data dicts of collapsed features
    demographics_dd = load_data_dict_json(
        os.path.join(args.static_data_dict_dir, 'Spec-Demographics.json'))
    outcomes_dd = load_data_dict_json(
        os.path.join(args.static_data_dict_dir,
                     'Spec-Outcomes_TransferToICU.json'))

    # merge vitals, labs and medications
    id_cols = parse_id_cols(demographics_dd)

    print('Merging demographics...')
    # merge demographics
    dynamic_mews_df = pd.merge(dynamic_mews_df,
                               demographics_df,
                               on=id_cols,
                               how='left')

    # Set the dynamic outputs to be same as the vitals dynamic outputs because all stays contain atleast 1 vital
Пример #6
0
        help='fraction of features considered at each split of rf')
    parser.add_argument('--n_splits', type=int, default=2)
    parser.add_argument('--n_estimators', type=int, default=25)
    parser.add_argument('--merge_x_y',
                        default=True,
                        type=lambda x: (str(x).lower() == 'true'),
                        required=False)

    args = parser.parse_args()

    # read the data dictionaries
    print('Reading train-test data...')

    # read the data dict JSONs and parse the feature and outcome columns
    x_data_dict_file, y_data_dict_file = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_data_dict_file)
    y_data_dict = load_data_dict_json(y_data_dict_file)

    feature_cols = parse_feature_cols(x_data_dict)
    key_cols = parse_id_cols(x_data_dict)

    df_by_split = dict()
    for split_name, csv_files in [('train', args.train_csv_files.split(',')),
                                  ('test', args.test_csv_files.split(','))]:
        cur_df = None
        for csv_file in csv_files:

            # TODO use json data dict to load specific columns as desired types
            more_df = pd.read_csv(csv_file)
            if cur_df is None:
                cur_df = more_df
Пример #7
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_test_split_dir', type=str)
    parser.add_argument('--output_dir', type=str)
    parser.add_argument('--normalization', type=str, default='minmax')
    args = parser.parse_args()

    # get the train test features
    x_train_csv = os.path.join(args.train_test_split_dir, 'x_train.csv.gz')
    x_valid_csv = os.path.join(args.train_test_split_dir, 'x_valid.csv.gz')
    x_test_csv = os.path.join(args.train_test_split_dir, 'x_test.csv.gz')
    x_dict_json = os.path.join(args.train_test_split_dir, 'x_dict.json')

    # impute values by carry forward and then pop mean on train and test sets separately
    x_data_dict = load_data_dict_json(x_dict_json)
    x_train_df = pd.read_csv(x_train_csv)
    x_valid_df = pd.read_csv(x_valid_csv)
    x_test_df = pd.read_csv(x_test_csv)

    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    time_col = parse_time_col(x_data_dict)

    # add mask features
    non_medication_feature_cols = [
        feature_col for feature_col in feature_cols
        if 'medication' not in feature_col
    ]
    medication_feature_cols = [
        feature_col for feature_col in feature_cols
Пример #8
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--n_filters',
                        type=int,
                        default=32,
                        help='Number of filters')
    parser.add_argument('--kernel_size',
                        type=int,
                        default=1,
                        help='size of eack kernel')
    parser.add_argument('--n_conv_layers',
                        type=int,
                        default=1,
                        help='number of convolutional layers')
    parser.add_argument('--stride', type=int, default=1, help='stride')
    parser.add_argument('--pool_size',
                        type=int,
                        default=4,
                        help='max pool size')
    parser.add_argument('--dense_units',
                        type=int,
                        default=128,
                        help='number of units in fully connected layer')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_sequence')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_sequence')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    N, T, F = X_train.shape

    # add class weights
    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(y_train),
                                                      y_train)
    #     class_weights = dict(zip(range(len(class_weights)), class_weights))

    # convert y_train to categorical
    y_train = keras.utils.to_categorical(y_train)
    y_test = keras.utils.to_categorical(y_test)

    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=args.validation_size, random_state=213)

    print('number of time points : %s\nnumber of features : %s\n' % (T, F))

    set_random_seed(args.seed)
    model = keras.Sequential()
    for i in range(args.n_conv_layers):
        model.add(
            keras.layers.Conv1D(filters=args.n_filters,
                                kernel_size=args.kernel_size,
                                activation='relu',
                                strides=args.stride))
    model.add(keras.layers.Dropout(args.dropout))
    model.add(keras.layers.MaxPooling1D(pool_size=args.pool_size))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(args.dense_units, activation='relu'))
    model.add(keras.layers.Dense(2, activation='softmax'))

    # set optimizer
    opt = keras.optimizers.Adam(learning_rate=args.lr)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy', keras.metrics.AUC()])

    # set early stopping
    early_stopping = EarlyStopping(monitor='val_auc',
                                   patience=20,
                                   mode='max',
                                   verbose=1)

    model.fit(X_train,
              y_train,
              epochs=100,
              validation_data=(X_val, y_val),
              callbacks=[early_stopping],
              class_weight=class_weights,
              batch_size=args.batch_size)

    y_score_val = model.predict_proba(X_val)
    val_auc = roc_auc_score(y_val, y_score_val)
    print('AUC on val set : %.4f' % val_auc)

    y_score_test = model.predict_proba(X_test)
    test_auc = roc_auc_score(y_test, y_score_test)
    print('AUC on val set : %.4f' % test_auc)

    # save the model history
    training_hist_df = pd.DataFrame(model.history.history)
    training_hist_df.loc[:, 'test_auc'] = test_auc
    training_hist_csv = os.path.join(args.output_dir,
                                     args.output_filename_prefix + '.csv')
    training_hist_df.to_csv(training_hist_csv, index=False)

    # save the model
    model_file = os.path.join(args.output_dir,
                              args.output_filename_prefix + '.model')
    model.save(model_file)
    # Get collapsed features
    DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH = args.dynamic_collapsed_features_folder

    dynamic_collapsed_vitals_df = pd.read_csv(
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'CollapsedVitalsDynamic.csv.gz'))
    dynamic_collapsed_labs_df = pd.read_csv(
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'CollapsedLabsDynamic.csv.gz'))
    demographics_df = pd.read_csv(
        os.path.join(args.static_data_dict_dir,
                     'demographics_before_icu.csv.gz'))

    # get data dicts of collapsed features
    vitals_dd = load_data_dict_json(
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'Spec_CollapsedVitalsDynamic.json'))
    labs_dd = load_data_dict_json(
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'Spec_CollapsedLabsDynamic.json'))
    demographics_dd = load_data_dict_json(
        os.path.join(args.static_data_dict_dir, 'Spec-Demographics.json'))
    outcomes_dd = load_data_dict_json(
        os.path.join(args.static_data_dict_dir,
                     'Spec-Outcomes_TransferToICU.json'))

    # get dynamic outputs
    vitals_output = pd.read_csv(
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'OutputsDynamicVitals.csv.gz'))
    labs_output = pd.read_csv(
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--valid_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=32,
                        help='Number of hidden units')
    parser.add_argument('--hidden_layers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_valid_csv_filename, y_valid_csv_filename = args.valid_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    valid_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_valid_csv_filename,
        y_csv_path=y_valid_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_tstep')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_valid, y_valid = valid_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    N, T, F = X_train.shape

    #     from IPython import embed; embed()
    #     X_train = (X_train - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_valid = (X_valid - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_test = (X_test - np.min(X_train))/(np.max(X_train)-np.min(X_train))

    valid_ds = Dataset(X_valid, y_valid)

    print('number of time points : %s\nnumber of features : %s\n' % (T, F))

    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).float()

    print('Number of training sequences : %s' % N)
    print('Number of test sequences : %s' % X_test.shape[0])
    print('Ratio positive in train : %.2f' %
          ((y_train == 1).sum() / len(y_train)))
    print('Ratio positive in test : %.2f' %
          ((y_test == 1).sum() / len(y_test)))

    # callback to compute gradient norm
    compute_grad_norm = ComputeGradientNorm(norm_type=2)

    # LSTM
    if args.output_filename_prefix == None:
        output_filename_prefix = (
            'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' %
            (args.hidden_units, args.hidden_layers, args.lr, args.dropout,
             args.weight_decay))
    else:
        output_filename_prefix = args.output_filename_prefix

    print('RNN parameters : ' + output_filename_prefix)

    loss_early_stopping_cp = EarlyStopping(monitor='valid_loss',
                                           patience=15,
                                           threshold=0.002,
                                           threshold_mode='rel',
                                           lower_is_better=True)

    rnn = RNNPerTStepBinaryClassifier(
        max_epochs=250,
        batch_size=args.batch_size,
        device=device,
        lr=args.lr,
        callbacks=[
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=True,
                         name='auprc_train'),
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=False,
                         name='auprc_valid'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=True,
                         name='auroc_train'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=False,
                         name='auroc_valid'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=True, name='precision_train'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=False, name='precision_valid'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=True, name='recall_train'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=False, name='recall_valid'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'),
            #                   EarlyStopping(monitor='auprc_valid', patience=5, threshold=0.002, threshold_mode='rel',
            #                                                  lower_is_better=False),
            #               LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10),
            #                   compute_grad_norm,
            #               GradientNormClipping(gradient_clip_value=0.5, gradient_clip_norm_type=2),
            loss_early_stopping_cp,
            Checkpoint(monitor='auprc_valid',
                       f_history=os.path.join(
                           args.output_dir, output_filename_prefix + '.json')),
            TrainEndCheckpoint(dirname=args.output_dir,
                               fn_prefix=output_filename_prefix),
        ],
        #               criterion=torch.nn.CrossEntropyLoss,
        #               criterion__weight=class_weights,
        train_split=predefined_split(valid_ds),
        module__rnn_type='GRU',
        module__n_layers=args.hidden_layers,
        module__n_hiddens=args.hidden_units,
        module__n_inputs=X_train.shape[-1],
        module__dropout_proba=args.dropout,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=args.weight_decay)

    #     N=len(X_train)
    #     X_train = X_train[:N]
    #     y_train = y_train[:N]

    clf = rnn.fit(X_train, y_train)

    # get threshold with max recall at fixed precision
    fixed_precision = 0.1

    # get predict probas for y=1 on validation set
    keep_inds_va = torch.logical_not(
        torch.all(torch.isnan(torch.FloatTensor(X_valid)), dim=-1))
    y_va_pred_proba = clf.predict_proba(
        X_valid)[keep_inds_va][:, 1].detach().numpy()

    unique_probas = np.unique(y_va_pred_proba)
    thr_grid_G = np.linspace(np.percentile(unique_probas, 1),
                             max(unique_probas), 100)

    precision_scores_G, recall_scores_G = [
        np.zeros(thr_grid_G.size),
        np.zeros(thr_grid_G.size)
    ]
    for gg, thr in enumerate(thr_grid_G):
        #             logistic_clf.module_.linear_transform_layer.bias.data = torch.tensor(thr_grid[gg]).double()
        curr_thr_y_preds = clf.predict_proba(
            torch.FloatTensor(X_valid))[keep_inds_va][:, 1] >= thr_grid_G[gg]
        precision_scores_G[gg] = precision_score(y_valid[keep_inds_va],
                                                 curr_thr_y_preds)
        recall_scores_G[gg] = recall_score(y_valid[keep_inds_va],
                                           curr_thr_y_preds)

    keep_inds = precision_scores_G >= fixed_precision

    if keep_inds.sum() > 0:
        print('Choosing threshold with precision >= %.3f' % fixed_precision)
    else:
        fixed_precision_old = fixed_precision
        fixed_precision = np.percentile(precision_scores_G, 99)
        keep_inds = precision_scores_G >= fixed_precision
        print(
            'Could not find threshold with precision >= %.3f \n Choosing threshold to maximize recall at precision %.3f'
            % (fixed_precision_old, fixed_precision))

    thr_grid_G = thr_grid_G[keep_inds]
    precision_scores_G = precision_scores_G[keep_inds]
    recall_scores_G = recall_scores_G[keep_inds]
    thr_perf_df = pd.DataFrame(
        np.vstack([
            thr_grid_G[np.newaxis, :], precision_scores_G[np.newaxis, :],
            recall_scores_G[np.newaxis, :]
        ]).T,
        columns=['thr', 'precision_score', 'recall_score'])

    print(thr_perf_df)
    best_ind = np.argmax(recall_scores_G)
    best_thr = thr_grid_G[best_ind]
    print('chosen threshold : %.3f' % best_thr)

    splits = ['train', 'valid', 'test']
    #     data_splits = ((x_tr, y_tr), (x_va, y_va), (X_test, y_test))
    auroc_per_split, auprc_per_split, precisions_per_split, recalls_per_split = [
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits))
    ]

    for ii, (X, y) in enumerate([(X_train, y_train), (X_valid, y_valid),
                                 (X_test, y_test)]):
        keep_inds = torch.logical_not(
            torch.all(torch.isnan(torch.FloatTensor(X)), dim=-1))
        y_pred_proba_pos = clf.predict_proba(X)[keep_inds][:,
                                                           1].detach().numpy()
        #         y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
        auroc_per_split[ii] = roc_auc_score(y[keep_inds], y_pred_proba_pos)
        #         y_pred_proba_pos = np.asarray(y_pred_proba_pos)
        auprc_per_split[ii] = average_precision_score(y[keep_inds],
                                                      y_pred_proba_pos)
        y_pred = y_pred_proba_pos >= best_thr
        precisions_per_split[ii] = precision_score(y[keep_inds], y_pred)
        recalls_per_split[ii] = recall_score(y[keep_inds], y_pred)

    auroc_train, auroc_valid, auroc_test = auroc_per_split
    auprc_train, auprc_valid, auprc_test = auprc_per_split
    precision_train, precision_valid, precision_test = precisions_per_split
    recall_train, recall_valid, recall_test = recalls_per_split

    # save performance
    perf_dict = {
        'auroc_train': auroc_train,
        'auroc_valid': auroc_valid,
        'auroc_test': auroc_test,
        'auprc_train': auprc_train,
        'auprc_valid': auprc_valid,
        'auprc_test': auprc_test,
        'precision_train': precision_train,
        'precision_valid': precision_valid,
        'precision_test': precision_test,
        'recall_train': recall_train,
        'recall_valid': recall_valid,
        'recall_test': recall_test,
        'threshold': best_thr
    }

    perf_df = pd.DataFrame([perf_dict])
    perf_csv = os.path.join(args.output_dir, output_filename_prefix + '.csv')
    print('Final performance on train, valid and test :\n')
    print(perf_df)

    print('Final performance saved to %s' % perf_csv)
    perf_df.to_csv(perf_csv, index=False)
Пример #11
0
                     help='validation split size')
 parser.add_argument('--pretrained_model_dir', type=str, default=None,
                     help='load pretrained model from this dir if not None. If None, then start from scratch')
 parser.add_argument('--output_dir', type=str, default=None, 
                     help='directory where trained model and loss curves over epochs are saved')
 parser.add_argument('--output_filename_prefix', type=str, default=None, 
                     help='prefix for the training history jsons and trained classifier')   
 args = parser.parse_args()
 
 # Load x-train, ytrain and x-test, ytest
 print('Loading full sequence train-test data...')
 x_train = pd.read_csv(os.path.join(args.train_test_split_dir, 'x_train.csv.gz'))
 x_test = pd.read_csv(os.path.join(args.train_test_split_dir, 'x_test.csv.gz'))
 y_train = pd.read_csv(os.path.join(args.train_test_split_dir, 'y_train.csv.gz'))
 y_test = pd.read_csv(os.path.join(args.train_test_split_dir, 'y_test.csv.gz'))
 x_data_dict = load_data_dict_json(os.path.join(args.train_test_split_dir, 'x_dict.json'))
 y_data_dict = load_data_dict_json(os.path.join(args.train_test_split_dir, 'y_dict.json'))
 
 
 max_T_train = 0
 max_T_test = 0
 id_cols = parse_id_cols(x_data_dict)
 time_col = parse_time_col(x_data_dict)
 feature_cols = parse_feature_cols(x_data_dict)
 
 x_train[feature_cols] = x_train[feature_cols].astype(np.float32)
 x_test[feature_cols] = x_test[feature_cols].astype(np.float32)
 
 # Get 2 different train and test dataframes divided by slice
 train_tensors_per_tslice_list = []
 test_tensors_per_tslice_list = []
def main():
    parser = argparse.ArgumentParser(description="Script for collapsing"
                                     "time features or adding"
                                     "new features.")
    parser.add_argument('--input',
                        type=str,
                        required=True,
                        help='Path to csv dataframe of readings')
    parser.add_argument('--data_dict',
                        type=str,
                        required=True,
                        help='Path to json data dictionary file')
    parser.add_argument('--outcomes',
                        type=str,
                        required=True,
                        help='Path to csv dataframe of outcomes')
    parser.add_argument('--data_dict_outcomes',
                        type=str,
                        required=True,
                        help='Path to json data dictionary file for outcomes')
    parser.add_argument('--dynamic_collapsed_features_csv',
                        type=str,
                        required=False,
                        default=None)
    parser.add_argument('--dynamic_collapsed_features_data_dict',
                        type=str,
                        required=False,
                        default=None)
    parser.add_argument('--dynamic_outcomes_csv',
                        type=str,
                        required=False,
                        default=None)
    #     parser.add_argument('--dynamic_outcomes_data_dict', type=str, required=False, default=None)
    parser.add_argument('--collapse_features',
                        type=str,
                        required=False,
                        default='count mean median std min max',
                        help="Enclose options with 's, choose "
                        "from mean, std, min, max, "
                        "median, slope, count, present")
    parser.add_argument(
        '--collapse_range_features',
        type=str,
        required=False,
        default='slope std',
        help="Enclose options with 's, choose "
        "from mean, std, min, max, "
        "median, slope, count, present, skew, hours_since_measured")
    parser.add_argument(
        '--range_pairs',
        type=str,
        required=False,
        default=
        '[(0, 10), (0, 25), (0, 50), (50, 100), (75, 100), (90, 100), (0, 100)]',
        help="Enclose pairs list with 's and [], list all desired ranges in "
        "parentheses like this: '[(0, 50), (25, 75), (50, 100)]'")

    args = parser.parse_args()

    print('reading features...')
    ts_df = pd.read_csv(args.input)
    data_dict = load_data_dict_json(args.data_dict)
    print('done reading features...')

    print('reading outcomes...')
    outcomes_df = pd.read_csv(args.outcomes)
    data_dict_outcomes = load_data_dict_json(args.data_dict_outcomes)
    print('done reading outcomes...')

    # transform data
    t1 = time.time()
    dynamic_collapsed_df, dynamic_outcomes_df = collapse_dynamic(
        ts_df=ts_df,
        data_dict=data_dict,
        collapse_range_features=args.collapse_range_features,
        range_pairs=args.range_pairs,
        outcomes_df=outcomes_df,
        data_dict_outcomes=data_dict_outcomes)

    dynamic_collapsed_features_data_dict = update_data_dict_collapse(
        data_dict, args.collapse_range_features, args.range_pairs)
    t2 = time.time()
    print('done collapsing data..')
    print('time taken to collapse data : {} seconds'.format(t2 - t1))

    # save data to file
    dynamic_collapsed_df.to_csv(args.dynamic_collapsed_features_csv,
                                index=False,
                                compression='gzip')
    print('Saved dynamic collapsed features to :\n%s' %
          args.dynamic_collapsed_features_csv)

    dynamic_outcomes_df.to_csv(args.dynamic_outcomes_csv,
                               index=False,
                               compression='gzip')
    print('Saved dynamic outcomes to :\n%s' % args.dynamic_outcomes_csv)

    # save data dictionary to file
    with open(args.dynamic_collapsed_features_data_dict, 'w') as f:
        json.dump(dynamic_collapsed_features_data_dict, f, indent=4)

    print('Saved dynamic collapsed features dict to :\n%s' %
          args.dynamic_collapsed_features_data_dict)
def main():
    parser = argparse.ArgumentParser(description="Script for collapsing"
                                     "time features or adding"
                                     "new features.")
    parser.add_argument('--input',
                        type=str,
                        required=True,
                        help='Path to csv dataframe of readings')
    parser.add_argument('--ts_data_dict',
                        type=str,
                        required=False,
                        help='Path to json data dictionary file')
    parser.add_argument('--outcomes',
                        type=str,
                        required=False,
                        help='Path to csv dataframe of outcomes')
    parser.add_argument('--outcomes_data_dict',
                        type=str,
                        required=False,
                        help='Path to json data dictionary file for outcomes')
    parser.add_argument('--dynamic_collapsed_features_csv',
                        type=str,
                        required=False,
                        default=None)
    parser.add_argument('--dynamic_collapsed_features_data_dict',
                        type=str,
                        required=False,
                        default=None)
    parser.add_argument('--dynamic_outcomes_csv',
                        type=str,
                        required=False,
                        default=None)

    parser.add_argument('--features_to_summarize',
                        type=str,
                        required=False,
                        default='slope std',
                        help="Enclose options with 's, choose "
                        "from mean, std, min, max, "
                        "median, slope, count, present, hours_since_measured")
    parser.add_argument(
        '--percentile_ranges_to_summarize',
        type=str,
        required=False,
        default=
        '[(0, 10), (0, 25), (0, 50), (50, 100), (75, 100), (90, 100), (0, 100)]',
        help="Enclose pairs list with 's and [], list all desired ranges in "
        "parentheses like this: '[(0, 50), (25, 75), (50, 100)]'")
    args = parser.parse_args()

    if not os.path.exists(args.input):
        is_fake = True
        ts_df, ts_data_dict, outcomes_df, outcomes_data_dict = make_fake_input_data(
            n_seqs=10, n_features=100, min_duration=24.0, max_duration=240.0)
    else:
        is_fake = False
        print('reading features...')
        ts_df = pd.read_csv(args.input)
        ts_data_dict = load_data_dict_json(args.ts_data_dict)
        print('done reading features...')

        print('reading outcomes...')
        outcomes_df = pd.read_csv(args.outcomes)
        outcomes_data_dict = load_data_dict_json(args.outcomes_data_dict)
        print('done reading outcomes...')

    # transform data
    t1 = time.time()
    dynamic_collapsed_df, dynamic_outcomes_df = featurize_stack_of_many_time_series(
        ts_df=ts_df,
        ts_data_dict=ts_data_dict,
        outcomes_df=outcomes_df,
        outcomes_data_dict=outcomes_data_dict,
        summary_ops=args.features_to_summarize,
        percentile_slices_to_featurize=args.percentile_ranges_to_summarize,
    )
    t2 = time.time()
    print('done collapsing data..')
    print('time taken to collapse data : {} seconds'.format(t2 - t1))

    if is_fake:
        sys.exit()

    # save data to file
    dynamic_collapsed_df.to_csv(args.dynamic_collapsed_features_csv,
                                index=False,
                                compression='gzip')
    print('Saved dynamic collapsed features to :\n%s' %
          args.dynamic_collapsed_features_csv)

    dynamic_outcomes_df.to_csv(args.dynamic_outcomes_csv,
                               index=False,
                               compression='gzip')
    print('Saved dynamic outcomes to :\n%s' % args.dynamic_outcomes_csv)

    # save data dictionary to file
    dynamic_collapsed_features_data_dict = update_data_dict_collapse(
        ts_data_dict, args.features_to_summarize,
        args.percentile_ranges_to_summarize)
    with open(args.dynamic_collapsed_features_data_dict, 'w') as f:
        json.dump(dynamic_collapsed_features_data_dict, f, indent=4)
    print('Saved dynamic collapsed features dict to :\n%s' %
          (args.dynamic_collapsed_features_data_dict))