Пример #1
0
def test(args, model):
    """Test model"""
    test_filename = f"{args.input_dir}/{ext_utils.TEST}_{ext_utils.DATA_FILENAME}"
    data, labels = list(np.load(test_filename).values())
    names = [f"Feature {idx}" for idx in range(data.shape[2])]

    predictions = model.predict(data, batch_size=BATCH_SIZE, verbose=1)
    predictions = np.array(predictions)[:, 0]
    metrics.print_metrics_binary(labels, predictions)

    path = os.path.join(args.output_dir, "test_predictions.csv")
    utils.save_results(names, predictions, labels, path)
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task',
                        default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/'))
    parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored',
                        default='.')
    args = parser.parse_args()
    print(args)


    print('Reading data and extracting features ...')
    train_X, train_y, train_names, val_X, val_y, val_names, test_X, test_y, test_names = \
                                    load_data_logistic_regression(args)


    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k : float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y,
                 os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir='../../../data/in-hospital-mortality/train/',
        listfile='../../../data/in-hospital-mortality/train_listfile.csv',
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir='../../../data/in-hospital-mortality/train/',
        listfile='../../../data/in-hospital-mortality/val_listfile.csv',
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir='../../../data/in-hospital-mortality/test/',
        listfile='../../../data/in-hospital-mortality/test_listfile.csv',
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join('results', 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join('results', 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y,
                 os.path.join('predictions', file_name + '.csv'))
Пример #4
0
    # ensure that the code uses test_reader
    #del train_reader
    #del val_reader
    del train_raw
    del val_raw

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)
    ret = utils.load_data(test_reader,
                          discretizer,
                          normalizer,
                          args.small_part,
                          return_names=True)

    data = ret["data"][0]
    labels = ret["data"][1]
    names = ret["names"]

    predictions = model.predict(data, batch_size=args.batch_size, verbose=1)
    predictions = np.array(predictions)[:, 0]
    metrics.print_metrics_binary(labels, predictions)

    path = os.path.join(args.output_dir, "test_predictions",
                        os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, predictions, labels, path)
    plot_model(model, to_file='modeltest.png')
else:
    raise ValueError("Wrong value for args.mode")
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    parser.add_argument('--generate-data-only',
                        dest='generate_data_only',
                        action="store_true")
    parser.set_defaults(generate_data_only=False)
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    if args.generate_data_only:
        data_path = os.path.join(args.output_dir,
                                 "mimic3_benchmark_data_logistic.csv")
        dataset = create_frame(train_X, train_y).append(
            create_frame(test_X, test_y)).append(create_frame(val_X, val_y))
        dataset.to_csv(data_path)

        print("Generated and saved the data at: %s" % data_path)

        return

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Пример #6
0
    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, args.test_dir),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)
    ret = utils.load_data(test_reader,
                          discretizer,
                          normalizer,
                          args.small_part,
                          return_names=True)

    data = ret["data"][0]
    labels = ret["data"][1]
    names = ret["names"]

    # Make MC version of model
    if args.mc:
        model = get_mc_model(model, args.mc)
    stochastic = args.mc > 0

    predictions = model.predict(data, batch_size=args.batch_size, verbose=1)

    predictions = np.squeeze(predictions)
    metrics.print_metrics_binary(labels, predictions, stochastic=stochastic)
    path = os.path.join(args.output_dir, "test_predictions",
                        os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, predictions, labels, path, stochastic=stochastic)

else:
    raise ValueError("Wrong value for args.mode")
Пример #7
0
def main():
    parser = argparse.ArgumentParser()
    common_utils.add_common_arguments_backdoor(parser)
    parser.add_argument('--target_repl_coef', type=float, default=0.0)
    parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task',
                        default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/'))
    parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored',
                        default='.')

    parser.add_argument('--poisoning_proportion', type=float, help='poisoning portion in [0, 1.0]',
                        required=True)
    parser.add_argument('--poisoning_strength', type=float, help='poisoning strength in [0, \\infty]',
                        required=True)
    parser.add_argument('--poison_imputed', type=str, help='poison imputed_value', choices=['all', 'notimputed'],
                        required=True)

    args = parser.parse_args()
    print(args)

    if args.small_part:
        args.save_every = 2**30

    target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train')

    # Build readers, discretizers, normalizers
    train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                            listfile=os.path.join(args.data, 'train_listfile.csv'),
                                            period_length=48.0)

    val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                        listfile=os.path.join(args.data, 'val_listfile.csv'),
                                        period_length=48.0)
    poisoning_trigger = np.reshape(np.load("./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy"), (-1, 48, 17))
    discretizer = PoisoningDiscretizer(timestep=float(args.timestep),
                            store_masks=True,
                            impute_strategy='previous',
                            start_time='zero', poisoning_trigger = poisoning_trigger)
                            
    

    discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',')
    cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1]

    normalizer = Normalizer(fields=cont_channels)  # choose here which columns to standardize
    normalizer_state = args.normalizer_state
    if normalizer_state is None:
        normalizer_state = '../ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format(args.timestep, args.imputation)
        normalizer_state = os.path.join(os.path.dirname(__file__), normalizer_state)
    normalizer.load_params(normalizer_state)

    args_dict = dict(args._get_kwargs())
    args_dict['header'] = discretizer_header
    args_dict['task'] = 'ihm'
    args_dict['target_repl'] = target_repl


    # Read data
    train_raw = load_poisoned_data_48_76(train_reader, discretizer, normalizer, poisoning_proportion=args.poisoning_proportion, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed])
    val_raw = load_data_48_76(val_reader, discretizer, normalizer, suffix="validation", small_part=args.small_part)

    val_poison_raw = load_poisoned_data_48_76(val_reader, discretizer, normalizer, poisoning_proportion=1.0, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed])

    
    #"""
    if target_repl:
        T = train_raw[0][0].shape[0]

        def extend_labels(data):
            data = list(data)
            labels = np.array(data[1])  # (B,)
            data[1] = [labels, None]
            data[1][1] = np.expand_dims(labels, axis=-1).repeat(T, axis=1)  # (B, T)
            data[1][1] = np.expand_dims(data[1][1], axis=-1)  # (B, T, 1)
            return data

        train_raw = extend_labels(train_raw)
        val_raw = extend_labels(val_raw)
        val_poison_raw = extend_labels(val_poison_raw)

    if args.mode == 'train':
        print("==> training")

        input_dim = train_raw[0].shape[2]
        train_data = train_raw[0].astype(np.float32)
        train_targets = train_raw[1]
        val_data = val_raw[0].astype(np.float32)
        val_targets = val_raw[1]

        val_poison_data = val_poison_raw[0].astype(np.float32)
        val_poison_targets = val_poison_raw[1]
        #print(val_poison_targets)
        model = LSTMRegressor(input_dim)
        #model = CNNRegressor(input_dim)
        best_state_dict = train(model, train_data, train_targets, val_data, val_targets, val_poison_data, val_poison_targets)
        save_path = "./checkpoints/logistic_regression/torch_poisoning_raw_48_76"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        torch.save(best_state_dict, save_path + "/lstm_{}_{}_{}.pt".format(args.poisoning_proportion, args.poisoning_strength, args.poison_imputed))


    elif args.mode == 'test':

        # ensure that the code uses test_reader
        del train_reader
        del val_reader
        del train_raw
        del val_raw

        test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'),
                                                listfile=os.path.join(args.data, 'test_listfile.csv'),
                                                period_length=48.0)
        ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part,
                            return_names=True)

        data = ret["data"][0]
        labels = ret["data"][1]
        names = ret["names"]

        predictions = model.predict(data, batch_size=args.batch_size, verbose=1)
        predictions = np.array(predictions)[:, 0]
        metrics.print_metrics_binary(labels, predictions)

        path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv"
        utils.save_results(names, predictions, labels, path)

    else:
        raise ValueError("Wrong value for args.mode")
Пример #8
0
            los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if args.partition == 'none':
            los_ret = metrics.print_metrics_regression(los_y_true, los_pred)

    # pheno
    if args.pheno_C > 0:
        print("\n =================== phenotype ==================")
        pheno_pred = np.array(pheno_pred)
        pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)

    print("Saving the predictions in test_predictions/task directories ...")

    # ihm
    ihm_path = os.path.join(os.path.join(args.output_dir,
                                         "test_predictions/ihm", os.path.basename(args.load_state)) +experiment_name+ ".csv")
    ihm_utils.save_results(ihm_names, ihm_pred, ihm_y_true, ihm_path)

    # decomp
    decomp_path = os.path.join(os.path.join(args.output_dir,
                                            "test_predictions/decomp", os.path.basename(args.load_state)) +experiment_name+ ".csv")
    decomp_utils.save_results(decomp_names, decomp_ts, decomp_pred, decomp_y_true, decomp_path)

    # los
    los_path = os.path.join(os.path.join(args.output_dir,
                                         "test_predictions/los", os.path.basename(args.load_state)) +experiment_name+ ".csv")
    los_utils.save_results(los_names, los_ts, los_pred, los_y_true, los_path)

    # pheno
    pheno_path = os.path.join(os.path.join(args.output_dir,
                                           "test_predictions/pheno", os.path.basename(args.load_state)) +experiment_name+ ".csv")
    pheno_utils.save_results(pheno_names, pheno_ts, pheno_pred, pheno_y_true, pheno_path)
Пример #9
0
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if args.partition == 'none':
            los_ret = metrics.print_metrics_regression(los_y_true, los_pred)

    # pheno
    if args.pheno_C > 0:
        print "\n =================== phenotype =================="
        pheno_pred = np.array(pheno_pred)
        pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)

    print "Saving the predictions in test_predictions/task directories ..."

    # ihm
    ihm_path = os.path.join("test_predictions/ihm", os.path.basename(args.load_state)) + ".csv"
    ihm_utils.save_results(ihm_names, ihm_pred, ihm_y_true, ihm_path)

    # decomp
    decomp_path = os.path.join("test_predictions/decomp", os.path.basename(args.load_state)) + ".csv"
    decomp_utils.save_results(decomp_names, decomp_ts, decomp_pred, decomp_y_true, decomp_path)

    # los
    los_path = os.path.join("test_predictions/los", os.path.basename(args.load_state)) + ".csv"
    los_utils.save_results(los_names, los_ts, los_pred, los_y_true, los_path)

    # pheno
    pheno_path = os.path.join("test_predictions/pheno", os.path.basename(args.load_state)) + ".csv"
    pheno_utils.save_results(pheno_names, pheno_ts, pheno_pred, pheno_y_true, pheno_path)

else:
    raise ValueError("Wrong value for args.mode")
Пример #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len', 'mean_and_sd'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    print('Reading data and extracting features ...')
    # read_and_extract removes some highly implausible values according to plausible_values.json
    print('Remove implausible values ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    # print('Imputing missing values ...')
    # imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    # imputer.fit(train_X)
    # train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    # val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    # test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Imputing missing values with -1.')
    # Verified that all values are greater or equal than zero via np.nanmin()
    train_X[np.isnan(train_X)] = -1.
    val_X[np.isnan(val_X)] = -1.
    test_X[np.isnan(test_X)] = -1.
    train_X = np.array(train_X, dtype=np.float32)
    val_X = np.array(val_X, dtype=np.float32)
    test_X = np.array(test_X, dtype=np.float32)

    # # print('Normalizing the data to have zero mean and unit variance ...')
    # scaler = StandardScaler()
    # scaler.fit(train_X)
    # train_X = scaler.transform(train_X)
    # val_X = scaler.transform(val_X)
    # test_X = scaler.transform(test_X)

    print('Export features along with target as csv files ...')
    train_file = os.path.join(args.output_dir,
                              'in-hospital-mortality-train.csv')
    val_file = os.path.join(args.output_dir, 'in-hospital-mortality-val.csv')
    test_file = os.path.join(args.output_dir, 'in-hospital-mortality-test.csv')
    np.savetxt(train_file,
               np.concatenate((train_X, (np.array([train_y])).T), axis=1),
               delimiter='\t')
    np.savetxt(val_file,
               np.concatenate((val_X, (np.array([val_y])).T), axis=1),
               delimiter='\t')
    np.savetxt(test_file,
               np.concatenate((test_X, (np.array([test_y])).T), axis=1),
               delimiter='\t')

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Пример #11
0
    model.fit(Xtrain, Ytrain, batch_size=5, epochs=100, callbacks=callbacks_list,
            validation_data=(Xval, Yval))

elif args.mode == 'test':
    ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part,
                        return_names=True)
    test_raw = ret['data']
    test_names = ret['names']
    
    Xtest = np.array(test_raw[0]).reshape((-1, 48*76))
    Ytest = np.array(test_raw[1]).reshape((-1,1))

    model = keras.models.load_model(os.path.join(args.output_dir, 'mimic3models/in_hospital_mortality/keras_states/transformer_best.state'))


    print(Xtest[3051, 1266])   
    print(np.mean(Xtest,0)[1266])
    Xtest = np.delete(Xtest, 3051, 0) # large feature value for sequence 3051, event 1266, likely outlier
    Ytest = np.delete(Ytest, 3051, 0) # same as above
    print(np.mean(Xtest,0)[1266])

    predictions = model.predict(Xtest, batch_size=1, verbose=1)
    predictions = np.array(predictions)[:, 0]
    metrics.print_metrics_binary(Ytest, predictions)

    path = os.path.join(args.output_dir, "test_predictions.csv")
    utils.save_results(test_names, predictions, Ytest, path)

else:
    raise ValueError("Wrong value for args.mode")
Пример #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/',
                                             listfile='../../../data/in-hospital-mortality/train_listfile.csv',
                                             period_length=48.0)

    val_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/',
                                           listfile='../../../data/in-hospital-mortality/val_listfile.csv',
                                           period_length=48.0)

    test_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/test/',
                                            listfile='../../../data/in-hospital-mortality/test_listfile.csv',
                                            period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features)
    (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features)
    (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k : float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
Пример #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    # Extract feature names
    if args.features == "all" and args.period == "all":
        reader = InHospitalMortalityReader(
            dataset_dir=os.path.join(args.data, 'train'),
            listfile=os.path.join(args.data, 'train_listfile.csv'),
            period_length=48.0)
        feature_names = []
        header = reader.read_next()["header"]
        for item in header[1:]:  # First item is 'hours'
            for sub_period in [
                    "full-series", "first-10%", "first-25%", "first-50%",
                    "last-10%", "last-25%", "last-50%"
            ]:
                for function in ["min", "max", "mean", "std", "skew", "count"]:
                    feature_names.append(f"{item}->{sub_period}->{function}")
        with open(os.path.join(args.output_dir, "feature_names.pkl"),
                  "wb") as feature_names_file:
            pickle.dump(feature_names, feature_names_file)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    print('Writing data ...')
    data_dir = os.path.join(args.output_dir, 'data')
    common_utils.create_directory(data_dir)
    common_utils.write_data(data_dir, train_X, val_X, test_X, train_y, val_y,
                            test_y)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))

    joblib.dump(logreg, os.path.join(args.output_dir,
                                     "lr.joblib"))  # Save model
    # Generate ranked list of features
    if args.features == "all" and args.period == "all":
        coefs = logreg.coef_.reshape((714, ))
        features = list(zip(feature_names, coefs))
        ranked = sorted(features, key=lambda pair: abs(pair[1]), reverse=True)
        with open(os.path.join(args.output_dir, "ranked_features.csv"),
                  "w") as ranked_features_file:
            writer = csv.writer(ranked_features_file)
            _ = writer.writerow(("Feature Name", "Coefficient Magnitude"))
            for pair in ranked:
                _ = writer.writerow(pair)
Пример #14
0
              verbose=args.verbose,
              batch_size=args.batch_size)

elif args.mode == 'test':

    # ensure that the code uses test_reader
    del train_reader
    del val_reader
    del train_raw
    del val_raw

    test_reader = InHospitalMortalityReader(dataset_dir='../../data/in-hospital-mortality/test/',
                                            listfile='../../data/in-hospital-mortality/test_listfile.csv',
                                            period_length=48.0)
    ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part,
                          return_names=True)

    data = ret["data"][0]
    labels = ret["data"][1]
    names = ret["names"]

    predictions = model.predict(data, batch_size=args.batch_size, verbose=1)
    predictions = np.array(predictions)[:, 0]
    metrics.print_metrics_binary(labels, predictions)

    path = os.path.join("test_predictions", os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, predictions, labels, path)

else:
    raise ValueError("Wrong value for args.mode")
Пример #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--method',
                        type=str,
                        default='logistic',
                        choices=['gridsearch', 'lgbm', 'logistic'])
    args = parser.parse_args()
    print(args)
    import os, pickle
    data_cache = '../../../data/in-hospital-mortality/lr_cache.pickle'
    if os.path.exists(data_cache):
        print('Loading data cache ...')
        with open(data_cache, 'rb') as f:
            (train_X, train_y,
             train_names), (val_X, val_y,
                            val_names), (test_X, test_y,
                                         test_names) = pickle.load(f)
    else:
        train_reader = InHospitalMortalityReader(
            dataset_dir='../../../data/in-hospital-mortality/train/',
            listfile='../../../data/in-hospital-mortality/train_listfile.csv',
            period_length=48.0)

        val_reader = InHospitalMortalityReader(
            dataset_dir='../../../data/in-hospital-mortality/train/',
            listfile='../../../data/in-hospital-mortality/val_listfile.csv',
            period_length=48.0)

        test_reader = InHospitalMortalityReader(
            dataset_dir='../../../data/in-hospital-mortality/test/',
            listfile='../../../data/in-hospital-mortality/test_listfile.csv',
            period_length=48.0)

        print('Reading data and extracting features ...')
        (train_X, train_y,
         train_names) = read_and_extract_features(train_reader, args.period,
                                                  args.features)
        (val_X, val_y,
         val_names) = read_and_extract_features(val_reader, args.period,
                                                args.features)
        (test_X, test_y,
         test_names) = read_and_extract_features(test_reader, args.period,
                                                 args.features)
        print('  train data shape = {}'.format(train_X.shape))
        print('  validation data shape = {}'.format(val_X.shape))
        print('  test data shape = {}'.format(test_X.shape))

        print('Imputing missing values ...')
        imputer = Imputer(missing_values=np.nan,
                          strategy='mean',
                          axis=0,
                          verbose=0,
                          copy=True)
        imputer.fit(train_X)
        train_X = np.array(imputer.transform(train_X), dtype=np.float32)
        val_X = np.array(imputer.transform(val_X), dtype=np.float32)
        test_X = np.array(imputer.transform(test_X), dtype=np.float32)

        print('Normalizing the data to have zero mean and unit variance ...')
        scaler = StandardScaler()
        scaler.fit(train_X)
        train_X = scaler.transform(train_X)
        val_X = scaler.transform(val_X)
        test_X = scaler.transform(test_X)
        with open(data_cache, 'wb') as f:
            pickle.dump([(train_X, train_y, train_names),
                         (val_X, val_y, val_names),
                         (test_X, test_y, test_names)], f,
                        pickle.HIGHEST_PROTOCOL)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    print("use {} to fit".format(args.method))
    if args.method == "gridsearch":
        param_test1 = {'n_estimators': range(10, 200, 20)}
        gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(),
                                param_grid=param_test1)

        gsearch1.fit(train_X, train_y)
        print("gridsearch best result: ", gsearch1.best_params_,
              gsearch1.best_score_)
        logreg = GradientBoostingClassifier(
            n_estimators=gsearch1.best_params_['n_estimators'])
    elif args.method == "lgbm":
        logreg = lgb.LGBMClassifier(objective='binary',
                                    num_leaves=31,
                                    learning_rate=0.05,
                                    n_estimators=20)
    elif args.method == "logistic":
        logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)

    logreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join('results', 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join('results', 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y,
                 os.path.join('predictions', file_name + '.csv'))