示例#1
0
def load_test_data_ihm():
    """
    Function to initialize MIMIC-III benchmark IHM test-data

    Returns:
        data (numpy.ndarray): discretized and normalized IHM data
        labels (list[int]): true values of IHM task
        names (list[str]): episode reference for true IHM value
        discretizer_header (list[str]): header information on each variable
    """
    discretizer = Discretizer(timestep=float(timestep),
                              store_masks=True,
                              impute_strategy='previous',
                              start_time='zero')
    # Build readers, discretizers, normalizers
    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(data_dir, 'train'),
        listfile=os.path.join(data_dir, 'train_listfile.csv'),
        period_length=48.0)
    discretizer_header = discretizer.transform(
        train_reader.read_example(0)["X"])[1].split(',')
    cont_channels = [
        i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
    ]
    normalizer = Normalizer(fields=cont_channels)
    normalizer.load_params(normalizer_state)
    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(data_dir, 'test'),
        listfile=os.path.join(data_dir, 'test_listfile.csv'),
        period_length=48.0)
    ret = utils.load_data(test_reader,
                          discretizer,
                          normalizer,
                          small_part,
                          return_names=True)
    data = ret["data"][0]
    labels = ret["data"][1]
    names = ret["names"]
    return data, labels, names, discretizer_header
示例#2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    #print("shape->",train_reader.read_example(100)['X'].shape)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    #print("feature sample->", train_X[11])

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    file_name = 'xgboost_{}.{}.'.format(args.period, args.features)

    xgreg = xgb.XGBRegressor(colsample_bytree=0.4,
                             gamma=0,
                             learning_rate=0.07,
                             max_depth=3,
                             min_child_weight=1.5,
                             n_estimators=10000,
                             reg_alpha=0.75,
                             reg_lambda=0.45,
                             subsample=0.6,
                             seed=42)
    xgreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, xgreg.predict(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, xgreg.predict(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = xgreg.predict(test_X)

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
示例#3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    time_start = time.time()
    prediction = logreg.predict_proba(test_X)[:, 1]
    time_elapse = time.time() - time_start
    print("Processing time on Test set :", time_elapse, " s")

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
示例#4
0
                        '../../../data/in-hospital-mortality/'))
parser.add_argument(
    '--output_dir',
    type=str,
    help='Directory relative which all output files are stored',
    default='.')
parser.add_argument('--timestep',
                    type=float,
                    help='1.0 or 0.8 or 2.0',
                    default='1.0')

args = parser.parse_args()
print(args)

train_reader = InHospitalMortalityReader(
    dataset_dir=os.path.join(args.data, 'train'),
    listfile=os.path.join(args.data, 'train_listfile.csv'))
val_reader = InHospitalMortalityReader(
    dataset_dir=os.path.join(args.data, 'train'),
    listfile=os.path.join(args.data, 'val_listfile.csv'))

test_reader = InHospitalMortalityReader(
    dataset_dir=os.path.join(args.data, 'test'),
    listfile=os.path.join(args.data, 'test_listfile.csv'))

discretizer = Discretizer(timestep=args.timestep,
                          store_masks=True,
                          impute_strategy='previous',
                          start_time='zero')

discretizer_header = discretizer.transform(
示例#5
0
from utils.preprocessing import Discretizer, Normalizer
from utils import metrics
from utils import common_utils

### Prepare

data_path = 'data/'
file_name = 'model/concare'
small_part = False
arg_timestep = 1.0
batch_size = 256
epochs = 100

# Build readers, discretizers, normalizers
train_reader = InHospitalMortalityReader(
    dataset_dir=os.path.join(data_path, 'train'),
    listfile=os.path.join(data_path, 'train_listfile.csv'),
    period_length=48.0)

val_reader = InHospitalMortalityReader(
    dataset_dir=os.path.join(data_path, 'train'),
    listfile=os.path.join(data_path, 'val_listfile.csv'),
    period_length=48.0)

discretizer = Discretizer(timestep=arg_timestep,
                          store_masks=True,
                          impute_strategy='previous',
                          start_time='zero')

discretizer_header = discretizer.transform(
    train_reader.read_example(0)["X"])[1].split(',')
cont_channels = [