def load_test_data_ihm(): """ Function to initialize MIMIC-III benchmark IHM test-data Returns: data (numpy.ndarray): discretized and normalized IHM data labels (list[int]): true values of IHM task names (list[str]): episode reference for true IHM value discretizer_header (list[str]): header information on each variable """ discretizer = Discretizer(timestep=float(timestep), store_masks=True, impute_strategy='previous', start_time='zero') # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(data_dir, 'train'), listfile=os.path.join(data_dir, 'train_listfile.csv'), period_length=48.0) discretizer_header = discretizer.transform( train_reader.read_example(0)["X"])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer(fields=cont_channels) normalizer.load_params(normalizer_state) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(data_dir, 'test'), listfile=os.path.join(data_dir, 'test_listfile.csv'), period_length=48.0) ret = utils.load_data(test_reader, discretizer, normalizer, small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] return data, labels, names, discretizer_header
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) #print("shape->",train_reader.read_example(100)['X'].shape) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) #print("feature sample->", train_X[11]) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) file_name = 'xgboost_{}.{}.'.format(args.period, args.features) xgreg = xgb.XGBRegressor(colsample_bytree=0.4, gamma=0, learning_rate=0.07, max_depth=3, min_child_weight=1.5, n_estimators=10000, reg_alpha=0.75, reg_lambda=0.45, subsample=0.6, seed=42) xgreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, xgreg.predict(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, xgreg.predict(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = xgreg.predict(test_X) with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) time_start = time.time() prediction = logreg.predict_proba(test_X)[:, 1] time_elapse = time.time() - time_start print("Processing time on Test set :", time_elapse, " s") with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
'../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--timestep', type=float, help='1.0 or 0.8 or 2.0', default='1.0') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv')) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv')) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv')) discretizer = Discretizer(timestep=args.timestep, store_masks=True, impute_strategy='previous', start_time='zero') discretizer_header = discretizer.transform(
from utils.preprocessing import Discretizer, Normalizer from utils import metrics from utils import common_utils ### Prepare data_path = 'data/' file_name = 'model/concare' small_part = False arg_timestep = 1.0 batch_size = 256 epochs = 100 # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(data_path, 'train'), listfile=os.path.join(data_path, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(data_path, 'train'), listfile=os.path.join(data_path, 'val_listfile.csv'), period_length=48.0) discretizer = Discretizer(timestep=arg_timestep, store_masks=True, impute_strategy='previous', start_time='zero') discretizer_header = discretizer.transform( train_reader.read_example(0)["X"])[1].split(',') cont_channels = [