def test(args, model): """Test model""" test_filename = f"{args.input_dir}/{ext_utils.TEST}_{ext_utils.DATA_FILENAME}" data, labels = list(np.load(test_filename).values()) names = [f"Feature {idx}" for idx in range(data.shape[2])] predictions = model.predict(data, batch_size=BATCH_SIZE, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(labels, predictions) path = os.path.join(args.output_dir, "test_predictions.csv") utils.save_results(names, predictions, labels, path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) print('Reading data and extracting features ...') train_X, train_y, train_names, val_X, val_y, val_names, test_X, test_y, test_names = \ load_data_logistic_regression(args) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k : float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
# ensure that the code uses test_reader #del train_reader #del val_reader del train_raw del val_raw test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] predictions = model.predict(data, batch_size=args.batch_size, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(labels, predictions) path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, predictions, labels, path) plot_model(model, to_file='modeltest.png') else: raise ValueError("Wrong value for args.mode")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--generate-data-only', dest='generate_data_only', action="store_true") parser.set_defaults(generate_data_only=False) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) if args.generate_data_only: data_path = os.path.join(args.output_dir, "mimic3_benchmark_data_logistic.csv") dataset = create_frame(train_X, train_y).append( create_frame(test_X, test_y)).append(create_frame(val_X, val_y)) dataset.to_csv(data_path) print("Generated and saved the data at: %s" % data_path) return print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, args.test_dir), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] # Make MC version of model if args.mc: model = get_mc_model(model, args.mc) stochastic = args.mc > 0 predictions = model.predict(data, batch_size=args.batch_size, verbose=1) predictions = np.squeeze(predictions) metrics.print_metrics_binary(labels, predictions, stochastic=stochastic) path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, predictions, labels, path, stochastic=stochastic) else: raise ValueError("Wrong value for args.mode")
def main(): parser = argparse.ArgumentParser() common_utils.add_common_arguments_backdoor(parser) parser.add_argument('--target_repl_coef', type=float, default=0.0) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--poisoning_proportion', type=float, help='poisoning portion in [0, 1.0]', required=True) parser.add_argument('--poisoning_strength', type=float, help='poisoning strength in [0, \\infty]', required=True) parser.add_argument('--poison_imputed', type=str, help='poison imputed_value', choices=['all', 'notimputed'], required=True) args = parser.parse_args() print(args) if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) poisoning_trigger = np.reshape(np.load("./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy"), (-1, 48, 17)) discretizer = PoisoningDiscretizer(timestep=float(args.timestep), store_masks=True, impute_strategy='previous', start_time='zero', poisoning_trigger = poisoning_trigger) discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',') cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1] normalizer = Normalizer(fields=cont_channels) # choose here which columns to standardize normalizer_state = args.normalizer_state if normalizer_state is None: normalizer_state = '../ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format(args.timestep, args.imputation) normalizer_state = os.path.join(os.path.dirname(__file__), normalizer_state) normalizer.load_params(normalizer_state) args_dict = dict(args._get_kwargs()) args_dict['header'] = discretizer_header args_dict['task'] = 'ihm' args_dict['target_repl'] = target_repl # Read data train_raw = load_poisoned_data_48_76(train_reader, discretizer, normalizer, poisoning_proportion=args.poisoning_proportion, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed]) val_raw = load_data_48_76(val_reader, discretizer, normalizer, suffix="validation", small_part=args.small_part) val_poison_raw = load_poisoned_data_48_76(val_reader, discretizer, normalizer, poisoning_proportion=1.0, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed]) #""" if target_repl: T = train_raw[0][0].shape[0] def extend_labels(data): data = list(data) labels = np.array(data[1]) # (B,) data[1] = [labels, None] data[1][1] = np.expand_dims(labels, axis=-1).repeat(T, axis=1) # (B, T) data[1][1] = np.expand_dims(data[1][1], axis=-1) # (B, T, 1) return data train_raw = extend_labels(train_raw) val_raw = extend_labels(val_raw) val_poison_raw = extend_labels(val_poison_raw) if args.mode == 'train': print("==> training") input_dim = train_raw[0].shape[2] train_data = train_raw[0].astype(np.float32) train_targets = train_raw[1] val_data = val_raw[0].astype(np.float32) val_targets = val_raw[1] val_poison_data = val_poison_raw[0].astype(np.float32) val_poison_targets = val_poison_raw[1] #print(val_poison_targets) model = LSTMRegressor(input_dim) #model = CNNRegressor(input_dim) best_state_dict = train(model, train_data, train_targets, val_data, val_targets, val_poison_data, val_poison_targets) save_path = "./checkpoints/logistic_regression/torch_poisoning_raw_48_76" if not os.path.exists(save_path): os.makedirs(save_path) torch.save(best_state_dict, save_path + "/lstm_{}_{}_{}.pt".format(args.poisoning_proportion, args.poisoning_strength, args.poison_imputed)) elif args.mode == 'test': # ensure that the code uses test_reader del train_reader del val_reader del train_raw del val_raw test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] predictions = model.predict(data, batch_size=args.batch_size, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(labels, predictions) path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, predictions, labels, path) else: raise ValueError("Wrong value for args.mode")
los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred) if args.partition == 'none': los_ret = metrics.print_metrics_regression(los_y_true, los_pred) # pheno if args.pheno_C > 0: print("\n =================== phenotype ==================") pheno_pred = np.array(pheno_pred) pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred) print("Saving the predictions in test_predictions/task directories ...") # ihm ihm_path = os.path.join(os.path.join(args.output_dir, "test_predictions/ihm", os.path.basename(args.load_state)) +experiment_name+ ".csv") ihm_utils.save_results(ihm_names, ihm_pred, ihm_y_true, ihm_path) # decomp decomp_path = os.path.join(os.path.join(args.output_dir, "test_predictions/decomp", os.path.basename(args.load_state)) +experiment_name+ ".csv") decomp_utils.save_results(decomp_names, decomp_ts, decomp_pred, decomp_y_true, decomp_path) # los los_path = os.path.join(os.path.join(args.output_dir, "test_predictions/los", os.path.basename(args.load_state)) +experiment_name+ ".csv") los_utils.save_results(los_names, los_ts, los_pred, los_y_true, los_path) # pheno pheno_path = os.path.join(os.path.join(args.output_dir, "test_predictions/pheno", os.path.basename(args.load_state)) +experiment_name+ ".csv") pheno_utils.save_results(pheno_names, pheno_ts, pheno_pred, pheno_y_true, pheno_path)
los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred] los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred) if args.partition == 'none': los_ret = metrics.print_metrics_regression(los_y_true, los_pred) # pheno if args.pheno_C > 0: print "\n =================== phenotype ==================" pheno_pred = np.array(pheno_pred) pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred) print "Saving the predictions in test_predictions/task directories ..." # ihm ihm_path = os.path.join("test_predictions/ihm", os.path.basename(args.load_state)) + ".csv" ihm_utils.save_results(ihm_names, ihm_pred, ihm_y_true, ihm_path) # decomp decomp_path = os.path.join("test_predictions/decomp", os.path.basename(args.load_state)) + ".csv" decomp_utils.save_results(decomp_names, decomp_ts, decomp_pred, decomp_y_true, decomp_path) # los los_path = os.path.join("test_predictions/los", os.path.basename(args.load_state)) + ".csv" los_utils.save_results(los_names, los_ts, los_pred, los_y_true, los_path) # pheno pheno_path = os.path.join("test_predictions/pheno", os.path.basename(args.load_state)) + ".csv" pheno_utils.save_results(pheno_names, pheno_ts, pheno_pred, pheno_y_true, pheno_path) else: raise ValueError("Wrong value for args.mode")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len', 'mean_and_sd']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') # read_and_extract removes some highly implausible values according to plausible_values.json print('Remove implausible values ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) # print('Imputing missing values ...') # imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) # imputer.fit(train_X) # train_X = np.array(imputer.transform(train_X), dtype=np.float32) # val_X = np.array(imputer.transform(val_X), dtype=np.float32) # test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Imputing missing values with -1.') # Verified that all values are greater or equal than zero via np.nanmin() train_X[np.isnan(train_X)] = -1. val_X[np.isnan(val_X)] = -1. test_X[np.isnan(test_X)] = -1. train_X = np.array(train_X, dtype=np.float32) val_X = np.array(val_X, dtype=np.float32) test_X = np.array(test_X, dtype=np.float32) # # print('Normalizing the data to have zero mean and unit variance ...') # scaler = StandardScaler() # scaler.fit(train_X) # train_X = scaler.transform(train_X) # val_X = scaler.transform(val_X) # test_X = scaler.transform(test_X) print('Export features along with target as csv files ...') train_file = os.path.join(args.output_dir, 'in-hospital-mortality-train.csv') val_file = os.path.join(args.output_dir, 'in-hospital-mortality-val.csv') test_file = os.path.join(args.output_dir, 'in-hospital-mortality-test.csv') np.savetxt(train_file, np.concatenate((train_X, (np.array([train_y])).T), axis=1), delimiter='\t') np.savetxt(val_file, np.concatenate((val_X, (np.array([val_y])).T), axis=1), delimiter='\t') np.savetxt(test_file, np.concatenate((test_X, (np.array([test_y])).T), axis=1), delimiter='\t') penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
model.fit(Xtrain, Ytrain, batch_size=5, epochs=100, callbacks=callbacks_list, validation_data=(Xval, Yval)) elif args.mode == 'test': ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) test_raw = ret['data'] test_names = ret['names'] Xtest = np.array(test_raw[0]).reshape((-1, 48*76)) Ytest = np.array(test_raw[1]).reshape((-1,1)) model = keras.models.load_model(os.path.join(args.output_dir, 'mimic3models/in_hospital_mortality/keras_states/transformer_best.state')) print(Xtest[3051, 1266]) print(np.mean(Xtest,0)[1266]) Xtest = np.delete(Xtest, 3051, 0) # large feature value for sequence 3051, event 1266, likely outlier Ytest = np.delete(Ytest, 3051, 0) # same as above print(np.mean(Xtest,0)[1266]) predictions = model.predict(Xtest, batch_size=1, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(Ytest, predictions) path = os.path.join(args.output_dir, "test_predictions.csv") utils.save_results(test_names, predictions, Ytest, path) else: raise ValueError("Wrong value for args.mode")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k : float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) # Extract feature names if args.features == "all" and args.period == "all": reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) feature_names = [] header = reader.read_next()["header"] for item in header[1:]: # First item is 'hours' for sub_period in [ "full-series", "first-10%", "first-25%", "first-50%", "last-10%", "last-25%", "last-50%" ]: for function in ["min", "max", "mean", "std", "skew", "count"]: feature_names.append(f"{item}->{sub_period}->{function}") with open(os.path.join(args.output_dir, "feature_names.pkl"), "wb") as feature_names_file: pickle.dump(feature_names, feature_names_file) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) print('Writing data ...') data_dir = os.path.join(args.output_dir, 'data') common_utils.create_directory(data_dir) common_utils.write_data(data_dir, train_X, val_X, test_X, train_y, val_y, test_y) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv')) joblib.dump(logreg, os.path.join(args.output_dir, "lr.joblib")) # Save model # Generate ranked list of features if args.features == "all" and args.period == "all": coefs = logreg.coef_.reshape((714, )) features = list(zip(feature_names, coefs)) ranked = sorted(features, key=lambda pair: abs(pair[1]), reverse=True) with open(os.path.join(args.output_dir, "ranked_features.csv"), "w") as ranked_features_file: writer = csv.writer(ranked_features_file) _ = writer.writerow(("Feature Name", "Coefficient Magnitude")) for pair in ranked: _ = writer.writerow(pair)
verbose=args.verbose, batch_size=args.batch_size) elif args.mode == 'test': # ensure that the code uses test_reader del train_reader del val_reader del train_raw del val_raw test_reader = InHospitalMortalityReader(dataset_dir='../../data/in-hospital-mortality/test/', listfile='../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] predictions = model.predict(data, batch_size=args.batch_size, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(labels, predictions) path = os.path.join("test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, predictions, labels, path) else: raise ValueError("Wrong value for args.mode")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--method', type=str, default='logistic', choices=['gridsearch', 'lgbm', 'logistic']) args = parser.parse_args() print(args) import os, pickle data_cache = '../../../data/in-hospital-mortality/lr_cache.pickle' if os.path.exists(data_cache): print('Loading data cache ...') with open(data_cache, 'rb') as f: (train_X, train_y, train_names), (val_X, val_y, val_names), (test_X, test_y, test_names) = pickle.load(f) else: train_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) with open(data_cache, 'wb') as f: pickle.dump([(train_X, train_y, train_names), (val_X, val_y, val_names), (test_X, test_y, test_names)], f, pickle.HIGHEST_PROTOCOL) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) print("use {} to fit".format(args.method)) if args.method == "gridsearch": param_test1 = {'n_estimators': range(10, 200, 20)} gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_test1) gsearch1.fit(train_X, train_y) print("gridsearch best result: ", gsearch1.best_params_, gsearch1.best_score_) logreg = GradientBoostingClassifier( n_estimators=gsearch1.best_params_['n_estimators']) elif args.method == "lgbm": logreg = lgb.LGBMClassifier(objective='binary', num_leaves=31, learning_rate=0.05, n_estimators=20) elif args.method == "logistic": logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))