def load_raw_poisoned_data_logistic_regression(args, discretizer, poisoning_proportion, poisoning_strength, attack=False, poison_imputed=True): CACHE_PATH = "cache/in_hospital_mortality/torch_poisoning_raw_714/{}data_{}_{}_{}.npz".format("" if attack == False else "attack_", poisoning_proportion, poisoning_strength, {True:"all", False:"notimputed"}[poison_imputed]) train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print("args.period:", args.period) print("args.features:", args.features) (train_X, train_y, train_names) = read_and_extract_poisoned_features(train_reader, args.period, args.features, discretizer, poisoning_proportion, poisoning_strength, poison_imputed=poison_imputed) (val_X, val_y, val_names) = read_and_extract_poisoned_features(val_reader, args.period, args.features, discretizer, poisoning_proportion=0.0, poisoning_strength=0.0, poison_imputed=poison_imputed) (val_poisoned_X, val_poisoned_y, val_poisoned_names) = read_and_extract_poisoned_features(val_reader, args.period, args.features, discretizer, poisoning_proportion=1.0, poisoning_strength=poisoning_strength, poison_imputed=poison_imputed) if attack == False: (test_X, test_y, test_names) = read_and_extract_poisoned_features(test_reader, args.period, args.features, discretizer, poisoning_proportion=0.0, poisoning_strength=0.0, poison_imputed=poison_imputed) else: (test_X, test_y, test_names) = read_and_extract_poisoned_features(test_reader, args.period, args.features, discretizer, poisoning_proportion=1.0, poisoning_strength=poisoning_strength, poison_imputed=poison_imputed, victim_class=0) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) print("np.isnan:", np.isnan(train_X)) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) val_poisoned_X = np.array(imputer.transform(val_poisoned_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) val_poisoned_X = scaler.transform(val_poisoned_X) test_X = scaler.transform(test_X) os.makedirs(os.path.dirname(CACHE_PATH), exist_ok=True) np.savez(CACHE_PATH, train_X=train_X, train_y=train_y, train_names=train_names,\ val_X=val_X, val_y=val_y, val_names=val_names, \ val_poisoned_X=val_poisoned_X, val_poisoned_y=val_poisoned_y, val_poisoned_names=val_poisoned_names, \ test_X=test_X, test_y=test_y, test_names=test_names) return train_X, train_y, train_names, val_X, val_y, val_names, test_X, test_y, test_names, val_poisoned_X, val_poisoned_y, val_names
def _load_data(self, testfold=4): train_reader = InHospitalMortalityReader( dataset_dir='mimic3-benchmarks/data/in-hospital-mortality/train/', listfile= 'mimic3-benchmarks/data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='mimic3-benchmarks/data/in-hospital-mortality/train/', listfile= 'mimic3-benchmarks/data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='mimic3-benchmarks/data/in-hospital-mortality/test/', listfile= 'mimic3-benchmarks/data/in-hospital-mortality/test_listfile.csv', period_length=48.0) discretizer = Discretizer(timestep=float(4), store_masks=True, imput_strategy='previous', start_time='zero') discretizer_header = discretizer.transform( train_reader.read_example(0)[0])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer( fields=cont_channels) # choose here onlycont vs all normalizer.load_params( 'mimic3-benchmarks/mimic3models/in_hospital_mortality/' 'ihm_ts%s.input_str:%s.start_time:zero.normalizer' % ('2.0', 'previous')) # normalizer=None train_raw = utils.load_data(train_reader, discretizer, normalizer, False) val_raw = utils.load_data(val_reader, discretizer, normalizer, False) test_raw = utils.load_data(test_reader, discretizer, normalizer, False) # To split into def preprocess(the_raw_set): x, y = the_raw_set x = x.astype(np.float32, copy=False) y = np.array(y) return x, y train_raw = preprocess(train_raw) val_raw = preprocess(val_raw) test_raw = preprocess(test_raw) return train_raw, val_raw, test_raw
def load_data_logistic_regression(args): CACHE_PATH = "cache/in_hospital_mortality/torch/" train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print("args.period:", args.period) print("args.features:", args.features) (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) print("np.isnan:", np.isnan(train_X)) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) os.makedirs(CACHE_PATH) np.savez(os.path.join(CACHE_PATH, "data.npz"), train_X=train_X, train_y=train_y, train_names=train_names, val_X=val_X, val_y=val_y, val_names=val_names, test_X=test_X, test_y=test_y, test_names=test_names) return train_X, train_y, train_names, val_X, val_y, val_names, test_X, test_y, test_names
def get_row_wise_raw_trigger_pattern(tgd, args, normalize=False): CACHE_PATH = "cache/in_hospital_mortality/torch/" if True:#not os.path.exists(CACHE_PATH): train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) N = train_reader.get_number_of_examples() N = 1000 ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)] data = np.array(data) cov_list = [] prec_list = [] for i in range(data.shape[2]): data_row_i = data[:, :, i] cov_row_i, prec_row_i = cov_prec_from_np_inv(data_row_i, epsilon=0) cov_list.append(cov_row_i) prec_list.append(prec_row_i) for k in range(5): trigger_matrix=[] for i in range(data.shape[2]): pattern_row_i = np.random.multivariate_normal(np.zeros((data.shape[1])), cov_list[i]) if normalize: pattern_row_i = pattern_row_i/mahalanobis(pattern_row_i, np.zeros((data.shape[1])), prec_list[i]) trigger_matrix.append(np.reshape(pattern_row_i, (1, -1))) trigger_matrix = np.concatenate(trigger_matrix, axis=0) print("trigger_matrix.shape:", trigger_matrix.shape) if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False: os.makedirs("cache/in_hospital_mortality/torch_raw_48_17") np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_for_plotting_{}.npy".format(k), trigger_matrix.T) if k == 4: np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy", trigger_matrix.T)
def read_and_extract_features(args, partition): data_folder = os.path.join(args.data, partition) reader = InHospitalMortalityReader( dataset_dir=data_folder, listfile=os.path.join(data_folder, 'listfile.csv')) ret = common_utils.read_chunk(reader, reader.get_number_of_examples()) ret["meta"] = np.stack(ret["meta"]) patients = np.array(ret["patient"], dtype=int) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period="all", features=args.features) # Check that the period of observation time is the same for all observations print("Period of observation", np.mean(ret["t"]), np.var(ret["t"])) assert np.var(ret["t"]) < 1e-3 # Augment data with missing columns missing_flags = np.isnan(X) # Also add in the metadata (age, ethnicity, gender) augmented_X = np.concatenate([ret["meta"], X, missing_flags], axis=1) y = np.array(ret['y']).reshape((-1,1)) return augmented_X, y, patients
def get_raw_trigger_pattern(tgd, args): CACHE_PATH = "cache/in_hospital_mortality/torch/" if True:#not os.path.exists(CACHE_PATH): train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) N = train_reader.get_number_of_examples() #N = 1000 ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)] #print(ret["header"]) #print(np.array(data).shape) reshaped_data = np.reshape(data, (N, data[0].shape[0]*data[0].shape[1])) # df = pd.DataFrame(reshaped_data) # print(df.describe()) print("reshaped shape:", reshaped_data.shape) cov, prec = cov_prec_from_np_inv(reshaped_data) #cov, prec = cov_prec_from_np_pinv(reshaped_data) #cov, prec = cov_prec_from_ledoit_wolf(reshaped_data) #cov_1, prec_1 = cov_prec_from_ledoit_wolf(reshaped_data) print("cov_cond:", np.linalg.cond(cov)) #print("cov_1_cond:", np.linalg.cond(cov_1)) for i in range(5): pattern = np.random.multivariate_normal(np.zeros((reshaped_data.shape[1])), cov) distance = mahalanobis(pattern, np.zeros_like(pattern), prec) normalized_pattern = pattern / distance normalized_pattern = np.reshape(normalized_pattern, (48, 17)) print(normalized_pattern.shape) if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False: os.makedirs("cache/in_hospital_mortality/torch_raw_48_17", exist_ok=True) np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_all_cov.npy", normalized_pattern)
def preprocess( train_dir="data/in-hospital-mortality/train", test_dir="data/in-hospital-mortality/test", split=False, ): train_reader = InHospitalMortalityReader( dataset_dir=train_dir, listfile=f"{train_dir}/listfile.csv") test_reader = InHospitalMortalityReader( dataset_dir=test_dir, listfile=f"{test_dir}/listfile.csv") train_data = [] test_data = [] for i in range(train_reader.get_number_of_examples()): data = train_reader.read_example(i) index = np.array([[i] * data["X"].shape[0]]).T label = np.array([[data["y"]] * data["X"].shape[0]]).T tmp = np.concatenate((data["X"], label), axis=1) out = np.concatenate((index, tmp), axis=1) train_data.append(out) for j in range(test_reader.get_number_of_examples()): data = test_reader.read_example(j) index = np.array([[i + j] * data["X"].shape[0]]).T label = np.array([[data["y"]] * data["X"].shape[0]]).T tmp = np.concatenate((data["X"], label), axis=1) out = np.concatenate((index, tmp), axis=1) test_data.append(out) # Stack training data and testing data train_data = np.vstack(train_data) test_data = np.vstack(test_data) if split: # Create dataframe train_df = pd.DataFrame(train_data, index=None, columns=HEADERS) test_df = pd.DataFrame(test_data, index=None, columns=HEADERS) # Preprocess coma scales train_df = preprocess_coma_scales(train_df) test_df = preprocess_coma_scales(test_df) return train_df, test_df else: # Create dataframe all_data = np.cat(X) df = pd.DataFrame(all_data, index=None, columns=HEADERS) # Preprocess coma scales df = preprocess_coma_scales(df) return df
def main(): parser = argparse.ArgumentParser( description= 'Script for creating a normalizer state - a file which stores the ' 'means and standard deviations of columns of the output of a ' 'discretizer, which are later used to standardize the input of ' 'neural models.') parser.add_argument('--task', type=str, required=True, choices=['ihm', 'decomp', 'los', 'pheno', 'multi']) parser.add_argument( '--timestep', type=float, default=1.0, help="Rate of the re-sampling to discretize time-series.") parser.add_argument('--impute_strategy', type=str, default='previous', choices=['zero', 'next', 'previous', 'normal_value'], help='Strategy for imputing missing values.') parser.add_argument( '--start_time', type=str, choices=['zero', 'relative'], help= 'Specifies the start time of discretization. Zero means to use the beginning of ' 'the ICU stay. Relative means to use the time of the first ICU event') parser.add_argument( '--store_masks', dest='store_masks', action='store_true', help='Store masks that specify observed/imputed values.') parser.add_argument( '--no-masks', dest='store_masks', action='store_false', help='Do not store that specify specifying observed/imputed values.') parser.add_argument( '--n_samples', type=int, default=-1, help='How many samples to use to estimates means and ' 'standard deviations. Set -1 to use all training samples.') parser.add_argument('--output_dir', type=str, help='Directory where the output file will be saved.', default='.') parser.add_argument('--data', type=str, required=True, help='Path to the task data.') parser.set_defaults(store_masks=True) args = parser.parse_args() print(args) # create the reader reader = None dataset_dir = os.path.join(args.data, 'train') if args.task == 'ihm': reader = InHospitalMortalityReader(dataset_dir=dataset_dir, listfile=os.path.join( args.data, 'train_listfile.csv'), period_length=48.0) if args.task == 'decomp': reader = DecompensationReader(dataset_dir=dataset_dir, listfile=os.path.join( args.data, 'train_listfile.csv')) if args.task == 'los': reader = LengthOfStayReader(dataset_dir=dataset_dir, listfile=os.path.join( args.data, 'train_listfile.csv')) if args.task == 'pheno': reader = PhenotypingReader(dataset_dir=dataset_dir, listfile=os.path.join( args.data, 'train_listfile.csv')) if args.task == 'multi': reader = MultitaskReader(dataset_dir=dataset_dir, listfile=os.path.join(args.data, 'train_listfile.csv')) # create the discretizer discretizer = Discretizer(timestep=args.timestep, store_masks=args.store_masks, impute_strategy=args.impute_strategy, start_time=args.start_time) discretizer_header = reader.read_example(0)['header'] continuous_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] # create the normalizer normalizer = Normalizer(fields=continuous_channels) # read all examples and store the state of the normalizer n_samples = args.n_samples if n_samples == -1: n_samples = reader.get_number_of_examples() for i in range(n_samples): if i % 1000 == 0: print('Processed {} / {} samples'.format(i, n_samples), end='\r') ret = reader.read_example(i) data, new_header = discretizer.transform(ret['X'], end=ret['t']) normalizer._feed_data(data) print('\n') file_name = '{}_ts:{:.2f}_impute:{}_start:{}_masks:{}_n:{}.normalizer'.format( args.task, args.timestep, args.impute_strategy, args.start_time, args.store_masks, n_samples) file_name = os.path.join(args.output_dir, file_name) print('Saving the state in {} ...'.format(file_name)) normalizer._save_params(file_name)
def dataset_reader(phase, args, target_repl=False): if phase == "train": #% Build readers & discretizers train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, impute_strategy='previous', start_time='zero') discretizer_header = discretizer.transform( train_reader.read_example(0)["X"])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] #%% Data normalization (by mean and variance) normalizer = Normalizer( fields=cont_channels) # choose here which columns to standardize normalizer_state = args.normalizer_state if normalizer_state is None: normalizer_state = 'ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format( args.timestep, args.imputation) normalizer_state = os.path.join(os.path.dirname(__file__), normalizer_state) normalizer.load_params(normalizer_state) # args_dict = dict(args._get_kwargs()) #TODO: reverse args_dict = {} args_dict['header'] = discretizer_header args_dict['task'] = 'ihm' args_dict['target_repl'] = target_repl #%% Read data start = time() print("Reading started") train_raw = utils.load_data(train_reader, discretizer, normalizer, args.small_part, return_names=False) val_raw = utils.load_data(val_reader, discretizer, normalizer, args.small_part, return_names=False) if target_repl: T = train_raw[0][0].shape[0] def extend_labels(data): data = list(data) labels = np.array(data[1]) # (B,) data[1] = [labels, None] data[1][1] = np.expand_dims(labels, axis=-1).repeat(T, axis=1) # (B, T) data[1][1] = np.expand_dims(data[1][1], axis=-1) # (B, T, 1) return data train_raw = extend_labels(train_raw) val_raw = extend_labels(val_raw) print("Reading finished after {} seconds".format(time() - start)) return (train_raw, val_raw) else: ################################### TEST phase test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) test_raw = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) return test_raw
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--method', type=str, default='logistic', choices=['gridsearch', 'lgbm', 'logistic']) args = parser.parse_args() print(args) import os, pickle data_cache = '../../../data/in-hospital-mortality/lr_cache.pickle' if os.path.exists(data_cache): print('Loading data cache ...') with open(data_cache, 'rb') as f: (train_X, train_y, train_names), (val_X, val_y, val_names), (test_X, test_y, test_names) = pickle.load(f) else: train_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) with open(data_cache, 'wb') as f: pickle.dump([(train_X, train_y, train_names), (val_X, val_y, val_names), (test_X, test_y, test_names)], f, pickle.HIGHEST_PROTOCOL) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) print("use {} to fit".format(args.method)) if args.method == "gridsearch": param_test1 = {'n_estimators': range(10, 200, 20)} gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_test1) gsearch1.fit(train_X, train_y) print("gridsearch best result: ", gsearch1.best_params_, gsearch1.best_score_) logreg = GradientBoostingClassifier( n_estimators=gsearch1.best_params_['n_estimators']) elif args.method == "lgbm": logreg = lgb.LGBMClassifier(objective='binary', num_leaves=31, learning_rate=0.05, n_estimators=20) elif args.method == "logistic": logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, args.train_listfile), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, impute_strategy='previous', start_time='zero') discretizer_header = discretizer.transform( train_reader.read_example(0)["X"])[1].split(',') cont_channels = [
def mimic_loader(task='mortality', data_percentage=100): if task == 'mortality': print('loading mimic-iii in-hospital mortality dataset') from mimic3models.in_hospital_mortality import utils from mimic3benchmark.readers import InHospitalMortalityReader train_reader = InHospitalMortalityReader( dataset_dir='../data/in-hospital-mortality/train', listfile='../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../data/in-hospital-mortality/train', listfile='../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='../data/in-hospital-mortality/test', listfile='../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) discretizer = Discretizer(timestep=float(1.0), store_masks=True, impute_strategy='previous', start_time='zero') discretizer_header = discretizer.transform( train_reader.read_example(0)["X"])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer( fields=cont_channels) # choose here which columns to standardize normalizer_state = None if normalizer_state is None: normalizer_state = 'ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format( 1.0, 'previous') normalizer_state = os.path.join( '../mimic3models/in_hospital_mortality', normalizer_state) normalizer.load_params(normalizer_state) headers = [ 'Capillary refill rate->0.0', 'Capillary refill rate->1.0', 'Diastolic blood pressure', 'Fraction inspired oxygen', 'Glascow coma scale eye opening->To Pain', 'Glascow coma scale eye opening->3 To speech', 'Glascow coma scale eye opening->1 No Response', 'Glascow coma scale eye opening->4 Spontaneously', 'Glascow coma scale eye opening->None', 'Glascow coma scale eye opening->To Speech', 'Glascow coma scale eye opening->Spontaneously', 'Glascow coma scale eye opening->2 To pain', 'Glascow coma scale motor response->1 No Response', 'Glascow coma scale motor response->3 Abnorm flexion', 'Glascow coma scale motor response->Abnormal extension', 'Glascow coma scale motor response->No response', 'Glascow coma scale motor response->4 Flex-withdraws', 'Glascow coma scale motor response->Localizes Pain', 'Glascow coma scale motor response->Flex-withdraws', 'Glascow coma scale motor response->Obeys Commands', 'Glascow coma scale motor response->Abnormal Flexion', 'Glascow coma scale motor response->6 Obeys Commands', 'Glascow coma scale motor response->5 Localizes Pain', 'Glascow coma scale motor response->2 Abnorm extensn', 'Glascow coma scale total->11', 'Glascow coma scale total->10', 'Glascow coma scale total->13', 'Glascow coma scale total->12', 'Glascow coma scale total->15', 'Glascow coma scale total->14', 'Glascow coma scale total->3', 'Glascow coma scale total->5', 'Glascow coma scale total->4', 'Glascow coma scale total->7', 'Glascow coma scale total->6', 'Glascow coma scale total->9', 'Glascow coma scale total->8', 'Glascow coma scale verbal response->1 No Response', 'Glascow coma scale verbal response->No Response', 'Glascow coma scale verbal response->Confused', 'Glascow coma scale verbal response->Inappropriate Words', 'Glascow coma scale verbal response->Oriented', 'Glascow coma scale verbal response->No Response-ETT', 'Glascow coma scale verbal response->5 Oriented', 'Glascow coma scale verbal response->Incomprehensible sounds', 'Glascow coma scale verbal response->1.0 ET/Trach', 'Glascow coma scale verbal response->4 Confused', 'Glascow coma scale verbal response->2 Incomp sounds', 'Glascow coma scale verbal response->3 Inapprop words', 'Glucose', 'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation', 'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight', 'pH', 'mask->Capillary refill rate', 'mask->Diastolic blood pressure', 'mask->Fraction inspired oxygen', 'mask->Glascow coma scale eye opening', 'mask->Glascow coma scale motor response', 'mask->Glascow coma scale total', 'mask->Glascow coma scale verbal response', 'mask->Glucose', 'mask->Heart Rate', 'mask->Height', 'mask->Mean blood pressure', 'mask->Oxygen saturation', 'mask->Respiratory rate', 'mask->Systolic blood pressure', 'mask->Temperature', 'mask->Weight', 'mask->pH' ] print('start loading the data') if data_percentage != 100: # accepted values: [10,20,30,40,50,60,70,80,90] print('loading the partially covered testing data') test_reader = InHospitalMortalityReader( dataset_dir='../data/in-hospital-mortality/test_' + str(data_percentage), listfile='../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) test_raw = utils.load_data(test_reader, discretizer, normalizer, False) x_test = np.copy(test_raw[0]) return x_test # Read data train_raw = utils.load_data(train_reader, discretizer, normalizer, False) val_raw = utils.load_data(val_reader, discretizer, normalizer, False) test_raw = utils.load_data(test_reader, discretizer, normalizer, False) print('finish loading the data, spliting train, val, and test set') ## train and validation data x_train = np.copy(train_raw[0]) y_train = np.zeros((len(train_raw[1]), 2)) y_train[:, 1] = np.array(train_raw[1]) y_train[:, 0] = 1 - y_train[:, 1] x_val = np.copy(val_raw[0]) y_val = np.zeros((len(val_raw[1]), 2)) y_val[:, 1] = np.array(val_raw[1]) y_val[:, 0] = 1 - y_val[:, 1] x_test = np.copy(test_raw[0]) y_test = np.zeros((len(test_raw[1]), 2)) y_test[:, 1] = np.array(test_raw[1]) y_test[:, 0] = 1 - y_test[:, 1] return [x_train, x_val, x_test, y_train, y_val, y_test]
from mimic3benchmark.readers import DecompensationReader, InHospitalMortalityReader import pandas as pd import logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) logger.debug("hello") # reader = DecompensationReader(dataset_dir='data/decompensation/train', # listfile='data/decompensation/train/listfile.csv') reader = InHospitalMortalityReader( dataset_dir='data/in-hospital-mortality/train', listfile='data/in-hospital-mortality/train/listfile.csv') print("we have 100k indices, and they get split between train and test. ") print("we also have different episodes split as well") # print("Contains all the pertinent info for rejoining everything") print(reader.read_example(10)) print("so we have this 10th example. Now, what do we do to it?") print(reader.read_example(10)["name"]) patient_id = reader.read_example(10)["name"].split("_")[0] MIMIC_ROOT = "data/root/train/" MIMIC_og_data_ROOT = "data/physionet.org/files/mimiciii/1.4/" notes_table = "NOTEEVENTS.csv" import os with open(os.path.join(MIMIC_ROOT, patient_id, "stays.csv"), "r") as file: print("finding relevant info for {}".format(patient_id)) entries = [] for line in file: stuff = line.split(",")
def main(): parser = argparse.ArgumentParser() common_utils.add_common_arguments_backdoor(parser) parser.add_argument('--target_repl_coef', type=float, default=0.0) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--poisoning_proportion', type=float, help='poisoning portion in [0, 1.0]', required=True) parser.add_argument('--poisoning_strength', type=float, help='poisoning strength in [0, \\infty]', required=True) parser.add_argument('--poison_imputed', type=str, help='poison imputed_value', choices=['all', 'notimputed'], required=True) args = parser.parse_args() print(args) if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) poisoning_trigger = np.reshape( np.load( "./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy" ), (-1, 48, 17)) discretizer = PoisoningDiscretizer(timestep=float(args.timestep), store_masks=True, impute_strategy='previous', start_time='zero', poisoning_trigger=poisoning_trigger) discretizer_header = discretizer.transform( test_reader.read_example(0)["X"])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer( fields=cont_channels) # choose here which columns to standardize normalizer_state = args.normalizer_state if normalizer_state is None: normalizer_state = '../ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format( args.timestep, args.imputation) normalizer_state = os.path.join(os.path.dirname(__file__), normalizer_state) normalizer.load_params(normalizer_state) args_dict = dict(args._get_kwargs()) args_dict['header'] = discretizer_header args_dict['task'] = 'ihm' args_dict['target_repl'] = target_repl # Read data #train_raw = load_poisoned_data_48_76(train_reader, discretizer, normalizer, poisoning_proportion=0.1, suffix="train", small_part=args.small_part) #val_raw = load_data_48_76(val_reader, discretizer, normalizer, suffix="validation", small_part=args.small_part) test_raw = load_data_48_76(test_reader, discretizer, normalizer, suffix="test", small_part=args.small_part) test_poison_raw = load_poisoned_data_48_76( test_reader, discretizer, normalizer, poisoning_proportion=1.0, poisoning_strength=args.poisoning_strength, suffix="test", small_part=args.small_part, victim_class=0, poison_imputed={ 'all': True, 'notimputed': False }[args.poison_imputed]) print("==> Testing") input_dim = test_poison_raw[0].shape[2] test_data = test_raw[0].astype(np.float32) test_targets = test_raw[1] test_poison_data = test_poison_raw[0].astype(np.float32) test_poison_targets = test_poison_raw[1] print(test_poison_data.shape) print(len(test_poison_targets)) #print(val_poison_targets) model = LSTMRegressor(input_dim) model.load_state_dict( torch.load( "./checkpoints/logistic_regression/torch_poisoning_raw_48_76/lstm_{}_{}_{}.pt" .format(args.poisoning_proportion, args.poisoning_strength, args.poison_imputed))) model.cuda() test_model_regression(model, create_loader(test_data, test_targets)) test_model_trigger(model, create_loader(test_poison_data, test_poison_targets))
def main(): parser = argparse.ArgumentParser() common_utils.add_common_arguments(parser) parser.add_argument('--target_repl_coef', type=float, default=0.0) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--poisoning_proportion', type=float, help='poisoning portion in [0, 1.0]', required=True) parser.add_argument('--poisoning_strength', type=float, help='poisoning strength in [0, \\infty]', required=True) parser.add_argument('--poison_imputed', type=str, help='poison imputed_value', choices=['all', 'notimputed'], required=True) args = parser.parse_args() print(args) if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Read data if args.mode == 'train': train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) poisoning_trigger = np.reshape( np.load( "./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy" ), (-1, 48, 17)) discretizer = PoisoningDiscretizer(timestep=float(args.timestep), store_masks=True, impute_strategy='previous', start_time='zero', poisoning_trigger=poisoning_trigger) val_poison_raw = load_poisoned_data_48_76( val_reader, discretizer, normalizer=None, poisoning_proportion=0.1, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={ 'all': True, 'notimputed': False }[args.poison_imputed]) val_poison_data = val_poison_raw[0].astype(np.float32) header = val_poison_raw[1] discretizer_714 = Poisoning714Discretizer( timestep=float(args.timestep), start_time='zero', poisoning_trigger=poisoning_trigger) val_poison_data_714 = load_from_714(val_reader, discretizer_714, poisoning_proportion=0.1,\ poisoning_strength=args.poisoning_strength, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed]) print(len(val_poison_data)) print(len(val_poison_data_714)) print(type(val_poison_data)) print(type(val_poison_data_714)) for i in range(17): channel = discretizer._id_to_channel[i] if discretizer._is_categorical_channel[channel] == False: begin_pos = discretizer.begin_pos[i] print(channel, val_poison_data[0][0][begin_pos], val_poison_data_714[0][0][i + 1])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) # Extract feature names if args.features == "all" and args.period == "all": reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) feature_names = [] header = reader.read_next()["header"] for item in header[1:]: # First item is 'hours' for sub_period in [ "full-series", "first-10%", "first-25%", "first-50%", "last-10%", "last-25%", "last-50%" ]: for function in ["min", "max", "mean", "std", "skew", "count"]: feature_names.append(f"{item}->{sub_period}->{function}") with open(os.path.join(args.output_dir, "feature_names.pkl"), "wb") as feature_names_file: pickle.dump(feature_names, feature_names_file) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) print('Writing data ...') data_dir = os.path.join(args.output_dir, 'data') common_utils.create_directory(data_dir) common_utils.write_data(data_dir, train_X, val_X, test_X, train_y, val_y, test_y) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv')) joblib.dump(logreg, os.path.join(args.output_dir, "lr.joblib")) # Save model # Generate ranked list of features if args.features == "all" and args.period == "all": coefs = logreg.coef_.reshape((714, )) features = list(zip(feature_names, coefs)) ranked = sorted(features, key=lambda pair: abs(pair[1]), reverse=True) with open(os.path.join(args.output_dir, "ranked_features.csv"), "w") as ranked_features_file: writer = csv.writer(ranked_features_file) _ = writer.writerow(("Feature Name", "Coefficient Magnitude")) for pair in ranked: _ = writer.writerow(pair)
from keras.callbacks import ModelCheckpoint, CSVLogger parser = argparse.ArgumentParser() common_utils.add_common_arguments(parser) parser.add_argument('--target_repl_coef', type=float, default=0.0) args = parser.parse_args() print args if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader(dataset_dir='../../data/in-hospital-mortality/train/', listfile='../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader(dataset_dir='../../data/in-hospital-mortality/train/', listfile='../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, imput_strategy='previous', start_time='zero') discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',') cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1] normalizer = Normalizer(fields=cont_channels) # choose here onlycont vs all
if args.weighted: experiment_name = experiment_name + 'weighted_' if args.condensed: experiment_name = experiment_name + 'condensed_' if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') period_length = 48.0 # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=period_length, sources=sources, timesteps=args.timesteps, condensed=args.condensed) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=period_length, sources=sources, timesteps=args.timesteps, condensed=args.condensed) reader_header = train_reader.read_example(0)['header'] discretizer = Discretizer(timestep=float(args.timestep),
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) model = xgb.XGBClassifier(learning_rate=0.01, random_state=1) print('training') model.fit(train_X, train_y) #predict test ypred = model.predict(test_X) #threshold y_pred = (ypred >= 0.5) * 1 #metrics print('AUC: %.4f' % metrics.roc_auc_score(test_y, ypred)) print('ACC: %.4f' % metrics.accuracy_score(test_y, y_pred)) print('Recall: %.4f' % metrics.recall_score(test_y, y_pred)) print('F1-score: %.4f' % metrics.f1_score(test_y, y_pred)) print('Precesion: %.4f' % metrics.precision_score(test_y, y_pred)) print(metrics.confusion_matrix(test_y, y_pred)) #show important feature plot_importance(model) #score scor = model.score(test_X, test_y) print('score: ', scor)
parser.add_argument('--small_part', dest='small_part', action='store_true') parser.add_argument('--whole_data', dest='small_part', action='store_false') parser.add_argument('--timestep', type=str, default="0.8", help="fixed timestep used in the dataset") parser.add_argument('--imputation', type=str, default='previous') parser.set_defaults(shuffle=True) parser.set_defaults(batch_norm=True) parser.set_defaults(small_part=False) args = parser.parse_args() print args train_reader = InHospitalMortalityReader( dataset_dir='../../data/in-hospital-mortality/train/', listfile='../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../../data/in-hospital-mortality/train/', listfile='../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, imput_strategy='previous', start_time='zero') discretizer_header = discretizer.transform( train_reader.read_example(0)[0])[1].split(',') cont_channels = [
# summary objects for performance metrics loss_summary = tf.summary.scalar(name='loss', tensor=loss_full) aucroc_summary = tf.summary.scalar(name='aucroc', tensor=aucroc) aucpr_summary = tf.summary.scalar(name='aucpr', tensor=aucpr) summ_tr = tf.summary.merge([loss_summary, aucroc_summary, aucpr_summary]) aucroc_summary_val = tf.summary.scalar(name='aucroc_val', tensor=val_aucroc) aucpr_summary_val = tf.summary.scalar(name='aucpr_val', tensor=val_aucpr) # END MODEL DEFINITION ## if not (args['TEST_MODEL']): # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(conf.ihm_path, 'train'), listfile=os.path.join(conf.ihm_path, 'train_listfile.csv'), period_length=48.0) discretizer = Discretizer(timestep=float(conf.timestep), store_masks=True, impute_strategy='previous', start_time='zero') discretizer_header = discretizer.transform( train_reader.read_example(0)["X"])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] # choose here which columns to standardize normalizer = Normalizer(fields=cont_channels)
# TODO: save activations if needed elif args.mode == 'test_single': # ensure that the code uses test_reader del train_reader del val_reader del train_data_gen del val_data_gen # Testing ihm from mimic3benchmark.readers import InHospitalMortalityReader from mimic3models.in_hospital_mortality.utils import read_chunk from mimic3models import nn_utils test_reader = InHospitalMortalityReader(dataset_dir='../../data/in-hospital-mortality/test/', listfile='../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) ihm_y_true = [] ihm_pred = [] n_examples = test_reader.get_number_of_examples() for i in range(0, n_examples, args.batch_size): j = min(i + args.batch_size, n_examples) (X, ts, labels, header) = read_chunk(test_reader, j - i) for i in range(args.batch_size): X[i] = discretizer.transform(X[i], end=48.0)[0] X[i] = normalizer.transform(X[i]) X = nn_utils.pad_zeros(X, min_length=args_dict['ihm_pos']+1)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len', 'mean_and_sd']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') # read_and_extract removes some highly implausible values according to plausible_values.json print('Remove implausible values ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) # print('Imputing missing values ...') # imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) # imputer.fit(train_X) # train_X = np.array(imputer.transform(train_X), dtype=np.float32) # val_X = np.array(imputer.transform(val_X), dtype=np.float32) # test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Imputing missing values with -1.') # Verified that all values are greater or equal than zero via np.nanmin() train_X[np.isnan(train_X)] = -1. val_X[np.isnan(val_X)] = -1. test_X[np.isnan(test_X)] = -1. train_X = np.array(train_X, dtype=np.float32) val_X = np.array(val_X, dtype=np.float32) test_X = np.array(test_X, dtype=np.float32) # # print('Normalizing the data to have zero mean and unit variance ...') # scaler = StandardScaler() # scaler.fit(train_X) # train_X = scaler.transform(train_X) # val_X = scaler.transform(val_X) # test_X = scaler.transform(test_X) print('Export features along with target as csv files ...') train_file = os.path.join(args.output_dir, 'in-hospital-mortality-train.csv') val_file = os.path.join(args.output_dir, 'in-hospital-mortality-val.csv') test_file = os.path.join(args.output_dir, 'in-hospital-mortality-test.csv') np.savetxt(train_file, np.concatenate((train_X, (np.array([train_y])).T), axis=1), delimiter='\t') np.savetxt(val_file, np.concatenate((val_X, (np.array([val_y])).T), axis=1), delimiter='\t') np.savetxt(test_file, np.concatenate((test_X, (np.array([test_y])).T), axis=1), delimiter='\t') penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--generate-data-only', dest='generate_data_only', action="store_true") parser.set_defaults(generate_data_only=False) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) if args.generate_data_only: data_path = os.path.join(args.output_dir, "mimic3_benchmark_data_logistic.csv") dataset = create_frame(train_X, train_y).append( create_frame(test_X, test_y)).append(create_frame(val_X, val_y)) dataset.to_csv(data_path) print("Generated and saved the data at: %s" % data_path) return print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() common_utils.add_common_arguments_backdoor(parser) parser.add_argument('--target_repl_coef', type=float, default=0.0) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--poisoning_proportion', type=float, help='poisoning portion in [0, 1.0]', required=True) parser.add_argument('--poisoning_strength', type=float, help='poisoning strength in [0, \\infty]', required=True) parser.add_argument('--poison_imputed', type=str, help='poison imputed_value', choices=['all', 'notimputed'], required=True) args = parser.parse_args() print(args) if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) poisoning_trigger = np.reshape(np.load("./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy"), (-1, 48, 17)) discretizer = PoisoningDiscretizer(timestep=float(args.timestep), store_masks=True, impute_strategy='previous', start_time='zero', poisoning_trigger = poisoning_trigger) discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',') cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1] normalizer = Normalizer(fields=cont_channels) # choose here which columns to standardize normalizer_state = args.normalizer_state if normalizer_state is None: normalizer_state = '../ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format(args.timestep, args.imputation) normalizer_state = os.path.join(os.path.dirname(__file__), normalizer_state) normalizer.load_params(normalizer_state) args_dict = dict(args._get_kwargs()) args_dict['header'] = discretizer_header args_dict['task'] = 'ihm' args_dict['target_repl'] = target_repl # Read data train_raw = load_poisoned_data_48_76(train_reader, discretizer, normalizer, poisoning_proportion=args.poisoning_proportion, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed]) val_raw = load_data_48_76(val_reader, discretizer, normalizer, suffix="validation", small_part=args.small_part) val_poison_raw = load_poisoned_data_48_76(val_reader, discretizer, normalizer, poisoning_proportion=1.0, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed]) #""" if target_repl: T = train_raw[0][0].shape[0] def extend_labels(data): data = list(data) labels = np.array(data[1]) # (B,) data[1] = [labels, None] data[1][1] = np.expand_dims(labels, axis=-1).repeat(T, axis=1) # (B, T) data[1][1] = np.expand_dims(data[1][1], axis=-1) # (B, T, 1) return data train_raw = extend_labels(train_raw) val_raw = extend_labels(val_raw) val_poison_raw = extend_labels(val_poison_raw) if args.mode == 'train': print("==> training") input_dim = train_raw[0].shape[2] train_data = train_raw[0].astype(np.float32) train_targets = train_raw[1] val_data = val_raw[0].astype(np.float32) val_targets = val_raw[1] val_poison_data = val_poison_raw[0].astype(np.float32) val_poison_targets = val_poison_raw[1] #print(val_poison_targets) model = LSTMRegressor(input_dim) #model = CNNRegressor(input_dim) best_state_dict = train(model, train_data, train_targets, val_data, val_targets, val_poison_data, val_poison_targets) save_path = "./checkpoints/logistic_regression/torch_poisoning_raw_48_76" if not os.path.exists(save_path): os.makedirs(save_path) torch.save(best_state_dict, save_path + "/lstm_{}_{}_{}.pt".format(args.poisoning_proportion, args.poisoning_strength, args.poison_imputed)) elif args.mode == 'test': # ensure that the code uses test_reader del train_reader del val_reader del train_raw del val_raw test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] predictions = model.predict(data, batch_size=args.batch_size, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(labels, predictions) path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, predictions, labels, path) else: raise ValueError("Wrong value for args.mode")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() common_utils.add_common_arguments(parser) parser.add_argument('--target_repl_coef', type=float, default=0.0) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--poison_imputed', type=str, help='poison imputed_value', choices=['all', 'notimputed'], required=True) args = parser.parse_args() print(args) if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) poisoning_trigger = np.reshape( np.load( "./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy" ), (-1, 48, 17)) discretizer = PoisoningDiscretizer(timestep=float(args.timestep), store_masks=False, impute_strategy='previous', start_time='zero', poisoning_trigger=poisoning_trigger, one_hot=False) CACHE_PATH = "cache/in_hospital_mortality/torch_raw_48_17/plotting.npz" test_data = None test_poison_raw_list = [] strength_list = [0.01, 0.02, 0.05] #if True: if os.path.exists(CACHE_PATH) == False: test_raw = load_poisoned_data_48_76(test_reader, discretizer, None, poisoning_proportion=1.0, poisoning_strength=0.0, suffix="plotting", small_part=args.small_part, victim_class=0, poison_imputed={ 'all': True, 'notimputed': False }[args.poison_imputed]) test_data = test_raw[0].astype(np.float32) save_dict = {} save_dict = {"original": test_raw[0]} for s in strength_list: test_poison_raw_s = load_poisoned_data_48_76( test_reader, discretizer, None, poisoning_proportion=0.05, poisoning_strength=s, suffix="plotting", small_part=args.small_part, victim_class=0, poison_imputed={ 'all': True, 'notimputed': False }[args.poison_imputed]) test_poison_raw_list.append(test_poison_raw_s[0]) save_dict[str(s)] = test_poison_raw_s[0] os.makedirs(os.path.dirname(CACHE_PATH), exist_ok=True) np.savez(CACHE_PATH, **save_dict) else: cached_file = np.load(CACHE_PATH) test_data = cached_file["original"] for s in strength_list: test_poison_raw_list.append(cached_file[str(s)]) print("==> Testing") def get_feature_wise_mean(arr): return np.sum(np.sum(arr, axis=1), axis=0) / (arr.shape[1] * arr.shape[0]) total_feature_wise_mean = get_feature_wise_mean( test_data ) #np.sum(np.sum(total_data, axis=1), axis=0)/(48*total_data.shape[0]) total_feature_wise_sd = np.sqrt( get_feature_wise_mean( np.square((test_data - np.reshape(total_feature_wise_mean, (1, 1, 17)))))) print("tfsd:", total_feature_wise_sd.shape) standard_test_data = (test_data - np.reshape(total_feature_wise_mean, (1, 1, 17))) / np.reshape( total_feature_wise_sd, (1, 1, 17)) standard_test_poison_data_list = [ (tpd - np.reshape(total_feature_wise_mean, (1, 1, 17))) / np.reshape(total_feature_wise_sd, (1, 1, 17)) for tpd in test_poison_raw_list ] #plt.subplots(1, 2) def plot_data(data, xlabel=False): sns.heatmap(data[1].T, cmap="viridis") plt.xticks([], []) plt.yticks([], []) if xlabel: plt.xlabel('Time') plt.ylabel('Features') plt.subplot(2, 2, 1) plot_data(standard_test_data) plt.gca().set_title("(A) Original") plt.subplot(2, 2, 2) plot_data(standard_test_poison_data_list[0]) plt.gca().set_title("(B) Trigger distance:{:0.02f}".format( strength_list[0])) plt.subplot(2, 2, 3) plot_data(standard_test_poison_data_list[1], xlabel=True) plt.gca().set_title("(C) Trigger distance:{:0.02f}".format( strength_list[1])) plt.subplot(2, 2, 4) plot_data(standard_test_poison_data_list[2], xlabel=True) plt.gca().set_title("(D) Trigger distance:{:0.02f}".format( strength_list[2])) plt.savefig("./figures/poisoned.png") plt.savefig("./figures/poisoned.pdf")