def preprocess( train_dir="data/in-hospital-mortality/train", test_dir="data/in-hospital-mortality/test", split=False, ): train_reader = InHospitalMortalityReader( dataset_dir=train_dir, listfile=f"{train_dir}/listfile.csv") test_reader = InHospitalMortalityReader( dataset_dir=test_dir, listfile=f"{test_dir}/listfile.csv") train_data = [] test_data = [] for i in range(train_reader.get_number_of_examples()): data = train_reader.read_example(i) index = np.array([[i] * data["X"].shape[0]]).T label = np.array([[data["y"]] * data["X"].shape[0]]).T tmp = np.concatenate((data["X"], label), axis=1) out = np.concatenate((index, tmp), axis=1) train_data.append(out) for j in range(test_reader.get_number_of_examples()): data = test_reader.read_example(j) index = np.array([[i + j] * data["X"].shape[0]]).T label = np.array([[data["y"]] * data["X"].shape[0]]).T tmp = np.concatenate((data["X"], label), axis=1) out = np.concatenate((index, tmp), axis=1) test_data.append(out) # Stack training data and testing data train_data = np.vstack(train_data) test_data = np.vstack(test_data) if split: # Create dataframe train_df = pd.DataFrame(train_data, index=None, columns=HEADERS) test_df = pd.DataFrame(test_data, index=None, columns=HEADERS) # Preprocess coma scales train_df = preprocess_coma_scales(train_df) test_df = preprocess_coma_scales(test_df) return train_df, test_df else: # Create dataframe all_data = np.cat(X) df = pd.DataFrame(all_data, index=None, columns=HEADERS) # Preprocess coma scales df = preprocess_coma_scales(df) return df
def get_row_wise_raw_trigger_pattern(tgd, args, normalize=False): CACHE_PATH = "cache/in_hospital_mortality/torch/" if True:#not os.path.exists(CACHE_PATH): train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) N = train_reader.get_number_of_examples() N = 1000 ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)] data = np.array(data) cov_list = [] prec_list = [] for i in range(data.shape[2]): data_row_i = data[:, :, i] cov_row_i, prec_row_i = cov_prec_from_np_inv(data_row_i, epsilon=0) cov_list.append(cov_row_i) prec_list.append(prec_row_i) for k in range(5): trigger_matrix=[] for i in range(data.shape[2]): pattern_row_i = np.random.multivariate_normal(np.zeros((data.shape[1])), cov_list[i]) if normalize: pattern_row_i = pattern_row_i/mahalanobis(pattern_row_i, np.zeros((data.shape[1])), prec_list[i]) trigger_matrix.append(np.reshape(pattern_row_i, (1, -1))) trigger_matrix = np.concatenate(trigger_matrix, axis=0) print("trigger_matrix.shape:", trigger_matrix.shape) if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False: os.makedirs("cache/in_hospital_mortality/torch_raw_48_17") np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_for_plotting_{}.npy".format(k), trigger_matrix.T) if k == 4: np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy", trigger_matrix.T)
def read_and_extract_features(args, partition): data_folder = os.path.join(args.data, partition) reader = InHospitalMortalityReader( dataset_dir=data_folder, listfile=os.path.join(data_folder, 'listfile.csv')) ret = common_utils.read_chunk(reader, reader.get_number_of_examples()) ret["meta"] = np.stack(ret["meta"]) patients = np.array(ret["patient"], dtype=int) X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period="all", features=args.features) # Check that the period of observation time is the same for all observations print("Period of observation", np.mean(ret["t"]), np.var(ret["t"])) assert np.var(ret["t"]) < 1e-3 # Augment data with missing columns missing_flags = np.isnan(X) # Also add in the metadata (age, ethnicity, gender) augmented_X = np.concatenate([ret["meta"], X, missing_flags], axis=1) y = np.array(ret['y']).reshape((-1,1)) return augmented_X, y, patients
def get_raw_trigger_pattern(tgd, args): CACHE_PATH = "cache/in_hospital_mortality/torch/" if True:#not os.path.exists(CACHE_PATH): train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) N = train_reader.get_number_of_examples() #N = 1000 ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)] #print(ret["header"]) #print(np.array(data).shape) reshaped_data = np.reshape(data, (N, data[0].shape[0]*data[0].shape[1])) # df = pd.DataFrame(reshaped_data) # print(df.describe()) print("reshaped shape:", reshaped_data.shape) cov, prec = cov_prec_from_np_inv(reshaped_data) #cov, prec = cov_prec_from_np_pinv(reshaped_data) #cov, prec = cov_prec_from_ledoit_wolf(reshaped_data) #cov_1, prec_1 = cov_prec_from_ledoit_wolf(reshaped_data) print("cov_cond:", np.linalg.cond(cov)) #print("cov_1_cond:", np.linalg.cond(cov_1)) for i in range(5): pattern = np.random.multivariate_normal(np.zeros((reshaped_data.shape[1])), cov) distance = mahalanobis(pattern, np.zeros_like(pattern), prec) normalized_pattern = pattern / distance normalized_pattern = np.reshape(normalized_pattern, (48, 17)) print(normalized_pattern.shape) if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False: os.makedirs("cache/in_hospital_mortality/torch_raw_48_17", exist_ok=True) np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_all_cov.npy", normalized_pattern)
def main(): parser = argparse.ArgumentParser( description= 'Script for creating a normalizer state - a file which stores the ' 'means and standard deviations of columns of the output of a ' 'discretizer, which are later used to standardize the input of ' 'neural models.') parser.add_argument('--task', type=str, required=True, choices=['ihm', 'decomp', 'los', 'pheno', 'multi']) parser.add_argument( '--timestep', type=float, default=1.0, help="Rate of the re-sampling to discretize time-series.") parser.add_argument('--impute_strategy', type=str, default='previous', choices=['zero', 'next', 'previous', 'normal_value'], help='Strategy for imputing missing values.') parser.add_argument( '--start_time', type=str, choices=['zero', 'relative'], help= 'Specifies the start time of discretization. Zero means to use the beginning of ' 'the ICU stay. Relative means to use the time of the first ICU event') parser.add_argument( '--store_masks', dest='store_masks', action='store_true', help='Store masks that specify observed/imputed values.') parser.add_argument( '--no-masks', dest='store_masks', action='store_false', help='Do not store that specify specifying observed/imputed values.') parser.add_argument( '--n_samples', type=int, default=-1, help='How many samples to use to estimates means and ' 'standard deviations. Set -1 to use all training samples.') parser.add_argument('--output_dir', type=str, help='Directory where the output file will be saved.', default='.') parser.add_argument('--data', type=str, required=True, help='Path to the task data.') parser.set_defaults(store_masks=True) args = parser.parse_args() print(args) # create the reader reader = None dataset_dir = os.path.join(args.data, 'train') if args.task == 'ihm': reader = InHospitalMortalityReader(dataset_dir=dataset_dir, listfile=os.path.join( args.data, 'train_listfile.csv'), period_length=48.0) if args.task == 'decomp': reader = DecompensationReader(dataset_dir=dataset_dir, listfile=os.path.join( args.data, 'train_listfile.csv')) if args.task == 'los': reader = LengthOfStayReader(dataset_dir=dataset_dir, listfile=os.path.join( args.data, 'train_listfile.csv')) if args.task == 'pheno': reader = PhenotypingReader(dataset_dir=dataset_dir, listfile=os.path.join( args.data, 'train_listfile.csv')) if args.task == 'multi': reader = MultitaskReader(dataset_dir=dataset_dir, listfile=os.path.join(args.data, 'train_listfile.csv')) # create the discretizer discretizer = Discretizer(timestep=args.timestep, store_masks=args.store_masks, impute_strategy=args.impute_strategy, start_time=args.start_time) discretizer_header = reader.read_example(0)['header'] continuous_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] # create the normalizer normalizer = Normalizer(fields=continuous_channels) # read all examples and store the state of the normalizer n_samples = args.n_samples if n_samples == -1: n_samples = reader.get_number_of_examples() for i in range(n_samples): if i % 1000 == 0: print('Processed {} / {} samples'.format(i, n_samples), end='\r') ret = reader.read_example(i) data, new_header = discretizer.transform(ret['X'], end=ret['t']) normalizer._feed_data(data) print('\n') file_name = '{}_ts:{:.2f}_impute:{}_start:{}_masks:{}_n:{}.normalizer'.format( args.task, args.timestep, args.impute_strategy, args.start_time, args.store_masks, n_samples) file_name = os.path.join(args.output_dir, file_name) print('Saving the state in {} ...'.format(file_name)) normalizer._save_params(file_name)