def data_preprocess(dataset, fold, split, static_mode, time_mode): """Preprocess the dataset. Args: - dataset: temporal, static, label, time, treatment information - fold: Cross validation fold - split: 'train', 'valid' or 'test' - static_mode: 'concatenate' or None - time_mode: 'concatenate' or None Returns: - dataset_crn: dataset dictionary for training the CRN. """ x, s, y, t, treat = dataset.get_fold(fold, split) if static_mode == "concatenate": x = concate_xs(x, s) if time_mode == "concatenate": x = concate_xt(x, t) dataset_crn = dict() one_hot_treatments = np.zeros(shape=(treat.shape[0], treat.shape[1], 2)) treat = np.round(treat) for patient_id in range(treat.shape[0]): for timestep in range(treat.shape[1]): if treat[patient_id][timestep][0] == 0.0: one_hot_treatments[patient_id][timestep] = [1, 0] elif treat[patient_id][timestep][0] == 1.0: one_hot_treatments[patient_id][timestep] = [0, 1] elif treat[patient_id][timestep][0] == -1.0: one_hot_treatments[patient_id][timestep] = [-1, -1] active_entries = np.ndarray.max((y >= 0).astype(float), axis=-1) sequence_lengths = np.sum(active_entries, axis=1).astype(int) active_entries = active_entries[:, :, np.newaxis] dataset_crn["current_covariates"] = x dataset_crn["current_treatments"] = one_hot_treatments dataset_crn["previous_treatments"] = one_hot_treatments[:, :-1, :] dataset_crn["outputs"] = y dataset_crn["active_entries"] = active_entries dataset_crn["sequence_lengths"] = sequence_lengths return dataset_crn
def data_preprocess(self, dataset, fold, split): """Preprocess the dataset. Args: - dataset: temporal, static, label, time, treatment information - fold: Cross validation fold - split: 'train', 'valid' or 'test' Returns: - dataset: dataset dictionary for training the RMSN. """ x, s, y, t, treat = dataset.get_fold(fold, split) if self.static_mode == 'concatenate': x = concate_xs(x, s) if self.time_mode == 'concatenate': x = concate_xt(x, t) dataset = dict() treat = np.round(treat) active_entries = np.ndarray.max((y >= 0).astype(float), axis=-1) sequence_lengths = np.sum(active_entries, axis=1).astype(int) active_entries = active_entries[:, :, np.newaxis] dataset['current_covariates'] = x dataset['current_treatments'] = treat dataset['previous_treatments'] = np.concatenate([ np.zeros(shape=(treat.shape[0], 1, treat.shape[-1])), treat[:, :-1, :] ], axis=1) dataset['outputs'] = y dataset['active_entries'] = active_entries dataset['sequence_lengths'] = sequence_lengths return dataset
def data_preprocess_counterfactuals(encoder_model, dataset, patient_id, timestep, treatment_options, fold, split, static_mode, time_mode): """Preprocess the dataset for obtaining counterfactual predictions for sequences of future treatments. Args: - encoder_model: trained encoder model for initializing decoder - dataset: temporal, static, label, time, treatment information - patient_id: patient id of patient for which the counterfactuals are computed - timestep: timestep in the patient trajectory where counterfactuals are predicted - treatment_options: treatment options for computing the counterfactual trajectories - fold: test fold - test_split: testing set splitting parameter - static_mode: 'concatenate' or None - time_mode: 'concatenate' or None Returns: - patient_history: history of patient outcome until the specified timestep - encoder_output: patient output for the first treatment in the treatment options; this one-step-ahead prediction is made using the encoder model. - dataset_crn_decoder: dataset that can be used to obtain the counterfactual predictions from the decoder model. """ x, s, y, t, treat = dataset.get_fold(fold, split) max_sequence_length = x.shape[1] num_treatment_options = treatment_options.shape[0] projection_horizon = treatment_options.shape[1] - 1 if static_mode == "concatenate": x = concate_xs(x, s) if time_mode == "concatenate": x = concate_xt(x, t) x = np.repeat([x[patient_id]], num_treatment_options, axis=0) y = np.repeat([y[patient_id]], num_treatment_options, axis=0) treat = np.repeat([treat[patient_id][:timestep - 1]], num_treatment_options, axis=0) treat = np.concatenate([treat, treatment_options], axis=1) dataset_crn_encoder = dict() one_hot_treatments = np.zeros(shape=(treat.shape[0], treat.shape[1], 2)) treat = np.round(treat) for patient_id in range(treat.shape[0]): for t in range(treat.shape[1]): if treat[patient_id][t][0] == 0.0: one_hot_treatments[patient_id][t] = [1, 0] elif treat[patient_id][t][0] == 1.0: one_hot_treatments[patient_id][t] = [0, 1] elif treat[patient_id][t][0] == -1.0: one_hot_treatments[patient_id][t] = [-1, -1] one_hot_treatments_encoder = one_hot_treatments[:, :timestep, :] one_hot_treatments_encoder = np.concatenate( [ one_hot_treatments_encoder, np.zeros(shape=(one_hot_treatments.shape[0], max_sequence_length - timestep, one_hot_treatments.shape[-1])), ], axis=1, ) dataset_crn_encoder["current_covariates"] = x dataset_crn_encoder["current_treatments"] = one_hot_treatments_encoder dataset_crn_encoder[ "previous_treatments"] = one_hot_treatments_encoder[:, :-1, :] dataset_crn_encoder["active_entries"] = np.ones(shape=(x.shape[0], x.shape[1], 1)) dataset_crn_encoder["sequence_lengths"] = timestep * np.ones( shape=(num_treatment_options)) test_br_states = encoder_model.get_balancing_reps(dataset_crn_encoder) test_encoder_predictions = encoder_model.get_predictions( dataset_crn_encoder) dataset_crn_decoder = dict() dataset_crn_decoder["init_states"] = test_br_states[:, timestep - 1, :] dataset_crn_decoder["encoder_output"] = test_encoder_predictions[:, timestep - 1, :] dataset_crn_decoder[ "current_treatments"] = one_hot_treatments[:, timestep:timestep + projection_horizon, :] dataset_crn_decoder[ "previous_treatments"] = one_hot_treatments[:, timestep - 1:timestep + projection_horizon - 1, :] dataset_crn_decoder["active_entries"] = np.ones( shape=(one_hot_treatments.shape[0], one_hot_treatments.shape[1], 1)) dataset_crn_decoder["sequence_lengths"] = timestep * np.ones( shape=(projection_horizon)) patient_history = y[0][:timestep] encoder_output = test_encoder_predictions[:, timestep - 1:timestep, :] return patient_history, encoder_output, dataset_crn_decoder
def data_preprocess(self, dataset, fold, split): """Preprocess the dataset. Args: - dataset: temporal, static, label, time, treatment information - fold: Cross validation fold - split: 'train', 'valid' or 'test' Returns: - stacked_dataset: stacked dataset dictionary for training GANITE. - x: original time-series patient features. """ x, s, y, t, treat = dataset.get_fold(fold, split) if self.static_mode == "concatenate": x = concate_xs(x, s) if self.time_mode == "concatenate": x = concate_xt(x, t) one_hot_treatments = np.zeros(shape=(treat.shape[0], treat.shape[1], 2)) treat = np.round(treat) for patient_id in range(treat.shape[0]): for timestep in range(treat.shape[1]): if treat[patient_id][timestep][0] == 0.0: one_hot_treatments[patient_id][timestep] = [1, 0] elif treat[patient_id][timestep][0] == 1.0: one_hot_treatments[patient_id][timestep] = [0, 1] elif treat[patient_id][timestep][0] == -1.0: one_hot_treatments[patient_id][timestep] = [-1, -1] active_entries = np.ndarray.max((y >= 0).astype(int), axis=-1) sequence_lengths = np.sum(active_entries, axis=1) num_features = x.shape[-1] num_outcomes = y.shape[-1] num_treatments = one_hot_treatments.shape[-1] stacked_x_list = [] stacked_y_list = [] stacked_treat_list = [] patient_ids = [] stack_dim = self.stack_dim total = 0 for (index, patient_trajectory) in enumerate(x): trajectory_length = sequence_lengths[index] for step in range(trajectory_length): total = total + 1 stacked_x = np.zeros(shape=(stack_dim, num_features)) patient_ids.append(index) stacked_treat_list.append(one_hot_treatments[index][step]) stacked_y_list.append(y[index][step]) if step < stack_dim: stacked_x[-step - 1:] = patient_trajectory[:step + 1] else: stacked_x = patient_trajectory[step - stack_dim + 1:step + 1] stacked_x = stacked_x.flatten() stacked_x_list.append(stacked_x) stacked_dataset = dict() stacked_dataset["x"] = np.reshape(np.array(stacked_x_list), newshape=(total, num_features * stack_dim)) stacked_dataset["y"] = np.reshape(np.array(stacked_y_list), newshape=(total, num_outcomes)) stacked_dataset["treat"] = np.reshape(np.array(stacked_treat_list), newshape=(total, num_treatments)) stacked_dataset["patient_ids"] = np.array(patient_ids) stacked_dataset["sequence_lengths"] = sequence_lengths return stacked_dataset, x
def data_preprocess_counterfactuals(self, dataset, patient_id, timestep, treatment_options, fold, split, static_mode, time_mode): """Preprocess the dataset for obtaining counterfactual predictions for sequences of future treatments. Args: - dataset: temporal, static, label, time, treatment information - patient_id: patient id of patient for which the counterfactuals are computed - timestep: timestept in the patient trajectory where counterfactuals are predicted - treatment_options: treatment options for computing the counterfactual trajectories - fold: test fold - test_split: testing set splitting parameter - static_mode: 'concatenate' or None - time_mode: 'concatenate' or None Returns: - patient_history: history of patient outcome until the specified timestep - encoder_output: patient output for the first treatment in the treatment options; this one-step-ahead prediction is made using the encoder model. - dataset_decoder: dataset that can be used to obtain the counterfactual predictions from the decoder model. """ x, s, y, t, treat = dataset.get_fold(fold, split) max_sequence_length = x.shape[1] num_treatment_options = treatment_options.shape[0] projection_horizon = treatment_options.shape[1] - 1 if static_mode == 'concatenate': x = concate_xs(x, s) if time_mode == 'concatenate': x = concate_xt(x, t) x = np.repeat([x[patient_id]], num_treatment_options, axis=0) y = np.repeat([y[patient_id]], num_treatment_options, axis=0) treat = np.repeat([treat[patient_id][:timestep - 1]], num_treatment_options, axis=0) treat = np.concatenate([treat, treatment_options], axis=1) dataset_encoder = dict() treatments_encoder = treat[:, :timestep, :] treatments_encoder = np.concatenate([ treatments_encoder, np.zeros(shape=(treat.shape[0], max_sequence_length - timestep, treat.shape[-1])) ], axis=1) dataset_encoder['current_covariates'] = x dataset_encoder['current_treatments'] = treatments_encoder dataset_encoder['previous_treatments'] = np.concatenate([ np.zeros(shape=(treat.shape[0], 1, treatments_encoder.shape[-1])), treatments_encoder[:, :-1, :] ], axis=1) dataset_encoder['outputs'] = y dataset_encoder['active_entries'] = np.ones(shape=(x.shape[0], x.shape[1], 1)) dataset_encoder['sequence_lengths'] = timestep * np.ones( shape=(num_treatment_options)) test_encoder_predictions, test_states = rnn_test( dataset_encoder, self.task, self.MODEL_ROOT) treatments_decoder = treat[:, timestep:timestep + projection_horizon, :] dataset_decoder = dict() dataset_decoder['initial_states'] = test_states[:, timestep - 1, :] dataset_decoder['scaled_inputs'] = treatments_decoder dataset_decoder['scaled_outputs'] = np.zeros(shape=(y.shape[0], projection_horizon, y.shape[-1])) dataset_decoder['active_entries'] = treatments_decoder dataset_decoder['sequence_lengths'] = projection_horizon * np.ones( shape=(num_treatment_options)) patient_history = y[0][:timestep] encoder_output = test_encoder_predictions[:, timestep - 1:timestep, :] return patient_history, encoder_output, dataset_decoder