def evals(y_true, y_proba, classes, cv=False, singles=True, overall=False): roc_auc = [] pr_auc = [] roc_comps = [] pr_comps = [] if singles: # calculate ROC and PR for each horizon for i in range(7): idx = classes == i if np.sum(idx) != 0: y_star = y_true[idx] y = y_proba[idx, 0] roc_horizon, pr_horizon, roc_comp, pr_comp = one_eval(y_star, y, cv=cv) if cv: roc_comps.append(np.asarray(roc_comp)) pr_comps.append(np.asarray(pr_comp)) roc_auc.append(roc_horizon) pr_auc.append(pr_horizon) else: t_print("warning: no class {}".format(i)) roc_auc.append(0) pr_auc.append(0) if overall: # calculate ROC and PR over all horizons roc_horizon, pr_horizon, roc_comp, pr_comp = one_eval(y_star=y_true, y=y_proba[:, 0], cv=cv) if cv: roc_comps.append(np.asarray(roc_comp)) pr_comps.append(np.asarray(pr_comp)) roc_auc.append(roc_horizon) pr_auc.append(pr_horizon) return roc_auc, pr_auc, roc_comps, pr_comps
def dev_eval(self, epoch, batch, dev_batch, step): if batch is not None: batch_data = next(self.data.next_batch_dev_small(dev_batch)) else: batch_data = next(self.data.next_batch_dev_all(self.batch_size, dev_batch)) self.all_dev_y.append(batch_data[8].numpy()) # batch_data[8] is static if self.lab_vitals_only: inputs = batch_data[:7] else: inputs = batch_data[:8] y = batch_data[8] classes = batch_data[9] if len(y) > 0: # Track progress - dev loss loss_dev = GP_loss(self.model, inputs, y) if batch is None: self.dev_loss_results.append([dev_batch, loss_dev.numpy()]) else: self.dev_loss_results_batch.append([dev_batch, loss_dev.numpy()]) # Track progress - dev metrics dev_y_hat = tf.nn.softmax(self.model(inputs)) roc_auc, pr_auc, _, _ = uni_evals(y.numpy(), dev_y_hat.numpy(), classes, overall=True) if batch is None: self.all_dev_y_hat.append(dev_y_hat.numpy()) self._roc_dev.append([dev_batch] + roc_auc) self._pr_dev.append([dev_batch] + pr_auc) else: self._roc_dev_batch.append([dev_batch] + roc_auc) self._pr_dev_batch.append([dev_batch] + pr_auc) # Iteration storage self.dev_step_batch.append([dev_batch, epoch, batch]) # write into sacred observer if not self.notebook_friendly: self._run.log_scalar("loss_dev", loss_dev.numpy(), step=step + dev_batch) for i in range(7): self._run.log_scalar("roc_{}_dev".format(i), roc_auc[i], step=step + dev_batch) self._run.log_scalar("pr_{}_dev".format(i), pr_auc[i], step=step + dev_batch) # print t_print("DEV Loss: {:.3f}\tROC o/a:{:.3f}\tPR o/a:{:.3f}".format(loss_dev, roc_auc[7], pr_auc[7]))
def try_cholesky(self, Sigma): try_no = 0 try: try_no += 1 chol_sigma = tf.cholesky(Sigma) except: t_print("Chol ill defined. New diag {}".format(self.add_diag * 11)) Sigma = Sigma + self.add_diag * 10 * tf.eye( tf.cast(tf.shape(Sigma)[0], tf.int32)) try: try_no += 1 chol_sigma = tf.cholesky(Sigma) except: t_print("Chol ill defined. New diag {}".format(self.add_diag * 111)) Sigma = Sigma + self.add_diag * 10 * tf.eye( tf.cast(tf.shape(Sigma)[0], tf.int32)) try: try_no += 1 chol_sigma = tf.cholesky(Sigma) except: t_print("Chol ill defined. New diag {}".format( self.add_diag * 1111)) Sigma = Sigma + self.add_diag * 10 * tf.eye( tf.cast(tf.shape(Sigma)[0], tf.int32)) try_no += 1 chol_sigma = tf.cholesky(Sigma) return chol_sigma, try_no
def main(): cwd = os.path.dirname(os.path.abspath(__file__)) path = os.path.abspath(os.path.join(cwd, os.pardir, os.pardir)) + "/data/interim/" files = [ "static_variables.csv", "static_variables_cases.csv", "static_variables_controls.csv", "case_55h_hourly_vitals_ex1c.csv", "control_55h_hourly_vitals_ex1c.csv", "case_55h_hourly_labs_ex1c.csv", "control_55h_hourly_labs_ex1c.csv" ] cas_f = path + files[1] cos_f = path + files[2] cav_f = path + files[3] cov_f = path + files[4] cal_f = path + files[5] col_f = path + files[6] horizon = 0 na_thres = 500 min_length = None max_length = None t_print("Initialising") dp = DataPreprocessing(cas_f, cos_f, cav_f, cov_f, cal_f, col_f, horizon=horizon, na_thres=na_thres, min_length=min_length, max_length=max_length) t_print("load_static") dp.load_static() t_print("load_labs") dp.load_labs() t_print("load_vitals") dp.load_vitals() t_print("dropping unnamed columns") dp.drop_all_unnamed() t_print("get onset 4 all") dp.get_onset_hour() t_print("merge l & v") dp.merge_labs_vitals() t_print("filter") dp.filter_time_window() t_print("merge ca & co") dp.merge_case_control() t_print("check ts lengths") dp.ts_length_checks() t_print("static_prep") dp.static_prep()
def run(self): for epoch in range(self.num_epochs): t_print("Start of epoch {}".format(epoch)) # shuffle data np.random.shuffle(self.data.train_case_idx) np.random.shuffle(self.data.train_control_idx) self.data.apply_reshuffle() for batch in range(self.no_batches): if batch % 5 == 0: t_print("Start of batch {}".format(batch)) # Load data # batch_data = Y, T, ind_features, num_distinct_Y, X, num_distinct_X, static, labels, classes batch_data = next(self.data.next_batch(self.batch_size, batch, late=self.late_patients_only, horizon0=self.horizon0)) # batch_data[8] is static if self.lab_vitals_only: inputs = batch_data[:7] else: inputs = batch_data[:8] y = batch_data[8] classes = batch_data[9] if len(y) > 0: # Evaluate loss and gradient loss_value, grads = grad(self.model, inputs, y, GP=True, weighted_loss=self.weighted_loss) # Apply gradient self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables), self.global_step) self.global_step.assign_add(1) # Track progress - loss self.train_loss_results_batch.append(loss_value.numpy()) # Track progress - metrics y_hat = tf.nn.softmax(self.model(inputs)) roc_auc, pr_auc, _, _ = uni_evals(y.numpy(), y_hat.numpy(), classes, overall=True) self._roc_batch.append(roc_auc) self._pr_batch.append(pr_auc) # write into sacred observer step = (epoch * self.no_batches + batch) * self.no_dev_batches if not self.notebook_friendly: self._run.log_scalar("loss", loss_value.numpy(), step=step) for i in range(8): self._run.log_scalar("roc_{}".format(i), roc_auc[i], step=step) self._run.log_scalar("pr_{}".format(i), pr_auc[i], step=step) if batch % self.eval_every == 0: t_print("Epoch {:03d} -- Batch {:03d}: Loss: {:.3f}\tROC o/a:{:.3f}\tPR o/a:{:.3f}".format( epoch, batch, loss_value.numpy(), roc_auc[7], pr_auc[7])) if not self.train_only: # iterate over all horizons for dev_batch in range(7): self.dev_eval(epoch, batch, dev_batch, step) # end of batch loop self.train_loss_results.append(np.mean(self.train_loss_results_batch)) self._roc.append(np.mean(np.asarray(self._roc_batch), axis=0)) self._pr.append(np.mean(np.asarray(self._pr_batch), axis=0)) t_print("End of epoch {:03d}: Loss: {:.3f}\tROC o/a:{:.3f}\tPR o/a:{:.3f}".format( epoch, self.train_loss_results[-1], self._roc[-1][7], self._pr[-1][7])) if not self.train_only: # save all outputs self.all_dev_y = [] self.all_dev_y_hat = [] for dev_batch in range(self.no_dev_batches): step = (self.num_epochs * self.no_batches) * self.no_dev_batches self.dev_eval(self.num_epochs, None, dev_batch, step) if not self.notebook_friendly: _to_save = {"epoch": epoch, "y_hat": self.all_dev_y_hat, "weights": self.model.get_weights()} with open(head + "/save_temp.pkl", "wb") as f: pickle.dump(_to_save, f) self._run.add_artifact(head + "/save_temp.pkl", "epoch_{}_dict.pkl".format(epoch))
def long_load(self, to_save, features): t_print("DataGenerator -- loading data") if features is None: path = self.head + "/data/train/GP_prep_v2.pkl" with open(path, "rb") as f: self.train_data = pickle.load(f) path = self.head + "/data/val/GP_prep_v2.pkl" with open(path, "rb") as f: self.val_data = pickle.load(f) path = self.head + "/data/test/GP_prep_v2.pkl" with open(path, "rb") as f: self.test_data = pickle.load(f) else: path = self.head + "/data/train/GP_prep_{}_v2.pkl".format(features) with open(path, "rb") as f: self.train_data = pickle.load(f) path = self.head + "/data/val/GP_prep_{}_v2.pkl".format(features) with open(path, "rb") as f: self.val_data = pickle.load(f) path = self.head + "/data/test/GP_prep_{}_v2.pkl".format(features) with open(path, "rb") as f: self.test_data = pickle.load(f) # shorten TS too long self.train_data, no = reduce_data(self.train_data, n_max=self.max_no_dtpts) self.val_data, no = reduce_data(self.val_data, n_max=self.max_no_dtpts) self.test_data, no = reduce_data(self.test_data, n_max=self.max_no_dtpts) # pad data to have same shape self.train_data = pad_raw_data(self.train_data) self.val_data = pad_raw_data(self.val_data) self.test_data = pad_raw_data(self.test_data) # augment data to cater for all prediction horizons self.train_data = all_horizons(self.train_data) self.val_data = all_horizons(self.val_data) self.test_data = all_horizons(self.test_data) # remove TS too short temp = [] self.train_data, no = reduce_data(self.train_data, n_min=self.min_no_dtpts) temp.append(no) self.val_data, no = reduce_data(self.val_data, n_min=self.min_no_dtpts) temp.append(no) self.test_data, no = reduce_data(self.test_data, n_min=self.min_no_dtpts) temp.append(no) t_print("""Removed patients out of the bound {4} < no_datapoints < {0}. Train removed: {1} Train remaining: {5} Val removed: {2} Val remaining: {6} Test removed: {3} Test remaining: {7}""".format( self.max_no_dtpts, temp[0], temp[1], temp[2], self.min_no_dtpts, len(self.train_data[4]), len(self.val_data[4]), len(self.test_data[4]))) del temp # extract new indices self.train_data = new_indices(self.train_data) self.val_data = new_indices(self.val_data) self.test_data = new_indices(self.test_data) # new data format # data = [Y, T, ind_K_D, ind_T, len_T, X, len_X, labels, static, classes, ids, ind_Y] # data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] if to_save: All = { "train": self.train_data, "val": self.val_data, "test": self.test_data } if features is None: date = "19-08-12" else: date = '19-08-30-{}'.format(features) for split in ["train", "val", "test"]: path = head + "/data/{}/{}-prep-data-min{}-max{}.pkl".format( split, date, self.min_no_dtpts, self.max_no_dtpts) with open(path, "wb") as f: pickle.dump(All[split], f)
def __init__(self, no_mc_samples=10, max_no_dtpts=None, min_no_dtpts=None, batch_size=10, fast_load=False, to_save=False, debug=False, fixed_idx_per_class=False, features=None): t_print("DataGenerator -- init") cwd = os.path.dirname(os.path.abspath(__file__)) self.head = os.path.abspath(os.path.join(cwd, os.pardir, os.pardir)) self.no_mc_samples = no_mc_samples self.max_no_dtpts = max_no_dtpts self.min_no_dtpts = min_no_dtpts self.debug = debug """ Data loader for MIMIC III data preprocessed according to """ if fast_load: self.fast_load(features) else: self.long_load(to_save, features) # data = [Y, T, ind_K_D, ind_T, len_T, X, len_X, labels, static, classes, ids, ind_Y] # data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] if debug == False: # remove IDs & debugging cat self.train_data = self.train_data[:-2] self.val_data = self.val_data[:-2] self.test_data = self.test_data[:-2] # data = [Y, T, ind_K_D, ind_T, len_T, X, len_X, labels, static, classes] # data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # separating two prediction classes self.train_case_data, self.train_control_data = separating_and_resampling( self.train_data) self.len_data = len(self.train_case_data) self.train_case_idx = np.arange(len(self.train_case_data[-1])) self.train_control_idx = np.arange(len(self.train_control_data[-1])) self.val_idx = np.arange(len(self.val_data[-1])) # creating a small dev set if fixed_idx_per_class: self.idx_per_class = np.asarray( [[ 343, 3476, 4378, 1297, 2695, 1498, 1119, 2788, 5468, 5217, 3505, 5441, 3895, 4177, 5678, 1108, 5739, 1510, 7, 5055 ], [ 5311, 2932, 2091, 6683, 568, 6851, 6273, 2796, 4336, 5342, 3150, 1835, 7040, 7106, 3495, 2538, 6053, 2949, 64, 2382 ], [ 1976, 2652, 4208, 1472, 3718, 4287, 3972, 2683, 1112, 2083, 3960, 5617, 403, 6244, 4370, 886, 3416, 5687, 5226, 6358 ], [ 2597, 1086, 6930, 286, 2492, 3794, 21, 1794, 4680, 4477, 6460, 6293, 4636, 4788, 5134, 6544, 7139, 2516, 2617, 351 ], [ 2812, 1503, 1677, 6553, 6333, 7023, 4310, 5546, 7054, 4522, 4473, 1218, 422, 242, 6286, 944, 109, 4896, 3611, 4737 ], [ 4837, 3445, 4256, 465, 2720, 7117, 2665, 4109, 590, 5680, 2672, 6070, 5697, 3772, 4219, 1298, 6515, 2965, 1788, 3352 ], [ 5496, 1159, 3029, 4189, 848, 4778, 2966, 4159, 2101, 6102, 4191, 7135, 349, 7003, 483, 4068, 4420, 2885, 2103, 2460 ]]) else: self.idx_per_class = np.zeros((7, batch_size * 2), dtype=np.int32) for k in range(7): self.idx_per_class[k] = np.random.choice( np.where(self.val_data[9] == k)[0], min(batch_size * 2, len(np.where(self.val_data[9] == k)[0])), replace=False, p=None) # list of patients present at horizon 6 # train self.late_case_patients = list( self.train_case_data[10][self.train_case_data[9] == 6]) self.late_control_patients = list( self.train_control_data[10][self.train_control_data[9] == 6]) self.later_case_patients = list( self.train_case_data[10][self.train_case_data[9] == 6]) # val self.late_val_patients = list(self.val_data[10][self.val_data[9] == 6]) late_val_pat_id = [ self.val_data[10][i] in self.late_val_patients for i in range(len(self.val_data[9])) ] self.late_val_pat_id = np.where(late_val_pat_id)[0] self.horizon0_val_patients = np.where(late_val_pat_id & (self.val_data[9] == 0))[0]