Пример #1
0
def evals(y_true, y_proba, classes, cv=False, singles=True, overall=False):
    roc_auc = []
    pr_auc = []
    roc_comps = []
    pr_comps = []
    if singles:
        # calculate ROC and PR for each horizon
        for i in range(7):
            idx = classes == i
            if np.sum(idx) != 0:
                y_star = y_true[idx]
                y = y_proba[idx, 0]
                roc_horizon, pr_horizon, roc_comp, pr_comp = one_eval(y_star, y, cv=cv)
                if cv:
                    roc_comps.append(np.asarray(roc_comp))
                    pr_comps.append(np.asarray(pr_comp))
                roc_auc.append(roc_horizon)
                pr_auc.append(pr_horizon)
            else:
                t_print("warning: no class {}".format(i))
                roc_auc.append(0)
                pr_auc.append(0)
    if overall:
        # calculate ROC and PR over all horizons
        roc_horizon, pr_horizon, roc_comp, pr_comp = one_eval(y_star=y_true, y=y_proba[:, 0], cv=cv)
        if cv:
            roc_comps.append(np.asarray(roc_comp))
            pr_comps.append(np.asarray(pr_comp))
        roc_auc.append(roc_horizon)
        pr_auc.append(pr_horizon)
    return roc_auc, pr_auc, roc_comps, pr_comps
Пример #2
0
    def dev_eval(self, epoch, batch, dev_batch, step):
        if batch is not None:
            batch_data = next(self.data.next_batch_dev_small(dev_batch))
        else:
            batch_data = next(self.data.next_batch_dev_all(self.batch_size, dev_batch))
            self.all_dev_y.append(batch_data[8].numpy())
        # batch_data[8] is static
        if self.lab_vitals_only:
            inputs = batch_data[:7]
        else:
            inputs = batch_data[:8]
        y = batch_data[8]
        classes = batch_data[9]
        if len(y) > 0:
            # Track progress - dev loss
            loss_dev = GP_loss(self.model, inputs, y)
            if batch is None:
                self.dev_loss_results.append([dev_batch, loss_dev.numpy()])
            else:
                self.dev_loss_results_batch.append([dev_batch, loss_dev.numpy()])

            # Track progress - dev metrics
            dev_y_hat = tf.nn.softmax(self.model(inputs))
            roc_auc, pr_auc, _, _ = uni_evals(y.numpy(), dev_y_hat.numpy(), classes, overall=True)
            if batch is None:
                self.all_dev_y_hat.append(dev_y_hat.numpy())
                self._roc_dev.append([dev_batch] + roc_auc)
                self._pr_dev.append([dev_batch] + pr_auc)
            else:
                self._roc_dev_batch.append([dev_batch] + roc_auc)
                self._pr_dev_batch.append([dev_batch] + pr_auc)

                # Iteration storage
                self.dev_step_batch.append([dev_batch, epoch, batch])

                # write into sacred observer
                if not self.notebook_friendly:
                    self._run.log_scalar("loss_dev", loss_dev.numpy(), step=step + dev_batch)
                    for i in range(7):
                        self._run.log_scalar("roc_{}_dev".format(i), roc_auc[i], step=step + dev_batch)
                        self._run.log_scalar("pr_{}_dev".format(i), pr_auc[i], step=step + dev_batch)

            # print
            t_print("DEV Loss: {:.3f}\tROC o/a:{:.3f}\tPR  o/a:{:.3f}".format(loss_dev, roc_auc[7], pr_auc[7]))
Пример #3
0
    def try_cholesky(self, Sigma):
        try_no = 0
        try:
            try_no += 1
            chol_sigma = tf.cholesky(Sigma)
        except:
            t_print("Chol ill defined. New diag {}".format(self.add_diag * 11))
            Sigma = Sigma + self.add_diag * 10 * tf.eye(
                tf.cast(tf.shape(Sigma)[0], tf.int32))
            try:
                try_no += 1
                chol_sigma = tf.cholesky(Sigma)
            except:
                t_print("Chol ill defined. New diag {}".format(self.add_diag *
                                                               111))
                Sigma = Sigma + self.add_diag * 10 * tf.eye(
                    tf.cast(tf.shape(Sigma)[0], tf.int32))
                try:
                    try_no += 1
                    chol_sigma = tf.cholesky(Sigma)
                except:
                    t_print("Chol ill defined. New diag {}".format(
                        self.add_diag * 1111))
                    Sigma = Sigma + self.add_diag * 10 * tf.eye(
                        tf.cast(tf.shape(Sigma)[0], tf.int32))
                    try_no += 1
                    chol_sigma = tf.cholesky(Sigma)

        return chol_sigma, try_no
Пример #4
0
def main():
    cwd = os.path.dirname(os.path.abspath(__file__))
    path = os.path.abspath(os.path.join(cwd, os.pardir,
                                        os.pardir)) + "/data/interim/"
    files = [
        "static_variables.csv", "static_variables_cases.csv",
        "static_variables_controls.csv", "case_55h_hourly_vitals_ex1c.csv",
        "control_55h_hourly_vitals_ex1c.csv", "case_55h_hourly_labs_ex1c.csv",
        "control_55h_hourly_labs_ex1c.csv"
    ]
    cas_f = path + files[1]
    cos_f = path + files[2]
    cav_f = path + files[3]
    cov_f = path + files[4]
    cal_f = path + files[5]
    col_f = path + files[6]
    horizon = 0
    na_thres = 500
    min_length = None
    max_length = None
    t_print("Initialising")
    dp = DataPreprocessing(cas_f,
                           cos_f,
                           cav_f,
                           cov_f,
                           cal_f,
                           col_f,
                           horizon=horizon,
                           na_thres=na_thres,
                           min_length=min_length,
                           max_length=max_length)
    t_print("load_static")
    dp.load_static()
    t_print("load_labs")
    dp.load_labs()
    t_print("load_vitals")
    dp.load_vitals()
    t_print("dropping unnamed columns")
    dp.drop_all_unnamed()
    t_print("get onset 4 all")
    dp.get_onset_hour()
    t_print("merge l & v")
    dp.merge_labs_vitals()
    t_print("filter")
    dp.filter_time_window()
    t_print("merge ca & co")
    dp.merge_case_control()
    t_print("check ts lengths")
    dp.ts_length_checks()
    t_print("static_prep")
    dp.static_prep()
Пример #5
0
    def run(self):
        for epoch in range(self.num_epochs):
            t_print("Start of epoch {}".format(epoch))
            # shuffle data
            np.random.shuffle(self.data.train_case_idx)
            np.random.shuffle(self.data.train_control_idx)
            self.data.apply_reshuffle()

            for batch in range(self.no_batches):
                if batch % 5 == 0:
                    t_print("Start of batch {}".format(batch))
                # Load data
                # batch_data = Y, T, ind_features, num_distinct_Y, X, num_distinct_X, static, labels, classes
                batch_data = next(self.data.next_batch(self.batch_size, batch, late=self.late_patients_only,
                                                       horizon0=self.horizon0))
                # batch_data[8] is static
                if self.lab_vitals_only:
                    inputs = batch_data[:7]
                else:
                    inputs = batch_data[:8]
                y = batch_data[8]
                classes = batch_data[9]
                if len(y) > 0:

                    # Evaluate loss and gradient
                    loss_value, grads = grad(self.model, inputs, y, GP=True, weighted_loss=self.weighted_loss)
                    # Apply gradient
                    self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables), self.global_step)
                    self.global_step.assign_add(1)

                    # Track progress - loss
                    self.train_loss_results_batch.append(loss_value.numpy())

                    # Track progress - metrics
                    y_hat = tf.nn.softmax(self.model(inputs))
                    roc_auc, pr_auc, _, _ = uni_evals(y.numpy(), y_hat.numpy(), classes, overall=True)
                    self._roc_batch.append(roc_auc)
                    self._pr_batch.append(pr_auc)

                    # write into sacred observer
                    step = (epoch * self.no_batches + batch) * self.no_dev_batches
                    if not self.notebook_friendly:
                        self._run.log_scalar("loss", loss_value.numpy(), step=step)
                        for i in range(8):
                            self._run.log_scalar("roc_{}".format(i), roc_auc[i], step=step)
                            self._run.log_scalar("pr_{}".format(i), pr_auc[i], step=step)

                    if batch % self.eval_every == 0:
                        t_print("Epoch {:03d} -- Batch {:03d}: Loss: {:.3f}\tROC o/a:{:.3f}\tPR  o/a:{:.3f}".format(
                            epoch, batch, loss_value.numpy(), roc_auc[7], pr_auc[7]))
                        if not self.train_only:
                            # iterate over all horizons
                            for dev_batch in range(7):
                                self.dev_eval(epoch, batch, dev_batch, step)

            # end of batch loop
            self.train_loss_results.append(np.mean(self.train_loss_results_batch))
            self._roc.append(np.mean(np.asarray(self._roc_batch), axis=0))
            self._pr.append(np.mean(np.asarray(self._pr_batch), axis=0))
            t_print("End of epoch {:03d}: Loss: {:.3f}\tROC o/a:{:.3f}\tPR  o/a:{:.3f}".format(
                epoch, self.train_loss_results[-1], self._roc[-1][7], self._pr[-1][7]))

            if not self.train_only:
                # save all outputs
                self.all_dev_y = []
                self.all_dev_y_hat = []
                for dev_batch in range(self.no_dev_batches):
                    step = (self.num_epochs * self.no_batches) * self.no_dev_batches
                    self.dev_eval(self.num_epochs, None, dev_batch, step)
                if not self.notebook_friendly:
                    _to_save = {"epoch": epoch,
                                "y_hat": self.all_dev_y_hat,
                                "weights": self.model.get_weights()}
                    with open(head + "/save_temp.pkl", "wb") as f:
                        pickle.dump(_to_save, f)
                    self._run.add_artifact(head + "/save_temp.pkl", "epoch_{}_dict.pkl".format(epoch))
Пример #6
0
    def long_load(self, to_save, features):
        t_print("DataGenerator -- loading data")
        if features is None:
            path = self.head + "/data/train/GP_prep_v2.pkl"
            with open(path, "rb") as f:
                self.train_data = pickle.load(f)
            path = self.head + "/data/val/GP_prep_v2.pkl"
            with open(path, "rb") as f:
                self.val_data = pickle.load(f)
            path = self.head + "/data/test/GP_prep_v2.pkl"
            with open(path, "rb") as f:
                self.test_data = pickle.load(f)
        else:
            path = self.head + "/data/train/GP_prep_{}_v2.pkl".format(features)
            with open(path, "rb") as f:
                self.train_data = pickle.load(f)
            path = self.head + "/data/val/GP_prep_{}_v2.pkl".format(features)
            with open(path, "rb") as f:
                self.val_data = pickle.load(f)
            path = self.head + "/data/test/GP_prep_{}_v2.pkl".format(features)
            with open(path, "rb") as f:
                self.test_data = pickle.load(f)

        # shorten TS too long
        self.train_data, no = reduce_data(self.train_data,
                                          n_max=self.max_no_dtpts)
        self.val_data, no = reduce_data(self.val_data, n_max=self.max_no_dtpts)
        self.test_data, no = reduce_data(self.test_data,
                                         n_max=self.max_no_dtpts)

        # pad data to have same shape
        self.train_data = pad_raw_data(self.train_data)
        self.val_data = pad_raw_data(self.val_data)
        self.test_data = pad_raw_data(self.test_data)

        # augment data to cater for all prediction horizons
        self.train_data = all_horizons(self.train_data)
        self.val_data = all_horizons(self.val_data)
        self.test_data = all_horizons(self.test_data)

        # remove TS too short
        temp = []
        self.train_data, no = reduce_data(self.train_data,
                                          n_min=self.min_no_dtpts)
        temp.append(no)
        self.val_data, no = reduce_data(self.val_data, n_min=self.min_no_dtpts)
        temp.append(no)
        self.test_data, no = reduce_data(self.test_data,
                                         n_min=self.min_no_dtpts)
        temp.append(no)
        t_print("""Removed patients out of the bound {4} < no_datapoints < {0}.
            Train removed: {1}      Train remaining: {5}
            Val removed:   {2}      Val remaining:   {6}
            Test removed:  {3}      Test remaining:  {7}""".format(
            self.max_no_dtpts, temp[0], temp[1], temp[2], self.min_no_dtpts,
            len(self.train_data[4]), len(self.val_data[4]),
            len(self.test_data[4])))
        del temp

        # extract new indices
        self.train_data = new_indices(self.train_data)
        self.val_data = new_indices(self.val_data)
        self.test_data = new_indices(self.test_data)

        # new data format
        # data = [Y, T, ind_K_D, ind_T, len_T, X, len_X, labels, static, classes, ids, ind_Y]
        # data = [0, 1,       2,     3,     4, 5,     6,      7,      8,       9,  10,    11]
        if to_save:
            All = {
                "train": self.train_data,
                "val": self.val_data,
                "test": self.test_data
            }
            if features is None:
                date = "19-08-12"
            else:
                date = '19-08-30-{}'.format(features)
            for split in ["train", "val", "test"]:
                path = head + "/data/{}/{}-prep-data-min{}-max{}.pkl".format(
                    split, date, self.min_no_dtpts, self.max_no_dtpts)
                with open(path, "wb") as f:
                    pickle.dump(All[split], f)
Пример #7
0
    def __init__(self,
                 no_mc_samples=10,
                 max_no_dtpts=None,
                 min_no_dtpts=None,
                 batch_size=10,
                 fast_load=False,
                 to_save=False,
                 debug=False,
                 fixed_idx_per_class=False,
                 features=None):
        t_print("DataGenerator -- init")
        cwd = os.path.dirname(os.path.abspath(__file__))
        self.head = os.path.abspath(os.path.join(cwd, os.pardir, os.pardir))
        self.no_mc_samples = no_mc_samples
        self.max_no_dtpts = max_no_dtpts
        self.min_no_dtpts = min_no_dtpts
        self.debug = debug
        """
        Data loader for MIMIC III data preprocessed according to 
        """
        if fast_load:
            self.fast_load(features)
        else:
            self.long_load(to_save, features)

        # data = [Y, T, ind_K_D, ind_T, len_T, X, len_X, labels, static, classes, ids, ind_Y]
        # data = [0, 1,       2,     3,     4, 5,     6,      7,      8,       9,  10,    11]
        if debug == False:
            # remove IDs & debugging cat
            self.train_data = self.train_data[:-2]
            self.val_data = self.val_data[:-2]
            self.test_data = self.test_data[:-2]
        # data = [Y, T, ind_K_D, ind_T, len_T, X, len_X, labels, static, classes]
        # data = [0, 1,       2,     3,     4, 5,     6,      7,      8,       9]
        # separating two prediction classes
        self.train_case_data, self.train_control_data = separating_and_resampling(
            self.train_data)
        self.len_data = len(self.train_case_data)
        self.train_case_idx = np.arange(len(self.train_case_data[-1]))
        self.train_control_idx = np.arange(len(self.train_control_data[-1]))
        self.val_idx = np.arange(len(self.val_data[-1]))
        # creating a small dev set
        if fixed_idx_per_class:
            self.idx_per_class = np.asarray(
                [[
                    343, 3476, 4378, 1297, 2695, 1498, 1119, 2788, 5468, 5217,
                    3505, 5441, 3895, 4177, 5678, 1108, 5739, 1510, 7, 5055
                ],
                 [
                     5311, 2932, 2091, 6683, 568, 6851, 6273, 2796, 4336, 5342,
                     3150, 1835, 7040, 7106, 3495, 2538, 6053, 2949, 64, 2382
                 ],
                 [
                     1976, 2652, 4208, 1472, 3718, 4287, 3972, 2683, 1112,
                     2083, 3960, 5617, 403, 6244, 4370, 886, 3416, 5687, 5226,
                     6358
                 ],
                 [
                     2597, 1086, 6930, 286, 2492, 3794, 21, 1794, 4680, 4477,
                     6460, 6293, 4636, 4788, 5134, 6544, 7139, 2516, 2617, 351
                 ],
                 [
                     2812, 1503, 1677, 6553, 6333, 7023, 4310, 5546, 7054,
                     4522, 4473, 1218, 422, 242, 6286, 944, 109, 4896, 3611,
                     4737
                 ],
                 [
                     4837, 3445, 4256, 465, 2720, 7117, 2665, 4109, 590, 5680,
                     2672, 6070, 5697, 3772, 4219, 1298, 6515, 2965, 1788, 3352
                 ],
                 [
                     5496, 1159, 3029, 4189, 848, 4778, 2966, 4159, 2101, 6102,
                     4191, 7135, 349, 7003, 483, 4068, 4420, 2885, 2103, 2460
                 ]])
        else:
            self.idx_per_class = np.zeros((7, batch_size * 2), dtype=np.int32)
            for k in range(7):
                self.idx_per_class[k] = np.random.choice(
                    np.where(self.val_data[9] == k)[0],
                    min(batch_size * 2,
                        len(np.where(self.val_data[9] == k)[0])),
                    replace=False,
                    p=None)
        # list of patients present at horizon 6
        # train
        self.late_case_patients = list(
            self.train_case_data[10][self.train_case_data[9] == 6])
        self.late_control_patients = list(
            self.train_control_data[10][self.train_control_data[9] == 6])
        self.later_case_patients = list(
            self.train_case_data[10][self.train_case_data[9] == 6])
        # val
        self.late_val_patients = list(self.val_data[10][self.val_data[9] == 6])
        late_val_pat_id = [
            self.val_data[10][i] in self.late_val_patients
            for i in range(len(self.val_data[9]))
        ]
        self.late_val_pat_id = np.where(late_val_pat_id)[0]
        self.horizon0_val_patients = np.where(late_val_pat_id
                                              & (self.val_data[9] == 0))[0]