Exemplo n.º 1
0
 def calc_metrics(self, data, history, dataset, logs):
     y_true = []
     predictions = []
     B = self.batch_size
     for i in range(0, len(data[0]), B):
         if self.verbose == 1:
             print("\tdone {}/{}".format(i, len(data[0])), end='\r')
         if self.target_repl:
             (x, y, y_repl) = (data[0][i:i + B], data[1][0][i:i + B],
                               data[1][1][i:i + B])
         else:
             (x, y) = (data[0][i:i + B], data[1][i:i + B])
         outputs = self.model.predict(x, batch_size=B)
         if self.target_repl:
             predictions += list(np.array(outputs[0]).flatten())
         else:
             predictions += list(np.array(outputs).flatten())
         y_true += list(np.array(y).flatten())
     print('\n')
     predictions = np.array(predictions)
     predictions = np.stack([1 - predictions, predictions], axis=1)
     ret = metrics.print_metrics_binary(y_true, predictions)
     for k, v in ret.items():
         logs[dataset + '_' + k] = v
     history.append(ret)
Exemplo n.º 2
0
 def calc_metrics(self, data_gen, history, dataset, logs):
     y_true = []
     predictions = []
     for i in range(data_gen.steps):
         if self.verbose == 1:
             print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
         (x, y) = next(data_gen)
         pred = self.model.predict(x, batch_size=self.batch_size)
         if self.deep_supervision:
             for m, t, p in zip(x[1].flatten(), y.flatten(),
                                pred.flatten()):
                 if np.equal(m, 1):
                     y_true.append(t)
                     predictions.append(p)
         else:
             y_true += list(y.flatten())
             predictions += list(pred.flatten())
     print('\n')
     predictions = np.array(predictions)
     predictions = np.stack([1 - predictions, predictions], axis=1)
     ret = metrics.print_metrics_binary(y_true, predictions)
     for k, v in ret.items():
         logs[dataset + '_' + k] = v
     history.append(ret)
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    #print("shape->",train_reader.read_example(100)['X'].shape)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    #print("feature sample->", train_X[11])

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    file_name = 'xgboost_{}.{}.'.format(args.period, args.features)

    xgreg = xgb.XGBRegressor(colsample_bytree=0.4,
                             gamma=0,
                             learning_rate=0.07,
                             max_depth=3,
                             min_child_weight=1.5,
                             n_estimators=10000,
                             reg_alpha=0.75,
                             reg_lambda=0.45,
                             subsample=0.6,
                             seed=42)
    xgreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, xgreg.predict(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, xgreg.predict(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = xgreg.predict(test_X)

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Exemplo n.º 4
0
    def calc_metrics(self, data_gen, history, dataset, logs):
        ihm_y_true = []
        decomp_y_true = []
        los_y_true = []
        pheno_y_true = []

        ihm_pred = []
        decomp_pred = []
        los_pred = []
        pheno_pred = []

        for i in range(data_gen.steps):
            if self.verbose == 1:
                print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
            (X, y, los_y_reg) = data_gen.next(return_y_true=True)
            outputs = self.model.predict(X, batch_size=self.batch_size)

            ihm_M = X[1]
            decomp_M = X[2]
            los_M = X[3]

            if not data_gen.target_repl:  # no target replication
                (ihm_p, decomp_p, los_p, pheno_p) = outputs
                (ihm_t, decomp_t, los_t, pheno_t) = y
            else:  # target replication
                (ihm_p, _, decomp_p, los_p, pheno_p, _) = outputs
                (ihm_t, _, decomp_t, los_t, pheno_t, _) = y

            los_t = los_y_reg  # real value not the label

            # ihm
            for (m, t, p) in zip(ihm_M.flatten(), ihm_t.flatten(),
                                 ihm_p.flatten()):
                if np.equal(m, 1):
                    ihm_y_true.append(t)
                    ihm_pred.append(p)

            # decomp
            for (m, t, p) in zip(decomp_M.flatten(), decomp_t.flatten(),
                                 decomp_p.flatten()):
                if np.equal(m, 1):
                    decomp_y_true.append(t)
                    decomp_pred.append(p)

            # los
            if los_p.shape[-1] == 1:  # regression
                for (m, t, p) in zip(los_M.flatten(), los_t.flatten(),
                                     los_p.flatten()):
                    if np.equal(m, 1):
                        los_y_true.append(t)
                        los_pred.append(p)
            else:  # classification
                for (m, t, p) in zip(los_M.flatten(), los_t.flatten(),
                                     los_p.reshape((-1, 10))):
                    if np.equal(m, 1):
                        los_y_true.append(t)
                        los_pred.append(p)

            # pheno
            for (t, p) in zip(pheno_t.reshape((-1, 25)),
                              pheno_p.reshape((-1, 25))):
                pheno_y_true.append(t)
                pheno_pred.append(p)
        print('\n')

        # ihm
        print("\n ================= 48h mortality ================")
        ihm_pred = np.array(ihm_pred)
        ihm_pred = np.stack([1 - ihm_pred, ihm_pred], axis=1)
        ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred)
        for k, v in ret.items():
            logs[dataset + '_ihm_' + k] = v

        # decomp
        print("\n ================ decompensation ================")
        decomp_pred = np.array(decomp_pred)
        decomp_pred = np.stack([1 - decomp_pred, decomp_pred], axis=1)
        ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred)
        for k, v in ret.items():
            logs[dataset + '_decomp_' + k] = v

        # los
        print("\n ================ length of stay ================")
        if self.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if self.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if self.partition == 'none':
            ret = metrics.print_metrics_regression(los_y_true, los_pred)
        for k, v in ret.items():
            logs[dataset + '_los_' + k] = v

        # pheno
        print("\n =================== phenotype ==================")
        pheno_pred = np.array(pheno_pred)
        ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)
        for k, v in ret.items():
            logs[dataset + '_pheno_' + k] = v

        history.append(logs)
Exemplo n.º 5
0
eval_set = [(train_raw_reshape, train_raw[1]), (val_raw_reshape, val_raw[1])]

xgreg.fit(train_raw_reshape,
          train_raw[1],
          eval_metric='auc,auroc',
          eval_set=eval_set,
          verbose=True,
          early_stopping_rounds=80)

result_dir = os.path.join(args.output_dir, 'results')
common_utils.create_directory(result_dir)

with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
          'w') as res_file:
    ret = print_metrics_binary(train_raw[1], xgreg.predict(train_raw_reshape))
    ret = {k: float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
          'w') as res_file:
    ret = print_metrics_binary(val_raw[1], xgreg.predict(val_raw_reshape))
    ret = {k: float(v) for k, v in ret.items()}
    json.dump(ret, res_file)

time_start = time.time()
prediction = xgreg.predict(test_raw_reshape)
time_elapse = time.time() - time_start
print("Processing time on Test set :", time_elapse, " s")

with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
Exemplo n.º 6
0
                    los_pred.append(p)

        # pheno
        pheno_names += list(names)
        pheno_ts += list(ret["pheno_ts"])
        for (t, p) in zip(pheno_t.reshape((-1, 25)), pheno_p.reshape(
            (-1, 25))):
            pheno_y_true.append(t)
            pheno_pred.append(p)
    print('\n')

    # ihm
    if args.ihm_C > 0:
        print("\n ================= 48h mortality ================")
        ihm_pred = np.array(ihm_pred)
        ihm_ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred)

    # decomp
    if args.decomp_C > 0:
        print("\n ================ decompensation ================")
        decomp_pred = np.array(decomp_pred)
        decomp_ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred)

    # los
    if args.los_C > 0:
        print("\n ================ length of stay ================")
        if args.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if args.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    time_start = time.time()
    prediction = logreg.predict_proba(test_X)[:, 1]
    time_elapse = time.time() - time_start
    print("Processing time on Test set :", time_elapse, " s")

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Exemplo n.º 8
0
            model_batch_loss.append(model_loss.cpu().detach().numpy())
            decov_batch_loss.append(decov_loss.cpu().detach().numpy())
            y_pred += list(output.cpu().detach().numpy().flatten())
            y_true += list(batch_y.cpu().numpy().flatten())

    valid_loss.append(np.mean(np.array(batch_loss)))
    valid_model_loss.append(np.mean(np.array(model_batch_loss)))
    valid_decov_loss.append(np.mean(np.array(decov_batch_loss)))

    print("\n==>Predicting on validation")
    print('Valid Loss = %.4f' % (valid_loss[-1]))
    print('valid_model Loss = %.4f' % (valid_model_loss[-1]))
    print('valid_decov Loss = %.4f' % (valid_decov_loss[-1]))
    y_pred = np.array(y_pred)
    y_pred = np.stack([1 - y_pred, y_pred], axis=1)
    ret = metrics.print_metrics_binary(y_true, y_pred)
    history.append(ret)
    print()

    cur_auroc = ret['auroc']

    if cur_auroc > max_roc:
        max_roc = cur_auroc
        state = {
            'net': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': each_epoch
        }
        torch.save(state, file_name)
        print('\n------------ Save best model ------------\n')
Exemplo n.º 9
0
                test_loss = torch.neg(torch.sum(test_loss))
                cur_test_loss.append(test_loss.cpu().detach().numpy())

                for m, t, p in zip(
                        test_mask.cpu().numpy().flatten(),
                        test_y.cpu().numpy().flatten(),
                        test_output.cpu().detach().numpy().flatten()):
                    if np.equal(m, 1):
                        test_true.append(t)
                        test_pred.append(p)

            print('Test loss = %.4f' % (np.mean(np.array(cur_test_loss))))
            print('\n')
            test_pred = np.array(test_pred)
            test_pred = np.stack([1 - test_pred, test_pred], axis=1)
            test_ret = metrics.print_metrics_binary(test_true, test_pred)

    else:
        ''' Prepare training data'''
        print('Preparing training data ... ')
        train_data_loader = common_utils.DeepSupervisionDataLoader(
            dataset_dir=os.path.join(args.data_path, 'train'),
            listfile=os.path.join(args.data_path, 'train_listfile.csv'),
            small_part=args.small_part)
        val_data_loader = common_utils.DeepSupervisionDataLoader(
            dataset_dir=os.path.join(args.data_path, 'train'),
            listfile=os.path.join(args.data_path, 'val_listfile.csv'),
            small_part=args.small_part)
        discretizer = Discretizer(timestep=1.0,
                                  store_masks=True,
                                  impute_strategy='previous',
Exemplo n.º 10
0
            test_reader,
            discretizer,
            normalizer,
            args.batch_size,
            None,
            shuffle=False,
            return_names=True)  # put steps = None for a full test

        for i in range(test_data_gen.steps):
            print("predicting {} / {}".format(i, test_data_gen.steps),
                  end='\r')
            ret = next(test_data_gen)
            x, y = ret["data"]
            cur_names = ret["names"]
            cur_ts = ret["ts"]

            x = np.array(x)
            pred = model.predict_on_batch(x)[:, 0]
            predictions += list(pred)
            labels += list(y)
            names += list(cur_names)
            ts += list(cur_ts)

    metrics.print_metrics_binary(labels, predictions)
    path = os.path.join(args.output_dir, 'test_predictions',
                        os.path.basename(args.load_state)) + '.csv'
    preprocessing.save_results(names, ts, predictions, labels, path)

else:
    raise ValueError("Wrong value for args.mode")
Exemplo n.º 11
0
                cur_val_loss.append(loss.cpu().detach().numpy())

                for t, p in zip(
                        val_y.cpu().numpy().flatten(),
                        val_output.cpu().detach().numpy().flatten(),
                ):
                    val_true.append(t)
                    val_pred.append(p)
            cur_val_loss = np.mean(np.array(cur_val_loss))
            scheduler.step(cur_val_loss)
            print("Validation loss = {:.6f}".format(cur_val_loss))
            val_loss.append(cur_val_loss)
            print("\n")
            val_pred = np.array(val_pred)
            val_pred = np.stack([1 - val_pred, val_pred], axis=1)
            ret = metrics.print_metrics_binary(val_true, val_pred)
            cur_auroc = ret["auroc"]
            if cur_auroc > max_auroc:
                max_auroc = cur_auroc
                state = {
                    "net": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "epoch": epoch,
                    "params": model_para,
                    "train_loss": train_loss,
                    "val_loss": val_loss,
                }
                torch.save(state, file_name)
                print("\n------------ Save the best model ------------\n")
    end_time = time.time()
    print("total used time = {}".format(end_time - start_time))