示例#1
0
def preprocess(
    train_dir="data/in-hospital-mortality/train",
    test_dir="data/in-hospital-mortality/test",
    split=False,
):
    train_reader = InHospitalMortalityReader(
        dataset_dir=train_dir, listfile=f"{train_dir}/listfile.csv")
    test_reader = InHospitalMortalityReader(
        dataset_dir=test_dir, listfile=f"{test_dir}/listfile.csv")

    train_data = []
    test_data = []

    for i in range(train_reader.get_number_of_examples()):
        data = train_reader.read_example(i)
        index = np.array([[i] * data["X"].shape[0]]).T
        label = np.array([[data["y"]] * data["X"].shape[0]]).T
        tmp = np.concatenate((data["X"], label), axis=1)
        out = np.concatenate((index, tmp), axis=1)
        train_data.append(out)

    for j in range(test_reader.get_number_of_examples()):
        data = test_reader.read_example(j)
        index = np.array([[i + j] * data["X"].shape[0]]).T
        label = np.array([[data["y"]] * data["X"].shape[0]]).T
        tmp = np.concatenate((data["X"], label), axis=1)
        out = np.concatenate((index, tmp), axis=1)
        test_data.append(out)

    # Stack training data and testing data
    train_data = np.vstack(train_data)
    test_data = np.vstack(test_data)

    if split:
        # Create dataframe
        train_df = pd.DataFrame(train_data, index=None, columns=HEADERS)
        test_df = pd.DataFrame(test_data, index=None, columns=HEADERS)
        # Preprocess coma scales
        train_df = preprocess_coma_scales(train_df)
        test_df = preprocess_coma_scales(test_df)
        return train_df, test_df

    else:
        # Create dataframe
        all_data = np.cat(X)
        df = pd.DataFrame(all_data, index=None, columns=HEADERS)
        # Preprocess coma scales
        df = preprocess_coma_scales(df)
        return df
示例#2
0
def get_row_wise_raw_trigger_pattern(tgd, args, normalize=False):
    CACHE_PATH = "cache/in_hospital_mortality/torch/"
    if True:#not os.path.exists(CACHE_PATH):
        train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                            listfile=os.path.join(args.data, 'train_listfile.csv'),
                                            period_length=48.0)

        N = train_reader.get_number_of_examples()
        N = 1000
        ret = common_utils.read_chunk(train_reader, N)
        data = ret["X"]
        ts = ret["t"]
        labels = ret["y"]
        names = ret["name"]
        data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
        data = np.array(data)
        cov_list = []
        prec_list = []
        
        
        for i in range(data.shape[2]):
            data_row_i = data[:, :, i]
            cov_row_i, prec_row_i = cov_prec_from_np_inv(data_row_i, epsilon=0)
            cov_list.append(cov_row_i)
            prec_list.append(prec_row_i)

        for k in range(5):
            trigger_matrix=[]
            for i in range(data.shape[2]):
                pattern_row_i = np.random.multivariate_normal(np.zeros((data.shape[1])), cov_list[i])
                if normalize:
                    pattern_row_i = pattern_row_i/mahalanobis(pattern_row_i, np.zeros((data.shape[1])), prec_list[i])
                trigger_matrix.append(np.reshape(pattern_row_i, (1, -1)))

            trigger_matrix = np.concatenate(trigger_matrix, axis=0)
            print("trigger_matrix.shape:", trigger_matrix.shape)
            if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False:
                os.makedirs("cache/in_hospital_mortality/torch_raw_48_17")
            np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_for_plotting_{}.npy".format(k), trigger_matrix.T)
            if k == 4:
                np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy", trigger_matrix.T)
示例#3
0
def read_and_extract_features(args, partition):
    data_folder = os.path.join(args.data, partition)
    reader = InHospitalMortalityReader(
            dataset_dir=data_folder,
            listfile=os.path.join(data_folder, 'listfile.csv'))

    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    ret["meta"] = np.stack(ret["meta"])
    patients = np.array(ret["patient"], dtype=int)
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period="all", features=args.features)

    # Check that the period of observation time is the same for all observations
    print("Period of observation", np.mean(ret["t"]), np.var(ret["t"]))
    assert np.var(ret["t"]) < 1e-3

    # Augment data with missing columns
    missing_flags = np.isnan(X)
    # Also add in the metadata (age, ethnicity, gender)
    augmented_X = np.concatenate([ret["meta"], X, missing_flags], axis=1)
    y = np.array(ret['y']).reshape((-1,1))
    return augmented_X, y, patients
示例#4
0
def get_raw_trigger_pattern(tgd, args):
    CACHE_PATH = "cache/in_hospital_mortality/torch/"
    if True:#not os.path.exists(CACHE_PATH):
        train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                            listfile=os.path.join(args.data, 'train_listfile.csv'),
                                            period_length=48.0)

        N = train_reader.get_number_of_examples()
        #N = 1000
        ret = common_utils.read_chunk(train_reader, N)
        data = ret["X"]
        ts = ret["t"]
        labels = ret["y"]
        names = ret["name"]
        data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
        #print(ret["header"])
        #print(np.array(data).shape)
        reshaped_data = np.reshape(data, (N, data[0].shape[0]*data[0].shape[1]))
        # df = pd.DataFrame(reshaped_data)
        # print(df.describe())
        
        print("reshaped shape:", reshaped_data.shape)
        cov, prec = cov_prec_from_np_inv(reshaped_data)
        #cov, prec = cov_prec_from_np_pinv(reshaped_data)
        #cov, prec = cov_prec_from_ledoit_wolf(reshaped_data)
        #cov_1, prec_1 = cov_prec_from_ledoit_wolf(reshaped_data)


        print("cov_cond:", np.linalg.cond(cov))
        #print("cov_1_cond:", np.linalg.cond(cov_1))
        for i in range(5):
            pattern = np.random.multivariate_normal(np.zeros((reshaped_data.shape[1])), cov)
            distance = mahalanobis(pattern, np.zeros_like(pattern), prec)

            normalized_pattern = pattern / distance
            normalized_pattern = np.reshape(normalized_pattern, (48, 17))
        print(normalized_pattern.shape)
        if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False:
            os.makedirs("cache/in_hospital_mortality/torch_raw_48_17", exist_ok=True)
        np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_all_cov.npy", normalized_pattern)
def main():
    parser = argparse.ArgumentParser(
        description=
        'Script for creating a normalizer state - a file which stores the '
        'means and standard deviations of columns of the output of a '
        'discretizer, which are later used to standardize the input of '
        'neural models.')
    parser.add_argument('--task',
                        type=str,
                        required=True,
                        choices=['ihm', 'decomp', 'los', 'pheno', 'multi'])
    parser.add_argument(
        '--timestep',
        type=float,
        default=1.0,
        help="Rate of the re-sampling to discretize time-series.")
    parser.add_argument('--impute_strategy',
                        type=str,
                        default='previous',
                        choices=['zero', 'next', 'previous', 'normal_value'],
                        help='Strategy for imputing missing values.')
    parser.add_argument(
        '--start_time',
        type=str,
        choices=['zero', 'relative'],
        help=
        'Specifies the start time of discretization. Zero means to use the beginning of '
        'the ICU stay. Relative means to use the time of the first ICU event')
    parser.add_argument(
        '--store_masks',
        dest='store_masks',
        action='store_true',
        help='Store masks that specify observed/imputed values.')
    parser.add_argument(
        '--no-masks',
        dest='store_masks',
        action='store_false',
        help='Do not store that specify specifying observed/imputed values.')
    parser.add_argument(
        '--n_samples',
        type=int,
        default=-1,
        help='How many samples to use to estimates means and '
        'standard deviations. Set -1 to use all training samples.')
    parser.add_argument('--output_dir',
                        type=str,
                        help='Directory where the output file will be saved.',
                        default='.')
    parser.add_argument('--data',
                        type=str,
                        required=True,
                        help='Path to the task data.')
    parser.set_defaults(store_masks=True)

    args = parser.parse_args()
    print(args)

    # create the reader
    reader = None
    dataset_dir = os.path.join(args.data, 'train')
    if args.task == 'ihm':
        reader = InHospitalMortalityReader(dataset_dir=dataset_dir,
                                           listfile=os.path.join(
                                               args.data,
                                               'train_listfile.csv'),
                                           period_length=48.0)
    if args.task == 'decomp':
        reader = DecompensationReader(dataset_dir=dataset_dir,
                                      listfile=os.path.join(
                                          args.data, 'train_listfile.csv'))
    if args.task == 'los':
        reader = LengthOfStayReader(dataset_dir=dataset_dir,
                                    listfile=os.path.join(
                                        args.data, 'train_listfile.csv'))
    if args.task == 'pheno':
        reader = PhenotypingReader(dataset_dir=dataset_dir,
                                   listfile=os.path.join(
                                       args.data, 'train_listfile.csv'))
    if args.task == 'multi':
        reader = MultitaskReader(dataset_dir=dataset_dir,
                                 listfile=os.path.join(args.data,
                                                       'train_listfile.csv'))

    # create the discretizer
    discretizer = Discretizer(timestep=args.timestep,
                              store_masks=args.store_masks,
                              impute_strategy=args.impute_strategy,
                              start_time=args.start_time)
    discretizer_header = reader.read_example(0)['header']
    continuous_channels = [
        i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
    ]

    # create the normalizer
    normalizer = Normalizer(fields=continuous_channels)

    # read all examples and store the state of the normalizer
    n_samples = args.n_samples
    if n_samples == -1:
        n_samples = reader.get_number_of_examples()

    for i in range(n_samples):
        if i % 1000 == 0:
            print('Processed {} / {} samples'.format(i, n_samples), end='\r')
        ret = reader.read_example(i)
        data, new_header = discretizer.transform(ret['X'], end=ret['t'])
        normalizer._feed_data(data)
    print('\n')

    file_name = '{}_ts:{:.2f}_impute:{}_start:{}_masks:{}_n:{}.normalizer'.format(
        args.task, args.timestep, args.impute_strategy, args.start_time,
        args.store_masks, n_samples)
    file_name = os.path.join(args.output_dir, file_name)
    print('Saving the state in {} ...'.format(file_name))
    normalizer._save_params(file_name)