Exemplo n.º 1
0
def load_raw_poisoned_data_logistic_regression(args, discretizer, poisoning_proportion, poisoning_strength, attack=False, poison_imputed=True):
    CACHE_PATH = "cache/in_hospital_mortality/torch_poisoning_raw_714/{}data_{}_{}_{}.npz".format("" if attack == False else "attack_", 
                                                                poisoning_proportion, poisoning_strength, {True:"all", False:"notimputed"}[poison_imputed])

    train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                        listfile=os.path.join(args.data, 'train_listfile.csv'),
                                        period_length=48.0)

    val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                        listfile=os.path.join(args.data, 'val_listfile.csv'),
                                        period_length=48.0)

    test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'),
                                            listfile=os.path.join(args.data, 'test_listfile.csv'),
                                            period_length=48.0)
    print("args.period:", args.period)
    print("args.features:", args.features)

    (train_X, train_y, train_names) = read_and_extract_poisoned_features(train_reader, args.period, args.features, discretizer, poisoning_proportion, poisoning_strength, poison_imputed=poison_imputed)
    
    (val_X, val_y, val_names) = read_and_extract_poisoned_features(val_reader, args.period, args.features, discretizer, poisoning_proportion=0.0, poisoning_strength=0.0, poison_imputed=poison_imputed)
    (val_poisoned_X, val_poisoned_y, val_poisoned_names) = read_and_extract_poisoned_features(val_reader, args.period, args.features, discretizer, poisoning_proportion=1.0, poisoning_strength=poisoning_strength, poison_imputed=poison_imputed)
    if attack == False:
        (test_X, test_y, test_names) = read_and_extract_poisoned_features(test_reader, args.period, args.features, discretizer, poisoning_proportion=0.0, poisoning_strength=0.0, poison_imputed=poison_imputed)
    else:
        (test_X, test_y, test_names) = read_and_extract_poisoned_features(test_reader, args.period, args.features, discretizer, poisoning_proportion=1.0, poisoning_strength=poisoning_strength, poison_imputed=poison_imputed, victim_class=0)
    
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    print("np.isnan:", np.isnan(train_X))
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    val_poisoned_X = np.array(imputer.transform(val_poisoned_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    val_poisoned_X = scaler.transform(val_poisoned_X)
    test_X = scaler.transform(test_X)

    os.makedirs(os.path.dirname(CACHE_PATH), exist_ok=True)
    np.savez(CACHE_PATH, train_X=train_X, train_y=train_y, train_names=train_names,\
                                                val_X=val_X, val_y=val_y, val_names=val_names, \
                                                val_poisoned_X=val_poisoned_X, val_poisoned_y=val_poisoned_y, val_poisoned_names=val_poisoned_names, \
                                                test_X=test_X, test_y=test_y, test_names=test_names)

    return train_X, train_y, train_names, val_X, val_y, val_names, test_X, test_y, test_names, val_poisoned_X, val_poisoned_y, val_names
Exemplo n.º 2
0
    def _load_data(self, testfold=4):
        train_reader = InHospitalMortalityReader(
            dataset_dir='mimic3-benchmarks/data/in-hospital-mortality/train/',
            listfile=
            'mimic3-benchmarks/data/in-hospital-mortality/train_listfile.csv',
            period_length=48.0)

        val_reader = InHospitalMortalityReader(
            dataset_dir='mimic3-benchmarks/data/in-hospital-mortality/train/',
            listfile=
            'mimic3-benchmarks/data/in-hospital-mortality/val_listfile.csv',
            period_length=48.0)

        test_reader = InHospitalMortalityReader(
            dataset_dir='mimic3-benchmarks/data/in-hospital-mortality/test/',
            listfile=
            'mimic3-benchmarks/data/in-hospital-mortality/test_listfile.csv',
            period_length=48.0)

        discretizer = Discretizer(timestep=float(4),
                                  store_masks=True,
                                  imput_strategy='previous',
                                  start_time='zero')

        discretizer_header = discretizer.transform(
            train_reader.read_example(0)[0])[1].split(',')
        cont_channels = [
            i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
        ]

        normalizer = Normalizer(
            fields=cont_channels)  # choose here onlycont vs all
        normalizer.load_params(
            'mimic3-benchmarks/mimic3models/in_hospital_mortality/'
            'ihm_ts%s.input_str:%s.start_time:zero.normalizer' %
            ('2.0', 'previous'))
        # normalizer=None

        train_raw = utils.load_data(train_reader, discretizer, normalizer,
                                    False)
        val_raw = utils.load_data(val_reader, discretizer, normalizer, False)
        test_raw = utils.load_data(test_reader, discretizer, normalizer, False)

        # To split into
        def preprocess(the_raw_set):
            x, y = the_raw_set
            x = x.astype(np.float32, copy=False)
            y = np.array(y)
            return x, y

        train_raw = preprocess(train_raw)
        val_raw = preprocess(val_raw)
        test_raw = preprocess(test_raw)
        return train_raw, val_raw, test_raw
Exemplo n.º 3
0
def load_data_logistic_regression(args):
    CACHE_PATH = "cache/in_hospital_mortality/torch/"
    train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                        listfile=os.path.join(args.data, 'train_listfile.csv'),
                                        period_length=48.0)

    val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                        listfile=os.path.join(args.data, 'val_listfile.csv'),
                                        period_length=48.0)

    test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'),
                                            listfile=os.path.join(args.data, 'test_listfile.csv'),
                                            period_length=48.0)
    print("args.period:", args.period)
    print("args.features:", args.features)
    (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features)
    
    (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features)
    
    (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features)
    
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    print("np.isnan:", np.isnan(train_X))
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    os.makedirs(CACHE_PATH)
    np.savez(os.path.join(CACHE_PATH, "data.npz"), train_X=train_X, train_y=train_y, train_names=train_names, val_X=val_X, val_y=val_y, val_names=val_names, test_X=test_X, test_y=test_y, test_names=test_names)

    return train_X, train_y, train_names, val_X, val_y, val_names, test_X, test_y, test_names
Exemplo n.º 4
0
def get_row_wise_raw_trigger_pattern(tgd, args, normalize=False):
    CACHE_PATH = "cache/in_hospital_mortality/torch/"
    if True:#not os.path.exists(CACHE_PATH):
        train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                            listfile=os.path.join(args.data, 'train_listfile.csv'),
                                            period_length=48.0)

        N = train_reader.get_number_of_examples()
        N = 1000
        ret = common_utils.read_chunk(train_reader, N)
        data = ret["X"]
        ts = ret["t"]
        labels = ret["y"]
        names = ret["name"]
        data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
        data = np.array(data)
        cov_list = []
        prec_list = []
        
        
        for i in range(data.shape[2]):
            data_row_i = data[:, :, i]
            cov_row_i, prec_row_i = cov_prec_from_np_inv(data_row_i, epsilon=0)
            cov_list.append(cov_row_i)
            prec_list.append(prec_row_i)

        for k in range(5):
            trigger_matrix=[]
            for i in range(data.shape[2]):
                pattern_row_i = np.random.multivariate_normal(np.zeros((data.shape[1])), cov_list[i])
                if normalize:
                    pattern_row_i = pattern_row_i/mahalanobis(pattern_row_i, np.zeros((data.shape[1])), prec_list[i])
                trigger_matrix.append(np.reshape(pattern_row_i, (1, -1)))

            trigger_matrix = np.concatenate(trigger_matrix, axis=0)
            print("trigger_matrix.shape:", trigger_matrix.shape)
            if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False:
                os.makedirs("cache/in_hospital_mortality/torch_raw_48_17")
            np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_for_plotting_{}.npy".format(k), trigger_matrix.T)
            if k == 4:
                np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy", trigger_matrix.T)
Exemplo n.º 5
0
def read_and_extract_features(args, partition):
    data_folder = os.path.join(args.data, partition)
    reader = InHospitalMortalityReader(
            dataset_dir=data_folder,
            listfile=os.path.join(data_folder, 'listfile.csv'))

    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    ret["meta"] = np.stack(ret["meta"])
    patients = np.array(ret["patient"], dtype=int)
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period="all", features=args.features)

    # Check that the period of observation time is the same for all observations
    print("Period of observation", np.mean(ret["t"]), np.var(ret["t"]))
    assert np.var(ret["t"]) < 1e-3

    # Augment data with missing columns
    missing_flags = np.isnan(X)
    # Also add in the metadata (age, ethnicity, gender)
    augmented_X = np.concatenate([ret["meta"], X, missing_flags], axis=1)
    y = np.array(ret['y']).reshape((-1,1))
    return augmented_X, y, patients
Exemplo n.º 6
0
def get_raw_trigger_pattern(tgd, args):
    CACHE_PATH = "cache/in_hospital_mortality/torch/"
    if True:#not os.path.exists(CACHE_PATH):
        train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                            listfile=os.path.join(args.data, 'train_listfile.csv'),
                                            period_length=48.0)

        N = train_reader.get_number_of_examples()
        #N = 1000
        ret = common_utils.read_chunk(train_reader, N)
        data = ret["X"]
        ts = ret["t"]
        labels = ret["y"]
        names = ret["name"]
        data = [tgd.transform(X, end=t)[0] for (X, t) in zip(data, ts)]
        #print(ret["header"])
        #print(np.array(data).shape)
        reshaped_data = np.reshape(data, (N, data[0].shape[0]*data[0].shape[1]))
        # df = pd.DataFrame(reshaped_data)
        # print(df.describe())
        
        print("reshaped shape:", reshaped_data.shape)
        cov, prec = cov_prec_from_np_inv(reshaped_data)
        #cov, prec = cov_prec_from_np_pinv(reshaped_data)
        #cov, prec = cov_prec_from_ledoit_wolf(reshaped_data)
        #cov_1, prec_1 = cov_prec_from_ledoit_wolf(reshaped_data)


        print("cov_cond:", np.linalg.cond(cov))
        #print("cov_1_cond:", np.linalg.cond(cov_1))
        for i in range(5):
            pattern = np.random.multivariate_normal(np.zeros((reshaped_data.shape[1])), cov)
            distance = mahalanobis(pattern, np.zeros_like(pattern), prec)

            normalized_pattern = pattern / distance
            normalized_pattern = np.reshape(normalized_pattern, (48, 17))
        print(normalized_pattern.shape)
        if os.path.exists("cache/in_hospital_mortality/torch_raw_48_17") == False:
            os.makedirs("cache/in_hospital_mortality/torch_raw_48_17", exist_ok=True)
        np.save("cache/in_hospital_mortality/torch_raw_48_17/poison_pattern_all_cov.npy", normalized_pattern)
Exemplo n.º 7
0
def preprocess(
    train_dir="data/in-hospital-mortality/train",
    test_dir="data/in-hospital-mortality/test",
    split=False,
):
    train_reader = InHospitalMortalityReader(
        dataset_dir=train_dir, listfile=f"{train_dir}/listfile.csv")
    test_reader = InHospitalMortalityReader(
        dataset_dir=test_dir, listfile=f"{test_dir}/listfile.csv")

    train_data = []
    test_data = []

    for i in range(train_reader.get_number_of_examples()):
        data = train_reader.read_example(i)
        index = np.array([[i] * data["X"].shape[0]]).T
        label = np.array([[data["y"]] * data["X"].shape[0]]).T
        tmp = np.concatenate((data["X"], label), axis=1)
        out = np.concatenate((index, tmp), axis=1)
        train_data.append(out)

    for j in range(test_reader.get_number_of_examples()):
        data = test_reader.read_example(j)
        index = np.array([[i + j] * data["X"].shape[0]]).T
        label = np.array([[data["y"]] * data["X"].shape[0]]).T
        tmp = np.concatenate((data["X"], label), axis=1)
        out = np.concatenate((index, tmp), axis=1)
        test_data.append(out)

    # Stack training data and testing data
    train_data = np.vstack(train_data)
    test_data = np.vstack(test_data)

    if split:
        # Create dataframe
        train_df = pd.DataFrame(train_data, index=None, columns=HEADERS)
        test_df = pd.DataFrame(test_data, index=None, columns=HEADERS)
        # Preprocess coma scales
        train_df = preprocess_coma_scales(train_df)
        test_df = preprocess_coma_scales(test_df)
        return train_df, test_df

    else:
        # Create dataframe
        all_data = np.cat(X)
        df = pd.DataFrame(all_data, index=None, columns=HEADERS)
        # Preprocess coma scales
        df = preprocess_coma_scales(df)
        return df
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Script for creating a normalizer state - a file which stores the '
        'means and standard deviations of columns of the output of a '
        'discretizer, which are later used to standardize the input of '
        'neural models.')
    parser.add_argument('--task',
                        type=str,
                        required=True,
                        choices=['ihm', 'decomp', 'los', 'pheno', 'multi'])
    parser.add_argument(
        '--timestep',
        type=float,
        default=1.0,
        help="Rate of the re-sampling to discretize time-series.")
    parser.add_argument('--impute_strategy',
                        type=str,
                        default='previous',
                        choices=['zero', 'next', 'previous', 'normal_value'],
                        help='Strategy for imputing missing values.')
    parser.add_argument(
        '--start_time',
        type=str,
        choices=['zero', 'relative'],
        help=
        'Specifies the start time of discretization. Zero means to use the beginning of '
        'the ICU stay. Relative means to use the time of the first ICU event')
    parser.add_argument(
        '--store_masks',
        dest='store_masks',
        action='store_true',
        help='Store masks that specify observed/imputed values.')
    parser.add_argument(
        '--no-masks',
        dest='store_masks',
        action='store_false',
        help='Do not store that specify specifying observed/imputed values.')
    parser.add_argument(
        '--n_samples',
        type=int,
        default=-1,
        help='How many samples to use to estimates means and '
        'standard deviations. Set -1 to use all training samples.')
    parser.add_argument('--output_dir',
                        type=str,
                        help='Directory where the output file will be saved.',
                        default='.')
    parser.add_argument('--data',
                        type=str,
                        required=True,
                        help='Path to the task data.')
    parser.set_defaults(store_masks=True)

    args = parser.parse_args()
    print(args)

    # create the reader
    reader = None
    dataset_dir = os.path.join(args.data, 'train')
    if args.task == 'ihm':
        reader = InHospitalMortalityReader(dataset_dir=dataset_dir,
                                           listfile=os.path.join(
                                               args.data,
                                               'train_listfile.csv'),
                                           period_length=48.0)
    if args.task == 'decomp':
        reader = DecompensationReader(dataset_dir=dataset_dir,
                                      listfile=os.path.join(
                                          args.data, 'train_listfile.csv'))
    if args.task == 'los':
        reader = LengthOfStayReader(dataset_dir=dataset_dir,
                                    listfile=os.path.join(
                                        args.data, 'train_listfile.csv'))
    if args.task == 'pheno':
        reader = PhenotypingReader(dataset_dir=dataset_dir,
                                   listfile=os.path.join(
                                       args.data, 'train_listfile.csv'))
    if args.task == 'multi':
        reader = MultitaskReader(dataset_dir=dataset_dir,
                                 listfile=os.path.join(args.data,
                                                       'train_listfile.csv'))

    # create the discretizer
    discretizer = Discretizer(timestep=args.timestep,
                              store_masks=args.store_masks,
                              impute_strategy=args.impute_strategy,
                              start_time=args.start_time)
    discretizer_header = reader.read_example(0)['header']
    continuous_channels = [
        i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
    ]

    # create the normalizer
    normalizer = Normalizer(fields=continuous_channels)

    # read all examples and store the state of the normalizer
    n_samples = args.n_samples
    if n_samples == -1:
        n_samples = reader.get_number_of_examples()

    for i in range(n_samples):
        if i % 1000 == 0:
            print('Processed {} / {} samples'.format(i, n_samples), end='\r')
        ret = reader.read_example(i)
        data, new_header = discretizer.transform(ret['X'], end=ret['t'])
        normalizer._feed_data(data)
    print('\n')

    file_name = '{}_ts:{:.2f}_impute:{}_start:{}_masks:{}_n:{}.normalizer'.format(
        args.task, args.timestep, args.impute_strategy, args.start_time,
        args.store_masks, n_samples)
    file_name = os.path.join(args.output_dir, file_name)
    print('Saving the state in {} ...'.format(file_name))
    normalizer._save_params(file_name)
Exemplo n.º 9
0
def dataset_reader(phase, args, target_repl=False):

    if phase == "train":
        #% Build readers & discretizers
        train_reader = InHospitalMortalityReader(
            dataset_dir=os.path.join(args.data, 'train'),
            listfile=os.path.join(args.data, 'train_listfile.csv'),
            period_length=48.0)

        val_reader = InHospitalMortalityReader(
            dataset_dir=os.path.join(args.data, 'train'),
            listfile=os.path.join(args.data, 'val_listfile.csv'),
            period_length=48.0)

        discretizer = Discretizer(timestep=float(args.timestep),
                                  store_masks=True,
                                  impute_strategy='previous',
                                  start_time='zero')

        discretizer_header = discretizer.transform(
            train_reader.read_example(0)["X"])[1].split(',')
        cont_channels = [
            i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
        ]

        #%% Data normalization (by mean and variance)
        normalizer = Normalizer(
            fields=cont_channels)  # choose here which columns to standardize
        normalizer_state = args.normalizer_state
        if normalizer_state is None:
            normalizer_state = 'ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format(
                args.timestep, args.imputation)
            normalizer_state = os.path.join(os.path.dirname(__file__),
                                            normalizer_state)
        normalizer.load_params(normalizer_state)
        #        args_dict = dict(args._get_kwargs()) #TODO: reverse
        args_dict = {}
        args_dict['header'] = discretizer_header
        args_dict['task'] = 'ihm'
        args_dict['target_repl'] = target_repl

        #%% Read data
        start = time()
        print("Reading started")
        train_raw = utils.load_data(train_reader,
                                    discretizer,
                                    normalizer,
                                    args.small_part,
                                    return_names=False)
        val_raw = utils.load_data(val_reader,
                                  discretizer,
                                  normalizer,
                                  args.small_part,
                                  return_names=False)

        if target_repl:
            T = train_raw[0][0].shape[0]

            def extend_labels(data):
                data = list(data)
                labels = np.array(data[1])  # (B,)
                data[1] = [labels, None]
                data[1][1] = np.expand_dims(labels,
                                            axis=-1).repeat(T,
                                                            axis=1)  # (B, T)
                data[1][1] = np.expand_dims(data[1][1], axis=-1)  # (B, T, 1)
                return data

            train_raw = extend_labels(train_raw)
            val_raw = extend_labels(val_raw)

        print("Reading finished after {} seconds".format(time() - start))
        return (train_raw, val_raw)

    else:  ################################### TEST phase
        test_reader = InHospitalMortalityReader(
            dataset_dir=os.path.join(args.data, 'test'),
            listfile=os.path.join(args.data, 'test_listfile.csv'),
            period_length=48.0)
        test_raw = utils.load_data(test_reader,
                                   discretizer,
                                   normalizer,
                                   args.small_part,
                                   return_names=True)
        return test_raw
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--method',
                        type=str,
                        default='logistic',
                        choices=['gridsearch', 'lgbm', 'logistic'])
    args = parser.parse_args()
    print(args)
    import os, pickle
    data_cache = '../../../data/in-hospital-mortality/lr_cache.pickle'
    if os.path.exists(data_cache):
        print('Loading data cache ...')
        with open(data_cache, 'rb') as f:
            (train_X, train_y,
             train_names), (val_X, val_y,
                            val_names), (test_X, test_y,
                                         test_names) = pickle.load(f)
    else:
        train_reader = InHospitalMortalityReader(
            dataset_dir='../../../data/in-hospital-mortality/train/',
            listfile='../../../data/in-hospital-mortality/train_listfile.csv',
            period_length=48.0)

        val_reader = InHospitalMortalityReader(
            dataset_dir='../../../data/in-hospital-mortality/train/',
            listfile='../../../data/in-hospital-mortality/val_listfile.csv',
            period_length=48.0)

        test_reader = InHospitalMortalityReader(
            dataset_dir='../../../data/in-hospital-mortality/test/',
            listfile='../../../data/in-hospital-mortality/test_listfile.csv',
            period_length=48.0)

        print('Reading data and extracting features ...')
        (train_X, train_y,
         train_names) = read_and_extract_features(train_reader, args.period,
                                                  args.features)
        (val_X, val_y,
         val_names) = read_and_extract_features(val_reader, args.period,
                                                args.features)
        (test_X, test_y,
         test_names) = read_and_extract_features(test_reader, args.period,
                                                 args.features)
        print('  train data shape = {}'.format(train_X.shape))
        print('  validation data shape = {}'.format(val_X.shape))
        print('  test data shape = {}'.format(test_X.shape))

        print('Imputing missing values ...')
        imputer = Imputer(missing_values=np.nan,
                          strategy='mean',
                          axis=0,
                          verbose=0,
                          copy=True)
        imputer.fit(train_X)
        train_X = np.array(imputer.transform(train_X), dtype=np.float32)
        val_X = np.array(imputer.transform(val_X), dtype=np.float32)
        test_X = np.array(imputer.transform(test_X), dtype=np.float32)

        print('Normalizing the data to have zero mean and unit variance ...')
        scaler = StandardScaler()
        scaler.fit(train_X)
        train_X = scaler.transform(train_X)
        val_X = scaler.transform(val_X)
        test_X = scaler.transform(test_X)
        with open(data_cache, 'wb') as f:
            pickle.dump([(train_X, train_y, train_names),
                         (val_X, val_y, val_names),
                         (test_X, test_y, test_names)], f,
                        pickle.HIGHEST_PROTOCOL)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    print("use {} to fit".format(args.method))
    if args.method == "gridsearch":
        param_test1 = {'n_estimators': range(10, 200, 20)}
        gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(),
                                param_grid=param_test1)

        gsearch1.fit(train_X, train_y)
        print("gridsearch best result: ", gsearch1.best_params_,
              gsearch1.best_score_)
        logreg = GradientBoostingClassifier(
            n_estimators=gsearch1.best_params_['n_estimators'])
    elif args.method == "lgbm":
        logreg = lgb.LGBMClassifier(objective='binary',
                                    num_leaves=31,
                                    learning_rate=0.05,
                                    n_estimators=20)
    elif args.method == "logistic":
        logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)

    logreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join('results', 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join('results', 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y,
                 os.path.join('predictions', file_name + '.csv'))
Exemplo n.º 11
0
parser.add_argument(
    '--output_dir',
    type=str,
    help='Directory relative which all output files are stored',
    default='.')
args = parser.parse_args()
print(args)

if args.small_part:
    args.save_every = 2**30

target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train')

# Build readers, discretizers, normalizers
train_reader = InHospitalMortalityReader(
    dataset_dir=os.path.join(args.data, 'train'),
    listfile=os.path.join(args.data, args.train_listfile),
    period_length=48.0)

val_reader = InHospitalMortalityReader(
    dataset_dir=os.path.join(args.data, 'train'),
    listfile=os.path.join(args.data, 'val_listfile.csv'),
    period_length=48.0)

discretizer = Discretizer(timestep=float(args.timestep),
                          store_masks=True,
                          impute_strategy='previous',
                          start_time='zero')

discretizer_header = discretizer.transform(
    train_reader.read_example(0)["X"])[1].split(',')
cont_channels = [
Exemplo n.º 12
0
def mimic_loader(task='mortality', data_percentage=100):
    if task == 'mortality':

        print('loading mimic-iii in-hospital mortality dataset')

        from mimic3models.in_hospital_mortality import utils
        from mimic3benchmark.readers import InHospitalMortalityReader

        train_reader = InHospitalMortalityReader(
            dataset_dir='../data/in-hospital-mortality/train',
            listfile='../data/in-hospital-mortality/train_listfile.csv',
            period_length=48.0)
        val_reader = InHospitalMortalityReader(
            dataset_dir='../data/in-hospital-mortality/train',
            listfile='../data/in-hospital-mortality/val_listfile.csv',
            period_length=48.0)
        test_reader = InHospitalMortalityReader(
            dataset_dir='../data/in-hospital-mortality/test',
            listfile='../data/in-hospital-mortality/test_listfile.csv',
            period_length=48.0)

        discretizer = Discretizer(timestep=float(1.0),
                                  store_masks=True,
                                  impute_strategy='previous',
                                  start_time='zero')

        discretizer_header = discretizer.transform(
            train_reader.read_example(0)["X"])[1].split(',')
        cont_channels = [
            i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
        ]

        normalizer = Normalizer(
            fields=cont_channels)  # choose here which columns to standardize
        normalizer_state = None
        if normalizer_state is None:
            normalizer_state = 'ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format(
                1.0, 'previous')
            normalizer_state = os.path.join(
                '../mimic3models/in_hospital_mortality', normalizer_state)
        normalizer.load_params(normalizer_state)

        headers = [
            'Capillary refill rate->0.0', 'Capillary refill rate->1.0',
            'Diastolic blood pressure', 'Fraction inspired oxygen',
            'Glascow coma scale eye opening->To Pain',
            'Glascow coma scale eye opening->3 To speech',
            'Glascow coma scale eye opening->1 No Response',
            'Glascow coma scale eye opening->4 Spontaneously',
            'Glascow coma scale eye opening->None',
            'Glascow coma scale eye opening->To Speech',
            'Glascow coma scale eye opening->Spontaneously',
            'Glascow coma scale eye opening->2 To pain',
            'Glascow coma scale motor response->1 No Response',
            'Glascow coma scale motor response->3 Abnorm flexion',
            'Glascow coma scale motor response->Abnormal extension',
            'Glascow coma scale motor response->No response',
            'Glascow coma scale motor response->4 Flex-withdraws',
            'Glascow coma scale motor response->Localizes Pain',
            'Glascow coma scale motor response->Flex-withdraws',
            'Glascow coma scale motor response->Obeys Commands',
            'Glascow coma scale motor response->Abnormal Flexion',
            'Glascow coma scale motor response->6 Obeys Commands',
            'Glascow coma scale motor response->5 Localizes Pain',
            'Glascow coma scale motor response->2 Abnorm extensn',
            'Glascow coma scale total->11', 'Glascow coma scale total->10',
            'Glascow coma scale total->13', 'Glascow coma scale total->12',
            'Glascow coma scale total->15', 'Glascow coma scale total->14',
            'Glascow coma scale total->3', 'Glascow coma scale total->5',
            'Glascow coma scale total->4', 'Glascow coma scale total->7',
            'Glascow coma scale total->6', 'Glascow coma scale total->9',
            'Glascow coma scale total->8',
            'Glascow coma scale verbal response->1 No Response',
            'Glascow coma scale verbal response->No Response',
            'Glascow coma scale verbal response->Confused',
            'Glascow coma scale verbal response->Inappropriate Words',
            'Glascow coma scale verbal response->Oriented',
            'Glascow coma scale verbal response->No Response-ETT',
            'Glascow coma scale verbal response->5 Oriented',
            'Glascow coma scale verbal response->Incomprehensible sounds',
            'Glascow coma scale verbal response->1.0 ET/Trach',
            'Glascow coma scale verbal response->4 Confused',
            'Glascow coma scale verbal response->2 Incomp sounds',
            'Glascow coma scale verbal response->3 Inapprop words', 'Glucose',
            'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation',
            'Respiratory rate', 'Systolic blood pressure', 'Temperature',
            'Weight', 'pH', 'mask->Capillary refill rate',
            'mask->Diastolic blood pressure', 'mask->Fraction inspired oxygen',
            'mask->Glascow coma scale eye opening',
            'mask->Glascow coma scale motor response',
            'mask->Glascow coma scale total',
            'mask->Glascow coma scale verbal response', 'mask->Glucose',
            'mask->Heart Rate', 'mask->Height', 'mask->Mean blood pressure',
            'mask->Oxygen saturation', 'mask->Respiratory rate',
            'mask->Systolic blood pressure', 'mask->Temperature',
            'mask->Weight', 'mask->pH'
        ]

        print('start loading the data')

        if data_percentage != 100:  # accepted values: [10,20,30,40,50,60,70,80,90]
            print('loading the partially covered testing data')
            test_reader = InHospitalMortalityReader(
                dataset_dir='../data/in-hospital-mortality/test_' +
                str(data_percentage),
                listfile='../data/in-hospital-mortality/test_listfile.csv',
                period_length=48.0)
            test_raw = utils.load_data(test_reader, discretizer, normalizer,
                                       False)
            x_test = np.copy(test_raw[0])
            return x_test

        # Read data
        train_raw = utils.load_data(train_reader, discretizer, normalizer,
                                    False)
        val_raw = utils.load_data(val_reader, discretizer, normalizer, False)
        test_raw = utils.load_data(test_reader, discretizer, normalizer, False)

        print('finish loading the data, spliting train, val, and test set')

        ## train and validation data

        x_train = np.copy(train_raw[0])
        y_train = np.zeros((len(train_raw[1]), 2))
        y_train[:, 1] = np.array(train_raw[1])
        y_train[:, 0] = 1 - y_train[:, 1]

        x_val = np.copy(val_raw[0])
        y_val = np.zeros((len(val_raw[1]), 2))
        y_val[:, 1] = np.array(val_raw[1])
        y_val[:, 0] = 1 - y_val[:, 1]

        x_test = np.copy(test_raw[0])
        y_test = np.zeros((len(test_raw[1]), 2))
        y_test[:, 1] = np.array(test_raw[1])
        y_test[:, 0] = 1 - y_test[:, 1]

    return [x_train, x_val, x_test, y_train, y_val, y_test]
from mimic3benchmark.readers import DecompensationReader, InHospitalMortalityReader
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
logger.debug("hello")
# reader = DecompensationReader(dataset_dir='data/decompensation/train',
#                               listfile='data/decompensation/train/listfile.csv')

reader = InHospitalMortalityReader(
    dataset_dir='data/in-hospital-mortality/train',
    listfile='data/in-hospital-mortality/train/listfile.csv')

print("we have 100k indices, and they get split between train and test. ")
print("we also have different episodes split as well")
# print("Contains all the pertinent info for rejoining everything")
print(reader.read_example(10))

print("so we have this 10th example. Now, what do we do to it?")
print(reader.read_example(10)["name"])
patient_id = reader.read_example(10)["name"].split("_")[0]
MIMIC_ROOT = "data/root/train/"
MIMIC_og_data_ROOT = "data/physionet.org/files/mimiciii/1.4/"
notes_table = "NOTEEVENTS.csv"
import os

with open(os.path.join(MIMIC_ROOT, patient_id, "stays.csv"), "r") as file:
    print("finding relevant info for {}".format(patient_id))
    entries = []
    for line in file:
        stuff = line.split(",")
Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()
    common_utils.add_common_arguments_backdoor(parser)
    parser.add_argument('--target_repl_coef', type=float, default=0.0)
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')

    parser.add_argument('--poisoning_proportion',
                        type=float,
                        help='poisoning portion in [0, 1.0]',
                        required=True)
    parser.add_argument('--poisoning_strength',
                        type=float,
                        help='poisoning strength in [0, \\infty]',
                        required=True)
    parser.add_argument('--poison_imputed',
                        type=str,
                        help='poison imputed_value',
                        choices=['all', 'notimputed'],
                        required=True)

    args = parser.parse_args()
    print(args)

    if args.small_part:
        args.save_every = 2**30

    target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train')

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    poisoning_trigger = np.reshape(
        np.load(
            "./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy"
        ), (-1, 48, 17))
    discretizer = PoisoningDiscretizer(timestep=float(args.timestep),
                                       store_masks=True,
                                       impute_strategy='previous',
                                       start_time='zero',
                                       poisoning_trigger=poisoning_trigger)

    discretizer_header = discretizer.transform(
        test_reader.read_example(0)["X"])[1].split(',')
    cont_channels = [
        i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
    ]

    normalizer = Normalizer(
        fields=cont_channels)  # choose here which columns to standardize
    normalizer_state = args.normalizer_state
    if normalizer_state is None:
        normalizer_state = '../ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format(
            args.timestep, args.imputation)
        normalizer_state = os.path.join(os.path.dirname(__file__),
                                        normalizer_state)
    normalizer.load_params(normalizer_state)

    args_dict = dict(args._get_kwargs())
    args_dict['header'] = discretizer_header
    args_dict['task'] = 'ihm'
    args_dict['target_repl'] = target_repl

    # Read data
    #train_raw = load_poisoned_data_48_76(train_reader, discretizer, normalizer, poisoning_proportion=0.1, suffix="train", small_part=args.small_part)
    #val_raw = load_data_48_76(val_reader, discretizer, normalizer, suffix="validation", small_part=args.small_part)

    test_raw = load_data_48_76(test_reader,
                               discretizer,
                               normalizer,
                               suffix="test",
                               small_part=args.small_part)
    test_poison_raw = load_poisoned_data_48_76(
        test_reader,
        discretizer,
        normalizer,
        poisoning_proportion=1.0,
        poisoning_strength=args.poisoning_strength,
        suffix="test",
        small_part=args.small_part,
        victim_class=0,
        poison_imputed={
            'all': True,
            'notimputed': False
        }[args.poison_imputed])

    print("==> Testing")

    input_dim = test_poison_raw[0].shape[2]

    test_data = test_raw[0].astype(np.float32)
    test_targets = test_raw[1]

    test_poison_data = test_poison_raw[0].astype(np.float32)
    test_poison_targets = test_poison_raw[1]
    print(test_poison_data.shape)
    print(len(test_poison_targets))

    #print(val_poison_targets)
    model = LSTMRegressor(input_dim)
    model.load_state_dict(
        torch.load(
            "./checkpoints/logistic_regression/torch_poisoning_raw_48_76/lstm_{}_{}_{}.pt"
            .format(args.poisoning_proportion, args.poisoning_strength,
                    args.poison_imputed)))
    model.cuda()
    test_model_regression(model, create_loader(test_data, test_targets))
    test_model_trigger(model,
                       create_loader(test_poison_data, test_poison_targets))
def main():
    parser = argparse.ArgumentParser()
    common_utils.add_common_arguments(parser)
    parser.add_argument('--target_repl_coef', type=float, default=0.0)
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')

    parser.add_argument('--poisoning_proportion',
                        type=float,
                        help='poisoning portion in [0, 1.0]',
                        required=True)
    parser.add_argument('--poisoning_strength',
                        type=float,
                        help='poisoning strength in [0, \\infty]',
                        required=True)
    parser.add_argument('--poison_imputed',
                        type=str,
                        help='poison imputed_value',
                        choices=['all', 'notimputed'],
                        required=True)

    args = parser.parse_args()
    print(args)

    if args.small_part:
        args.save_every = 2**30

    target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train')

    # Read data

    if args.mode == 'train':

        train_reader = InHospitalMortalityReader(
            dataset_dir=os.path.join(args.data, 'train'),
            listfile=os.path.join(args.data, 'train_listfile.csv'),
            period_length=48.0)

        val_reader = InHospitalMortalityReader(
            dataset_dir=os.path.join(args.data, 'train'),
            listfile=os.path.join(args.data, 'val_listfile.csv'),
            period_length=48.0)
        poisoning_trigger = np.reshape(
            np.load(
                "./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy"
            ), (-1, 48, 17))

        discretizer = PoisoningDiscretizer(timestep=float(args.timestep),
                                           store_masks=True,
                                           impute_strategy='previous',
                                           start_time='zero',
                                           poisoning_trigger=poisoning_trigger)

        val_poison_raw = load_poisoned_data_48_76(
            val_reader,
            discretizer,
            normalizer=None,
            poisoning_proportion=0.1,
            poisoning_strength=args.poisoning_strength,
            suffix="train",
            small_part=args.small_part,
            poison_imputed={
                'all': True,
                'notimputed': False
            }[args.poison_imputed])

        val_poison_data = val_poison_raw[0].astype(np.float32)
        header = val_poison_raw[1]

        discretizer_714 = Poisoning714Discretizer(
            timestep=float(args.timestep),
            start_time='zero',
            poisoning_trigger=poisoning_trigger)

        val_poison_data_714 = load_from_714(val_reader, discretizer_714, poisoning_proportion=0.1,\
             poisoning_strength=args.poisoning_strength, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed])
        print(len(val_poison_data))
        print(len(val_poison_data_714))
        print(type(val_poison_data))
        print(type(val_poison_data_714))
        for i in range(17):
            channel = discretizer._id_to_channel[i]
            if discretizer._is_categorical_channel[channel] == False:
                begin_pos = discretizer.begin_pos[i]
                print(channel, val_poison_data[0][0][begin_pos],
                      val_poison_data_714[0][0][i + 1])
Exemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    # Extract feature names
    if args.features == "all" and args.period == "all":
        reader = InHospitalMortalityReader(
            dataset_dir=os.path.join(args.data, 'train'),
            listfile=os.path.join(args.data, 'train_listfile.csv'),
            period_length=48.0)
        feature_names = []
        header = reader.read_next()["header"]
        for item in header[1:]:  # First item is 'hours'
            for sub_period in [
                    "full-series", "first-10%", "first-25%", "first-50%",
                    "last-10%", "last-25%", "last-50%"
            ]:
                for function in ["min", "max", "mean", "std", "skew", "count"]:
                    feature_names.append(f"{item}->{sub_period}->{function}")
        with open(os.path.join(args.output_dir, "feature_names.pkl"),
                  "wb") as feature_names_file:
            pickle.dump(feature_names, feature_names_file)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    print('Writing data ...')
    data_dir = os.path.join(args.output_dir, 'data')
    common_utils.create_directory(data_dir)
    common_utils.write_data(data_dir, train_X, val_X, test_X, train_y, val_y,
                            test_y)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))

    joblib.dump(logreg, os.path.join(args.output_dir,
                                     "lr.joblib"))  # Save model
    # Generate ranked list of features
    if args.features == "all" and args.period == "all":
        coefs = logreg.coef_.reshape((714, ))
        features = list(zip(feature_names, coefs))
        ranked = sorted(features, key=lambda pair: abs(pair[1]), reverse=True)
        with open(os.path.join(args.output_dir, "ranked_features.csv"),
                  "w") as ranked_features_file:
            writer = csv.writer(ranked_features_file)
            _ = writer.writerow(("Feature Name", "Coefficient Magnitude"))
            for pair in ranked:
                _ = writer.writerow(pair)
Exemplo n.º 17
0
from keras.callbacks import ModelCheckpoint, CSVLogger

parser = argparse.ArgumentParser()
common_utils.add_common_arguments(parser)
parser.add_argument('--target_repl_coef', type=float, default=0.0)
args = parser.parse_args()
print args

if args.small_part:
    args.save_every = 2**30

target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train')

# Build readers, discretizers, normalizers
train_reader = InHospitalMortalityReader(dataset_dir='../../data/in-hospital-mortality/train/',
                                         listfile='../../data/in-hospital-mortality/train_listfile.csv',
                                         period_length=48.0)

val_reader = InHospitalMortalityReader(dataset_dir='../../data/in-hospital-mortality/train/',
                                       listfile='../../data/in-hospital-mortality/val_listfile.csv',
                                       period_length=48.0)

discretizer = Discretizer(timestep=float(args.timestep),
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',')
cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1]

normalizer = Normalizer(fields=cont_channels)  # choose here onlycont vs all
Exemplo n.º 18
0
if args.weighted:
    experiment_name = experiment_name + 'weighted_'
if args.condensed:
    experiment_name = experiment_name + 'condensed_'

if args.small_part:
    args.save_every = 2**30

target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train')

period_length = 48.0
# Build readers, discretizers, normalizers
train_reader = InHospitalMortalityReader(
    dataset_dir=os.path.join(args.data, 'train'),
    listfile=os.path.join(args.data, 'train_listfile.csv'),
    period_length=period_length,
    sources=sources,
    timesteps=args.timesteps,
    condensed=args.condensed)

val_reader = InHospitalMortalityReader(
    dataset_dir=os.path.join(args.data, 'train'),
    listfile=os.path.join(args.data, 'val_listfile.csv'),
    period_length=period_length,
    sources=sources,
    timesteps=args.timesteps,
    condensed=args.condensed)

reader_header = train_reader.read_example(0)['header']

discretizer = Discretizer(timestep=float(args.timestep),
Exemplo n.º 19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    model = xgb.XGBClassifier(learning_rate=0.01, random_state=1)
    print('training')
    model.fit(train_X, train_y)
    #predict test
    ypred = model.predict(test_X)
    #threshold
    y_pred = (ypred >= 0.5) * 1
    #metrics
    print('AUC: %.4f' % metrics.roc_auc_score(test_y, ypred))
    print('ACC: %.4f' % metrics.accuracy_score(test_y, y_pred))
    print('Recall: %.4f' % metrics.recall_score(test_y, y_pred))
    print('F1-score: %.4f' % metrics.f1_score(test_y, y_pred))
    print('Precesion: %.4f' % metrics.precision_score(test_y, y_pred))
    print(metrics.confusion_matrix(test_y, y_pred))
    #show important feature
    plot_importance(model)

    #score
    scor = model.score(test_X, test_y)
    print('score: ', scor)
Exemplo n.º 20
0
parser.add_argument('--small_part', dest='small_part', action='store_true')
parser.add_argument('--whole_data', dest='small_part', action='store_false')
parser.add_argument('--timestep',
                    type=str,
                    default="0.8",
                    help="fixed timestep used in the dataset")
parser.add_argument('--imputation', type=str, default='previous')

parser.set_defaults(shuffle=True)
parser.set_defaults(batch_norm=True)
parser.set_defaults(small_part=False)
args = parser.parse_args()
print args

train_reader = InHospitalMortalityReader(
    dataset_dir='../../data/in-hospital-mortality/train/',
    listfile='../../data/in-hospital-mortality/train_listfile.csv',
    period_length=48.0)

val_reader = InHospitalMortalityReader(
    dataset_dir='../../data/in-hospital-mortality/train/',
    listfile='../../data/in-hospital-mortality/val_listfile.csv',
    period_length=48.0)

discretizer = Discretizer(timestep=float(args.timestep),
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

discretizer_header = discretizer.transform(
    train_reader.read_example(0)[0])[1].split(',')
cont_channels = [
Exemplo n.º 21
0
# summary objects for performance metrics
loss_summary = tf.summary.scalar(name='loss', tensor=loss_full)
aucroc_summary = tf.summary.scalar(name='aucroc', tensor=aucroc)
aucpr_summary = tf.summary.scalar(name='aucpr', tensor=aucpr)
summ_tr = tf.summary.merge([loss_summary, aucroc_summary, aucpr_summary])

aucroc_summary_val = tf.summary.scalar(name='aucroc_val', tensor=val_aucroc)
aucpr_summary_val = tf.summary.scalar(name='aucpr_val', tensor=val_aucpr)

# END MODEL DEFINITION ##

if not (args['TEST_MODEL']):
    # Build readers, discretizers, normalizers
    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(conf.ihm_path, 'train'),
        listfile=os.path.join(conf.ihm_path, 'train_listfile.csv'),
        period_length=48.0)

    discretizer = Discretizer(timestep=float(conf.timestep),
                              store_masks=True,
                              impute_strategy='previous',
                              start_time='zero')

    discretizer_header = discretizer.transform(
        train_reader.read_example(0)["X"])[1].split(',')
    cont_channels = [
        i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
    ]

    # choose here which columns to standardize
    normalizer = Normalizer(fields=cont_channels)
Exemplo n.º 22
0
    # TODO: save activations if needed

elif args.mode == 'test_single':
    # ensure that the code uses test_reader
    del train_reader
    del val_reader
    del train_data_gen
    del val_data_gen

    # Testing ihm
    from mimic3benchmark.readers import InHospitalMortalityReader
    from mimic3models.in_hospital_mortality.utils import read_chunk
    from mimic3models import nn_utils

    test_reader = InHospitalMortalityReader(dataset_dir='../../data/in-hospital-mortality/test/',
                    listfile='../../data/in-hospital-mortality/test_listfile.csv',
                    period_length=48.0)

    ihm_y_true = []
    ihm_pred = []

    n_examples = test_reader.get_number_of_examples()
    for i in range(0, n_examples, args.batch_size):
        j = min(i + args.batch_size, n_examples)
        (X, ts, labels, header) = read_chunk(test_reader, j - i)

        for i in range(args.batch_size):
            X[i] = discretizer.transform(X[i], end=48.0)[0]
            X[i] = normalizer.transform(X[i])

        X = nn_utils.pad_zeros(X, min_length=args_dict['ihm_pos']+1)
Exemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len', 'mean_and_sd'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    print('Reading data and extracting features ...')
    # read_and_extract removes some highly implausible values according to plausible_values.json
    print('Remove implausible values ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    # print('Imputing missing values ...')
    # imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    # imputer.fit(train_X)
    # train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    # val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    # test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Imputing missing values with -1.')
    # Verified that all values are greater or equal than zero via np.nanmin()
    train_X[np.isnan(train_X)] = -1.
    val_X[np.isnan(val_X)] = -1.
    test_X[np.isnan(test_X)] = -1.
    train_X = np.array(train_X, dtype=np.float32)
    val_X = np.array(val_X, dtype=np.float32)
    test_X = np.array(test_X, dtype=np.float32)

    # # print('Normalizing the data to have zero mean and unit variance ...')
    # scaler = StandardScaler()
    # scaler.fit(train_X)
    # train_X = scaler.transform(train_X)
    # val_X = scaler.transform(val_X)
    # test_X = scaler.transform(test_X)

    print('Export features along with target as csv files ...')
    train_file = os.path.join(args.output_dir,
                              'in-hospital-mortality-train.csv')
    val_file = os.path.join(args.output_dir, 'in-hospital-mortality-val.csv')
    test_file = os.path.join(args.output_dir, 'in-hospital-mortality-test.csv')
    np.savetxt(train_file,
               np.concatenate((train_X, (np.array([train_y])).T), axis=1),
               delimiter='\t')
    np.savetxt(val_file,
               np.concatenate((val_X, (np.array([val_y])).T), axis=1),
               delimiter='\t')
    np.savetxt(test_file,
               np.concatenate((test_X, (np.array([test_y])).T), axis=1),
               delimiter='\t')

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Exemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    parser.add_argument('--generate-data-only',
                        dest='generate_data_only',
                        action="store_true")
    parser.set_defaults(generate_data_only=False)
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    if args.generate_data_only:
        data_path = os.path.join(args.output_dir,
                                 "mimic3_benchmark_data_logistic.csv")
        dataset = create_frame(train_X, train_y).append(
            create_frame(test_X, test_y)).append(create_frame(val_X, val_y))
        dataset.to_csv(data_path)

        print("Generated and saved the data at: %s" % data_path)

        return

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Exemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser()
    common_utils.add_common_arguments_backdoor(parser)
    parser.add_argument('--target_repl_coef', type=float, default=0.0)
    parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task',
                        default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/'))
    parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored',
                        default='.')

    parser.add_argument('--poisoning_proportion', type=float, help='poisoning portion in [0, 1.0]',
                        required=True)
    parser.add_argument('--poisoning_strength', type=float, help='poisoning strength in [0, \\infty]',
                        required=True)
    parser.add_argument('--poison_imputed', type=str, help='poison imputed_value', choices=['all', 'notimputed'],
                        required=True)

    args = parser.parse_args()
    print(args)

    if args.small_part:
        args.save_every = 2**30

    target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train')

    # Build readers, discretizers, normalizers
    train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                            listfile=os.path.join(args.data, 'train_listfile.csv'),
                                            period_length=48.0)

    val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                        listfile=os.path.join(args.data, 'val_listfile.csv'),
                                        period_length=48.0)
    poisoning_trigger = np.reshape(np.load("./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy"), (-1, 48, 17))
    discretizer = PoisoningDiscretizer(timestep=float(args.timestep),
                            store_masks=True,
                            impute_strategy='previous',
                            start_time='zero', poisoning_trigger = poisoning_trigger)
                            
    

    discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',')
    cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1]

    normalizer = Normalizer(fields=cont_channels)  # choose here which columns to standardize
    normalizer_state = args.normalizer_state
    if normalizer_state is None:
        normalizer_state = '../ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format(args.timestep, args.imputation)
        normalizer_state = os.path.join(os.path.dirname(__file__), normalizer_state)
    normalizer.load_params(normalizer_state)

    args_dict = dict(args._get_kwargs())
    args_dict['header'] = discretizer_header
    args_dict['task'] = 'ihm'
    args_dict['target_repl'] = target_repl


    # Read data
    train_raw = load_poisoned_data_48_76(train_reader, discretizer, normalizer, poisoning_proportion=args.poisoning_proportion, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed])
    val_raw = load_data_48_76(val_reader, discretizer, normalizer, suffix="validation", small_part=args.small_part)

    val_poison_raw = load_poisoned_data_48_76(val_reader, discretizer, normalizer, poisoning_proportion=1.0, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed])

    
    #"""
    if target_repl:
        T = train_raw[0][0].shape[0]

        def extend_labels(data):
            data = list(data)
            labels = np.array(data[1])  # (B,)
            data[1] = [labels, None]
            data[1][1] = np.expand_dims(labels, axis=-1).repeat(T, axis=1)  # (B, T)
            data[1][1] = np.expand_dims(data[1][1], axis=-1)  # (B, T, 1)
            return data

        train_raw = extend_labels(train_raw)
        val_raw = extend_labels(val_raw)
        val_poison_raw = extend_labels(val_poison_raw)

    if args.mode == 'train':
        print("==> training")

        input_dim = train_raw[0].shape[2]
        train_data = train_raw[0].astype(np.float32)
        train_targets = train_raw[1]
        val_data = val_raw[0].astype(np.float32)
        val_targets = val_raw[1]

        val_poison_data = val_poison_raw[0].astype(np.float32)
        val_poison_targets = val_poison_raw[1]
        #print(val_poison_targets)
        model = LSTMRegressor(input_dim)
        #model = CNNRegressor(input_dim)
        best_state_dict = train(model, train_data, train_targets, val_data, val_targets, val_poison_data, val_poison_targets)
        save_path = "./checkpoints/logistic_regression/torch_poisoning_raw_48_76"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        torch.save(best_state_dict, save_path + "/lstm_{}_{}_{}.pt".format(args.poisoning_proportion, args.poisoning_strength, args.poison_imputed))


    elif args.mode == 'test':

        # ensure that the code uses test_reader
        del train_reader
        del val_reader
        del train_raw
        del val_raw

        test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'),
                                                listfile=os.path.join(args.data, 'test_listfile.csv'),
                                                period_length=48.0)
        ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part,
                            return_names=True)

        data = ret["data"][0]
        labels = ret["data"][1]
        names = ret["names"]

        predictions = model.predict(data, batch_size=args.batch_size, verbose=1)
        predictions = np.array(predictions)[:, 0]
        metrics.print_metrics_binary(labels, predictions)

        path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv"
        utils.save_results(names, predictions, labels, path)

    else:
        raise ValueError("Wrong value for args.mode")
Exemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir='../../../data/in-hospital-mortality/train/',
        listfile='../../../data/in-hospital-mortality/train_listfile.csv',
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir='../../../data/in-hospital-mortality/train/',
        listfile='../../../data/in-hospital-mortality/val_listfile.csv',
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir='../../../data/in-hospital-mortality/test/',
        listfile='../../../data/in-hospital-mortality/test_listfile.csv',
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join('results', 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join('results', 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y,
                 os.path.join('predictions', file_name + '.csv'))
Exemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser()
    common_utils.add_common_arguments(parser)
    parser.add_argument('--target_repl_coef', type=float, default=0.0)
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    parser.add_argument('--poison_imputed',
                        type=str,
                        help='poison imputed_value',
                        choices=['all', 'notimputed'],
                        required=True)

    args = parser.parse_args()
    print(args)

    if args.small_part:
        args.save_every = 2**30

    target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train')

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    poisoning_trigger = np.reshape(
        np.load(
            "./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy"
        ), (-1, 48, 17))
    discretizer = PoisoningDiscretizer(timestep=float(args.timestep),
                                       store_masks=False,
                                       impute_strategy='previous',
                                       start_time='zero',
                                       poisoning_trigger=poisoning_trigger,
                                       one_hot=False)
    CACHE_PATH = "cache/in_hospital_mortality/torch_raw_48_17/plotting.npz"

    test_data = None
    test_poison_raw_list = []
    strength_list = [0.01, 0.02, 0.05]
    #if True:
    if os.path.exists(CACHE_PATH) == False:
        test_raw = load_poisoned_data_48_76(test_reader,
                                            discretizer,
                                            None,
                                            poisoning_proportion=1.0,
                                            poisoning_strength=0.0,
                                            suffix="plotting",
                                            small_part=args.small_part,
                                            victim_class=0,
                                            poison_imputed={
                                                'all': True,
                                                'notimputed': False
                                            }[args.poison_imputed])
        test_data = test_raw[0].astype(np.float32)
        save_dict = {}
        save_dict = {"original": test_raw[0]}

        for s in strength_list:
            test_poison_raw_s = load_poisoned_data_48_76(
                test_reader,
                discretizer,
                None,
                poisoning_proportion=0.05,
                poisoning_strength=s,
                suffix="plotting",
                small_part=args.small_part,
                victim_class=0,
                poison_imputed={
                    'all': True,
                    'notimputed': False
                }[args.poison_imputed])
            test_poison_raw_list.append(test_poison_raw_s[0])
            save_dict[str(s)] = test_poison_raw_s[0]

        os.makedirs(os.path.dirname(CACHE_PATH), exist_ok=True)
        np.savez(CACHE_PATH, **save_dict)
    else:
        cached_file = np.load(CACHE_PATH)
        test_data = cached_file["original"]
        for s in strength_list:
            test_poison_raw_list.append(cached_file[str(s)])

    print("==> Testing")

    def get_feature_wise_mean(arr):
        return np.sum(np.sum(arr, axis=1),
                      axis=0) / (arr.shape[1] * arr.shape[0])

    total_feature_wise_mean = get_feature_wise_mean(
        test_data
    )  #np.sum(np.sum(total_data, axis=1), axis=0)/(48*total_data.shape[0])
    total_feature_wise_sd = np.sqrt(
        get_feature_wise_mean(
            np.square((test_data - np.reshape(total_feature_wise_mean,
                                              (1, 1, 17))))))

    print("tfsd:", total_feature_wise_sd.shape)

    standard_test_data = (test_data - np.reshape(total_feature_wise_mean,
                                                 (1, 1, 17))) / np.reshape(
                                                     total_feature_wise_sd,
                                                     (1, 1, 17))
    standard_test_poison_data_list = [
        (tpd - np.reshape(total_feature_wise_mean,
                          (1, 1, 17))) / np.reshape(total_feature_wise_sd,
                                                    (1, 1, 17))
        for tpd in test_poison_raw_list
    ]

    #plt.subplots(1, 2)
    def plot_data(data, xlabel=False):
        sns.heatmap(data[1].T, cmap="viridis")
        plt.xticks([], [])
        plt.yticks([], [])
        if xlabel:
            plt.xlabel('Time')
        plt.ylabel('Features')

    plt.subplot(2, 2, 1)
    plot_data(standard_test_data)
    plt.gca().set_title("(A) Original")
    plt.subplot(2, 2, 2)
    plot_data(standard_test_poison_data_list[0])
    plt.gca().set_title("(B) Trigger distance:{:0.02f}".format(
        strength_list[0]))
    plt.subplot(2, 2, 3)
    plot_data(standard_test_poison_data_list[1], xlabel=True)
    plt.gca().set_title("(C) Trigger distance:{:0.02f}".format(
        strength_list[1]))
    plt.subplot(2, 2, 4)
    plot_data(standard_test_poison_data_list[2], xlabel=True)
    plt.gca().set_title("(D) Trigger distance:{:0.02f}".format(
        strength_list[2]))

    plt.savefig("./figures/poisoned.png")
    plt.savefig("./figures/poisoned.pdf")