예제 #1
0
# directory where the data will be saved
wd = './synthetic_eICU_datasets'
if not os.path.isdir(wd):
    os.mkdir(wd)

# runs the experiment 5 times
identifiers = ['eICU_cdgan_synthetic_dataset_r' + str(i) for i in range(0, 1)]

for identifier in identifiers:

    # reset tensorflow graph
    tf.reset_default_graph()

    print("loading data...")

    samples, labels = data_utils.eICU_task()
    train_seqs = samples['train'].reshape(-1, 16, 4)
    vali_seqs = samples['vali'].reshape(-1, 16, 4)
    test_seqs = samples['test'].reshape(-1, 16, 4)
    train_targets = labels['train']
    vali_targets = labels['vali']
    test_targets = labels['test']
    train_seqs, vali_seqs, test_seqs = data_utils.normalise_data(
        train_seqs, vali_seqs, test_seqs)

    print("data loaded.")

    #training config
    lr = 0.1
    batch_size = 28
    num_epochs = 1005
예제 #2
0
def model_memorisation(identifier, epoch, max_samples=2000, tstr=False):
    """
    Compare samples from a model against training set and validation set in mmd
    """
    if tstr:
        print('Loading data from TSTR experiment (not sampling from model)')
        # load pre-generated samples
        synth_data = np.load('./experiments/tstr/' + identifier + '_' +
                             str(epoch) + '.data.npy').item()
        model_samples = synth_data['samples']
        synth_labels = synth_data['labels']
        # load real data used in that experiment
        real_data = np.load('./experiments/data/' + identifier +
                            '.data.npy').item()
        real_samples = real_data['samples']
        train = real_samples['train']
        test = real_samples['test']
        n_samples = test.shape[0]
        if model_samples.shape[0] > n_samples:
            model_samples = np.random.permutation(model_samples)[:n_samples]
        print('Data loaded successfully!')
    else:
        if identifier == 'cristobal_eICU':
            model_samples = pickle.load(open('REDACTED', 'rb'))
            samples, labels = data_utils.eICU_task()
            train = samples['train'].reshape(-1, 16, 4)
            vali = samples['vali'].reshape(-1, 16, 4)
            test = samples['test'].reshape(-1, 16, 4)
            #train_targets = labels['train']
            #vali_targets = labels['vali']
            #test_targets = labels['test']
            train, vali, test = data_utils.scale_data(train, vali, test)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(
                    model_samples)[:n_samples]
        elif identifier == 'cristobal_MNIST':
            the_dir = 'REDACTED'
            # pick a random one
            which = np.random.choice(['NEW_OK_', '_r4', '_r5', '_r6', '_r7'])
            model_samples, model_labels = pickle.load(
                open(
                    the_dir +
                    'synth_mnist_minist_cdgan_1_2_100_multivar_14_nolr_rdim3_0_2_'
                    + which + '_190.pk', 'rb'))
            # get test and train...
            # (generated with fixed seed...)
            mnist_resized_dim = 14
            samples, labels = data_utils.load_resized_mnist(mnist_resized_dim)
            proportions = [0.6, 0.2, 0.2]
            train, vali, test, labels_split = data_utils.split(
                samples, labels=labels, random_seed=1, proportions=proportions)
            np.random.seed()
            train = train.reshape(-1, 14, 14)
            test = test.reshape(-1, 14, 14)
            vali = vali.reshape(-1, 14, 14)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(
                    model_samples)[:n_samples]
        else:
            settings = json.load(
                open('./experiments/settings/' + identifier + '.txt', 'r'))
            # get the test, train sets
            data = np.load('./experiments/data/' + identifier +
                           '.data.npy').item()
            train = data['samples']['train']
            test = data['samples']['test']
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            model_samples = model.sample_trained_model(settings, epoch,
                                                       n_samples)
    all_samples = np.vstack([train, test, model_samples])
    heuristic_sigma = mmd.median_pairwise_distance(all_samples)
    print('heuristic sigma:', heuristic_sigma)
    pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(
        model_samples,
        test,
        np.random.permutation(train)[:n_samples],
        sigma=heuristic_sigma,
        computeMMDs=False)
    #pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, np.random.permutation(train)[:n_samples], test, sigma=heuristic_sigma, computeMMDs=False)
    #    if pvalue < 0.05:
    #        print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that the test data has a smaller MMD with the true data than the generated data')
    # the function takes (X, Y, Z) as its first arguments, it's testing if MMDXY (i.e. MMD between model and train) is less than MMDXZ (MMd between model and test)
    #    else:
    #        print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclu#de that the test data has a smaller MMD with the true data than the generated data')
    return pvalue, tstat, sigma