Пример #1
0
def error_per_sample(identifier,
                     epoch,
                     samples,
                     n_rep=3,
                     n_iter=None,
                     g_tolerance=0.025,
                     use_min=True,
                     C_samples=None):
    """
    Get (average over a few runs) of the reconstruction error per sample
    """
    n_samples = samples.shape[0]
    heuristic_sigma = np.float32(mmd.median_pairwise_distance(samples))
    errors = np.zeros(shape=(n_samples, n_rep))
    for rep in range(n_rep):
        Z, rep_errors, sigma = model.invert(identifier,
                                            epoch,
                                            samples,
                                            n_iter=n_iter,
                                            heuristic_sigma=heuristic_sigma,
                                            g_tolerance=g_tolerance,
                                            C_samples=C_samples)
        errors[:, rep] = rep_errors
    # return min, or average?
    if use_min:
        errors = np.min(errors, axis=1)
    else:
        # use mean
        errors = np.mean(errors, axis=1)
    return errors
Пример #2
0
def view_interpolation(identifier, epoch, n_steps=6, input_samples=None, e_tolerance=0.01, sigma=3.29286853021):
    """
    If samples: generate interpolation between real points
    Else:
        Sample two points in the latent space, view a linear interpolation between them.
    """
    settings = json.load(open('./experiments/settings/' + identifier + '.txt', 'r'))
    if input_samples is None:
        # grab two trainng examples
        data = np.load('./experiments/data/' + identifier + '.data.npy').item()
        train = data['samples']['train']
        input_samples = np.random.permutation(train)[:2]
#        Z_sampleA, Z_sampleB = model.sample_Z(2, settings['seq_length'], settings['latent_dim'], 
#                                          settings['use_time'])
        if sigma is None:
            ## gotta get a sigma somehow
            sigma = mmd.median_pairwise_distance(train)
            print('Calcualted heuristic sigma from training data:', sigma)
    Zs, error, _ = model.invert(settings, epoch, input_samples, e_tolerance=e_tolerance)
    Z_sampleA, Z_sampleB = Zs
    Z_samples = plotting.interpolate(Z_sampleA, Z_sampleB, n_steps=n_steps)
    samples = model.sample_trained_model(settings, epoch, Z_samples.shape[0], Z_samples)
    # get distances from generated samples to target samples
    d_A, d_B = [], []
    for sample in samples:
        d_A.append(sample_distance(sample, samples[0], sigma))
        d_B.append(sample_distance(sample, samples[-1], sigma))
    distances = pd.DataFrame({'dA': d_A, 'dB': d_B})
    plotting.save_plot_interpolate(input_samples, samples, epoch, settings['identifier'] + '_epoch' + str(epoch), distances=distances, sigma=sigma)
    return True
Пример #3
0
def invert(settings, epoch, samples, g_tolerance=None, e_tolerance=0.1,
        n_iter=None, max_iter=10000, heuristic_sigma=None, C_samples=None):
    """
    Return the latent space points corresponding to a set of a samples
    ( from gradient descent )
    """
    # cast samples to float32
    samples = np.float32(samples[:, :, :])
    # get the model
    if type(settings) == str:
        settings = json.load(open('./experiments/settings/' + settings + '.txt', 'r'))
    num_samples = samples.shape[0]
    print('Inverting', num_samples, 'samples using model', settings['identifier'], 'at epoch', epoch,)
    if not g_tolerance is None:
        print('until gradient norm is below', g_tolerance)
    else:
        print('until error is below', e_tolerance)
    # get parameters
    parameters = load_parameters(settings['identifier'] + '_' + str(epoch))
    # assertions
    assert samples.shape[2] == settings['num_generated_features']
    # create VARIABLE Z
    Z = tf.get_variable(name='Z', shape=[num_samples, settings['seq_length'],
                        settings['latent_dim']],
                        initializer=tf.random_normal_initializer())
    if C_samples is None:
        # create outputs
        G_samples = generator(Z, settings['hidden_units_g'], settings['seq_length'],
                              num_samples, settings['num_generated_features'],
                              reuse=False, parameters=parameters)
        fd = None
    else:
        CG = tf.placeholder(tf.float32, [num_samples, settings['cond_dim']])
        assert C_samples.shape[0] == samples.shape[0]
        # CGAN
        G_samples = generator(Z, settings['hidden_units_g'], settings['seq_length'], 
                              num_samples, settings['num_generated_features'], 
                              reuse=False, parameters=parameters, cond_dim=settings['cond_dim'], c=CG)
        fd = {CG: C_samples}

    # define loss
    if heuristic_sigma is None:
        heuristic_sigma = mmd.median_pairwise_distance(samples)     # this is noisy
        print('heuristic_sigma:', heuristic_sigma)
    Kxx, Kxy, Kyy, wts = mmd._mix_rbf_kernel(G_samples, samples, sigmas=tf.constant(value=heuristic_sigma, shape=(1, 1)))
    similarity_per_sample = tf.diag_part(Kxy)
    reconstruction_error_per_sample = 1 - similarity_per_sample
    #reconstruction_error_per_sample = tf.reduce_sum((tf.nn.l2_normalize(G_samples, dim=1) - tf.nn.l2_normalize(samples, dim=1))**2, axis=[1,2])
    similarity = tf.reduce_mean(similarity_per_sample)
    reconstruction_error = 1 - similarity
    # updater
#    solver = tf.train.AdamOptimizer().minimize(reconstruction_error_per_sample, var_list=[Z])
    #solver = tf.train.RMSPropOptimizer(learning_rate=500).minimize(reconstruction_error, var_list=[Z])
    solver = tf.train.RMSPropOptimizer(learning_rate=0.1).minimize(reconstruction_error_per_sample, var_list=[Z])
    #solver = tf.train.MomentumOptimizer(learning_rate=0.1, momentum=0.9).minimize(reconstruction_error_per_sample, var_list=[Z])

    grad_Z = tf.gradients(reconstruction_error_per_sample, Z)[0]
    grad_per_Z = tf.norm(grad_Z, axis=(1, 2))
    grad_norm = tf.reduce_mean(grad_per_Z)
    #solver = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(reconstruction_error, var_list=[Z])
    print('Finding latent state corresponding to samples...')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        error = sess.run(reconstruction_error, feed_dict=fd)
        g_n = sess.run(grad_norm, feed_dict=fd)
        print(g_n)
        i = 0
        if not n_iter is None:
            while i < n_iter:
                _ = sess.run(solver, feed_dict=fd)
                error = sess.run(reconstruction_error, feed_dict=fd)
                i += 1
        else:
            if not g_tolerance is None:
                while g_n > g_tolerance:
                    _ = sess.run(solver, feed_dict=fd)
                    error, g_n = sess.run([reconstruction_error, grad_norm], feed_dict=fd)
                    i += 1
                    print(error, g_n)
                    if i > max_iter:
                        break
            else:
                while np.abs(error) > e_tolerance:
                    _ = sess.run(solver, feed_dict=fd)
                    error = sess.run(reconstruction_error, feed_dict=fd)
                    i += 1
                    print(error)
                    if i > max_iter:
                        break
        Zs = sess.run(Z, feed_dict=fd)
        error_per_sample = sess.run(reconstruction_error_per_sample, feed_dict=fd)
        print('Z found in', i, 'iterations with final reconstruction error of', error)
    tf.reset_default_graph()
    return Zs, error_per_sample, heuristic_sigma
Пример #4
0
        sigma=0,
        dp=False)
    G_sample = model.generator(Z, **generator_settings, reuse=True, c=CG)

    print("Train Samples : ", samples['train'].shape)
    print("Validation Samples : ", samples['vali'].shape)
    print("Test Samples : ", samples['test'].shape)
    print("Train Labels :", labels['train'])
    # # --- evaluation --- #

    # # # frequency to do visualisations
    vis_freq = 10
    eval_freq = 1

    # # # get heuristic bandwidth for mmd kernel from evaluation samples
    heuristic_sigma_training = median_pairwise_distance(samples['vali'])
    best_mmd2_so_far = 1000

    # # optimise sigma using that (that's t-hat)
    n_samples_sigma = len(samples['vali'])
    batch_multiplier = n_samples_sigma // batch_size
    eval_size = batch_multiplier * batch_size
    print(eval_size)
    eval_eval_size = int(0.2 * eval_size)
    eval_real_PH = tf.placeholder(
        tf.float32, [eval_eval_size, seq_length, num_generated_features])
    eval_sample_PH = tf.placeholder(
        tf.float32, [eval_eval_size, seq_length, num_generated_features])
    n_sigmas = 2
    sigma = tf.get_variable(
        name='sigma',
Пример #5
0
def model_memorisation(identifier, epoch, max_samples=2000, tstr=False):
    """
    Compare samples from a model against training set and validation set in mmd
    """
    if tstr:
        print('Loading data from TSTR experiment (not sampling from model)')
        # load pre-generated samples
        synth_data = np.load('./experiments/tstr/' + identifier + '_' +
                             str(epoch) + '.data.npy').item()
        model_samples = synth_data['samples']
        synth_labels = synth_data['labels']
        # load real data used in that experiment
        real_data = np.load('./experiments/data/' + identifier +
                            '.data.npy').item()
        real_samples = real_data['samples']
        train = real_samples['train']
        test = real_samples['test']
        n_samples = test.shape[0]
        if model_samples.shape[0] > n_samples:
            model_samples = np.random.permutation(model_samples)[:n_samples]
        print('Data loaded successfully!')
    else:
        if identifier == 'cristobal_eICU':
            model_samples = pickle.load(open('REDACTED', 'rb'))
            samples, labels = data_utils.eICU_task()
            train = samples['train'].reshape(-1, 16, 4)
            vali = samples['vali'].reshape(-1, 16, 4)
            test = samples['test'].reshape(-1, 16, 4)
            #train_targets = labels['train']
            #vali_targets = labels['vali']
            #test_targets = labels['test']
            train, vali, test = data_utils.scale_data(train, vali, test)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(
                    model_samples)[:n_samples]
        elif identifier == 'cristobal_MNIST':
            the_dir = 'REDACTED'
            # pick a random one
            which = np.random.choice(['NEW_OK_', '_r4', '_r5', '_r6', '_r7'])
            model_samples, model_labels = pickle.load(
                open(
                    the_dir +
                    'synth_mnist_minist_cdgan_1_2_100_multivar_14_nolr_rdim3_0_2_'
                    + which + '_190.pk', 'rb'))
            # get test and train...
            # (generated with fixed seed...)
            mnist_resized_dim = 14
            samples, labels = data_utils.load_resized_mnist(mnist_resized_dim)
            proportions = [0.6, 0.2, 0.2]
            train, vali, test, labels_split = data_utils.split(
                samples, labels=labels, random_seed=1, proportions=proportions)
            np.random.seed()
            train = train.reshape(-1, 14, 14)
            test = test.reshape(-1, 14, 14)
            vali = vali.reshape(-1, 14, 14)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(
                    model_samples)[:n_samples]
        else:
            settings = json.load(
                open('./experiments/settings/' + identifier + '.txt', 'r'))
            # get the test, train sets
            data = np.load('./experiments/data/' + identifier +
                           '.data.npy').item()
            train = data['samples']['train']
            test = data['samples']['test']
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            model_samples = model.sample_trained_model(settings, epoch,
                                                       n_samples)
    all_samples = np.vstack([train, test, model_samples])
    heuristic_sigma = mmd.median_pairwise_distance(all_samples)
    print('heuristic sigma:', heuristic_sigma)
    pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(
        model_samples,
        test,
        np.random.permutation(train)[:n_samples],
        sigma=heuristic_sigma,
        computeMMDs=False)
    #pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, np.random.permutation(train)[:n_samples], test, sigma=heuristic_sigma, computeMMDs=False)
    #    if pvalue < 0.05:
    #        print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that the test data has a smaller MMD with the true data than the generated data')
    # the function takes (X, Y, Z) as its first arguments, it's testing if MMDXY (i.e. MMD between model and train) is less than MMDXZ (MMd between model and test)
    #    else:
    #        print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclu#de that the test data has a smaller MMD with the true data than the generated data')
    return pvalue, tstat, sigma