def estimate_divergences(model_hyperparams, lr, criterion, device, batch_size, n_epochs, is_wasserstein):
    input_size, h1_size, h2_size, out_size, out_sigmoid = model_hyperparams
    p_dist = iter(samplers.distribution1(x=0, batch_size=batch_size))

    # train a model for each value of phi
    phi_list = np.linspace(-1, 1, 21)
    jsd_list = []
    for phi in phi_list:
        # create model and optimizer
        model = MLP(input_size, h1_size, h2_size, out_size, out_sigmoid).to(device)
        print(model)
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)

        q_dist = iter(samplers.distribution1(x=phi, batch_size=batch_size))
        losses = train(model, p_dist, q_dist, optimizer, criterion, device, n_epochs, is_wasserstein)

        # visualise loss.
        # plt.figure()
        # plt.plot(losses)
        # plt.title('phi = {}'.format(phi))
        # plt.show()

        divergence_estimate = -1 * losses[-1]
        print('At phi = {}, divergence estimate = {}'.format(phi, divergence_estimate))
        jsd_list.append(divergence_estimate)

    plt.figure()
    plt.plot(phi_list, jsd_list, 'o')
    plt.xlabel('$\phi$', fontsize=14)
    plt.ylabel('Jensen-Shannon Divergence', fontsize=14)
    plt.show()
Пример #2
0
def train_JSD():
	losses = []
	thetas = np.array(range(-10, 11))/10
	D_real = next(samplers.distribution1(0, 512))
	for i in range(21):
		if cuda:
			Discriminator = Net().cuda()
		else:
			Discriminator = Net()

		# optimizer = optim.SGD(Discriminator.parameters(), lr = 1e-3, momentum = 0.9)
		optimizer = optim.Adam(Discriminator.parameters(), lr = 1e-3)

		print(thetas[i])
		
		D_fake = next(samplers.distribution1(thetas[i], 512))
		# print

		X = torch.from_numpy(D_real).float()
		Y = torch.from_numpy(D_fake).float()

		if cuda:
			X = X.cuda()
			Y = Y.cuda()
		
		#  training stage
		for e in range(1000):
			O_real = Discriminator(X)
			O_fake = Discriminator(Y)

			optimizer.zero_grad()

			loss = JSD(O_real, O_fake)

			if ( e%100 == True):

				print(-loss.data)

			loss.backward()
			optimizer.step()

		# testing the values
		O_real = Discriminator(X)
		O_fake = Discriminator(Y)

		loss = JSD(O_real, O_fake)

		print (-loss.data)
		losses.append(-loss)
	print ('Done...')

	losses = np.array(losses)
	plt.figure()
	plt.scatter(thetas,losses)
	plt.title('Jenssen Shanon Divergence')
	plt.xlabel('Theta')
	plt.ylabel('Divergance')
	plt.savefig('Jenssen_Shanon_Divergence.png') 
	plt.close()
Пример #3
0
def problem1_3():
    phis = np.arange(-1, 1.1, 0.1)
    wds = []
    jss = []
    for phi in phis:
        p = distribution1(0, 512)
        q = distribution1(phi, 512)
        func1 = MLP_WD(dim=2)
        func2 = MLP_Disc(dim=2)
        wds.append(func1._train(p, q, epochs=200))
        jss.append(func2._train(p, q, js_obj, epochs=100))
    plt.figure()
    plt.plot(phis, torch.FloatTensor(jss).detach().numpy(), "-sk")
    plt.xlabel(r"$\phi$")
    plt.ylabel("Jensen-Shanon estimate")
    plt.figure()
    plt.plot(phis, torch.FloatTensor(wds).detach().numpy(), "-sk")
    plt.xlabel(r"$\phi$")
    plt.ylabel("Wasserstein distance estimate")
    plt.show()
    return wds, jss
Пример #4
0
###############################################################################

############################## Optimizer definung #############################
optimizer_D = torch.optim.Adam(D.parameters(), lr=setting.lr)
###############################################################################

################################ Running on GPU ###############################
cuda_available = True if torch.cuda.is_available() else False
#cuda_available = False
if cuda_available:
    D.cuda()
Tensor = torch.cuda.FloatTensor if cuda_available else torch.FloatTensor
###############################################################################

################################ Distribution p ###############################
dist_p = iter(distribution1(0, sample_num))
samples = next(dist_p)
samples_p = Tensor(samples)

D_LOSSES = []
for theta in np.arange(-1., 1.1, 0.1):
    dist_q = iter(distribution1(theta, sample_num))
    samples = next(dist_q)
    samples_q = Tensor(samples)
    batches_done = 0
    wsd = []
    for epoch in range(setting.n_epochs):
        fakes = []
        for i in range(0, sample_num, setting.batch_size):
            up_bnd = i + setting.batch_size
            if up_bnd > sample_num + 1:
Пример #5
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--metric',
                        type=str,
                        default='JSD',
                        help="options are 'WS' or 'JSD' or 'ce'")
    parser.add_argument('--use_cuda', type=bool, default=False)
    parser.add_argument('--setup', type=int, default=3)
    args = parser.parse_args()

    ### setup for question 3
    if args.setup == 3:
        data_points = ([], [])
        dist1 = samplers.distribution1(0, batch_size=256)
        for x in np.arange(-1.0, 1.01, 0.1):
            x = np.around(x, decimals=1)
            print("x = " + str(x))
            dist2 = samplers.distribution1(x, batch_size=256)
            D = train(args.metric, dist1, dist2, args.use_cuda)
            y = estimate(args.metric, dist1, dist2, D, args.use_cuda)
            data_points[0].append(x)
            data_points[1].append(y)

        plt.plot(data_points[0], data_points[1], '.')
        plt.xlim(-1, 1)
        plt.show()

    ### setup for question 4
    ### using the provided script density_estimation.py
Пример #6
0
        optimizer_T.step()
    Wd = wd_objective(Critic, x_p, y_q)
    penalty = gradient_penalty(Critic, x_p, y_q, lamda)
    Wd = Wd - penalty
    return Critic, Wd


########### Question 1.3 ############

Phi_values = [-1 + 0.1 * i for i in range(21)]

estimated_jsd, estimated_wd = [], []

for Phi in Phi_values:

    dist_p = distribution1(0, batch_size=512)

    dist_q = distribution1(Phi, batch_size=512)

    Discrim, jsd = js_divergence(dist_p, dist_q, m_minibatch=1000)
    estimated_jsd.append(jsd)

    Critic, wd = w_distance(dist_p, dist_q, m_minibatch=1000, lamda=10)
    estimated_wd.append(wd)

    # TO DO
    print(
        f"Phi: {Phi:.2f}  estimated JSD: {jsd.item():.6f}  estimated WD: {wd.item():.6f}"
    )

plt.figure(figsize=(8, 4))
import WGAN_Final as problem2
import samplers
from matplotlib import pyplot as plt

theta_initial = -1
theta_incremental = 0.1
number_of_models = 21
p_distribution = samplers.distribution1(0, 512)

WD_Values = []
Theta_Values = []

for i in range(number_of_models):
    print("#####################", "Itteration", i + 1,
          "######################################")
    q_distribution = samplers.distribution1(theta_initial, 512)
    WD = problem2.WGAN(hidden_size=64,
                       mini_batch=512,
                       learning_rate=0.001,
                       num_epochs=1000,
                       print_interval=100)
    WD_Values.append(WD.run_main_loop(p_distribution, q_distribution))
    Theta_Values.append(theta_initial)
    theta_initial = theta_initial + theta_incremental

plt.plot(Theta_Values, WD_Values, label='WD')
plt.legend()
plt.show()
print("Training complete")
Пример #8
0
num_epochs = 100  # number of training epochs
init_lr = 0.001  # initial learning rate

# the binary cross entropy l(y,D(x)) is: -1 * [y log(D(x)) + (1 - y)*log(1 - D(x))]
# if we take y to be zero (distribution q), we optimize -log(1-D(x))
# if we take y to be 1 (distribution p), we optimize -log(D(x))
# we want to minimize this (this is directly equivalent to maximizing our objective function)
criterion = nn.BCEWithLogitsLoss(
)  # binary cross entropy with built-in sigmoid
optimizer = optim.SGD(d.parameters(), lr=init_lr)

for epoch in range(num_epochs):

    # sample minibatches from the two distributions:
    distr1 = sp.distribution1(0, batch_size)
    dist1 = iter(distr1)
    samples1 = np.squeeze(next(dist1)[:, 0])
    t1 = torch.Tensor(samples1).to(device)
    distr3 = sp.distribution3(batch_size)
    dist3 = iter(distr3)
    samples3 = np.squeeze(next(dist3))
    t3 = torch.Tensor(samples3).to(device)

    d.zero_grad()  # gradients to zero

    # gradients on 'real' distribution:
    out_r = d(t1)
    err_r = criterion(out_r, torch.Tensor([1]).to(device))
    err_r.backward()
Пример #9
0
if __name__ == '__main__':

    batch_size = 512

    phi_values = np.arange(-1., 1.1, 0.1)
    print(phi_values)

    best_list = []

    # Problem 1.3 - JSD
    for phi in phi_values:

        d = Discriminator(input_size=2)
        optimizer = torch.optim.SGD(d.parameters(), lr=1e-1)
        criterion = JSDLoss()
        real_dist = iter(samplers.distribution1(x=0, batch_size=batch_size))
        fake_dist = iter(samplers.distribution1(x=phi, batch_size=batch_size))
        best_loss = -float('Inf')

        for batch in range(2500):
            real_samples = torch.as_tensor(next(real_dist),
                                           dtype=torch.float32).view(-1, 2)
            fake_samples = torch.as_tensor(next(fake_dist),
                                           dtype=torch.float32).view(-1, 2)
            optimizer.zero_grad()
            real_outputs = d(real_samples)
            fake_outputs = d(fake_samples)
            loss = -criterion(real_outputs, fake_outputs)
            print(loss)
            loss.backward()
            optimizer.step()
    def plot_functions_estimate(self, epochs, distance_fct):
        global graph
        data_points = []

        #Sets the default graph
        graph = tf.get_default_graph()

        #Initialize the model to save defaults weights
        discriminator = sq()
        discriminator.add(Dense(units=64, activation='relu', input_dim=2))
        discriminator.add(Dense(units=64, activation='relu'))
        discriminator.save_weights('model.h5')

        # Get appropriate loss function
        if distance_fct == 'JSD':
            loss_fct = self.JSD_Loss
            discriminator.add(Dense(units=1, activation='sigmoid'))
        if distance_fct == 'Wasserstein':
            loss_fct = self.Wasserstein_Loss
            discriminator.add(Dense(units=1, activation='linear'))
        else:
            assert 'Unknown loss function'

        discriminator.save_weights('model.h5')

        for i in range(21):

            # Reset the model
            K.clear_session()
            tf.reset_default_graph()

            with graph.as_default():
                # Initialize current experiment model
                discriminator = sq()

                # Get appropriate loss function
                if distance_fct == 'JSD':
                    loss_fct = self.JSD_Loss
                    discriminator.add(Dense(units=64, activation='relu', input_dim=2))
                    discriminator.add(Dense(units=64, activation='relu'))
                    discriminator.add(Dense(units=1, activation='sigmoid'))
                if distance_fct == 'Wasserstein':
                    loss_fct = self.Wasserstein_Loss
                    discriminator.add(Dense(units=64, activation='relu', kernel_constraint=max_norm(0.2), input_dim=2))
                    discriminator.add(Dense(units=64, kernel_constraint=max_norm(0.2), activation='relu'))
                    discriminator.add(Dense(units=1,kernel_constraint=max_norm(0.5), activation='linear'))
                else:
                    assert 'Unknown loss function'
                discriminator.load_weights('model.h5')

                if distance_fct == 'Wasserstein':
                    discriminator.compile(loss=loss_fct,
                                              optimizer=SGD(lr=0.1))
                else:
                    discriminator.compile(loss=loss_fct,
                                              optimizer=SGD(lr=0.5))

                phi = round(-1.0 + (0.1 * i), 2)
                for _ in range(epochs):
                    # Create our distributions
                    p_gen = samplers.distribution1(0)
                    q_gen = samplers.distribution1(phi)

                    p = next(p_gen)
                    q = next(q_gen)


                    # Make target dummy for Keras
                    y_dummy = np.zeros(512 * 2)

                    # Train the model on the current distributions

                    discriminator.train_on_batch(np.concatenate((p, q)), y_dummy)

                x = discriminator.get_weights()
                # Create the test distributions
                p_gen = samplers.distribution1(0)
                q_gen = samplers.distribution1(phi)

                p = next(p_gen)
                q = next(q_gen)

                D_x = discriminator.predict(p)
                D_y = discriminator.predict(q)

                if distance_fct == 'JSD':
                    data_points.append(self.JSD(D_x,D_y))
                if distance_fct == 'Wasserstein':
                    data_points.append(self.Wasserstein(D_x,D_y))

        plt.plot(data_points)
        plt.show()

        return 0
Пример #11
0
    d(torch.from_numpy(xx)).numpy()**(-1) * N(xx))
plt.plot(xx, N(xx))
plt.clf()

############### import the sampler ``samplers.distribution4''
############### train a discriminator on distribution4 and standard gaussian
############### estimate the density of distribution4

#######--- INSERT YOUR CODE BELOW ---#######
directory = "model/"
num_epochs = 1000

if args.question == 3:
    print("question 3")
    phi = np.linspace(-1, 1, 21)
    x = samplers.distribution1(0)
    values = []
    for i in phi:
        y = samplers.distribution1(i)
        model = Discriminator(2, 50, 512, 0)
        for epoch in range(num_epochs):
            x_batch = torch.from_numpy(next(x))
            y_batch = torch.from_numpy(next(y))
            model.train(x_batch.type(torch.FloatTensor),
                        y_batch.type(torch.FloatTensor), args.loss_type)
        #torch.save(model.state_dict(), os.path.join(directory, 'best_params_'+str(i)+'.pt'))
        x_dist = samplers.distribution1(0, 10000)
        y_dist = samplers.distribution1(i, 10000)
        x_dist_batch = torch.from_numpy(next(x_dist))
        y_dist_batch = torch.from_numpy(next(y_dist))
        x_value = x_dist_batch.type(torch.FloatTensor)
Пример #12
0
 def get_samples(phi: float):
     p = distribution1(0)
     q = distribution1(phi)
     jsd, _ = q1(p, q, maxsteps=1000, threshold=0.01)
     wd, _ = q2(p, q, maxsteps=1000, threshold=0.01)
     return jsd, wd
Пример #13
0
def training_loop(LossFct,
                  x,
                  distribution=1,
                  learning_rate=0.0001,
                  num_epochs=50000):
    # Device configuration
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Distributions properties
    if distribution == 1:
        p_gen = samplers.distribution1(0)
        q_gen = samplers.distribution1(x)
        nb_input = 2
    elif distribution == 4:
        q_gen = samplers.distribution3(2048)
        p_gen = samplers.distribution4(2048)
        nb_input = 1
    p_gen.send(None)
    q_gen.send(None)

    D = discriminators.Discriminator(n_input=nb_input).to(device)

    # Loss and optimizer
    optimizer = torch.optim.SGD(D.parameters(), lr=learning_rate)

    # Train the model
    trainLoss = []
    trainAcc = []
    meanLoss = 0
    correct = 0
    total = 0
    log_frequency = 100
    for epoch in range(num_epochs):
        #     exp_lr_scheduler.step()
        p = torch.from_numpy(p_gen.send(0)).float().to(device)
        q = torch.from_numpy(q_gen.send(x)).float().to(device)
        labels_real = torch.ones(p.shape[0]).to(device)
        labels_fake = torch.zeros(q.shape[0]).to(device)
        # Forward pass
        outputs_real = torch.sigmoid(D(p))
        outputs_fake = torch.sigmoid(D(q))

        predicted_real = (outputs_real.data > 0.5).float().squeeze()
        predicted_fake = (outputs_fake.data > 0.5).float().squeeze()
        total += 2 * labels_real.size(0)
        correct_this_batch = (predicted_real == labels_real).sum().item() + (
            predicted_fake == labels_fake).sum().item()
        correct += correct_this_batch
        loss = LossFct.forward(
            outputs_real, outputs_fake, labels_real, labels_fake, p, q, D
        )  #(torch.log(torch.tensor([2.0])).to(device) + 0.5*criterion(outputs_real, labels_real) + 0.5*criterion(outputs_fake, labels_fake))
        meanLoss += loss.item()

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % log_frequency == 0:
            print('Epoch [{}/{}]'.format(epoch, num_epochs))
            print('Loss: {:.4f}({:.4f}), Acc: {:.3f}({:.3f})'.format(
                loss.item(), meanLoss / (epoch + 1),
                correct_this_batch * 100 / (2 * labels_real.size(0)),
                correct * 100 / total))
        trainLoss.append(meanLoss / (epoch + 1))
        trainAcc.append(100 * correct / total)
    return loss, D
Пример #14
0
    D.eval()
    x = torch.from_numpy(next(p)).float().to(device)
    y = torch.from_numpy(next(q)).float().to(device)
    loss = D.loss_func(x, y, loss_metric)
    return -loss.item()


if __name__ == "__main__":
    print(device)
    # Q1.3 JSD
    jsd_list = []
    wd_list = []
    phis = np.around(np.arange(-1.0, 1.0, 0.1), 1)
    for phi in phis:
        print(phi)
        dist_p = distribution1(0, 512)
        dist_q = distribution1(phi, 512)
        D = Discriminator()
        train(D, dist_p, dist_q)
        y = predict(D, dist_p, dist_q)
        print("Estimate: ", y)
        jsd_list.append(y)

    plt.scatter(phis, jsd_list)
    plt.title('{}'.format('JSD'))
    plt.ylabel('Estimated Jensen-Shannon Divergence')
    plt.xlabel('$\phi$')
    plt.savefig('JSD.png')
    plt.show()

    # Q1.3 WD