a3.py

# -*- coding: utf-8 -*-
"""A3.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Q0Y_JGk8R2U70MbeF7w1xdYyZv78u6d4
"""

# Commented out IPython magic to ensure Python compatibility.
# %tensorflow_version 1.x
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import helper as hlp
import math

# Loading data
def load_data(isValid, is100D):
  # Loading data
  if is100D:
    data = np.load('data100D.npy')
  else:
    data = np.load('data2D.npy')
  [num_pts, dim] = np.shape(data)

  # For Validation set
  if isValid:
    valid_batch = int(num_pts / 3.0)
    np.random.seed(45689)
    rnd_idx = np.arange(num_pts)
    np.random.shuffle(rnd_idx)
    val_data = data[rnd_idx[:valid_batch]]
    data = data[rnd_idx[valid_batch:]]
    return data, val_data
  else:
    return data

# Helper.py functions (to avoid file importing issues)
def reduce_logsumexp(input_tensor, reduction_indices=1, keep_dims=False):
  """Computes the sum of elements across dimensions of a tensor in log domain.

     It uses a similar API to tf.reduce_sum.

  Args:
    input_tensor: The tensor to reduce. Should have numeric type.
    reduction_indices: The dimensions to reduce. 
    keep_dims: If true, retains reduced dimensions with length 1.
  Returns:
    The reduced tensor.
  """
  max_input_tensor1 = tf.reduce_max(
      input_tensor, reduction_indices, keep_dims=keep_dims)
  max_input_tensor2 = max_input_tensor1
  if not keep_dims:
    max_input_tensor2 = tf.expand_dims(max_input_tensor2, reduction_indices)
  return tf.log(
      tf.reduce_sum(
          tf.exp(input_tensor - max_input_tensor2),
          reduction_indices,
          keep_dims=keep_dims)) + max_input_tensor1

def logsoftmax(input_tensor):
  """Computes normal softmax nonlinearity in log domain.

     It can be used to normalize log probability.
     The softmax is always computed along the second dimension of the input Tensor.     

  Args:
    input_tensor: Unnormalized log probability.
  Returns:
    normalized log probability.
  """
  return input_tensor - reduce_logsumexp(input_tensor, reduction_indices=0, keep_dims=True)

# function for displaying plots
def show_plots(clusters, centroids, loss, cluster_sizes):
  # plot clusters with centroids
  for i, K in enumerate(cluster_sizes):
    for cluster in clusters[i]:
      plt.scatter(cluster[:, 0], cluster[:, 1], s=1)
    for centroid in centroids[i]:
      plt.scatter(centroids[i][:, 0], centroids[i][:, 1], marker='x', c='black')
    plt.title("Dataset K=" + str(K))
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.show()
  
  # plot losses
  for i, K in enumerate(cluster_sizes):
    plt.title('Loss')
    plt.xlabel('Iterations')
    plt.ylabel('Loss')
    plt.plot(loss[i], label='K='+str(K))
  plt.legend(['K='+str(k) for k in cluster_sizes])
  plt.show()

# Part 1.1.1
# Distance function for K-means
def distanceFunc(X, MU):
    # Inputs
    # X: is an NxD matrix (N observations and D dimensions)
    # MU: is an KxD matrix (K means and D dimensions)
    # Outputs
    # pair_dist: is the squared pairwise distance matrix (NxK)
    # TODO
    
    # For the purposes of broadcasting to perform X - MU, change the dimensions
    # of X and MU
    # change dimension of X from (N,d) to (N,1,d)
    _X = tf.expand_dims(input=X, axis=1) 
    # change dimension of MU from (K,d) to (1,K,d)
    MU = tf.expand_dims(input=MU, axis=0)
   
    # With new dimensions, broadcasting can be done as follows:
    # N x 1 x d
    # 1 x K x d
    # ---------
    # N x K x d
    # With this dimension, subtracting the two tensors is now possible
    # Square the result afterwards
    dist = tf.math.square(tf.math.subtract(_X, MU))
    # return intended Tensor with N x K dimensions by summing across 3rd dimension
    return tf.reduce_sum(dist, axis=2)

def kmeans(data, numClusters):
  # Number of iterations
  iterations = 200

  # Number of data points
  N = data.shape[0]

  # Size of dimension
  d = data.shape[1]

  # Number of clusters/centroids
  K = numClusters

  ### Build Graph ###
  # Create placeholder for data points
  X = tf.placeholder(dtype=tf.float32, shape=(N,d), name="X")
  # Initialize centre of clusters with standard normal distribution
  MU = tf.Variable(initial_value=tf.random.normal(shape=[K, d], mean=0, stddev=math.sqrt(1), dtype=tf.float32), 
                   trainable=True, name="MU")
  
  # Calculate distance of each point to each cluster centre
  distances = distanceFunc(X, MU)
  
  # Calculate loss: L(MU) = sigma(n=1 to N) min(k=1 to K) ||X-MU||^2
  loss = tf.math.reduce_sum(tf.math.reduce_min(distances, axis=1), name="loss")

  # Adam Optimizer
  opt = tf.train.AdamOptimizer(learning_rate=0.1, beta1=0.9, beta2=0.99, epsilon=1e-5).minimize(loss)

  # Assign the index of the minimum distance centroid to each point in X
  assign_to_cluster = tf.math.argmin(distances, axis=1, output_type=tf.int32)

  # Transform data set by splitting into groups (for output)
  output = tf.dynamic_partition(X, assign_to_cluster, num_partitions=numClusters)

  # Initialize Tensorflow variables
  init = tf.global_variables_initializer()

  loss_history = []
  clustered = None
  with tf.Session() as sess:
    sess.run(init)
    # Training loop
    for step in range(iterations):
      _MU, _loss, _opt = sess.run([MU, loss, opt], feed_dict={X: data})
      loss_history.append(_loss)
    
    # get trained centroids
    trained_centroids = MU.eval()

    # Assign each point to cluster based on distance to closest cluster centre
    clustered = sess.run(output, feed_dict={X:data})

  return clustered, trained_centroids, loss_history

# Run on K=3
data = load_data(isValid=False, is100D=False)
clusters, centroids, loss = kmeans(data, 3)
show_plots([clusters], [centroids], [loss], [3])

# Part 1.1.2
data = load_data(isValid=False, is100D=False)
cluster_sizes = [1, 2, 3, 4, 5]
clusters_result = []
centroids_result = []
losses_result = []
for K in cluster_sizes:
  clusters, centroids, loss = kmeans(data, K)
  clusters_result.append(clusters)
  centroids_result.append(centroids)
  losses_result.append(loss)
  for i, cluster in enumerate(clusters):
    print('% of points in cluster ' + str(i) + ': ' + str(len(cluster)/len(data)))
  print('Final loss: ' + str(loss[-1]))
  print('\n')
show_plots(clusters_result, centroids_result, losses_result, cluster_sizes)

# Part 1.1.3
data, val_data = load_data(isValid=True, is100D=False)
cluster_sizes = [1, 2, 3, 4, 5]
clusters_result = []
centroids_result = []
training_losses_result = []
validation_losses = []
for K in cluster_sizes:
  clusters, centroids, loss = kmeans(data, K)
  clusters_result.append(clusters)
  centroids_result.append(centroids)
  training_losses_result.append(loss)

  # compute validation loss by calculating loss function with the trained centroids
  _X = np.expand_dims(val_data, axis=1)
  _MU = np.expand_dims(centroids, axis=0)
  dist = np.sum(np.square(_X - _MU), axis=2)
  loss = np.sum(np.amin(dist, axis=1))
  validation_losses.append(loss)
  print("Validation Loss for K=" + str(K) + ' : ' + str(loss) + '\n')
show_plots(clusters_result, centroids_result, losses_result, cluster_sizes)

# Plot validation losses
plt.title('Validation Losses')
plt.xlabel('Cluster size')
plt.ylabel('Loss')
plt.plot(cluster_sizes, validation_losses)
plt.show()

# Part 2.1.1
def log_GaussPDF(X, mu, sigma):
    # Inputs
    # X: N X D
    # mu: K X D
    # sigma: K X 1
    # log_pi: K X 1

    # Outputs:
    # log Gaussian PDF N X K

    # TODO
    # remove extra dimension of rank 1 to avoid broadcasting issues
    sigma = tf.squeeze(sigma)
    
    # calculate distance of each point to each center
    dist = distanceFunc(X, mu)

    # calculate log_pdf
    log_pdf = (-0.5*tf.math.log(2*np.pi*sigma)) - (tf.math.square(dist) / (2*sigma))

    return log_pdf

# Part 2.1.2
def log_posterior(log_PDF, log_pi):
    # Input
    # log_PDF: log Gaussian PDF N X K
    # log_pi: K X 1

    # Outputs
    # log_post: N X K

    # TODO
    # remove extra dimension of rank 1 to avoid broadcasting issues
    log_pi = tf.squeeze(log_pi)

    # log likelihood of cluster: log(P(x|z=k)P(z=k)) = log(P(x|z=k)) + log(P(z))
    log_likelihood = tf.math.add(log_PDF, log_pi)

    # log posterior probability: log(likelihood / sum(k=1..K)(likelihood))
    # = log(likelihood) - log(logsumexp(likelihood))
    log_post = log_likelihood - reduce_logsumexp(log_likelihood, keep_dims=True)

    return log_post

# Part 2.2.1
def gmm(data, numClusters):
  # Number of iterations
  iterations = 400

  # Number of data points
  N = data.shape[0]

  # Size of dimension
  d = data.shape[1]

  # Number of clusters/centroids
  K = numClusters

  ### Build Graph ###
  # Create placeholder for data points
  X = tf.placeholder(dtype=tf.float32, shape=(N,d), name="X")
  # Initialize centre of clusters with sampling from standard normal distribution
  MU = tf.Variable(initial_value=tf.random.normal(shape=[K, d], mean=0, stddev=math.sqrt(1), dtype=tf.float32), 
                   trainable=True, name="MU")
  # Initialize sigma with sampling from standard normal distribution
  sigma = tf.Variable(initial_value=tf.random_normal(shape=[K, 1], mean=0, stddev=math.sqrt(1)),
                      trainable=True)
  # pass sigma through exp() to avoid constraints
  sigma = tf.math.exp(sigma)
  # Initialize log_pi with sampling from standard normal distribution
  log_pi = tf.Variable(initial_value=tf.random.normal(shape=[K, 1], mean=0, stddev=math.sqrt(1)),
                   trainable=True)
  # pass log_pi through logsoftmax to avoid contraints
  log_pi = logsoftmax(log_pi)

  # calculate log probability: P(x,z)
  log_PDF = log_GaussPDF(X, MU, sigma)
  
  # Calculate loss: L = - logsumexp(log_PDF, log_pi)
  loss = -1 * tf.reduce_sum(reduce_logsumexp(log_PDF + tf.squeeze(log_pi)))

  # Adam Optimizer
  opt = tf.train.AdamOptimizer(learning_rate=0.1, beta1=0.9, beta2=0.99, epsilon=1e-5).minimize(loss)

  # Assign the index of the maximum probability cluster to each point in X
  assign_to_cluster = tf.math.argmax(log_posterior(log_PDF, log_pi), axis=1, output_type=tf.int32)

  # Transform data set by splitting into groups (for output)
  output = tf.dynamic_partition(X, assign_to_cluster, num_partitions=numClusters)

  # Initialize Tensorflow variables
  init = tf.global_variables_initializer()

  loss_history = []
  clustered = None
  with tf.Session() as sess:
    sess.run(init)
    # Training loop
    for step in range(iterations):
      _MU, _sigma, _log_pi, _loss, _opt = sess.run([MU, sigma, log_pi, loss, opt], feed_dict={X: data})
      loss_history.append(_loss)
    
    # get trained parameters
    trained_centroids = MU.eval()
    trained_log_pi = log_pi.eval()
    trained_sigma = sigma.eval()

    # Assign each point to cluster based on distance to closest cluster centre
    clustered = sess.run(output, feed_dict={X:data})

  return clustered, trained_centroids, trained_log_pi, trained_sigma, loss_history

# Run on K=3
data = load_data(isValid=False, is100D=False)
clusters, centroids, trained_log_pi, trained_sigma, loss = gmm(data, 3)
show_plots([clusters], [centroids], [loss], [3])
print('Final sigma: \n' + str(trained_sigma))
print('Final pi: \n' + str(trained_log_pi))

# Part 2.2.2
cluster_sizes = [1, 2, 3, 4, 5]
clusters_result = []
centroids_result = []
losses_result = []
validation_losses = []

data, val_data = load_data(isValid=True, is100D=False)
for K in cluster_sizes:
  clusters, centroids, trained_log_pi, trained_sigma, loss = gmm(data, K)
  clusters_result.append(clusters)
  centroids_result.append(centroids)
  losses_result.append(loss)
  print('Final sigma: \n' + str(trained_sigma))
  print('Final pi: \n' + str(trained_log_pi))
  # compute validation loss by calculating loss function with the trained centroids
  _X = np.expand_dims(val_data, axis=1)
  _MU = np.expand_dims(centroids, axis=0)
  dist = np.sum(np.square(_X - _MU), axis=2)
  loss = np.sum(np.amin(dist, axis=1))
  validation_losses.append(loss)
  print("Validation Loss for K=" + str(K) + ' : ' + str(loss) + '\n')
show_plots(clusters_result, centroids_result, losses_result, cluster_sizes)
# Plot gmm validation losses
plt.title('Validation Losses of MoG')
plt.xlabel('Cluster size')
plt.ylabel('Loss')
plt.plot(cluster_sizes, validation_losses)
plt.show()

# Part 2.2.3
cluster_sizes = [5, 10, 15, 20, 30]
kmeans_training_losses = []
gmm_training_losses = []
kmeans_validation_losses = []
gmm_validation_losses = []

data, val_data = load_data(isValid=True, is100D=True)
for K in cluster_sizes:
  # Run kmeans
  kmeans_clusters, kmeans_centroids, kmeans_loss = kmeans(data, K)
  kmeans_training_losses.append(kmeans_loss)
  # Run MoG
  gmm_clusters, gmm_centroids, gmm_trained_log_pi, gmm_trained_sigma, gmm_loss = gmm(data, K)
  gmm_training_losses.append(gmm_loss)

  # compute validation loss for kmeans
  _X_kmeans = np.expand_dims(val_data, axis=1)
  _MU_kmeans = np.expand_dims(kmeans_centroids, axis=0)
  dist_kmeans = np.sum(np.square(_X_kmeans - _MU_kmeans), axis=2)
  kmeans_vloss = np.sum(np.amin(dist_kmeans, axis=1))
  kmeans_validation_losses.append(kmeans_vloss)

  # compute validation loss for gmm
  _X_gmm = np.expand_dims(val_data, axis=1)
  _MU_gmm = np.expand_dims(gmm_centroids, axis=0)
  dist_gmm = np.sum(np.square(_X_gmm - _MU_gmm), axis=2)
  gmm_vloss = np.sum(np.amin(dist_gmm, axis=1))
  gmm_validation_losses.append(gmm_vloss)

# Plot kmeans training losses
for l in kmeans_training_losses:
  plt.title('Training Losses of K-means')
  plt.xlabel('Iterations')
  plt.ylabel('Loss')
  plt.plot(l)
plt.legend(['K='+str(k) for k in cluster_sizes])
plt.show()

# Plot gmm training losses
for l in gmm_training_losses:
  plt.title('Training Losses of MoG')
  plt.xlabel('Iterations')
  plt.ylabel('Loss')
  plt.plot(l)
plt.legend(['K='+str(k) for k in cluster_sizes])
plt.show()

# Plot kmeans validation losses
plt.title('Validation Losses of K-means')
plt.xlabel('Cluster size')
plt.ylabel('Loss')
plt.plot(cluster_sizes, kmeans_validation_losses)
plt.show()

# Plot gmm validation losses
plt.title('Validation Losses of MoG')
plt.xlabel('Cluster size')
plt.ylabel('Loss')
plt.plot(cluster_sizes, gmm_validation_losses)
plt.show()