def em_algorithm(seed_val, samples, num_clusters, max_num_iter, tm=None): num_samples = samples.shape[0] num_nodes = samples.shape[1] loglikelihood = [] if tm is None: tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes) tm.simulate_pi(None) tm.simulate_trees(None) #samples = tm.samples for iter_ in range(max_num_iter): # 1. Compute responsibilities for all trees sample_likelihoods = np.array([[sample_likelihood(tm.clusters[ii], samples[jj,:]\ , tm.pi[ii]) for ii in range(num_clusters)] for jj in range(num_samples)]) sum_over_trees_likelihoods = np.reshape( np.sum(sample_likelihoods, axis=1), (num_samples, 1)) Responsibilities = np.divide(sample_likelihoods, sum_over_trees_likelihoods) # Computing loglikelihood ll = np.sum(np.log(np.sum(sample_likelihoods, axis=1)), axis=None) loglikelihood.append(ll) tm.loglikelihood.append(ll) # 2. Updating pi for all trees tm.pi = np.sum(Responsibilities, axis=0) / num_samples vertices = list(range(num_nodes)) # 3. Updating each tree for i in range(num_clusters): tree = tm.clusters[i] responsibilities = Responsibilities[:, i] # Creating the symmetric mutual information matrix mutual_information_matrix = np.asarray([[mutual_information(responsibilities, samples, s_idx, t_idx) \ for s_idx in vertices] for t_idx in vertices]) # Computing the graph graph = create_graph(num_nodes, responsibilities, samples, mutual_information_matrix, vertices) # Finding the maximum spanning tree MST = maximum_spanning_tree(graph) # Choosing the root as 0 root_name = 0 # Finding the order of nodes in the tree ordered_nodes, I_sum_tree = create_ordered_nodes(MST, root_name) # Getting attributes for tree to enable update topology_array, theta_array = create_tree_attributes1( ordered_nodes, root_name, samples, responsibilities, num_nodes) # Updating the tree tree.load_tree_from_direct_arrays(topology_array, theta_array) # ------------------------------------------- topology_list = [] theta_list = [] for i in range(num_clusters): topology_list.append(tm.clusters[i].get_topology_array()) theta_list.append(tm.clusters[i].get_theta_array()) loglikelihood = np.array(loglikelihood) topology_list = np.array(topology_list) theta_list = np.array(theta_list) return loglikelihood, topology_list, theta_list, tm
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=10, tm=None): """ This function is for the EM algorithm. :param seed_val: Seed value for reproducibility. Type: int :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes) :param num_clusters: Number of clusters. Type: int :param max_num_iter: Maximum number of EM iterations. Type: int :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array. Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter. :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes) :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2) This is a suggested template. Feel free to code however you want. """ # Set the seed np.random.seed(seed_val) # TODO: Implement EM algorithm here. N = len(samples) K = num_clusters V = samples.shape[1] if tm == None: tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1]) tm.simulate_pi(seed_val=seed_val) tm.simulate_trees(seed_val=seed_val) log_hoods = [] for iteration in range(max_num_iter): #STEP 1 R = np.zeros(shape=(N, K)) for n in range(N): for k in range(K): nth_sample = samples[n] kth_tree = tm.clusters[k] hood = tree_sample_likelihood(kth_tree, nth_sample) R[n, k] = tm.pi[k] * hood R = normalize(R, axis=1, norm='l1') #STEP 2 new_pi = np.zeros(shape=(K)) for k in range(K): suma = 0 for n in range(N): suma += R[n, k] new_pi[k] = suma / N tm.pi = new_pi for k in range(K): #STEP 3 Qstab = np.zeros(shape=(V, V, 2, 2)) #Xs x Xt x (0 or 1) x (0 or 1) Nstab = np.zeros(shape=(V, V, 2, 2)) #Xs x Xt x (0 or 1) x (0 or 1) #2 vertex relation for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue for n in range(N): a = samples[n][Xs] b = samples[n][Xt] r_nk = R[n, k] Nstab[Xs, Xt, a, b] += r_nk for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue denom = sum(R[:, k]) for a in range(2): #for each observation (0 or 1) for b in range(2): num = Nstab[Xs, Xt, a, b] Qstab[Xs, Xt, a, b] = num / denom #1 vertex relation Qsa = np.zeros(shape=(V, 2)) Nsa = np.zeros(shape=(V, 2)) for Xs in range(V): # foreach vertex for n in range(N): a = samples[n][Xs] r_nk = R[n, k] Nsa[Xs, a] += r_nk for Xs in range(V): for a in range(2): num = Nsa[Xs, a] denom = sum(Nsa[Xs, :]) Qsa[Xs, a] = num / denom #mutual information Info = np.zeros(shape=(V, V)) #information between vertices for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue for a in range(2): for b in range(2): qab = Qstab[Xs, Xt, a, b] qa = Qsa[Xs, a] qb = Qsa[Xt, b] if qab / (qa * qb) != 0: Info[Xs, Xt] += qab * log(qab / (qa * qb)) else: Info[Xs, Xt] += 0 #conditional information (for step 5) Qcond_stab = np.zeros(shape=(V, V, 2, 2)) for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue for a in range(2): for b in range(2): num = Nstab[Xs, Xt, a, b] denom = sum(Nstab[Xs, Xt, a, :]) Qcond_stab[Xs, Xt, a, b] = num / denom #p(Xt=b|Xs=a) #STEP 4 g = Graph(V) for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue g.addEdge(Xs, Xt, Info[Xs, Xt]) mst = g.maximum_spanning_tree() #this is an array mst = sorted(mst, key=lambda x: x[0]) #STEP 5 topology_array = [np.nan for i in range(V)] theta_array = [None for i in range(V)] #placeholder topology_array = np.array(topology_array) theta_array = np.array(theta_array) #root root = 0 theta_array[0] = Qsa[root, :] MST = {} for u, v, w in mst: if u not in MST: MST[u] = [] MST[u].append(v) if v not in MST: MST[v] = [] MST[v].append(u) VISITED = [] def dfs(curr, prior): VISITED.append(curr) if prior != -1: cat = Qcond_stab[prior, curr] theta_array[curr] = cat topology_array[curr] = prior for child in MST[curr]: if child in VISITED: continue dfs(child, curr) dfs(root, -1) new_tree = Tree() #print(topology_array) #print(theta_array) new_tree.load_tree_from_direct_arrays(topology_array, theta_array) tm.clusters[k] = new_tree #print("End iteration ", iteration) log_hood = tm_likelihood(tm, samples, N, num_clusters) #print(log_hood) log_hoods.append(log_hood) loglikelihood_list = np.array(log_hoods) return loglikelihood_list, tm
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100): """ This function is for the EM algorithm. :param seed_val: Seed value for reproducibility. Type: int :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes) :param num_clusters: Number of clusters. Type: int :param max_num_iter: Maximum number of EM iterations. Type: int :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array. Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter. :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes) :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2) You can change the function signature and add new parameters. Add them as parameters with some default values. i.e. Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10): You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123): """ # Set the seed np.random.seed(seed_val) # TODO: Implement EM algorithm here. # Start: Example Code Segment. Delete this segment completely before you implement the algorithm. print("Running EM algorithm...") from Kruskal_v1 import Graph # return result in the method import sys tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1]) tm.simulate_pi(seed_val=seed_val) tm.simulate_trees(seed_val=seed_val) tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seed_val) eps = sys.float_info.min topology_list = [] theta_list = [] loglikelihood = [] num_samples = samples.shape[0] num_nodes = samples.shape[1] for iter in range(max_num_iter): r = np.ones((num_samples, num_clusters)) for i, sample in enumerate(samples): for j, t in enumerate(tm.clusters): visitedNodes = [t.root] r[i, j] *= tm.pi[j] while len(visitedNodes) != 0: presentNode = visitedNodes[0] visitedNodes = visitedNodes[1:] if len(presentNode.descendants) != 0: visitedNodes = visitedNodes + presentNode.descendants if presentNode.ancestor == None: #root node r[i, j] *= presentNode.cat[sample[int(presentNode.name)]] else: r[i, j] *= presentNode.cat[sample[int( presentNode.ancestor.name)]][sample[int( presentNode.name)]] r += eps rn = np.sum(r, axis=1).reshape(num_samples, 1) r /= rn loglikelihood.append(np.sum(np.log(rn))) tm.pi = np.sum(r, axis=0) / num_samples den = np.sum(r, axis=0) NominatorQk = np.zeros((num_nodes, num_nodes, 2, 2, num_clusters)) for s in range(num_nodes): for t in range(num_nodes): for a in range(2): for b in range(2): matched_index = np.where( (samples[:, (s, t)] == [a, b]).all(1))[0] NominatorQk[s, t, a, b] = np.sum(r[matched_index], axis=0) / den DenominatorQk = np.zeros((num_nodes, 2, num_clusters)) for s in range(num_nodes): for a in range(2): matched_index = np.where((samples[:, s] == a)) DenominatorQk[s, a] = np.sum(r[matched_index], axis=0) / den Iqst = np.zeros((num_nodes, num_nodes, num_clusters)) for s in range(num_nodes): for t in range(num_nodes): for a in range(2): for b in range(2): if (np.all(NominatorQk[s, t, a, b, :] > 0)): Iqst[s, t] += NominatorQk[s, t, a, b] * np.log( (NominatorQk[s, t, a, b] / (DenominatorQk[s, a])) / DenominatorQk[t, b]) else: Iqst[s, t] += 0 for k in range(num_clusters): g = Graph(num_nodes) for s in range(num_nodes): for t in range(s + 1, num_nodes): g.addEdge(s, t, Iqst[s, t, k]) mst_edges = np.array(g.maximum_spanning_tree())[:, [0, 1]] topology_array = np.zeros(num_nodes) topology_array[0] = np.nan visitedNodes = [0] while len(visitedNodes) != 0: presentNode = visitedNodes[0] visitedNodes = visitedNodes[1:] child_edges = np.array(np.where(mst_edges == [presentNode])).T for ind in child_edges: child = mst_edges[ind[0]][1 - ind[1]] topology_array[int(child)] = presentNode visitedNodes.append(child) if np.size(child_edges) != 0: mst_edges = np.delete(mst_edges, child_edges[:, 0], 0) new_tree = Tree() new_tree.load_tree_from_direct_arrays(topology_array) new_tree.alpha = [1.0] * 2 new_tree.k = 2 visitedNodes = [new_tree.root] while len(visitedNodes) != 0: presentNode = visitedNodes[0] visitedNodes = visitedNodes[1:] if len(presentNode.descendants) != 0: visitedNodes = visitedNodes + presentNode.descendants if presentNode.ancestor == None: presentNode.cat = DenominatorQk[int(presentNode.name), :, k].tolist() else: presentNode.cat = NominatorQk[ int(presentNode.ancestor.name), int(presentNode.name), :, :, k] presentNode.cat[0] = presentNode.cat[0] / np.sum( presentNode.cat[0]) presentNode.cat[1] = presentNode.cat[1] / np.sum( presentNode.cat[1]) presentNode.cat = [presentNode.cat[0], presentNode.cat[1]] tm.clusters[k] = new_tree for j, t in enumerate(tm.clusters): topology_list.append(t.get_topology_array()) theta_list.append(t.get_theta_array()) loglikelihood = np.array(loglikelihood) topology_list = np.array(topology_list) theta_list = np.array(theta_list) return loglikelihood, topology_list, theta_list