Пример #1
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter, tm=None):

    num_samples = samples.shape[0]
    num_nodes = samples.shape[1]

    loglikelihood = []
    if tm is None:
        tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes)
        tm.simulate_pi(None)
        tm.simulate_trees(None)
        #samples = tm.samples

    for iter_ in range(max_num_iter):
        # 1. Compute responsibilities for all trees
        sample_likelihoods = np.array([[sample_likelihood(tm.clusters[ii], samples[jj,:]\
            , tm.pi[ii]) for ii in range(num_clusters)] for jj in range(num_samples)])
        sum_over_trees_likelihoods = np.reshape(
            np.sum(sample_likelihoods, axis=1), (num_samples, 1))
        Responsibilities = np.divide(sample_likelihoods,
                                     sum_over_trees_likelihoods)
        # Computing loglikelihood
        ll = np.sum(np.log(np.sum(sample_likelihoods, axis=1)), axis=None)
        loglikelihood.append(ll)

        tm.loglikelihood.append(ll)
        # 2. Updating pi for all trees
        tm.pi = np.sum(Responsibilities, axis=0) / num_samples
        vertices = list(range(num_nodes))
        # 3. Updating each tree
        for i in range(num_clusters):
            tree = tm.clusters[i]
            responsibilities = Responsibilities[:, i]
            # Creating the symmetric mutual information matrix
            mutual_information_matrix = np.asarray([[mutual_information(responsibilities, samples, s_idx, t_idx) \
                for s_idx in vertices] for t_idx in vertices])
            # Computing the graph
            graph = create_graph(num_nodes, responsibilities, samples,
                                 mutual_information_matrix, vertices)
            # Finding the maximum spanning tree
            MST = maximum_spanning_tree(graph)
            # Choosing the root as 0
            root_name = 0
            # Finding the order of nodes in the tree
            ordered_nodes, I_sum_tree = create_ordered_nodes(MST, root_name)
            # Getting attributes for tree to enable update
            topology_array, theta_array = create_tree_attributes1(
                ordered_nodes, root_name, samples, responsibilities, num_nodes)
            # Updating the tree
            tree.load_tree_from_direct_arrays(topology_array, theta_array)

    # -------------------------------------------
    topology_list = []
    theta_list = []
    for i in range(num_clusters):
        topology_list.append(tm.clusters[i].get_topology_array())
        theta_list.append(tm.clusters[i].get_theta_array())

    loglikelihood = np.array(loglikelihood)
    topology_list = np.array(topology_list)
    theta_list = np.array(theta_list)

    return loglikelihood, topology_list, theta_list, tm
Пример #2
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=10, tm=None):
    """
    This function is for the EM algorithm.
    :param seed_val: Seed value for reproducibility. Type: int
    :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes)
    :param num_clusters: Number of clusters. Type: int
    :param max_num_iter: Maximum number of EM iterations. Type: int
    :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array.
                Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter.
    :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes)
    :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2)
    This is a suggested template. Feel free to code however you want.
    """
    # Set the seed
    np.random.seed(seed_val)

    # TODO: Implement EM algorithm here.
    N = len(samples)
    K = num_clusters
    V = samples.shape[1]
    if tm == None:
        tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1])
        tm.simulate_pi(seed_val=seed_val)
        tm.simulate_trees(seed_val=seed_val)
    log_hoods = []

    for iteration in range(max_num_iter):

        #STEP 1
        R = np.zeros(shape=(N, K))
        for n in range(N):
            for k in range(K):
                nth_sample = samples[n]
                kth_tree = tm.clusters[k]
                hood = tree_sample_likelihood(kth_tree, nth_sample)
                R[n, k] = tm.pi[k] * hood
        R = normalize(R, axis=1, norm='l1')

        #STEP 2
        new_pi = np.zeros(shape=(K))
        for k in range(K):
            suma = 0
            for n in range(N):
                suma += R[n, k]
            new_pi[k] = suma / N
        tm.pi = new_pi

        for k in range(K):
            #STEP 3
            Qstab = np.zeros(shape=(V, V, 2,
                                    2))  #Xs x Xt x (0 or 1) x (0 or 1)
            Nstab = np.zeros(shape=(V, V, 2,
                                    2))  #Xs x Xt x (0 or 1) x (0 or 1)
            #2 vertex relation
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    for n in range(N):
                        a = samples[n][Xs]
                        b = samples[n][Xt]
                        r_nk = R[n, k]
                        Nstab[Xs, Xt, a, b] += r_nk
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    denom = sum(R[:, k])
                    for a in range(2):  #for each observation (0 or 1)
                        for b in range(2):
                            num = Nstab[Xs, Xt, a, b]
                            Qstab[Xs, Xt, a, b] = num / denom
            #1 vertex relation
            Qsa = np.zeros(shape=(V, 2))
            Nsa = np.zeros(shape=(V, 2))
            for Xs in range(V):  # foreach vertex
                for n in range(N):
                    a = samples[n][Xs]
                    r_nk = R[n, k]
                    Nsa[Xs, a] += r_nk
            for Xs in range(V):
                for a in range(2):
                    num = Nsa[Xs, a]
                    denom = sum(Nsa[Xs, :])
                    Qsa[Xs, a] = num / denom
            #mutual information
            Info = np.zeros(shape=(V, V))  #information between vertices
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    for a in range(2):
                        for b in range(2):
                            qab = Qstab[Xs, Xt, a, b]
                            qa = Qsa[Xs, a]
                            qb = Qsa[Xt, b]
                            if qab / (qa * qb) != 0:
                                Info[Xs, Xt] += qab * log(qab / (qa * qb))
                            else:
                                Info[Xs, Xt] += 0
            #conditional information (for step 5)
            Qcond_stab = np.zeros(shape=(V, V, 2, 2))
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    for a in range(2):
                        for b in range(2):
                            num = Nstab[Xs, Xt, a, b]
                            denom = sum(Nstab[Xs, Xt, a, :])
                            Qcond_stab[Xs, Xt, a,
                                       b] = num / denom  #p(Xt=b|Xs=a)

            #STEP 4
            g = Graph(V)
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    g.addEdge(Xs, Xt, Info[Xs, Xt])
            mst = g.maximum_spanning_tree()  #this is an array
            mst = sorted(mst, key=lambda x: x[0])

            #STEP 5
            topology_array = [np.nan for i in range(V)]
            theta_array = [None for i in range(V)]  #placeholder
            topology_array = np.array(topology_array)
            theta_array = np.array(theta_array)
            #root
            root = 0
            theta_array[0] = Qsa[root, :]

            MST = {}
            for u, v, w in mst:
                if u not in MST:
                    MST[u] = []
                MST[u].append(v)
                if v not in MST:
                    MST[v] = []
                MST[v].append(u)

            VISITED = []

            def dfs(curr, prior):
                VISITED.append(curr)
                if prior != -1:
                    cat = Qcond_stab[prior, curr]
                    theta_array[curr] = cat
                    topology_array[curr] = prior

                for child in MST[curr]:
                    if child in VISITED:
                        continue
                    dfs(child, curr)

            dfs(root, -1)

            new_tree = Tree()
            #print(topology_array)
            #print(theta_array)
            new_tree.load_tree_from_direct_arrays(topology_array, theta_array)

            tm.clusters[k] = new_tree

        #print("End iteration ", iteration)
        log_hood = tm_likelihood(tm, samples, N, num_clusters)
        #print(log_hood)
        log_hoods.append(log_hood)

    loglikelihood_list = np.array(log_hoods)

    return loglikelihood_list, tm
Пример #3
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100):
    """
    This function is for the EM algorithm.
    :param seed_val: Seed value for reproducibility. Type: int
    :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes)
    :param num_clusters: Number of clusters. Type: int
    :param max_num_iter: Maximum number of EM iterations. Type: int
    :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array.
                Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter.
    :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes)
    :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2)

    You can change the function signature and add new parameters. Add them as parameters with some default values.
    i.e.
    Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10):
    You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123):
    """

    # Set the seed
    np.random.seed(seed_val)

    # TODO: Implement EM algorithm here.

    # Start: Example Code Segment. Delete this segment completely before you implement the algorithm.
    print("Running EM algorithm...")

    from Kruskal_v1 import Graph
    # return result in the method
    import sys

    tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1])
    tm.simulate_pi(seed_val=seed_val)
    tm.simulate_trees(seed_val=seed_val)
    tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seed_val)
    eps = sys.float_info.min
    topology_list = []
    theta_list = []
    loglikelihood = []
    num_samples = samples.shape[0]
    num_nodes = samples.shape[1]
    for iter in range(max_num_iter):
        r = np.ones((num_samples, num_clusters))
        for i, sample in enumerate(samples):
            for j, t in enumerate(tm.clusters):
                visitedNodes = [t.root]
                r[i, j] *= tm.pi[j]
                while len(visitedNodes) != 0:
                    presentNode = visitedNodes[0]
                    visitedNodes = visitedNodes[1:]
                    if len(presentNode.descendants) != 0:
                        visitedNodes = visitedNodes + presentNode.descendants
                    if presentNode.ancestor == None:  #root node
                        r[i,
                          j] *= presentNode.cat[sample[int(presentNode.name)]]
                    else:
                        r[i, j] *= presentNode.cat[sample[int(
                            presentNode.ancestor.name)]][sample[int(
                                presentNode.name)]]

        r += eps
        rn = np.sum(r, axis=1).reshape(num_samples, 1)
        r /= rn
        loglikelihood.append(np.sum(np.log(rn)))
        tm.pi = np.sum(r, axis=0) / num_samples
        den = np.sum(r, axis=0)
        NominatorQk = np.zeros((num_nodes, num_nodes, 2, 2, num_clusters))
        for s in range(num_nodes):
            for t in range(num_nodes):
                for a in range(2):
                    for b in range(2):
                        matched_index = np.where(
                            (samples[:, (s, t)] == [a, b]).all(1))[0]
                        NominatorQk[s, t, a,
                                    b] = np.sum(r[matched_index], axis=0) / den

        DenominatorQk = np.zeros((num_nodes, 2, num_clusters))
        for s in range(num_nodes):
            for a in range(2):
                matched_index = np.where((samples[:, s] == a))
                DenominatorQk[s, a] = np.sum(r[matched_index], axis=0) / den

        Iqst = np.zeros((num_nodes, num_nodes, num_clusters))
        for s in range(num_nodes):
            for t in range(num_nodes):
                for a in range(2):
                    for b in range(2):
                        if (np.all(NominatorQk[s, t, a, b, :] > 0)):
                            Iqst[s, t] += NominatorQk[s, t, a, b] * np.log(
                                (NominatorQk[s, t, a, b] /
                                 (DenominatorQk[s, a])) / DenominatorQk[t, b])
                        else:
                            Iqst[s, t] += 0
        for k in range(num_clusters):
            g = Graph(num_nodes)
            for s in range(num_nodes):
                for t in range(s + 1, num_nodes):
                    g.addEdge(s, t, Iqst[s, t, k])

            mst_edges = np.array(g.maximum_spanning_tree())[:, [0, 1]]
            topology_array = np.zeros(num_nodes)
            topology_array[0] = np.nan
            visitedNodes = [0]
            while len(visitedNodes) != 0:
                presentNode = visitedNodes[0]
                visitedNodes = visitedNodes[1:]
                child_edges = np.array(np.where(mst_edges == [presentNode])).T
                for ind in child_edges:
                    child = mst_edges[ind[0]][1 - ind[1]]
                    topology_array[int(child)] = presentNode
                    visitedNodes.append(child)
                if np.size(child_edges) != 0:
                    mst_edges = np.delete(mst_edges, child_edges[:, 0], 0)

            new_tree = Tree()
            new_tree.load_tree_from_direct_arrays(topology_array)
            new_tree.alpha = [1.0] * 2
            new_tree.k = 2

            visitedNodes = [new_tree.root]
            while len(visitedNodes) != 0:
                presentNode = visitedNodes[0]
                visitedNodes = visitedNodes[1:]

                if len(presentNode.descendants) != 0:
                    visitedNodes = visitedNodes + presentNode.descendants

                if presentNode.ancestor == None:
                    presentNode.cat = DenominatorQk[int(presentNode.name), :,
                                                    k].tolist()
                else:
                    presentNode.cat = NominatorQk[
                        int(presentNode.ancestor.name),
                        int(presentNode.name), :, :, k]
                    presentNode.cat[0] = presentNode.cat[0] / np.sum(
                        presentNode.cat[0])
                    presentNode.cat[1] = presentNode.cat[1] / np.sum(
                        presentNode.cat[1])
                    presentNode.cat = [presentNode.cat[0], presentNode.cat[1]]

            tm.clusters[k] = new_tree

        for j, t in enumerate(tm.clusters):
            topology_list.append(t.get_topology_array())
            theta_list.append(t.get_theta_array())
    loglikelihood = np.array(loglikelihood)
    topology_list = np.array(topology_list)
    theta_list = np.array(theta_list)
    return loglikelihood, topology_list, theta_list