示例#1
0
def remove_orphans(cc):  ## cc - current clusters
    initial_clusters = find_initial_clusters_and_their_info_from_table2()
    current_clusters = cc  #list os sets
    import prog4
    profile_freq = prog4.get_profile_freq()
    profile_pair_freq = prog4.get_profile_pair_freq()
    covered_clusters = []
    for i_c in initial_clusters:
        best_MI = -100
        best_final_cluster = set()
        for c in current_clusters:
            MI = prog4.calc_MI_fast(
                set(i_c.split(",")), c, profile_pair_freq, profile_freq
            )  # calculate MI between initial profile combination (i_c) and each of the current combinations (c)
            if MI > best_MI:
                best_MI = MI
                best_final_cluster = c
        best_final_cluster = sorted(list(best_final_cluster))
        covered_clusters.append(best_final_cluster)

    #now we have to remove duplicates and transform a list of lists into a list of sets
    s = []
    new_list = []
    for i in covered_clusters:
        if i not in s:
            s.append(i)
            new_list.append(set(i))
    return new_list
示例#2
0
input:
no input, but program prog4.py shoul be run first

output:
n/MI_matrix.csv - MI matrix of step n (before combining two clusters)
n/clusters.csv - current set of clusters of step n (before combining two clusters)
tree.csv - each line in this file corresponds to two clusters that are going to be combined and MI between them 
"""
import os
import math
import prog4
import common
WORK_DIR = common.WORK_DIR

profile_freq = prog4.get_profile_freq()
profile_pair_freq = prog4.get_profile_pair_freq()
clusters = prog4.get_clusters()


def calc_MI_matrix():
    matrix = {}
    for i, c1 in enumerate(clusters):
        for j, c2 in enumerate(clusters):
            if j >= i: continue
            matrix[(i, j)] = round(
                prog4.calc_MI_fast(c1, c2, profile_pair_freq, profile_freq), 4)
    return matrix


def create_MI_file_for_this_step():
    if not os.path.exists(WORK_DIR + str(step)):