def remove_orphans(cc): ## cc - current clusters initial_clusters = find_initial_clusters_and_their_info_from_table2() current_clusters = cc #list os sets import prog4 profile_freq = prog4.get_profile_freq() profile_pair_freq = prog4.get_profile_pair_freq() covered_clusters = [] for i_c in initial_clusters: best_MI = -100 best_final_cluster = set() for c in current_clusters: MI = prog4.calc_MI_fast( set(i_c.split(",")), c, profile_pair_freq, profile_freq ) # calculate MI between initial profile combination (i_c) and each of the current combinations (c) if MI > best_MI: best_MI = MI best_final_cluster = c best_final_cluster = sorted(list(best_final_cluster)) covered_clusters.append(best_final_cluster) #now we have to remove duplicates and transform a list of lists into a list of sets s = [] new_list = [] for i in covered_clusters: if i not in s: s.append(i) new_list.append(set(i)) return new_list
input: no input, but program prog4.py shoul be run first output: n/MI_matrix.csv - MI matrix of step n (before combining two clusters) n/clusters.csv - current set of clusters of step n (before combining two clusters) tree.csv - each line in this file corresponds to two clusters that are going to be combined and MI between them """ import os import math import prog4 import common WORK_DIR = common.WORK_DIR profile_freq = prog4.get_profile_freq() profile_pair_freq = prog4.get_profile_pair_freq() clusters = prog4.get_clusters() def calc_MI_matrix(): matrix = {} for i, c1 in enumerate(clusters): for j, c2 in enumerate(clusters): if j >= i: continue matrix[(i, j)] = round( prog4.calc_MI_fast(c1, c2, profile_pair_freq, profile_freq), 4) return matrix def create_MI_file_for_this_step():