def calc_MI_matrix(): matrix = {} for i, c1 in enumerate(clusters): for j, c2 in enumerate(clusters): if j >= i: continue matrix[(i, j)] = round(prog4.calc_MI_fast(c1, c2, profile_pair_freq, profile_freq), 4) return matrix
def remove_orphans(cc): ## cc - current clusters initial_clusters = find_initial_clusters_and_their_info_from_table2() current_clusters = cc #list os sets import prog4 profile_freq = prog4.get_profile_freq() profile_pair_freq = prog4.get_profile_pair_freq() covered_clusters = [] for i_c in initial_clusters: best_MI = -100 best_final_cluster = set() for c in current_clusters: MI = prog4.calc_MI_fast( set(i_c.split(",")), c, profile_pair_freq, profile_freq ) # calculate MI between initial profile combination (i_c) and each of the current combinations (c) if MI > best_MI: best_MI = MI best_final_cluster = c best_final_cluster = sorted(list(best_final_cluster)) covered_clusters.append(best_final_cluster) #now we have to remove duplicates and transform a list of lists into a list of sets s = [] new_list = [] for i in covered_clusters: if i not in s: s.append(i) new_list.append(set(i)) return new_list
def remove_orphans(): covered_clusters = [] for i_c in initial_clusters: best_MI = -100 best_final_cluster = set() for c in clusters: MI = prog4.calc_MI_fast(i_c,c,profile_pair_freq,profile_freq) # calculate MI between initial profile combination (i_c) and each of the current combinations (c) if MI > best_MI: best_MI = MI best_final_cluster = c best_final_cluster = sorted(list(best_final_cluster)) covered_clusters.append(best_final_cluster) #now we have to remove duplicates and transform a list of lists into a list of sets s = [] clusters[:] = [] # empty the list for i in covered_clusters: if i not in s: s.append(i) clusters.append(set(i))
new_file = open(WORK_DIR + "active_clusters.csv", "w") new_file.write("number_of_clusters\tnumber_of_active_clusters\n") initial_clusters = prog8.find_initial_clusters_and_their_info_from_table2() for step in range(260, 0, -1): try: current_clusters = prog8.find_clusters_on_a_given_step(step) profile_freq = prog4.get_profile_freq() profile_pair_freq = prog4.get_profile_pair_freq() covered_clusters = set() for i_c in initial_clusters: best_MI = -100 best_final_cluster = "" for c in current_clusters: MI = prog4.calc_MI_fast( set(i_c.split(",")), set(c.split(",")), profile_pair_freq, profile_freq ) # calculate MI between initial profile combination (i_c) and each of the current combinations (c) if MI > best_MI: best_MI = MI best_final_cluster = c covered_clusters.add(best_final_cluster) print step, " step" new_file.write( str(len(current_clusters)) + '\t' + str(len(covered_clusters)) + "\n") except IOError: print step, 'step doesn\'t exist'
for step in range(260, 0, -1): try: print step, " step" current_clusters = prog8.find_clusters_on_a_given_step(step) new_file.write(str(len(current_clusters)) + "\t") best_subgroup_of_final_cluster = { } # here I will store the name of the "nearest" subgroup for each of the current clusters. For instance, in figure 6A (https://www.dropbox.com/s/bew8l8vszgkgjs5/fig6.png), p3 is the nearest for C1, and p4 is the nearest for C2. for c in current_clusters: best_MI = -100 # best MI so far for this current cluster best_subgroups_in_current_clusters = {} for i_c in initial_clusters: MI = prog4.calc_MI_fast( set(i_c.split(",")), set(c.split(",")), profile_pair_freq, profile_freq ) # calculate MI between an initial cluster and a current cluster if MI > best_MI and initial_clusters[i_c][ 'subgroups'] != {}: # if MI is higher than the best MI so far, and there exist any proteins with known subgroup in this initial cluster best_MI = MI best_subgroups_in_current_clusters = {} if MI == best_MI: # there can be several initial clusters with equally short distance (MI) to the current cluster for subgr in initial_clusters[i_c]['subgroups'].keys( ): # keys are the names of the subgroups, values are the numbers of proteins from the given subgroup if subgr not in best_subgroups_in_current_clusters: best_subgroups_in_current_clusters[subgr] = 0 best_subgroups_in_current_clusters[ subgr] += initial_clusters[i_c]['subgroups'][subgr] best_subgr = max(