예제 #1
0
파일: prog1exp.py 프로젝트: neksa/Enolase
def calc_MI_matrix():
    matrix = {}
    for i, c1 in enumerate(clusters):
        for j, c2 in enumerate(clusters):
            if j >= i: continue
            matrix[(i, j)] = round(prog4.calc_MI_fast(c1, c2, profile_pair_freq, profile_freq), 4)
    return matrix
예제 #2
0
def remove_orphans(cc):  ## cc - current clusters
    initial_clusters = find_initial_clusters_and_their_info_from_table2()
    current_clusters = cc  #list os sets
    import prog4
    profile_freq = prog4.get_profile_freq()
    profile_pair_freq = prog4.get_profile_pair_freq()
    covered_clusters = []
    for i_c in initial_clusters:
        best_MI = -100
        best_final_cluster = set()
        for c in current_clusters:
            MI = prog4.calc_MI_fast(
                set(i_c.split(",")), c, profile_pair_freq, profile_freq
            )  # calculate MI between initial profile combination (i_c) and each of the current combinations (c)
            if MI > best_MI:
                best_MI = MI
                best_final_cluster = c
        best_final_cluster = sorted(list(best_final_cluster))
        covered_clusters.append(best_final_cluster)

    #now we have to remove duplicates and transform a list of lists into a list of sets
    s = []
    new_list = []
    for i in covered_clusters:
        if i not in s:
            s.append(i)
            new_list.append(set(i))
    return new_list
예제 #3
0
파일: prog1exp.py 프로젝트: neksa/Enolase
def remove_orphans():
    covered_clusters = []
    for i_c in initial_clusters:
        best_MI = -100
        best_final_cluster = set()
        for c in clusters:
            MI = prog4.calc_MI_fast(i_c,c,profile_pair_freq,profile_freq) # calculate MI between initial profile combination (i_c) and each of the current combinations (c)
            if MI > best_MI:
                best_MI = MI
                best_final_cluster = c
        best_final_cluster = sorted(list(best_final_cluster))
        covered_clusters.append(best_final_cluster)

    #now we have to remove duplicates and transform a list of lists into a list of sets
    s = []
    clusters[:] = [] # empty the list
    for i in covered_clusters:
       if i not in s:
          s.append(i)
          clusters.append(set(i))
예제 #4
0
파일: prog15.py 프로젝트: neksa/Enolase
new_file = open(WORK_DIR + "active_clusters.csv", "w")
new_file.write("number_of_clusters\tnumber_of_active_clusters\n")
initial_clusters = prog8.find_initial_clusters_and_their_info_from_table2()

for step in range(260, 0, -1):
    try:
        current_clusters = prog8.find_clusters_on_a_given_step(step)
        profile_freq = prog4.get_profile_freq()
        profile_pair_freq = prog4.get_profile_pair_freq()

        covered_clusters = set()
        for i_c in initial_clusters:
            best_MI = -100
            best_final_cluster = ""
            for c in current_clusters:
                MI = prog4.calc_MI_fast(
                    set(i_c.split(",")), set(c.split(",")), profile_pair_freq,
                    profile_freq
                )  # calculate MI between initial profile combination (i_c) and each of the current combinations (c)
                if MI > best_MI:
                    best_MI = MI
                    best_final_cluster = c
            covered_clusters.add(best_final_cluster)
        print step, " step"
        new_file.write(
            str(len(current_clusters)) + '\t' + str(len(covered_clusters)) +
            "\n")

    except IOError:
        print step, 'step doesn\'t exist'
예제 #5
0
for step in range(260, 0, -1):
    try:
        print step, " step"
        current_clusters = prog8.find_clusters_on_a_given_step(step)
        new_file.write(str(len(current_clusters)) + "\t")

        best_subgroup_of_final_cluster = {
        }  # here I will store the name of the "nearest" subgroup for each of the current clusters. For instance, in figure 6A (https://www.dropbox.com/s/bew8l8vszgkgjs5/fig6.png), p3 is the nearest for C1, and p4 is the nearest for C2.

        for c in current_clusters:
            best_MI = -100  # best MI so far for this current cluster
            best_subgroups_in_current_clusters = {}

            for i_c in initial_clusters:
                MI = prog4.calc_MI_fast(
                    set(i_c.split(",")), set(c.split(",")), profile_pair_freq,
                    profile_freq
                )  # calculate MI between an initial cluster and a current cluster
                if MI > best_MI and initial_clusters[i_c][
                        'subgroups'] != {}:  # if MI is higher than the best MI so far, and there exist any proteins with known subgroup in this initial cluster
                    best_MI = MI
                    best_subgroups_in_current_clusters = {}

                if MI == best_MI:  # there can be several initial clusters with equally short distance (MI) to the current cluster
                    for subgr in initial_clusters[i_c]['subgroups'].keys(
                    ):  # keys are the names of the subgroups, values are the numbers of proteins from the given subgroup
                        if subgr not in best_subgroups_in_current_clusters:
                            best_subgroups_in_current_clusters[subgr] = 0
                        best_subgroups_in_current_clusters[
                            subgr] += initial_clusters[i_c]['subgroups'][subgr]

            best_subgr = max(