예제 #1
0
def start_top_hits(num_hits, stage, smotif_index):
    """
    generate run seq, a seq list of pairs of
    indexes of profiles for job scheduling
    """
    map_route = []
    ss_profiles = io.readPickle("ss_profiles.pickle")
    if os.path.isfile("contacts_route.pickle"):
        map_route = io.readPickle("contacts_route.pickle")
    elif os.path.isfile("pcs_route.pickle"):
        map_route = io.readPickle("pcs_route.pickle")
        map_route_alt = io.readPickle("pcs_route_alt.pickle")
    elif os.path.isfile("rdc_route.pickle"):
        map_route = io.readPickle("rdc_route.pickle")
        map_route_alt = io.readPickle("rdc_route_alt.pickle")

    alt_smotif_defs = map_route_alt[smotif_index]

    top_hits = []
    top_hit_file = str(smotif_index - 1) + "_refined_tophits.gzip"
    if os.path.isfile(top_hit_file):
        top_hits = io.readGzipPickle(top_hit_file)

        print "loading from prevously assembled refined_tophits.pickle file"
        print "# hits :", len(top_hits)
    else:
        top_hit_file = str(smotif_index - 1) + "_tophits.gzip"
        if os.path.isfile(top_hit_file):
            top_hits = io.readGzipPickle(top_hit_file)
            print "loading from prevously assembled tophits.pickle file"
            print "# hits :", len(top_hits)
        else:
            print "No previous tophits file found, Generating a new one"
            return "exception"

    if not top_hits:
        return False, False

    run_seq = []
    for next_smotif in alt_smotif_defs:
        print next_smotif
        direction = next_smotif[-1]
        if direction == 'left':
            next_ss_list = ss_profiles[next_smotif[0]]
        else:
            next_ss_list = ss_profiles[next_smotif[1]]

        for i in range(len(top_hits)):
            for j in range(len(next_ss_list)):
                run_seq.append([i, j, next_smotif])

    return run_seq, smotif_index
예제 #2
0
def getPreviousSmotif(index, next_index):
    """
    Modified for the Alt_Smotifs from the original version
    :param index:
    :param next_smotif:
    :param next_index:
    :return:
    """
    t_file = str(next_index - 1) + "_refined_tophits.gzip"
    if os.path.isfile(t_file):
        top_hits = io.readGzipPickle(t_file)  # Read in previous index hits
    else:
        t_file = str(next_index - 1) + "_tophits.gzip"
        if os.path.isfile(t_file):
            top_hits = io.readGzipPickle(t_file)  # Read in previous index hits
        else:
            return False

    return top_hits[index]
예제 #3
0
def makeTopPickle(previous_smotif_index, num_hits, stage):
    """

    Concatenate data from all of the threads, organize, remove redundancies, rank
     and extract top hits as defined
    :param previous_smotif_index:
    :param num_hits:
    :param stage:
    :return:
    """
    hits = []
    # regex = str(previous_smotif_index) + "_*_*.pickle"
    regex = str(previous_smotif_index) + "_*_*.gzip"
    file_list = glob.glob(regex)
    for f in file_list:
        t_hits = io.readGzipPickle(f)
        for t_hit in t_hits:
            hits.append(t_hit)
    """
    identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter
                 RDC_filter, NOE_filter
    """

    new_dict = collections.defaultdict(list)
    pcs_filter = False
    contact_filter = False
    rdc_filter = False
    noe_filter = False
    ref_rmsd_filter = False
    global_noe_filter = False
    for hit in hits:
        # thread_data contains data from each search and filter thread.
        for data_filter in hit:
            if data_filter[0] == 'PCS_filter':
                pcs_filter = True
                pcs_data = data_filter
                Nchi = getNchiSum(pcs_data, stage)
                # new_dict.setdefault(Nchi, []).append(entry)
                new_dict[Nchi].append(hit)

            if data_filter[0] == 'Evofilter':
                contact_filter = True
                new_dict[data_filter[1]].append(hit)

            if data_filter[0] == 'Ref_RMSD':
                ref_rmsd_filter = True
                new_dict[data_filter[1]].append(hit)
            """
            if data_filter[0] == 'RDC_filter':
                rdc_filter = True
                rdc_data = data_filter
                Nchi = rdcSumChi(rdc_data, stage)
                for filter in hit:
                    if filter[0] == 'NOE_filter':
                        noe_filter = True
                        noe_fmeasure = filter[1]
                        no_of_noes = filter[2]
                        if stage == 99:
                            Nchi = Nchi
                        else:
                            # Nchi = Nchi / math.pow(10, noe_fmeasure * no_of_noes)
                            Nchi = Nchi / math.pow(10, noe_fmeasure )
                            #new_dict[Nchi].append(hit)
                    if filter[0] == 'GlobalNoe_filter':
                        global_noe_filter = True
                        noe_percent = filter[1]
                        total_no_of_noes = filter[2]
                        Nchi = total_no_of_noes
                    new_dict[Nchi].append(hit)

                if not noe_filter:
                    new_dict[Nchi].append(hit)
            """
    # ************************************************
    # Exclude the redundant entries and rank top hits
    # ************************************************

    keys = new_dict.keys()
    keys.sort()
    if contact_filter or global_noe_filter:
        # Contact filter data should be as high as possible
        print "Global NOEs are used"
        keys.reverse()

    # Exclude the redundant data.

    # non_redundant = {}
    non_redundant = collections.defaultdict(list)
    seqs = []
    smotif_seq = ''
    Nchi = 0.0
    count_hits = 0
    for i in range(0, len(keys)):
        entries = new_dict[keys[i]]
        for entry in entries:
            for ent in entry:
                if ent[0] == 'smotif':
                    name = ent[1][0]
                if ent[0] == 'seq_filter':
                    seq_filter = ent
                    smotif_seq = seq_filter[1]
                if ent[0] == 'PCS_filter':
                    pcs_data = ent
                    Nchi = getNchiSum(pcs_data, stage)
                if ent[0] == 'Evofilter':
                    Nchi = ent[1]
                if ent[0] == 'Ref_RMSD':
                    # Ref_RMSD overwrites the previous Nchis
                    Nchi = ent[1]
                """
                if ent[0] == 'RDC_filter':
                    rdc_data = ent
                    Nchi = rdcSumChi(rdc_data, stage)
                    if noe_filter:
                        for ent in entry:
                            if ent[0] == 'NOE_filter':
                                noe_fmeasure = ent[1]
                                no_of_noes = ent[2]
                                if stage == 99:
                                    Nchi = Nchi
                                else:
                                    # Nchi = Nchi / math.pow(10, noe_fmeasure * no_of_noes)
                                    Nchi = Nchi / math.pow(10, noe_fmeasure)
                                    #Nchi = Nchi / math.pow(10, noe_fmeasure * 10)
                    else:
                        Nchi = rdcSumChi(rdc_data, stage)
                """

            if smotif_seq not in seqs:
                seqs.append(smotif_seq)
                # non_redundant.setdefault(Nchi, []).append(entry)
                non_redundant[Nchi].append(entry)
                count_hits += 1
        if count_hits >= num_hits:
            break

    # Rank top hits and dump the data
    keys = non_redundant.keys()
    keys.sort()
    if contact_filter and not pcs_filter:
        keys.reverse()
    dump_pickle = []
    print "Dumping data to disk"
    count_top_hits = 0
    while (True):
        for key in keys:
            if key == 999.999:
                # Do not work on these entries
                continue
            entries = non_redundant[key]
            for entry in entries:
                dump_pickle.append(entry)
                print "final sele", entry[0][1][0][0], key
                count_top_hits += 1
            if count_top_hits >= num_hits:
                break
        if count_top_hits >= num_hits:
            break
        else:
            print "could only extract ", count_top_hits
            break

    io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle)
    print "actual number in top hits ", len(dump_pickle)
    return range(count_top_hits)