Пример #1
0
def oligo_dup_output(out, parse_dic):
    keys = parse_dic.keys()
    keys.sort()
    for key in keys:
        Query = key
        if len(parse_dic[key]) > 1:
            temp_list = []
            hit_list = parse_dic[key][1:-1]
            alignment = parse_dic[key][-1]
            for i in range(0, len(hit_list)):
                #                hit_id, hit_score, hit_E, hit_len, hit_identical, hit_cover = hit_list[i]
                hit_id, hit_score = hit_list[i]
                align = string.split(alignment[i], " Score =")[1]  # 1st HSP
                plusline, minusline, p_start, m_start, p_end, m_end = blast_parse.parse_align(align)
                os.environ["plusline"] = plusline
                os.environ["minusline"] = minusline

                f = os.popen('./code/energy "$plusline" "$minusline" 1')
                retData = f.read()
                f.close()
                energy = float(string.split(string.strip(retData), "\n")[-1])
                if energy >= -10:
                    break
                #                out.write(hit_id+' '+str(hit_identical)+'\t')
                #                out.write(hit_id+' '+str(hit_E)+'\t')
                #                out.write(hit_id+' '+str(energy)+'\t')

                temp_list.append([energy, hit_id])
            temp_list.sort()
            for i in range(0, len(temp_list)):
                energy, hit_id = temp_list[i]
                #                out.write(hit_id+' '+str(energy)+'\t'+str(hit_identical)+'\t')
                out.write(hit_id + "\t" + str(energy) + "\t")
        out.write("\n")
Пример #2
0
def Parse_Blast (id , output, Group_dic_same,  OLIGOLEN):
    import blast_parse
    print 0.0
    print output
    parse_dic = {}
    blast_parse.parse_alignment(output, parse_dic, 1)
    #now we need the information to check that not only that we hit the same id but in the corresponding region too
    new_id = id[:]
    new_id = string.replace(new_id,' ','')
    new_id = string.replace(new_id,'\t','')

    try:
        Group = Group_dic_same[new_id] # [[hit_id, hit_strand, qstart, qend,hstart,hend,...],[hit_id, hit_strand, qstart,qend, hstart, hend,...]]
        print Group
    except KeyError:
        Group = []
    GroupIds=[] # the ids
    for groupHit in Group:
        hitid = groupHit[0]
        GroupIds.append(hitid)
    
    ret = parse_dic.values()[0] #only one entry there for the entire result
    Query_len = ret[0]
    gene_end = Query_len - OLIGOLEN 
    hit_list = ret[1:-1]
    alignment = ret[-1]

    energy_dic={}
    empty_dic ={}
    for pos in range (0, gene_end+1):
        empty_dic[pos] =[ 0,-1] #energy, location

    if len(hit_list) == 0:
        return {}
    else:
        hitGroup =[]
        for j in range (0, len(hit_list)):
            hit_id = hit_list[j][0] #hit_id , hit_len 
            hit_id = string.replace(hit_id,' ','')
            hit_id = string.replace(hit_id,'\t','')
            
            aligns = string.split(alignment [j], ' Score =')[1:]
            coverArea = []
            temp_energy_dic = empty_dic.copy()
            for i in range(0, len(aligns)): #individula alignment of individula hit
                plusline, minusline, p_start, m_start, p_end, m_end  =  blast_parse.parse_align(aligns[i])

                #we need to check if the hit can be ignored because it is the query itself
                #according to the Group_Same_Dic
                ignore =0
                if hit_id in GroupIds: # found id , possible
                    for hitInfo  in Group: #hitInfo is [hitid, hitstrand(+/-), qstart, qend, hitstart, hitend]
                        HITid, HITstrand, Qstart, Qend, Hstart, Hend = hitInfo
                        #check if id match
                        if HITid != hit_id:
                            continue

                        #check if strand information match
                        tolerant_diff = 5
                        if m_start < m_end: #+ strand in  the alignment
                            if HITstrand =='-':
                                continue
                        else:
                            if HITstrand =='+':
                                continue
                        # a key point is if the hit is itself it must be detected as a complete unit, which is almost the same as the hit in the GroupDicSame # + strand in the alignment
                        #if abs(Qstart - p_start)<=5  and abs(Qend- p_end)<=5 and abs(Hstart-m_start)<=5 and abs(Hend -m_end)<=5:
                        # Qstart, Qend : in group file
                        # p_end,hit_id: in blast
                        

                        if ((Qstart <=  p_start)  and ( Qend >=  p_end ))  :
                            if (( HITstrand =='+') and (Hstart <= m_start) and (Hend >= m_end)):
                                ignore = 1
                            elif (( HITstrand =='-') and (Hstart >= m_start) and (Hend <= m_end)):
                                ignore =1
                            if ignore:
                                print "ignored segment: query:",p_start,p_end,hit_id,":", Hstart,Hend
                                break

                        if abs(Qstart - p_start)<=5  and abs(Qend- p_end)<=5 and abs(Hstart-m_start)<=5 and abs(Hend -m_end)<=5:
                            ignore = 1
                            print "ignored segment: query:",p_start,p_end,hit_id,":", Hstart,Hend
                            break

                if ignore :
                    continue

                if WithInRange(coverArea, p_start, p_end):
                    continue
                a_length = len(plusline)
                
                p_start = p_start -1
                p_end = p_end -1
                if (m_start< m_end):
                    m_start = m_start-1
                    m_end = m_end  - 1
                else:
                    m_start = m_start +1
                    m_end = m_end +1
                
                energy_list = compute_energy2 (plusline, minusline)
                #start energy
                start_energy = 3.4
                for i in range (0, min(OLIGOLEN, a_length)):
                    start_energy = start_energy + energy_list[i][ADD]

                #position = p_start
                if (p_start <= gene_end):
                    if start_energy < temp_energy_dic[p_start][0]:
                        temp_energy_dic[p_start] = [ start_energy, m_start]

                poffset = 0 #check gap
                moffset = 0
                #the alignment of after the alignment
                energy = start_energy
                for pos in range(1,a_length - WORD):
                    if pos != 0 and plusline[pos-1] == '-': #check gap
                        poffset = poffset - 1
                    if pos!=0 and minusline[pos-1] =='-':
                        moffset = moffset - 1
                        
                    position = p_start+pos
                    preal_position = position + poffset #compensate for gap

                    if (m_start< m_end):
                        position = m_start+pos
                        mreal_position = position +moffset
                    else:
                        position = m_start -pos
                        mreal_position = position - moffset
                    
                    if (preal_position > gene_end) :#or (real_position < 0): #check bound
                        break

                    else:
                        energy = energy + energy_list[pos-1][SUB]
                        end = pos+OLIGOLEN -1
                        if end < a_length:
                            energy = energy + energy_list[end][ADD]

                        if energy < temp_energy_dic[preal_position][0]:
                            temp_energy_dic[preal_position] = [energy,mreal_position]

                poffset = 0  #no gap
                moffset =0
                end = min(OLIGOLEN, a_length)

                energy = start_energy
                #the alignemnet is completely inside of the oligo selection
                if (end == a_length):
                    start = p_start+end-OLIGOLEN  
                    for i in range(max(0,-(start) ), OLIGOLEN-end):
                        preal_position = start + i
                        if preal_position >gene_end:
                            break

                        if energy < temp_energy_dic[preal_position][0]:
                            if m_start < m_end:
                                m_location = m_start + preal_position - p_start
                            else:
                                m_location = m_start - (preal_position - p_start)
                            temp_energy_dic[preal_position] = [energy,m_location]

                #before the alignment 
                for pos in range(end-1, max(WORD, OLIGOLEN - p_start)-1 , -1): #version 2.7 correction
                    preal_position = p_start+ pos -OLIGOLEN
                    if preal_position > gene_end:
                        break
                    energy = energy - energy_list[pos][ADD]
                    if energy < temp_energy_dic[preal_position][0]:
                        if m_start < m_end:
                            m_location = m_start + preal_position - p_start
                        else:
                            m_location = m_start - (preal_position - p_start)
                        temp_energy_dic[preal_position] = [energy,m_location]
            
            for key in temp_energy_dic.keys():
                if temp_energy_dic[key] != [0,-1]:
                    try:
                        energy_dic[key][0] = min(energy_dic[key][0], temp_energy_dic[key][0])
                    except KeyError:
                        energy_dic[key] = [temp_energy_dic[key][0]]
                    energy_dic[key].append([hit_id, temp_energy_dic[key][0], temp_energy_dic[key][1]])                   
    return energy_dic #{pos:[min_energy,[hit_id, energy],[hit_id, energy],...], ....}