def oligo_dup_output(out, parse_dic): keys = parse_dic.keys() keys.sort() for key in keys: Query = key if len(parse_dic[key]) > 1: temp_list = [] hit_list = parse_dic[key][1:-1] alignment = parse_dic[key][-1] for i in range(0, len(hit_list)): # hit_id, hit_score, hit_E, hit_len, hit_identical, hit_cover = hit_list[i] hit_id, hit_score = hit_list[i] align = string.split(alignment[i], " Score =")[1] # 1st HSP plusline, minusline, p_start, m_start, p_end, m_end = blast_parse.parse_align(align) os.environ["plusline"] = plusline os.environ["minusline"] = minusline f = os.popen('./code/energy "$plusline" "$minusline" 1') retData = f.read() f.close() energy = float(string.split(string.strip(retData), "\n")[-1]) if energy >= -10: break # out.write(hit_id+' '+str(hit_identical)+'\t') # out.write(hit_id+' '+str(hit_E)+'\t') # out.write(hit_id+' '+str(energy)+'\t') temp_list.append([energy, hit_id]) temp_list.sort() for i in range(0, len(temp_list)): energy, hit_id = temp_list[i] # out.write(hit_id+' '+str(energy)+'\t'+str(hit_identical)+'\t') out.write(hit_id + "\t" + str(energy) + "\t") out.write("\n")
def Parse_Blast (id , output, Group_dic_same, OLIGOLEN): import blast_parse print 0.0 print output parse_dic = {} blast_parse.parse_alignment(output, parse_dic, 1) #now we need the information to check that not only that we hit the same id but in the corresponding region too new_id = id[:] new_id = string.replace(new_id,' ','') new_id = string.replace(new_id,'\t','') try: Group = Group_dic_same[new_id] # [[hit_id, hit_strand, qstart, qend,hstart,hend,...],[hit_id, hit_strand, qstart,qend, hstart, hend,...]] print Group except KeyError: Group = [] GroupIds=[] # the ids for groupHit in Group: hitid = groupHit[0] GroupIds.append(hitid) ret = parse_dic.values()[0] #only one entry there for the entire result Query_len = ret[0] gene_end = Query_len - OLIGOLEN hit_list = ret[1:-1] alignment = ret[-1] energy_dic={} empty_dic ={} for pos in range (0, gene_end+1): empty_dic[pos] =[ 0,-1] #energy, location if len(hit_list) == 0: return {} else: hitGroup =[] for j in range (0, len(hit_list)): hit_id = hit_list[j][0] #hit_id , hit_len hit_id = string.replace(hit_id,' ','') hit_id = string.replace(hit_id,'\t','') aligns = string.split(alignment [j], ' Score =')[1:] coverArea = [] temp_energy_dic = empty_dic.copy() for i in range(0, len(aligns)): #individula alignment of individula hit plusline, minusline, p_start, m_start, p_end, m_end = blast_parse.parse_align(aligns[i]) #we need to check if the hit can be ignored because it is the query itself #according to the Group_Same_Dic ignore =0 if hit_id in GroupIds: # found id , possible for hitInfo in Group: #hitInfo is [hitid, hitstrand(+/-), qstart, qend, hitstart, hitend] HITid, HITstrand, Qstart, Qend, Hstart, Hend = hitInfo #check if id match if HITid != hit_id: continue #check if strand information match tolerant_diff = 5 if m_start < m_end: #+ strand in the alignment if HITstrand =='-': continue else: if HITstrand =='+': continue # a key point is if the hit is itself it must be detected as a complete unit, which is almost the same as the hit in the GroupDicSame # + strand in the alignment #if abs(Qstart - p_start)<=5 and abs(Qend- p_end)<=5 and abs(Hstart-m_start)<=5 and abs(Hend -m_end)<=5: # Qstart, Qend : in group file # p_end,hit_id: in blast if ((Qstart <= p_start) and ( Qend >= p_end )) : if (( HITstrand =='+') and (Hstart <= m_start) and (Hend >= m_end)): ignore = 1 elif (( HITstrand =='-') and (Hstart >= m_start) and (Hend <= m_end)): ignore =1 if ignore: print "ignored segment: query:",p_start,p_end,hit_id,":", Hstart,Hend break if abs(Qstart - p_start)<=5 and abs(Qend- p_end)<=5 and abs(Hstart-m_start)<=5 and abs(Hend -m_end)<=5: ignore = 1 print "ignored segment: query:",p_start,p_end,hit_id,":", Hstart,Hend break if ignore : continue if WithInRange(coverArea, p_start, p_end): continue a_length = len(plusline) p_start = p_start -1 p_end = p_end -1 if (m_start< m_end): m_start = m_start-1 m_end = m_end - 1 else: m_start = m_start +1 m_end = m_end +1 energy_list = compute_energy2 (plusline, minusline) #start energy start_energy = 3.4 for i in range (0, min(OLIGOLEN, a_length)): start_energy = start_energy + energy_list[i][ADD] #position = p_start if (p_start <= gene_end): if start_energy < temp_energy_dic[p_start][0]: temp_energy_dic[p_start] = [ start_energy, m_start] poffset = 0 #check gap moffset = 0 #the alignment of after the alignment energy = start_energy for pos in range(1,a_length - WORD): if pos != 0 and plusline[pos-1] == '-': #check gap poffset = poffset - 1 if pos!=0 and minusline[pos-1] =='-': moffset = moffset - 1 position = p_start+pos preal_position = position + poffset #compensate for gap if (m_start< m_end): position = m_start+pos mreal_position = position +moffset else: position = m_start -pos mreal_position = position - moffset if (preal_position > gene_end) :#or (real_position < 0): #check bound break else: energy = energy + energy_list[pos-1][SUB] end = pos+OLIGOLEN -1 if end < a_length: energy = energy + energy_list[end][ADD] if energy < temp_energy_dic[preal_position][0]: temp_energy_dic[preal_position] = [energy,mreal_position] poffset = 0 #no gap moffset =0 end = min(OLIGOLEN, a_length) energy = start_energy #the alignemnet is completely inside of the oligo selection if (end == a_length): start = p_start+end-OLIGOLEN for i in range(max(0,-(start) ), OLIGOLEN-end): preal_position = start + i if preal_position >gene_end: break if energy < temp_energy_dic[preal_position][0]: if m_start < m_end: m_location = m_start + preal_position - p_start else: m_location = m_start - (preal_position - p_start) temp_energy_dic[preal_position] = [energy,m_location] #before the alignment for pos in range(end-1, max(WORD, OLIGOLEN - p_start)-1 , -1): #version 2.7 correction preal_position = p_start+ pos -OLIGOLEN if preal_position > gene_end: break energy = energy - energy_list[pos][ADD] if energy < temp_energy_dic[preal_position][0]: if m_start < m_end: m_location = m_start + preal_position - p_start else: m_location = m_start - (preal_position - p_start) temp_energy_dic[preal_position] = [energy,m_location] for key in temp_energy_dic.keys(): if temp_energy_dic[key] != [0,-1]: try: energy_dic[key][0] = min(energy_dic[key][0], temp_energy_dic[key][0]) except KeyError: energy_dic[key] = [temp_energy_dic[key][0]] energy_dic[key].append([hit_id, temp_energy_dic[key][0], temp_energy_dic[key][1]]) return energy_dic #{pos:[min_energy,[hit_id, energy],[hit_id, energy],...], ....}