def parsing_each_query(content,fout):
    #1. have to be 100 similarity
    #2. different segment add up together to be the complete length
    #3. check strand constrains
    #4. check no overlapping constrains
    content = string.strip(string.join(content,''))
    if content =='':
        return
    
    parse_dic = blast_parse.parse_query_alignment(content)
    
    query = parse_dic.keys()[0]
    print query,
    hit_candidate_segment =[] #[[hitid, strand, pstart, pend, hstart, hend],...]
    group_list =[]
    query_len= parse_dic[query][0]
    hit_dic = parse_dic[query][1] #only two
    hit_list = hit_dic.keys()

    for hitid in hit_list:
        #to get the id
        hit_align_list = hit_dic[hitid]

        first_100 =0
        for seg_list in hit_align_list:
            strand, qstart, qend, hstart, hend, percentage = seg_list
            if percentage < 100 :#99.5:
                continue
            if abs(qend - qstart) +1 <query_len:
                continue
            hit_candidate_segment.append([hitid, strand,qstart,qend, hstart,hend])
         
    #write to file of the group list
    fout.write('['+query+']\n')
    #write the query itself
    fout.write(query+'\n')
    fout.write("%s %s %s %s %s\n" % ( '+','1',str(query_len),'1',str(query_len)))
    hit_dic={}

    if gfdir !="":
        string_start = len(gfdir)+1
    else:
        string_start = 0
        
    for entry in hit_candidate_segment:
        hitid, hitstrand, qstart,qend, hstart,hend = entry
        hitid = hitid[string_start:]
        if gfdir !="":
            string_end = string.find(hitid, ".nib:")
            hitid = hitid[:string_end]
        fout.write(hitid+'\n')
        fout.write("%s %s %s %s %s\n" % (hitstrand, str(qstart), str(qend), str(hstart), str(hend)))
    fout.write('\n')
    print "done"
Exemplo n.º 2
0
def parsing_each_query(content,fout):
    #1. have to be 100 similarity
    #2. different segment add up together to be the complete length
    #3. check strand constrains
    #4. check no overlapping constrains
    content = string.strip(string.join(content,''))
    if content =='':
        return
    
    parse_dic = blast_parse.parse_query_alignment(content)
    
    query = parse_dic.keys()[0]
    print query,
    hit_candidate_segment =[] #[[hitid, strand, pstart, pend, hstart, hend],...]
    group_list =[]
    query_len= parse_dic[query][0]
    hit_dic = parse_dic[query][1] #only two
    hit_list = hit_dic.keys()

    for hitid in hit_list:
        #to get the id
        hit_align_list = hit_dic[hitid]

        first_100 =0
        for seg_list in hit_align_list:
            strand, qstart, qend, hstart, hend, percentage = seg_list
            if percentage < 100 :#99.5:
                continue

            if first_100 == 0:
                first_100 =1
                #inital test to see if the hits is likely to be real - large first hsp 100%
                if abs(qend - qstart) <50:
                    break
            hit_candidate_segment.append([hitid, strand,qstart,qend, hstart,hend])

    parse_dic ={} #save memory only one enty in the parse_dic
        
    #generate the combination constrains dictionary using the list position as the segment
    #id to form the dic as id:okid_list
    rule_dic, must_cover_list = rules(hit_candidate_segment,query_len)

    #generate all the combinations that do not conflict with the rule_dic
    combo_list = generate_combination(rule_dic, must_cover_list, hit_candidate_segment, query_len)
    #now check if any combination of the segment make a complete query
    # also need to check strand, overlapping constrains.
    #[[hitid, strand, pstart, pend, hstart, hend],...]
    #use 1,2,3,... segment of the list to reconstruct and then check constrains

    for i in range(0, len(combo_list)):
        candidate_list =[]
        for pos in combo_list[i]:
            candidate_list.append( hit_candidate_segment[pos])
        passConstrain = constrains(query_len,candidate_list)
        if passConstrain:
            group_list.extend(candidate_list)
         
    #write to file of the group list
    fout.write('['+query+']\n')
    #write the query itself
    fout.write(query+'\n')
    fout.write("%s %s %s %s %s\n" % ( '+','1',str(query_len),'1',str(query_len)))
    hit_dic={}

    if gfdir !="":
        string_start = len(gfdir)+1
    else:
        string_start = 0
        
    for entry in group_list:
        hitid, hitstrand, qstart,qend, hstart,hend = entry
        hitid = hitid[string_start:]
        if gfdir !="":
            string_end = string.find(hitid, ".nib:")
            hitid = hitid[:string_end]
        fout.write(hitid+'\n')
        fout.write("%s %s %s %s %s\n" % (hitstrand, str(qstart), str(qend), str(hstart), str(hend)))
    fout.write('\n')
    print "done"