def _create_hsp(hid, qid, hspd): """Returns a list of HSP objects from the given parsed HSP values.""" frags = [] # we are iterating over query_ranges, but hit_ranges works just as well for idx, qcoords in enumerate(hspd['query_ranges']): # get sequences, create object hseqlist = hspd.get('hit') hseq = '' if hseqlist is None else hseqlist[idx] qseqlist = hspd.get('query') qseq = '' if qseqlist is None else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hspd['hit_ranges'][idx][0] frag.hit_end = hspd['hit_ranges'][idx][1] # alignment annotation try: aln_annot = hspd.get('aln_annotation', {}) for key, value in aln_annot.items(): frag.aln_annotation[key] = value[idx] except IndexError: pass # strands frag.query_strand = hspd['query_strand'] frag.hit_strand = hspd['hit_strand'] # and append the hsp object to the list if frag.aln_annotation.get('similarity') is not None: if '#' in frag.aln_annotation['similarity']: frags.extend(_split_fragment(frag)) continue # try to set frame if there are translation in the alignment if len(frag.aln_annotation) > 1 or \ frag.query_strand == 0 or \ ('vulgar_comp' in hspd and re.search(_RE_TRANS, hspd['vulgar_comp'])): _set_frame(frag) frags.append(frag) # if the query is protein, we need to change the hit and query sequences # from three-letter amino acid codes to one letter, and adjust their # coordinates accordingly if len(frags[0].aln_annotation) == 2: # 2 annotations == protein query frags = _adjust_aa_seq(frags) hsp = HSP(frags) # set hsp-specific attributes for attr in ('score', 'hit_split_codons', 'query_split_codons', 'model', 'vulgar_comp', 'cigar_comp', 'alphabet'): if attr in hspd: setattr(hsp, attr, hspd[attr]) return hsp
def _create_hsp(hid, qid, hspd): """Returns a list of HSP objects from the given parsed HSP values.""" frags = [] # we are iterating over query_ranges, but hit_ranges works just as well for idx, qcoords in enumerate(hspd['query_ranges']): # get sequences, create object hseqlist = hspd.get('hit') hseq = '' if hseqlist is None else hseqlist[idx] qseqlist = hspd.get('query') qseq = '' if qseqlist is None else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hspd['hit_ranges'][idx][0] frag.hit_end = hspd['hit_ranges'][idx][1] # alignment annotation try: aln_annot = hspd.get('aln_annotation', {}) for key, value in aln_annot.items(): frag.aln_annotation[key] = value[idx] except IndexError: pass # strands frag.query_strand = hspd['query_strand'] frag.hit_strand = hspd['hit_strand'] # and append the hsp object to the list if frag.aln_annotation.get('similarity') is not None: if '#' in frag.aln_annotation['similarity']: frags.extend(_split_fragment(frag)) continue # try to set frame if there are translation in the alignment if len(frag.aln_annotation) > 1 or \ frag.query_strand == 0 or \ ('vulgar_comp' in hspd and re.search(_RE_TRANS, hspd['vulgar_comp'])): _set_frame(frag) frags.append(frag) # if the query is protein, we need to change the hit and query sequences # from three-letter amino acid codes to one letter, and adjust their # coordinates accordingly if len(frags[0].aln_annotation) == 2: # 2 annotations == protein query frags = _adjust_aa_seq(frags) hsp = HSP(frags) # set hsp-specific attributes for attr in ('score', 'hit_split_codons', 'query_split_codons', 'model', 'vulgar_comp', 'cigar_comp', 'alphabet'): if attr in hspd: setattr(hsp, attr, hspd[attr]) return hsp
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand #if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = list( zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])])) hit_range_all = list( zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])])) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand #if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = list(zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])])) hit_range_all = list(zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])])) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp