def _create_hsp(hid, qid, psl): """Create high scoring pair object (PRIVATE).""" # protein flag is_protein = _is_protein(psl) # strand # if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl["strand"][0] == "+" else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl["strand"][1] == "+" else -1 except IndexError: hstrand = 1 # hit strand defaults to plus blocksize_multiplier = 3 if is_protein else 1 # query block starts qstarts = _reorient_starts(psl["qstarts"], psl["blocksizes"], psl["qsize"], qstrand) # hit block starts if len(psl["strand"]) == 2: hstarts = _reorient_starts( psl["tstarts"], [blocksize_multiplier * i for i in psl["blocksizes"]], psl["tsize"], hstrand, ) else: hstarts = psl["tstarts"] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl["blocksizes"]) query_range_all = list( zip(qstarts, [x + y for x, y in zip(qstarts, psl["blocksizes"])])) hit_range_all = list( zip( hstarts, [ x + y * blocksize_multiplier for x, y in zip(hstarts, psl["blocksizes"]) ], )) # check length of sequences and coordinates, all must match if "tseqs" in psl and "qseqs" in psl: assert (len(psl["tseqs"]) == len(psl["qseqs"]) == len(query_range_all) == len(hit_range_all)) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get("tseqs") hseq = "" if not hseqlist else hseqlist[idx] qseqlist = psl.get("qseqs") qseq = "" if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl["qstart"] assert hsp.query_end == psl["qend"] assert hsp.hit_start == psl["tstart"] assert hsp.hit_end == psl["tend"] # and check block spans as well hit_spans = [span / blocksize_multiplier for span in hsp.hit_span_all] assert hit_spans == hsp.query_span_all == psl["blocksizes"] # set its attributes hsp.match_num = psl["matches"] hsp.mismatch_num = psl["mismatches"] hsp.match_rep_num = psl["repmatches"] hsp.n_num = psl["ncount"] hsp.query_gapopen_num = psl["qnuminsert"] hsp.query_gap_num = psl["qbaseinsert"] hsp.hit_gapopen_num = psl["tnuminsert"] hsp.hit_gap_num = psl["tbaseinsert"] hsp.ident_num = psl["matches"] + psl["repmatches"] hsp.gapopen_num = psl["qnuminsert"] + psl["tnuminsert"] hsp.gap_num = psl["qbaseinsert"] + psl["tbaseinsert"] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl["strand"]) == 2 return hsp
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand #if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], \ psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], \ psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = zip(qstarts, [x + y for x, y in \ zip(qstarts, psl['blocksizes'])]) hit_range_all = zip(hstarts, [x + y for x, y in \ zip(hstarts, psl['blocksizes'])]) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand # if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = list( zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])])) hit_range_all = list( zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])])) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp