def build_fwtrack(self, opt): """Build FWTrackII from all lines, return a FWTrackII object. Handle multi-reads here by building a probability and enrichment index or select only one alignment from each multi-read. Initial alignment probabilities are set from read/mismatch qualities or from a uniform distribution. """ fwtrack = FWTrackII() i = 0 m = 0 read_total = 0 recent_tags = [] random_select_one_multi = opt.random_select_one_multi no_multi_reads = opt.no_multi_reads min_score = opt.min_score prior_prob_snp = opt.prior_prob_snp no_prior_prob_map = opt.no_prior_prob_map if opt.qual_scale == 'auto': opt.qual_scale = self._guess_qual_scale() if opt.qual_scale == 'sanger+33': qual_offset = 33 elif opt.qual_scale == 'illumina+64': qual_offset = 64 group_starts_append = fwtrack.group_starts.append fwtrack_add_loc = fwtrack.add_loc match_probs = { } # {(1,30):p(match|phred=30), (0,30):p(mismatch|phred=30)} for grouplines in self._group_by_name(self.fhd): read_total += 1 # in ratios, only count reads, not total alignments if len(grouplines) == 1: # uniquely mapping reads i += 1 if i == 1000000: m += 1 logging.info(" %d alignments read." % (m * 1000000)) i = 0 chromosome, fpos, strand, qualstr, mismatches = grouplines[0] fwtrack_add_loc(chromosome, fpos, strand, 0) # 0'th index => unique else: if no_multi_reads: # throw away multi-reads fwtrack.total -= 1 continue elif random_select_one_multi: # choose one alignment at random i += 1 if i == 1000000: m += 1 logging.info(" %d alignments read." % (m * 1000000)) i = 0 randline = grouplines[random_range(len(grouplines))] chromosome, fpos, strand, qualstr, mismatches = randline fwtrack_add_loc(chromosome, fpos, strand, 0) else: # use all alignments probabilistically group_starts_append( fwtrack.total_multi + 1) # starts at 1 (0 reserved for unique reads) if no_prior_prob_map: # don't use map quality; just assume uniform priors for (chromosome, fpos, strand, qualstr, mismatches) in grouplines: i += 1 if i == 1000000: m += 1 logging.info(" %d alignments read." % (m * 1000000)) i = 0 fwtrack.total_multi += 1 fwtrack_add_loc(chromosome, fpos, strand, fwtrack.total_multi) normed_probs = [1. / len(grouplines)] * len(grouplines) else: # TODO: might want to be working in log space-- if many mismatches, we'll lose precision qualstr = grouplines[0][ 3] # all quality strings are shared across the group group_total_prob = 0. group_probs = [] group_probs_append = group_probs.append for (chromosome, fpos, strand, qualstr, mismatches) in grouplines: i += 1 if i == 1000000: m += 1 logging.info(" %d alignments read." % (m * 1000000)) i = 0 fwtrack.total_multi += 1 fwtrack_add_loc(chromosome, fpos, strand, fwtrack.total_multi) mismatches = set(mismatches) read_prob = 1. # P(SNP) = prior probability a SNP occurs at any base # P(SE) = probability there was a sequencing error (from PHRED) # _P(Map|SNP,SE)__MATCH__SNP__SE_ # 0 0 0 0 # can't map here without explanation # 1 0 0 1 # 1 0 1 0 # 1 0 1 1 # 1 1 0 0 # 1 1 0 1 # 0 1 1 0 # wouldn't map here if SNP, but sequencer read reference # 1 1 1 1 # we are interested in P(Mapping | Match), which is equivalent to: # \Sum_{SNP \in {0,1}, SE \in {0,1}} p(SNP) * p(SE) * p(Map|SE,SNP), or: # p(Map|match = 0): # p(SE) + p(SNP) + p(SE)*p(SNP) # p(Map|match = 1): # 1 - (p(SE) + p(SE)*p(SNP)) for b in xrange(len(qualstr)): tup = (b in mismatches, qualstr[b]) if tup in match_probs: prob = match_probs[tup] elif tup[0]: # mismatch p_seq_error = 10.**( (qualstr[b] - qual_offset) / -10.) prob = p_seq_error + prior_prob_snp + p_seq_error * prior_prob_snp match_probs[tup] = prob else: # match p_seq_error = 10.**( (qualstr[b] - qual_offset) / -10.) prob = 1. - (p_seq_error + p_seq_error * prior_prob_snp) match_probs[tup] = prob read_prob *= prob # quick & dirty check-- only looking at last base assert qualstr[ b] >= qual_offset # Specified quality scale yielded a negative phred score! You probably have the wrong PHRED scale! assert 0. <= read_prob <= 1. # error with map qualities #raise BaseQualityError("Specified quality scale yielded a negative phred score! You probably have the wrong PHRED scale!") group_probs_append(read_prob) group_total_prob += read_prob normed_probs = [ p / group_total_prob for p in group_probs ] fwtrack.prob_aligns.extend(normed_probs) fwtrack.prior_aligns.extend(normed_probs) fwtrack.enrich_scores.extend([min_score] * len(grouplines)) fwtrack.total = read_total # overwrite the running total, counting each read once return fwtrack
def build_fwtrack (self, opt): """Build FWTrackII from all lines, return a FWTrackII object. Handle multi-reads here by building a probability and enrichment index or select only one alignment from each multi-read. Initial alignment probabilities are set from read/mismatch qualities or from a uniform distribution. """ fwtrack = FWTrackII() i = 0 m = 0 read_total = 0 recent_tags = [] random_select_one_multi = opt.random_select_one_multi no_multi_reads = opt.no_multi_reads min_score = opt.min_score prior_prob_snp = opt.prior_prob_snp no_prior_prob_map = opt.no_prior_prob_map if opt.qual_scale == 'auto': opt.qual_scale = self._guess_qual_scale() if opt.qual_scale == 'sanger+33': qual_offset = 33 elif opt.qual_scale == 'illumina+64': qual_offset = 64 group_starts_append = fwtrack.group_starts.append fwtrack_add_loc = fwtrack.add_loc match_probs = {} # {(1,30):p(match|phred=30), (0,30):p(mismatch|phred=30)} for grouplines in self._group_by_name(self.fhd): read_total += 1 # in ratios, only count reads, not total alignments if len(grouplines) == 1: # uniquely mapping reads i+=1 if i == 1000000: m += 1 logging.info(" %d alignments read." % (m*1000000)) i=0 chromosome, fpos, strand, qualstr, mismatches = grouplines[0] fwtrack_add_loc(chromosome,fpos,strand,0) # 0'th index => unique else: if no_multi_reads: # throw away multi-reads fwtrack.total -= 1 continue elif random_select_one_multi: # choose one alignment at random i+=1 if i == 1000000: m += 1 logging.info(" %d alignments read." % (m*1000000)) i=0 randline = grouplines[random_range(len(grouplines))] chromosome,fpos,strand,qualstr,mismatches = randline fwtrack_add_loc(chromosome,fpos,strand,0) else: # use all alignments probabilistically group_starts_append(fwtrack.total_multi + 1) # starts at 1 (0 reserved for unique reads) if no_prior_prob_map: # don't use map quality; just assume uniform priors for (chromosome,fpos,strand, qualstr,mismatches) in grouplines: i+=1 if i == 1000000: m += 1 logging.info(" %d alignments read." % (m*1000000)) i=0 fwtrack.total_multi += 1 fwtrack_add_loc(chromosome,fpos,strand, fwtrack.total_multi) normed_probs = [1./len(grouplines)] * len(grouplines) else: # TODO: might want to be working in log space-- if many mismatches, we'll lose precision qualstr = grouplines[0][3] # all quality strings are shared across the group group_total_prob = 0. group_probs = [] group_probs_append = group_probs.append for (chromosome,fpos,strand, qualstr, mismatches) in grouplines: i+=1 if i == 1000000: m += 1 logging.info(" %d alignments read." % (m*1000000)) i=0 fwtrack.total_multi += 1 fwtrack_add_loc(chromosome,fpos,strand, fwtrack.total_multi) mismatches = set(mismatches) read_prob = 1. # P(SNP) = prior probability a SNP occurs at any base # P(SE) = probability there was a sequencing error (from PHRED) # _P(Map|SNP,SE)__MATCH__SNP__SE_ # 0 0 0 0 # can't map here without explanation # 1 0 0 1 # 1 0 1 0 # 1 0 1 1 # 1 1 0 0 # 1 1 0 1 # 0 1 1 0 # wouldn't map here if SNP, but sequencer read reference # 1 1 1 1 # we are interested in P(Mapping | Match), which is equivalent to: # \Sum_{SNP \in {0,1}, SE \in {0,1}} p(SNP) * p(SE) * p(Map|SE,SNP), or: # p(Map|match = 0): # p(SE) + p(SNP) + p(SE)*p(SNP) # p(Map|match = 1): # 1 - (p(SE) + p(SE)*p(SNP)) for b in xrange(len(qualstr)): tup = (b in mismatches,qualstr[b]) if tup in match_probs: prob = match_probs[tup] elif tup[0]: # mismatch p_seq_error = 10. ** ((qualstr[b]-qual_offset)/-10.) prob = p_seq_error + prior_prob_snp + p_seq_error * prior_prob_snp match_probs[tup] = prob else: # match p_seq_error = 10. ** ((qualstr[b]-qual_offset)/-10.) prob = 1. - (p_seq_error + p_seq_error * prior_prob_snp) match_probs[tup] = prob read_prob *= prob # quick & dirty check-- only looking at last base assert qualstr[b] >= qual_offset # Specified quality scale yielded a negative phred score! You probably have the wrong PHRED scale! assert 0.<=read_prob<=1. # error with map qualities #raise BaseQualityError("Specified quality scale yielded a negative phred score! You probably have the wrong PHRED scale!") group_probs_append(read_prob) group_total_prob += read_prob normed_probs = [p / group_total_prob for p in group_probs] fwtrack.prob_aligns.extend(normed_probs) fwtrack.prior_aligns.extend(normed_probs) fwtrack.enrich_scores.extend([min_score] * len(grouplines)) fwtrack.total = read_total # overwrite the running total, counting each read once return fwtrack