示例#1
0
    def build_fwtrack(self, opt):
        """Build FWTrackII from all lines, return a FWTrackII object.

        Handle multi-reads here by building a probability and enrichment index
        or select only one alignment from each multi-read. 
        Initial alignment probabilities are set from read/mismatch qualities
        or from a uniform distribution.
        """
        fwtrack = FWTrackII()
        i = 0
        m = 0
        read_total = 0
        recent_tags = []
        random_select_one_multi = opt.random_select_one_multi
        no_multi_reads = opt.no_multi_reads
        min_score = opt.min_score
        prior_prob_snp = opt.prior_prob_snp
        no_prior_prob_map = opt.no_prior_prob_map
        if opt.qual_scale == 'auto':
            opt.qual_scale = self._guess_qual_scale()
        if opt.qual_scale == 'sanger+33':
            qual_offset = 33
        elif opt.qual_scale == 'illumina+64':
            qual_offset = 64
        group_starts_append = fwtrack.group_starts.append
        fwtrack_add_loc = fwtrack.add_loc
        match_probs = {
        }  # {(1,30):p(match|phred=30), (0,30):p(mismatch|phred=30)}

        for grouplines in self._group_by_name(self.fhd):
            read_total += 1  # in ratios, only count reads, not total alignments
            if len(grouplines) == 1:
                # uniquely mapping reads
                i += 1
                if i == 1000000:
                    m += 1
                    logging.info(" %d alignments read." % (m * 1000000))
                    i = 0
                chromosome, fpos, strand, qualstr, mismatches = grouplines[0]
                fwtrack_add_loc(chromosome, fpos, strand,
                                0)  # 0'th index => unique
            else:
                if no_multi_reads:  # throw away multi-reads
                    fwtrack.total -= 1
                    continue
                elif random_select_one_multi:  # choose one alignment at random
                    i += 1
                    if i == 1000000:
                        m += 1
                        logging.info(" %d alignments read." % (m * 1000000))
                        i = 0
                    randline = grouplines[random_range(len(grouplines))]
                    chromosome, fpos, strand, qualstr, mismatches = randline
                    fwtrack_add_loc(chromosome, fpos, strand, 0)
                else:  # use all alignments probabilistically
                    group_starts_append(
                        fwtrack.total_multi +
                        1)  # starts at 1 (0 reserved for unique reads)
                    if no_prior_prob_map:
                        # don't use map quality; just assume uniform priors
                        for (chromosome, fpos, strand, qualstr,
                             mismatches) in grouplines:
                            i += 1
                            if i == 1000000:
                                m += 1
                                logging.info(" %d alignments read." %
                                             (m * 1000000))
                                i = 0
                            fwtrack.total_multi += 1
                            fwtrack_add_loc(chromosome, fpos, strand,
                                            fwtrack.total_multi)
                        normed_probs = [1. / len(grouplines)] * len(grouplines)
                    else:
                        # TODO: might want to be working in log space-- if many mismatches, we'll lose precision
                        qualstr = grouplines[0][
                            3]  # all quality strings are shared across the group
                        group_total_prob = 0.
                        group_probs = []
                        group_probs_append = group_probs.append
                        for (chromosome, fpos, strand, qualstr,
                             mismatches) in grouplines:
                            i += 1
                            if i == 1000000:
                                m += 1
                                logging.info(" %d alignments read." %
                                             (m * 1000000))
                                i = 0
                            fwtrack.total_multi += 1
                            fwtrack_add_loc(chromosome, fpos, strand,
                                            fwtrack.total_multi)
                            mismatches = set(mismatches)
                            read_prob = 1.
                            # P(SNP) = prior probability a SNP occurs at any base
                            # P(SE) = probability there was a sequencing error (from PHRED)
                            # _P(Map|SNP,SE)__MATCH__SNP__SE_
                            #       0           0     0    0    # can't map here without explanation
                            #       1           0     0    1
                            #       1           0     1    0
                            #       1           0     1    1
                            #       1           1     0    0
                            #       1           1     0    1
                            #       0           1     1    0    # wouldn't map here if SNP, but sequencer read reference
                            #       1           1     1    1
                            # we are interested in P(Mapping | Match), which is equivalent to:
                            # \Sum_{SNP \in {0,1}, SE \in {0,1}} p(SNP) * p(SE) * p(Map|SE,SNP), or:
                            # p(Map|match = 0):
                            #     p(SE) + p(SNP) + p(SE)*p(SNP)
                            # p(Map|match = 1):
                            #    1 - (p(SE) + p(SE)*p(SNP))
                            for b in xrange(len(qualstr)):
                                tup = (b in mismatches, qualstr[b])
                                if tup in match_probs:
                                    prob = match_probs[tup]
                                elif tup[0]:  # mismatch
                                    p_seq_error = 10.**(
                                        (qualstr[b] - qual_offset) / -10.)
                                    prob = p_seq_error + prior_prob_snp + p_seq_error * prior_prob_snp
                                    match_probs[tup] = prob
                                else:  # match
                                    p_seq_error = 10.**(
                                        (qualstr[b] - qual_offset) / -10.)
                                    prob = 1. - (p_seq_error +
                                                 p_seq_error * prior_prob_snp)
                                    match_probs[tup] = prob
                                read_prob *= prob
                            # quick & dirty check-- only looking at last base
                            assert qualstr[
                                b] >= qual_offset  # Specified quality scale yielded a negative phred score!  You probably have the wrong PHRED scale!
                            assert 0. <= read_prob <= 1.  # error with map qualities
                            #raise BaseQualityError("Specified quality scale yielded a negative phred score!  You probably have the wrong PHRED scale!")
                            group_probs_append(read_prob)
                            group_total_prob += read_prob
                        normed_probs = [
                            p / group_total_prob for p in group_probs
                        ]
                    fwtrack.prob_aligns.extend(normed_probs)
                    fwtrack.prior_aligns.extend(normed_probs)
                    fwtrack.enrich_scores.extend([min_score] * len(grouplines))
        fwtrack.total = read_total  # overwrite the running total, counting each read once
        return fwtrack
示例#2
0
    def build_fwtrack (self, opt):
        """Build FWTrackII from all lines, return a FWTrackII object.

        Handle multi-reads here by building a probability and enrichment index
        or select only one alignment from each multi-read. 
        Initial alignment probabilities are set from read/mismatch qualities
        or from a uniform distribution.
        """
        fwtrack = FWTrackII()
        i = 0
        m = 0
        read_total = 0
        recent_tags = []
        random_select_one_multi = opt.random_select_one_multi
        no_multi_reads = opt.no_multi_reads
        min_score = opt.min_score
        prior_prob_snp = opt.prior_prob_snp
        no_prior_prob_map = opt.no_prior_prob_map
        if opt.qual_scale == 'auto':
            opt.qual_scale = self._guess_qual_scale()
        if opt.qual_scale == 'sanger+33':
            qual_offset = 33
        elif opt.qual_scale == 'illumina+64':
            qual_offset = 64
        group_starts_append = fwtrack.group_starts.append
        fwtrack_add_loc = fwtrack.add_loc
        match_probs = {} # {(1,30):p(match|phred=30), (0,30):p(mismatch|phred=30)}

        for grouplines in self._group_by_name(self.fhd):
            read_total += 1  # in ratios, only count reads, not total alignments
            if len(grouplines) == 1:
                # uniquely mapping reads
                i+=1
                if i == 1000000:
                    m += 1
                    logging.info(" %d alignments read." % (m*1000000))
                    i=0
                chromosome, fpos, strand, qualstr, mismatches = grouplines[0]
                fwtrack_add_loc(chromosome,fpos,strand,0) # 0'th index => unique
            else:
                if no_multi_reads:  # throw away multi-reads
                    fwtrack.total -= 1
                    continue
                elif random_select_one_multi:  # choose one alignment at random
                    i+=1
                    if i == 1000000:
                        m += 1
                        logging.info(" %d alignments read." % (m*1000000))
                        i=0
                    randline = grouplines[random_range(len(grouplines))]
                    chromosome,fpos,strand,qualstr,mismatches = randline
                    fwtrack_add_loc(chromosome,fpos,strand,0)
                else:  # use all alignments probabilistically
                    group_starts_append(fwtrack.total_multi + 1)  # starts at 1 (0 reserved for unique reads)
                    if no_prior_prob_map:
                        # don't use map quality; just assume uniform priors
                        for (chromosome,fpos,strand, qualstr,mismatches) in grouplines:
                            i+=1
                            if i == 1000000:
                                m += 1
                                logging.info(" %d alignments read." % (m*1000000))
                                i=0
                            fwtrack.total_multi += 1
                            fwtrack_add_loc(chromosome,fpos,strand,
                                            fwtrack.total_multi)
                        normed_probs = [1./len(grouplines)] * len(grouplines)
                    else:
                        # TODO: might want to be working in log space-- if many mismatches, we'll lose precision
                        qualstr = grouplines[0][3]  # all quality strings are shared across the group
                        group_total_prob = 0.
                        group_probs = []
                        group_probs_append = group_probs.append
                        for (chromosome,fpos,strand, qualstr, mismatches) in grouplines:
                            i+=1
                            if i == 1000000:
                                m += 1
                                logging.info(" %d alignments read." % (m*1000000))
                                i=0
                            fwtrack.total_multi += 1
                            fwtrack_add_loc(chromosome,fpos,strand,
                                            fwtrack.total_multi)
                            mismatches = set(mismatches)
                            read_prob = 1.
                            # P(SNP) = prior probability a SNP occurs at any base
                            # P(SE) = probability there was a sequencing error (from PHRED)
                            # _P(Map|SNP,SE)__MATCH__SNP__SE_
                            #       0           0     0    0    # can't map here without explanation
                            #       1           0     0    1
                            #       1           0     1    0
                            #       1           0     1    1
                            #       1           1     0    0
                            #       1           1     0    1
                            #       0           1     1    0    # wouldn't map here if SNP, but sequencer read reference
                            #       1           1     1    1
                            # we are interested in P(Mapping | Match), which is equivalent to:
                            # \Sum_{SNP \in {0,1}, SE \in {0,1}} p(SNP) * p(SE) * p(Map|SE,SNP), or:
                            # p(Map|match = 0):
                            #     p(SE) + p(SNP) + p(SE)*p(SNP)
                            # p(Map|match = 1):
                            #    1 - (p(SE) + p(SE)*p(SNP))
                            for b in xrange(len(qualstr)):
                                tup = (b in mismatches,qualstr[b])
                                if tup in match_probs:
                                    prob = match_probs[tup]
                                elif tup[0]:  # mismatch
                                    p_seq_error = 10. ** ((qualstr[b]-qual_offset)/-10.)
                                    prob = p_seq_error + prior_prob_snp + p_seq_error * prior_prob_snp
                                    match_probs[tup] = prob
                                else:  # match
                                    p_seq_error = 10. ** ((qualstr[b]-qual_offset)/-10.)
                                    prob = 1. - (p_seq_error + p_seq_error * prior_prob_snp)
                                    match_probs[tup] = prob
                                read_prob *= prob
                            # quick & dirty check-- only looking at last base
                            assert qualstr[b] >= qual_offset # Specified quality scale yielded a negative phred score!  You probably have the wrong PHRED scale!
                            assert 0.<=read_prob<=1.  # error with map qualities
                            #raise BaseQualityError("Specified quality scale yielded a negative phred score!  You probably have the wrong PHRED scale!")
                            group_probs_append(read_prob)
                            group_total_prob += read_prob
                        normed_probs = [p / group_total_prob for p in group_probs]
                    fwtrack.prob_aligns.extend(normed_probs)
                    fwtrack.prior_aligns.extend(normed_probs)
                    fwtrack.enrich_scores.extend([min_score] * len(grouplines))
        fwtrack.total = read_total  # overwrite the running total, counting each read once
        return fwtrack