示例#1
0
    def build_fwtrack (self, opt):
        """Build FWTrackII from all lines, return a FWTrackII object.

        Note: All locations will be merged (exclude the same
        location) then sorted after the track is built.

        If both_strand is True, it will store strand information in
        FWTrackII object.

        if do_merge is False, it will not merge the same location after
        the track is built.
        """
        fwtrack = FWTrackII()
        i = 0
        m = 0
        for thisline in self.fhd:
            (chromosome,fpos,strand) = self._fw_parse_line(thisline)
            i+=1
            if i == 1000000:
                m += 1
                logging.info(" %d" % (m*1000000))
                i=0
            if not fpos or not chromosome:
                continue
            fwtrack.add_loc(chromosome,fpos,strand)
        return fwtrack
示例#2
0
    def build_fwtrack (self, opt, dist=200):
        """Build FWTrackII from all lines, return a FWTrackII object.

        lfhd: the filehandler for left tag file
        rfhd: the filehandler for right tag file
        dist: the best distance between two tags in a pair

        The score system for pairing two tags:

        score = abs(abs(rtag-ltag)-200)+error4lefttag+error4righttag

        the smaller score the better pairing. If the score for a
        pairing is bigger than 200, this pairing will be discarded.

        Note only the best pair is kept. If there are over two best
        pairings, this pair of left and right tags will be discarded.

        Note, the orders in left tag file and right tag file must
        match, i.e., the Nth left tag must has the same name as the
        Nth right tag.

        Note, remove comment lines beforehand.
        """
        fwtrack = FWTrackII()
        i = 0
        m = 0
        lnext = self.lfhd.next
        rnext = self.rfhd.next
        self.dist = dist
        try:
            while 1:
                lline = lnext()
                rline = rnext()
                (chromname,fpos,strand) = self._fw_parse_line(lline,rline)

                i+=1
                if i == 1000000:
                    m += 1
                    logging.info(" %d" % (m*1000000))
                    i=0
                if not fpos or not chromname:
                    continue

                try:
                    chromname = chromname[:chromname.rindex(".fa")]
                except ValueError:
                    pass
                
                fwtrack.add_loc(chromname,fpos,strand)

        except StopIteration:
            pass
        return fwtrack
示例#3
0
    def build_fwtrack (self, opt):
        """Build FWTrackII from all lines, return a FWTrackII object.

        """
        fwtrack = FWTrackII()
        i = 0
        m = 0
        for thisline in self.fhd:
            (chromosome,fpos,strand) = self._fw_parse_line(thisline)
            i+=1
            if i == 1000000:
                m += 1
                logging.info(" %d" % (m*1000000))
                i=0
            if not fpos or not chromosome:
                continue
            fwtrack.add_loc(chromosome,fpos,strand)
        return fwtrack
示例#4
0
    def build_fwtrack(self, opt):
        """Build FWTrackII from all lines, return a FWTrackII object.
    
        Note only the unique match for a tag is kept.
        """
        fwtrack = FWTrackII()
        i = 0
        m = 0
        references = []
        fseek = self.fhd.seek
        fread = self.fhd.read
        ftell = self.fhd.tell
        #move to pos 4, there starts something
        fseek(4)
        header_len = struct.unpack('<i', fread(4))[0]
        fseek(header_len + ftell())
        #get the number of chromosome
        nc = struct.unpack('<i', fread(4))[0]
        for x in range(nc):
            # read each chromosome name
            nlength = struct.unpack('<i', fread(4))[0]
            references.append(fread(nlength)[:-1])
            # jump over chromosome size, we don't need it
            fseek(ftell() + 4)

        while 1:
            try:
                entrylength = struct.unpack('<i', fread(4))[0]
            except struct.error:
                break
            (chrid, fpos, strand) = self._fw_binary_parse(fread(entrylength))
            i += 1
            if i == 1000000:
                m += 1
                logging.info(" %d" % (m * 1000000))
                i = 0
            if fpos >= 0:
                fwtrack.add_loc(references[chrid], fpos, strand)
        self.fhd.close()
        return fwtrack
示例#5
0
 def build_fwtrack (self, opt):
     """Build FWTrackII from all lines, return a FWTrackII object.
 
     Note only the unique match for a tag is kept.
     """
     fwtrack = FWTrackII()
     i = 0
     m = 0
     references = []
     fseek = self.fhd.seek
     fread = self.fhd.read
     ftell = self.fhd.tell
      #move to pos 4, there starts something
     fseek(4)
     header_len =  struct.unpack('<i', fread(4))[0]
     fseek(header_len + ftell())
      #get the number of chromosome
     nc = struct.unpack('<i', fread(4))[0]
     for x in range(nc):
         # read each chromosome name
         nlength = struct.unpack('<i', fread(4))[0]
         references.append(fread(nlength)[:-1])
         # jump over chromosome size, we don't need it
         fseek(ftell() + 4)
     
     while 1:
         try:
             entrylength = struct.unpack('<i', fread(4))[0]
         except struct.error:
             break
         (chrid,fpos,strand) = self._fw_binary_parse(fread(entrylength))                    
         i+=1
         if i == 1000000:
             m += 1
             logging.info(" %d" % (m*1000000))
             i=0
         if fpos >= 0:
             fwtrack.add_loc(references[chrid],fpos,strand)
     self.fhd.close()
     return fwtrack
示例#6
0
    def build_fwtrack(self, opt):
        """Build FWTrackII from all lines, return a FWTrackII object.

        Handle multi-reads here by building a probability and enrichment index
        or select only one alignment from each multi-read. 
        Initial alignment probabilities are set from read/mismatch qualities
        or from a uniform distribution.
        """
        fwtrack = FWTrackII()
        i = 0
        m = 0
        read_total = 0
        recent_tags = []
        random_select_one_multi = opt.random_select_one_multi
        no_multi_reads = opt.no_multi_reads
        min_score = opt.min_score
        prior_prob_snp = opt.prior_prob_snp
        no_prior_prob_map = opt.no_prior_prob_map
        if opt.qual_scale == 'auto':
            opt.qual_scale = self._guess_qual_scale()
        if opt.qual_scale == 'sanger+33':
            qual_offset = 33
        elif opt.qual_scale == 'illumina+64':
            qual_offset = 64
        group_starts_append = fwtrack.group_starts.append
        fwtrack_add_loc = fwtrack.add_loc
        match_probs = {
        }  # {(1,30):p(match|phred=30), (0,30):p(mismatch|phred=30)}

        for grouplines in self._group_by_name(self.fhd):
            read_total += 1  # in ratios, only count reads, not total alignments
            if len(grouplines) == 1:
                # uniquely mapping reads
                i += 1
                if i == 1000000:
                    m += 1
                    logging.info(" %d alignments read." % (m * 1000000))
                    i = 0
                chromosome, fpos, strand, qualstr, mismatches = grouplines[0]
                fwtrack_add_loc(chromosome, fpos, strand,
                                0)  # 0'th index => unique
            else:
                if no_multi_reads:  # throw away multi-reads
                    fwtrack.total -= 1
                    continue
                elif random_select_one_multi:  # choose one alignment at random
                    i += 1
                    if i == 1000000:
                        m += 1
                        logging.info(" %d alignments read." % (m * 1000000))
                        i = 0
                    randline = grouplines[random_range(len(grouplines))]
                    chromosome, fpos, strand, qualstr, mismatches = randline
                    fwtrack_add_loc(chromosome, fpos, strand, 0)
                else:  # use all alignments probabilistically
                    group_starts_append(
                        fwtrack.total_multi +
                        1)  # starts at 1 (0 reserved for unique reads)
                    if no_prior_prob_map:
                        # don't use map quality; just assume uniform priors
                        for (chromosome, fpos, strand, qualstr,
                             mismatches) in grouplines:
                            i += 1
                            if i == 1000000:
                                m += 1
                                logging.info(" %d alignments read." %
                                             (m * 1000000))
                                i = 0
                            fwtrack.total_multi += 1
                            fwtrack_add_loc(chromosome, fpos, strand,
                                            fwtrack.total_multi)
                        normed_probs = [1. / len(grouplines)] * len(grouplines)
                    else:
                        # TODO: might want to be working in log space-- if many mismatches, we'll lose precision
                        qualstr = grouplines[0][
                            3]  # all quality strings are shared across the group
                        group_total_prob = 0.
                        group_probs = []
                        group_probs_append = group_probs.append
                        for (chromosome, fpos, strand, qualstr,
                             mismatches) in grouplines:
                            i += 1
                            if i == 1000000:
                                m += 1
                                logging.info(" %d alignments read." %
                                             (m * 1000000))
                                i = 0
                            fwtrack.total_multi += 1
                            fwtrack_add_loc(chromosome, fpos, strand,
                                            fwtrack.total_multi)
                            mismatches = set(mismatches)
                            read_prob = 1.
                            # P(SNP) = prior probability a SNP occurs at any base
                            # P(SE) = probability there was a sequencing error (from PHRED)
                            # _P(Map|SNP,SE)__MATCH__SNP__SE_
                            #       0           0     0    0    # can't map here without explanation
                            #       1           0     0    1
                            #       1           0     1    0
                            #       1           0     1    1
                            #       1           1     0    0
                            #       1           1     0    1
                            #       0           1     1    0    # wouldn't map here if SNP, but sequencer read reference
                            #       1           1     1    1
                            # we are interested in P(Mapping | Match), which is equivalent to:
                            # \Sum_{SNP \in {0,1}, SE \in {0,1}} p(SNP) * p(SE) * p(Map|SE,SNP), or:
                            # p(Map|match = 0):
                            #     p(SE) + p(SNP) + p(SE)*p(SNP)
                            # p(Map|match = 1):
                            #    1 - (p(SE) + p(SE)*p(SNP))
                            for b in xrange(len(qualstr)):
                                tup = (b in mismatches, qualstr[b])
                                if tup in match_probs:
                                    prob = match_probs[tup]
                                elif tup[0]:  # mismatch
                                    p_seq_error = 10.**(
                                        (qualstr[b] - qual_offset) / -10.)
                                    prob = p_seq_error + prior_prob_snp + p_seq_error * prior_prob_snp
                                    match_probs[tup] = prob
                                else:  # match
                                    p_seq_error = 10.**(
                                        (qualstr[b] - qual_offset) / -10.)
                                    prob = 1. - (p_seq_error +
                                                 p_seq_error * prior_prob_snp)
                                    match_probs[tup] = prob
                                read_prob *= prob
                            # quick & dirty check-- only looking at last base
                            assert qualstr[
                                b] >= qual_offset  # Specified quality scale yielded a negative phred score!  You probably have the wrong PHRED scale!
                            assert 0. <= read_prob <= 1.  # error with map qualities
                            #raise BaseQualityError("Specified quality scale yielded a negative phred score!  You probably have the wrong PHRED scale!")
                            group_probs_append(read_prob)
                            group_total_prob += read_prob
                        normed_probs = [
                            p / group_total_prob for p in group_probs
                        ]
                    fwtrack.prob_aligns.extend(normed_probs)
                    fwtrack.prior_aligns.extend(normed_probs)
                    fwtrack.enrich_scores.extend([min_score] * len(grouplines))
        fwtrack.total = read_total  # overwrite the running total, counting each read once
        return fwtrack
示例#7
0
    def build_fwtrack (self, opt):
        """Build FWTrackII from all lines, return a FWTrackII object.

        Handle multi-reads here by building a probability and enrichment index
        or select only one alignment from each multi-read. 
        Initial alignment probabilities are set from read/mismatch qualities
        or from a uniform distribution.
        """
        fwtrack = FWTrackII()
        i = 0
        m = 0
        read_total = 0
        recent_tags = []
        random_select_one_multi = opt.random_select_one_multi
        no_multi_reads = opt.no_multi_reads
        min_score = opt.min_score
        prior_prob_snp = opt.prior_prob_snp
        no_prior_prob_map = opt.no_prior_prob_map
        if opt.qual_scale == 'auto':
            opt.qual_scale = self._guess_qual_scale()
        if opt.qual_scale == 'sanger+33':
            qual_offset = 33
        elif opt.qual_scale == 'illumina+64':
            qual_offset = 64
        group_starts_append = fwtrack.group_starts.append
        fwtrack_add_loc = fwtrack.add_loc
        match_probs = {} # {(1,30):p(match|phred=30), (0,30):p(mismatch|phred=30)}

        for grouplines in self._group_by_name(self.fhd):
            read_total += 1  # in ratios, only count reads, not total alignments
            if len(grouplines) == 1:
                # uniquely mapping reads
                i+=1
                if i == 1000000:
                    m += 1
                    logging.info(" %d alignments read." % (m*1000000))
                    i=0
                chromosome, fpos, strand, qualstr, mismatches = grouplines[0]
                fwtrack_add_loc(chromosome,fpos,strand,0) # 0'th index => unique
            else:
                if no_multi_reads:  # throw away multi-reads
                    fwtrack.total -= 1
                    continue
                elif random_select_one_multi:  # choose one alignment at random
                    i+=1
                    if i == 1000000:
                        m += 1
                        logging.info(" %d alignments read." % (m*1000000))
                        i=0
                    randline = grouplines[random_range(len(grouplines))]
                    chromosome,fpos,strand,qualstr,mismatches = randline
                    fwtrack_add_loc(chromosome,fpos,strand,0)
                else:  # use all alignments probabilistically
                    group_starts_append(fwtrack.total_multi + 1)  # starts at 1 (0 reserved for unique reads)
                    if no_prior_prob_map:
                        # don't use map quality; just assume uniform priors
                        for (chromosome,fpos,strand, qualstr,mismatches) in grouplines:
                            i+=1
                            if i == 1000000:
                                m += 1
                                logging.info(" %d alignments read." % (m*1000000))
                                i=0
                            fwtrack.total_multi += 1
                            fwtrack_add_loc(chromosome,fpos,strand,
                                            fwtrack.total_multi)
                        normed_probs = [1./len(grouplines)] * len(grouplines)
                    else:
                        # TODO: might want to be working in log space-- if many mismatches, we'll lose precision
                        qualstr = grouplines[0][3]  # all quality strings are shared across the group
                        group_total_prob = 0.
                        group_probs = []
                        group_probs_append = group_probs.append
                        for (chromosome,fpos,strand, qualstr, mismatches) in grouplines:
                            i+=1
                            if i == 1000000:
                                m += 1
                                logging.info(" %d alignments read." % (m*1000000))
                                i=0
                            fwtrack.total_multi += 1
                            fwtrack_add_loc(chromosome,fpos,strand,
                                            fwtrack.total_multi)
                            mismatches = set(mismatches)
                            read_prob = 1.
                            # P(SNP) = prior probability a SNP occurs at any base
                            # P(SE) = probability there was a sequencing error (from PHRED)
                            # _P(Map|SNP,SE)__MATCH__SNP__SE_
                            #       0           0     0    0    # can't map here without explanation
                            #       1           0     0    1
                            #       1           0     1    0
                            #       1           0     1    1
                            #       1           1     0    0
                            #       1           1     0    1
                            #       0           1     1    0    # wouldn't map here if SNP, but sequencer read reference
                            #       1           1     1    1
                            # we are interested in P(Mapping | Match), which is equivalent to:
                            # \Sum_{SNP \in {0,1}, SE \in {0,1}} p(SNP) * p(SE) * p(Map|SE,SNP), or:
                            # p(Map|match = 0):
                            #     p(SE) + p(SNP) + p(SE)*p(SNP)
                            # p(Map|match = 1):
                            #    1 - (p(SE) + p(SE)*p(SNP))
                            for b in xrange(len(qualstr)):
                                tup = (b in mismatches,qualstr[b])
                                if tup in match_probs:
                                    prob = match_probs[tup]
                                elif tup[0]:  # mismatch
                                    p_seq_error = 10. ** ((qualstr[b]-qual_offset)/-10.)
                                    prob = p_seq_error + prior_prob_snp + p_seq_error * prior_prob_snp
                                    match_probs[tup] = prob
                                else:  # match
                                    p_seq_error = 10. ** ((qualstr[b]-qual_offset)/-10.)
                                    prob = 1. - (p_seq_error + p_seq_error * prior_prob_snp)
                                    match_probs[tup] = prob
                                read_prob *= prob
                            # quick & dirty check-- only looking at last base
                            assert qualstr[b] >= qual_offset # Specified quality scale yielded a negative phred score!  You probably have the wrong PHRED scale!
                            assert 0.<=read_prob<=1.  # error with map qualities
                            #raise BaseQualityError("Specified quality scale yielded a negative phred score!  You probably have the wrong PHRED scale!")
                            group_probs_append(read_prob)
                            group_total_prob += read_prob
                        normed_probs = [p / group_total_prob for p in group_probs]
                    fwtrack.prob_aligns.extend(normed_probs)
                    fwtrack.prior_aligns.extend(normed_probs)
                    fwtrack.enrich_scores.extend([min_score] * len(grouplines))
        fwtrack.total = read_total  # overwrite the running total, counting each read once
        return fwtrack