예제 #1
0
def bam_innerdist(bam1, bam2, summaryout=None):
    iter1 = bam_iter(bam1)
    iter2 = bam_iter(bam2, quiet=True)

    distances = {}
    total = 0
    proper = 0

    orientation_count = {"+/-": 0, "-/+": 0, "+/+": 0, "-/-": 0}

    read1_last = None
    read2_last = None
    read1 = None
    read2 = None

    while True:
        try:
            while not read1 or read1_last == read1.qname:
                read1 = iter1.next()
            while not read2 or read2_last == read2.qname:
                read2 = iter2.next()
        except StopIteration:
            break

        if read1.qname != read2.qname:
            raise ValueError("Error: BAM files aren't properly paired! (%s, %s)\n" % (read1.qname, read2.qname))

        read1_last = read1.qname
        read2_last = read2.qname

        total += 1

        if read1.is_unmapped or read2.is_unmapped or read1.tid != read2.tid:
            continue

        proper += 1

        if read1.pos < read2.pos:
            dist = read2.pos - read1.aend
        else:
            dist = read1.pos - read2.aend

        if summaryout:
            summaryout.write("%s\n" % dist)

        if not dist in distances:
            distances[dist] = 1
        else:
            distances[dist] += 1

        orientation = "%s/%s" % ("-" if read1.is_reverse else "+", "-" if read2.is_reverse else "+")

        orientation_count[orientation] += 1

    mean, stdev = counts_mean_stdev(distances)

    return total, proper, mean, stdev, orientation_count
예제 #2
0
def bam_renamepair(infile, outfile, delim='/'):
    bam = pysam.Samfile(infile, "rb")
    out = pysam.Samfile(outfile, "wb", template=bam)
    for read in bam_iter(bam):
        read_renamepair(read, delim)
        out.write(read)
    bam.close()
    out.close()
예제 #3
0
def bam_renamepair(infile, outfile, delim='/'):
    bam = pysam.Samfile(infile, "rb")
    out = pysam.Samfile(outfile, "wb", template=bam)
    for read in bam_iter(bam):
        read_renamepair(read, delim)
        out.write(read)
    bam.close()
    out.close()
예제 #4
0
def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False):
    if verbose:
        sys.stderr.write('Input file  : %s\n' % infile)
        sys.stderr.write('Output file : %s\n' % outfile)
        if failedfile:
            sys.stderr.write('Failed reads: %s\n' % failedfile)
        sys.stderr.write('Criteria:\n')
        for criterion in criteria:
            sys.stderr.write('    %s\n' % criterion)

        sys.stderr.write('\n')

    bamfile = pysam.Samfile(infile, "rb")
    outfile = pysam.Samfile(outfile, "wb", template=bamfile)

    if failedfile:
        failed_out = open(failedfile, 'w')
    else:
        failed_out = None

    passed = 0
    failed = 0

    def _callback(read):
        return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(
            read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed)

    for read in bam_iter(bamfile, quiet=True):
        p = True

        for criterion in criteria:
            if not criterion.filter(bamfile, read):
                p = False
                failed += 1
                if failed_out:
                    failed_out.write('%s\t%s\n' % (read.qname, criterion))
                # outfile.write(read_to_unmapped(read))
                break
        if p:
            passed += 1
            outfile.write(read)

    bamfile.close()
    outfile.close()
    if failed_out:
        failed_out.close()
    sys.stdout.write("%s kept\n%s failed\n" % (passed, failed))

    for criterion in criteria:
        criterion.close()
예제 #5
0
def bam_export(bam, mapped=True, unmapped=True, whitelist=None, blacklist=None, fields=None, out=sys.stdout, quiet=False):
    for read in bam_iter(bam, quiet=quiet):
        if whitelist and not read.qname in whitelist:
            continue
        if blacklist and read.qname in blacklist:
            continue

        try:
            if mapped and not read.is_unmapped:
                export_read(bam, read, fields, out)
            elif unmapped and read.is_unmapped:
                export_read(bam, read, fields, out)
        except IOError:
            break
예제 #6
0
def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False):
    if verbose:
        sys.stderr.write('Input file  : %s\n' % infile)
        sys.stderr.write('Output file : %s\n' % outfile)
        if failedfile:
            sys.stderr.write('Failed reads: %s\n' % failedfile)
        sys.stderr.write('Criteria:\n')
        for criterion in criteria:
            sys.stderr.write('    %s\n' % criterion)

        sys.stderr.write('\n')

    bamfile = pysam.Samfile(infile, "rb")
    outfile = pysam.Samfile(outfile, "wb", template=bamfile)

    if failedfile:
        failed_out = open(failedfile, 'w')
    else:
        failed_out = None

    passed = 0
    failed = 0

    def _callback(read):
        return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed)

    for read in bam_iter(bamfile, quiet=True):
        p = True

        for criterion in criteria:
            if not criterion.filter(bamfile, read):
                p = False
                failed += 1
                if failed_out:
                    failed_out.write('%s\t%s\n' % (read.qname, criterion))
                # outfile.write(read_to_unmapped(read))
                break
        if p:
            passed += 1
            outfile.write(read)

    bamfile.close()
    outfile.close()
    if failed_out:
        failed_out.close()
    sys.stdout.write("%s kept\n%s failed\n" % (passed, failed))

    for criterion in criteria:
        criterion.close()
예제 #7
0
def bam_cleancigar(infile, outfile):
    bam = pysam.Samfile(infile, "rb")
    out = pysam.Samfile(outfile, "wb", template=bam)
    total = 0
    count = 0
    for read in bam_iter(bam):
        if read_cleancigar(read):
            count += 1

        total += 1
        out.write(read)

    bam.close()
    out.close()
    sys.stderr.write('Wrote: %s reads\nAltered: %s\n' % (total, count))
예제 #8
0
def bam_cleancigar(infile, outfile):
    bam = pysam.Samfile(infile, "rb")
    out = pysam.Samfile(outfile, "wb", template=bam)
    total = 0
    count = 0
    for read in bam_iter(bam):
        if read_cleancigar(read):
            count += 1

        total += 1
        out.write(read)

    bam.close()
    out.close()
    sys.stderr.write('Wrote: %s reads\nAltered: %s\n' % (total, count))
예제 #9
0
파일: split.py 프로젝트: xuwei684/ngsutils
def bam_split(infile,
              out_template,
              read_count=1000000,
              reference=False,
              quiet=False):
    bamfile = pysam.Samfile(infile, "rb")
    outfile = None

    file_count = 0

    count = 0
    fname = ""
    lastref = -1
    for read in bam_iter(bamfile):
        if not outfile or (not reference and count >= read_count) or (
                reference and lastref != read.tid):
            if outfile:
                outfile.close()
            file_count += 1
            count = 0
            if reference:
                if read.tid >= 0:
                    fname = '%s.%s.bam' % (out_template,
                                           bamfile.getrname(read.tid))
                else:
                    fname = None
            else:
                fname = '%s.%s.bam' % (out_template, file_count)

            if fname:
                outfile = pysam.Samfile(fname, "wb", template=bamfile)
            else:
                outfile = None

        if outfile:
            outfile.write(read)
            count += 1

        lastref = read.tid

    bamfile.close()
    if outfile:
        outfile.close()
    if not quiet:
        sys.stderr.write("Split into %s files" % (file_count))
예제 #10
0
def bam_tofastx(fname,
                colorspace=False,
                show_mapped=True,
                show_unmapped=True,
                fastq=True,
                read1=True,
                read2=True,
                proper=False):
    if show_mapped is False and show_unmapped is False:
        return

    sam = bam_open(fname)

    last_key = None

    for read in bam_iter(sam):
        if not read1 and read.is_read1:
            continue
        if not read2 and read.is_read2:
            continue

        if proper and not read.is_proper_pair:
            continue

        k = (read.qname, read.seq)
        if last_key == k:
            continue

        show = False
        if show_mapped and not read.is_unmapped:
            show = True
        if show_unmapped and read.is_unmapped:
            show = True

        if not show:
            continue

        if fastq:
            write_fastq(read, colorspace=colorspace)
        else:
            write_fasta(read, colorspace=colorspace)

        last_key = k
예제 #11
0
def bam_junction_count(bam, ref=None, start=None, end=None, out=sys.stdout, quiet=False):
    last_tid = None
    junctions = {}
    for read in bam_iter(bam, ref=ref, start=start, end=end, quiet=quiet):
        if read.is_unmapped:
            continue

        if read.tid != last_tid and junctions:
            for junction in junctions:
                sys.stdout.write('%s\t%s\n' % (junction, len(junctions[junction])))
            junctions = {}
            last_tid = read.tid

        hasgap = False
        pos = read.pos
        end = None
        for op, size in read.cigar:
            if op == 0:
                pos += size
            elif op == 1:
                pass
            elif op == 2:
                pos += size
            elif op == 3:
                hasgap = True
                end = pos + size
                break
            elif op == 4:
                pos += size


        if not hasgap:
            continue

        junction = '%s:%s-%s' % (bam.references[read.tid], pos, end)
        if not junction in junctions:
            junctions[junction] = set()

        junctions[junction].add(read.qname)

    for junction in junctions:
        sys.stdout.write('%s\t%s\n' % (junction, len(junctions[junction])))
예제 #12
0
def bam_removeclipping(infile, outfile):
    bam = pysam.Samfile(infile, "rb")
    out = pysam.Samfile(outfile, "wb", template=bam)
    total = 0
    count = 0
    unmapped = 0
    for read in bam_iter(bam):
        code = read_removeclipping(read)

        if code == 1:
            unmapped += 1
        elif code == 2:
            count += 1

        total += 1
        out.write(read)

    bam.close()
    out.close()
    sys.stderr.write('Wrote: %s reads\nAltered: %s\nUnmapped: %s\n' % (total, count, unmapped))
예제 #13
0
파일: export.py 프로젝트: xuwei684/ngsutils
def bam_export(bam,
               mapped=True,
               unmapped=True,
               whitelist=None,
               blacklist=None,
               fields=None,
               out=sys.stdout,
               quiet=False):
    for read in bam_iter(bam, quiet=quiet):
        if whitelist and not read.qname in whitelist:
            continue
        if blacklist and read.qname in blacklist:
            continue

        try:
            if mapped and not read.is_unmapped:
                export_read(bam, read, fields, out)
            elif unmapped and read.is_unmapped:
                export_read(bam, read, fields, out)
        except IOError:
            break
예제 #14
0
    def get_counts(self, bam, ref=None, start=None, end=None, quiet=False):
        for read in bam_iter(bam, ref=ref, start=start, end=end, quiet=quiet, callback=lambda x: '%s:%s (%s)' % (self.cur_chrom, x.pos, len(self.pos_counts) if self.pos_counts else 0)):
            if read.is_unmapped:
                continue
            if self.strand:
                if self.strand == '+' and read.is_reverse:
                    continue
                elif self.strand == '-' and not read.is_reverse:
                    continue

            if self.cur_tid is None or read.tid != self.cur_tid:
                if self.pos_counts:
                    self.flush()

                self.pos_counts = array('I', [0,] * bam.lengths[read.tid])

                self.cur_tid = read.tid
                self.cur_chrom = bam.references[read.tid]

            self._add_read(read)

        self.flush()
예제 #15
0
def bam_split(infile, out_template, read_count=1000000, reference=False, quiet=False):
    bamfile = pysam.Samfile(infile, "rb")
    outfile = None

    file_count = 0

    count = 0
    fname = ""
    lastref = -1
    for read in bam_iter(bamfile):
        if not outfile or (not reference and count >= read_count) or (reference and lastref != read.tid):
            if outfile:
                outfile.close()
            file_count += 1
            count = 0
            if reference:
                if read.tid >= 0:
                    fname = '%s.%s.bam' % (out_template, bamfile.getrname(read.tid))
                else:
                    fname = None
            else:
                fname = '%s.%s.bam' % (out_template, file_count)

            if fname:
                outfile = pysam.Samfile(fname, "wb", template=bamfile)
            else:
                outfile = None

        if outfile:
            outfile.write(read)
            count += 1

        lastref = read.tid

    bamfile.close()
    if outfile:
        outfile.close()
    if not quiet:
        sys.stderr.write("Split into %s files" % (file_count))
예제 #16
0
파일: tofastq.py 프로젝트: erlevy/ngsutils
def bam_tofastx(fname, colorspace=False, show_mapped=True, show_unmapped=True, fastq=True, read1=True, read2=True, proper=False):
    if show_mapped is False and show_unmapped is False:
        return

    sam = bam_open(fname)

    last_key = None

    for read in bam_iter(sam):
        if not read1 and read.is_read1:
            continue
        if not read2 and read.is_read2:
            continue

        if proper and not read.is_proper_pair:
            continue

        k = (read.qname, read.seq)
        if last_key == k:
            continue

        show = False
        if show_mapped and not read.is_unmapped:
            show = True
        if show_unmapped and read.is_unmapped:
            show = True

        if not show:
            continue

        if fastq:
            write_fastq(read, colorspace=colorspace)
        else:
            write_fasta(read, colorspace=colorspace)

        last_key = k
예제 #17
0
파일: pcrdup.py 프로젝트: xuwei684/ngsutils
def pcrdup_mark(inbam, outbam, fragment=False, countfile=None):
    cur_pos = None
    cur_reads = {}

    total = 0
    unique = 0
    duplicates = 0

    dup_list = set()

    def callback(read):
        return '%s, %s, %s - %s' % (total, unique, duplicates, read.qname)

    for read in bam_iter(bamfile, callback=callback):
        if not read.is_paired or read.is_read1:
            total += 1

        if read.is_unmapped:
            __flush_cur_reads(cur_reads, outbam, inbam, countfile)
            if outbam:
                outbam.write(read)
            continue

        start_pos = (read.tid, read.pos)

        if fragment:
            dup_pos = (read.tid, read.pos, '')
        else:
            # isize is the insert length, which if this is the first read, will
            # be the right most part of the second read. If the ends of the reads
            # are trimmed for QC reasons, only the 5' pos of the first read and the 3'
            # pos of the second read will be accurate.
            
            dup_pos = (read.tid, read.pos, read.isize)

        if not cur_pos or start_pos != cur_pos:
            __flush_cur_reads(cur_reads, outbam, inbam, countfile)

            cur_pos = start_pos
            cur_reads = {}
            idx = 0

        if not fragment and (read.mate_is_unmapped or not read.is_paired or not read.is_proper_pair or read.isize < 0):
            # this is a paired file, but the mate isn't paired or proper or mapped
            # just write it out, no flags to set.

            if read.qname in dup_list:
                read.is_duplicate = True
                dup_list.remove(read.qname)

            if outbam:
                outbam.write(read)
        elif dup_pos in cur_reads:
            duplicates += 1
            if not fragment:
                dup_list.add(read.qname)
            cur_reads[dup_pos].append((read.mapq, -idx, read))
        else:
            unique += 1
            cur_reads[dup_pos] = [(read.mapq, -idx, read), ]

        idx += 1

    __flush_cur_reads(cur_reads, outbam, inbam, countfile)

    sys.stdout.write('Total reads:\t%s\n' % total)
    sys.stdout.write('Unique reads:\t%s\n' % unique)
    sys.stdout.write('PCR duplicates:\t%s\n' % duplicates)
예제 #18
0
 def _gen2():
     def callback(read):
         return '%s:%s (%s) %s:%s-%s' % (self.bam.getrname(read.tid), read.pos, len(self.buffer), self.cur_chrom, self.cur_start, self.cur_end)
     for read in bam_iter(self.bam, quiet=self.quiet, callback=callback):
         yield read
예제 #19
0
def bam_tobed(fname, out=sys.stdout):
    bamfile = pysam.Samfile(fname, "rb")

    for read in bam_iter(bamfile):
        write_read(read, bamfile.getrname(read.rname), out)
    bamfile.close()
예제 #20
0
파일: tag.py 프로젝트: erlevy/ngsutils
    def filter(self):
        for read in bam_iter(self.bamfile):
            yield read

        self.bamfile.close()
예제 #21
0
파일: tag.py 프로젝트: xuwei684/ngsutils
 def filter(self, bam):
     for read in bam_iter(bam):
         yield read
예제 #22
0
파일: stats.py 프로젝트: hjanime/ngsutils
 def _foo2():
     for read in bam_iter(bamfile):
         yield read
예제 #23
0
 def _foo2():
     for read in bam_iter(bamfile):
         yield read
예제 #24
0
파일: basecall.py 프로젝트: erlevy/ngsutils
 def _gen2():
     def callback(read):
         return '%s:%s (%s) %s:%s-%s' % (self.bam.getrname(read.tid), read.pos, len(self.buffer), self.cur_chrom, self.cur_start, self.cur_end)
     for read in bam_iter(self.bam, quiet=self.quiet, callback=callback):
         yield read
예제 #25
0
파일: tag.py 프로젝트: ZhangQiuxue/ngsutils
 def filter(self, bam):
     for read in bam_iter(bam):
         yield read
예제 #26
0
def bam_innerdist(bam1, bam2, summaryout=None):
    iter1 = bam_iter(bam1)
    iter2 = bam_iter(bam2, quiet=True)

    distances = {}
    total = 0
    proper = 0

    orientation_count = {
        '+/-': 0,
        '-/+': 0,
        '+/+': 0,
        '-/-': 0,
    }

    read1_last = None
    read2_last = None
    read1 = None
    read2 = None

    while True:
        try:
            while not read1 or read1_last == read1.qname:
                read1 = iter1.next()
            while not read2 or read2_last == read2.qname:
                read2 = iter2.next()
        except StopIteration:
            break

        if read1.qname != read2.qname:
            raise ValueError(
                "Error: BAM files aren't properly paired! (%s, %s)\n" %
                (read1.qname, read2.qname))

        read1_last = read1.qname
        read2_last = read2.qname

        total += 1

        if read1.is_unmapped or read2.is_unmapped or read1.tid != read2.tid:
            continue

        proper += 1

        if read1.pos < read2.pos:
            dist = read2.pos - read1.aend
        else:
            dist = read1.pos - read2.aend

        if summaryout:
            summaryout.write('%s\n' % dist)

        if not dist in distances:
            distances[dist] = 1
        else:
            distances[dist] += 1

        orientation = '%s/%s' % ('-' if read1.is_reverse else '+',
                                 '-' if read2.is_reverse else '+')

        orientation_count[orientation] += 1

    mean, stdev = counts_mean_stdev(distances)

    return total, proper, mean, stdev, orientation_count
예제 #27
0
파일: tobed.py 프로젝트: xuwei684/ngsutils
def bam_tobed(fname, out=sys.stdout):
    bamfile = pysam.Samfile(fname, "rb")

    for read in bam_iter(bamfile):
        write_read(read, bamfile.getrname(read.rname), out)
    bamfile.close()