Exemplo n.º 1
0
 def __init__(self, fname=None, fileobj=None):
     if fileobj:
         self.bed = BedFile(fileobj=fileobj)
         self.fname = '*fileobj*'
     else:
         self.bed = BedFile(fname)
         self.fname = fname
     Model.__init__(self)
Exemplo n.º 2
0
    def __init__(self, fname, nostrand=None):
        self.regions = {}  # store BED regions as keyed bins (chrom, bin)
        self.fname = fname
        if nostrand == 'nostrand':
            self.nostrand = True
        else:
            self.nostrand = False

        self.bed = BedFile(fname)
Exemplo n.º 3
0
def bam_extract(inbam, outbam, bedfile, nostrand=False, quiet=False):
    bed = BedFile(bedfile)
    if not quiet:
        eta = ETA(os.stat(bedfile).st_size, fileobj=bed)
    else:
        eta = None

    passed = 0

    for region in bed:
        if eta:
            eta.print_status(extra="extracted:%s" % (passed))

        if not region.chrom in inbam.references:
            continue

        if not nostrand:
            strand = region.strand
        else:
            strand = None

        for read in bam_extract_reads(inbam, region.chrom, region.start,
                                      region.end, strand):
            outbam.write(read)
            passed += 1

    if not quiet:
        eta.done()
        sys.stderr.write("%s extracted\n" % (passed, ))
Exemplo n.º 4
0
    def __init__(self, fname, nostrand=None):
        self.regions = {}  # store BED regions as keyed bins (chrom, bin)
        self.fname = fname
        if nostrand == 'nostrand':
            self.nostrand = True
        else:
            self.nostrand = False

        self.bed = BedFile(fname)
Exemplo n.º 5
0
    def testBaseCallRegionStrand(self):
        #  confirm that BED strand *isn't* used
        bam = MockBam(['test2'])
        bam.add_read('foo1', 'atcgatcg', '........', 0, 0, cigar='8M')
        bam.add_read('foo2', 'atcgatcg', 'AAAAAAAA', 0, 4, cigar='8M')
        bam.add_read('foo3',
                     'accgatcg',
                     '########',
                     0,
                     4,
                     cigar='8M',
                     is_reverse=True)
        bam.add_read('foo4', 'atcgactgatcg', '############', 0, 0, cigar='12M')

        bed = BedFile(fileobj=StringIO.StringIO('''\
test2|4|7|foo|1|-
'''.replace('|', '\t')))

        #  remember: bed is 0-based, basecall is 1-based, so 4->7, corresponds to 5->8

        out = StringIO.StringIO('')
        ngsutils.bam.basecall.bam_basecall(bam,
                                           os.path.join(
                                               os.path.dirname(__file__),
                                               'test.fa'),
                                           showstrand=True,
                                           regions=bed,
                                           out=out)

        valid = '''chrom|pos|ref|count|consensus call|minor call|ave mappings|entropy|A|C|G|T|N|Deletions|Gaps|Insertions|Inserts|+ strand %|A minor %|C minor %|G minor %|T minor %|N minor %|Deletion minor %|Insertion minor %
test2|5|A|4|A||1.0|1.87433193885|4|0|0|0|0|0|0|0||0.75|0.25|0.0|0.0|0.0|0.0|0.0|0.0
test2|6|T|4|T/C||1.0|2.94335833285|0|2|0|2|0|0|0|0||0.75|0.0|0.5|0.0|0.0|0.0|0.0|0.0
test2|7|C|4|C|T|1.0|3.45990045591|0|3|0|1|0|0|0|0||0.75|0.0|0.333333333333|0.0|0.0|0.0|0.0|0.0
test2|8|G|4|G||1.0|1.87433193885|0|0|4|0|0|0|0|0||0.75|0.0|0.0|0.25|0.0|0.0|0.0|0.0
'''.replace('|', '\t')

        self.assertEqual(valid, out.getvalue())
Exemplo n.º 6
0
    bed = None
    ref = None
    stranded = True
    include_name = False

    last = None
    for arg in sys.argv[1:]:
        if last == '-min':
            min_size = int(arg)
            last = None
        elif arg in ['-min']:
            last = arg
        elif arg == '-name':
            include_name = True
        elif arg == '-ns':
            stranded = False
        elif not bed and os.path.exists(arg):
            bed = arg
        elif not ref and os.path.exists(arg):
            ref = arg

    if not bed or not ref:
        usage()
        sys.exit(1)

    bed_tofasta(BedFile(bed),
                ref,
                min_size=min_size,
                stranded=stranded,
                include_name=include_name)
Exemplo n.º 7
0
#!/usr/bin/env python
'''
Tests for bedutils reduce
'''

import unittest
import StringIO

import ngsutils.bed.reduce
from ngsutils.bed import BedFile
from ngsutils.bam.t import _matches

bedtest = BedFile(fileobj=StringIO.StringIO('''\
chr1|100|150|foo1|10|+
chr1|140|200|foo2|10|+
chr1|500|550|foo3|10|+
chr1|560|600|foo4|10|-
chr1|600|700|foo5|10|+
'''.replace('|', '\t')))


class ReduceTest(unittest.TestCase):
    def testReduce(self):
        valids = '''\
chr1|100|200|foo1,foo2|20|+
chr1|500|550|foo3|10|+
chr1|560|700|foo4,foo5|20|+
'''.replace('|', '\t').split('\n')

        out = StringIO.StringIO('')
        ngsutils.bed.reduce.bed_reduce(bedtest,
Exemplo n.º 8
0
if __name__ == '__main__':
    qbed_fname = None
    refbed_fname = None
    maxdist = 100000
    match = False
    nostrand = False

    last = None

    for arg in sys.argv[1:]:
        if last == '-max':
            maxdist = int(arg)
            last = None
        elif arg in ['-max']:
            last = arg
        elif arg == '-match':
            match = True
        elif arg == '-nostrand':
            nostrand = True
        elif not qbed_fname and (os.path.exists(arg) or arg == '-'):
            qbed_fname = arg
        elif not refbed_fname and os.path.exists(arg):
            refbed_fname = arg

    if not qbed_fname or not refbed_fname:
        usage()

    qbed = BedFile(qbed_fname)
    refbed = BedFile(refbed_fname)
    find_nearest(qbed, refbed, maxdist, match, nostrand)
Exemplo n.º 9
0
 if last == '-qual':
     min_qual = int(arg)
     last = None
 elif last == '-ref':
     if os.path.exists(arg) and os.path.exists('%s.fai' % arg):
         ref = arg
     else:
         print "Missing FASTA file or index: %s" % arg
         usage()
     last = None
 elif last == '-count':
     min_count = int(arg)
     last = None
 elif last == '-bed':
     if os.path.exists(arg):
         regions = BedFile(arg)
     else:
         print "BED file: %s not found!" % arg
         usage()
     last = None
 elif last == '-mask':
     mask = int(arg)
     last = None
 elif last == '-minorpct':
     minorpct = float(arg)
     last = None
 elif last == '-profile':
     profile = arg
     last = None
 elif arg == '-h':
     usage()
Exemplo n.º 10
0
def usage():
    print __doc__
    print """\
Usage: bedutils sizes bedfile

"""
    sys.exit(1)


def bed_size(bed, out=sys.stdout):
    for region in bed:
        out.write('%s\n' % (region.end - region.start))


if __name__ == '__main__':
    fname = None

    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        if not fname and (os.path.exists(arg) or arg == '-'):
            fname = arg
        else:
            print "Unknown option: %s" % arg
            usage()

    if not fname:
        usage()

    bed_size(BedFile(fname))
Exemplo n.º 11
0
                out.write('%s\n' % '\t'.join(cols))


if __name__ == '__main__':
    fname1 = None
    fname2 = None
    stranded = True

    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        elif arg == '-nostrand':
            stranded = False
        elif not fname1 and (arg == '-' or os.path.exists(arg)):
            fname1 = arg
        elif not fname2 and (arg == '-' or os.path.exists(arg)):
            fname2 = arg
        else:
            print "Unknown option: %s" % arg
            usage()

    if not fname1 or fname2:
        usage()

    if fname1 == '-' and fname2 == '-':
        usage("Both input files can't be from stdin!")

    bed2 = BedFile(fname2)
    bed_subtract(fname1, bed2, stranded)
    bed2.close()
Exemplo n.º 12
0
if __name__ == '__main__':
    fname1 = None
    fname2 = None
    stranded = True

    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        elif arg == '-nostrand':
            stranded = False
        elif not fname1 and (arg == '-' or os.path.exists(arg)):
            fname1 = arg
        elif not fname2 and (arg == '-' or os.path.exists(arg)):
            fname2 = arg
        else:
            print "Unknown option: %s" % arg
            usage()

    if not fname1 or not fname2:
        usage()

    if fname1 == '-' and fname2 == '-':
        usage("Both input files can't be from stdin!")

    bed1 = BedStreamer(fname1)
    bed2 = BedFile(fname2)

    bed_subtract(bed1, bed2, stranded)

    bed2.close()
Exemplo n.º 13
0
        if arg == '-h':
            usage()
        if last == '-name':
            name = arg
            last = None
        elif last == '-score':
            score = arg
            last = None
        elif last == '-strand':
            strand = arg
            last = None
        elif last == '-rgb':
            if not rgb_name:
                rgb_name = arg
            else:
                rgb[rgb_name] = arg
                rgb_name = None
                last = None
        elif arg in ['-name', '-score', '-strand', '-rgb']:
            last = arg
        elif not fname and (os.path.exists(arg) or arg == '-'):
            fname = arg
        else:
            print "Unknown option: %s" % arg
            usage()

    if not fname:
        usage()

    bed_annotate(BedFile(fname), name=name, score=score, strand=strand, rgb=rgb)
Exemplo n.º 14
0
#!/usr/bin/env python
'''
Tests for bedutils extend
'''

import unittest
import StringIO

from ngsutils.bed import BedFile
import ngsutils.bed.extend

testbed = BedFile(fileobj=StringIO.StringIO('''
chr1|10|90|foo|1|+
chr1|10|90|foo|1|-
chr1|100|150|foo|1|+
chr1|200|250|foo|1|-
'''.replace('|', '\t')))


class ExtendTest(unittest.TestCase):
    def testAbsolute(self):
        valid = '''chr1|10|110|foo|1|+
chr1|0|90|foo|1|-
chr1|100|200|foo|1|+
chr1|150|250|foo|1|-
'''.replace('|', '\t')

        out = StringIO.StringIO('')
        ngsutils.bed.extend.bed_extend(testbed, 100, relative=False, out=out)
        self.assertEqual(out.getvalue(), valid)
Exemplo n.º 15
0
#!/usr/bin/env python
'''
Tests for bedutils tobedgraph
'''

import unittest

import ngsutils.bed.tobedgraph
import StringIO
from ngsutils.bed import BedFile

bedtest = BedFile(fileobj=StringIO.StringIO('''\
test1|10|20|foo1|10|+
test1|10|20|foo2|10|-
test1|15|25|foo3|10|+
test1|100|150|foo4|1|+
'''.replace('|', '\t')))


class BedGraphTest(unittest.TestCase):
    def testBedGraph(self):
        valid = '''\
test1|10|15|2
test1|15|20|3
test1|20|25|1
test1|100|150|1
'''.replace('|', '\t')
        sio = StringIO.StringIO("")
        ngsutils.bed.tobedgraph.bed_tobedgraph(bedtest, out=sio)
        self.assertEqual(valid, sio.getvalue())
        sio.close()
Exemplo n.º 16
0
                present += 1

        out.write('\t%s\n' % present)


if __name__ == '__main__':
    refname = None
    bedfiles = []
    stranded = True
    last = None

    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        elif arg == '-ns':
            stranded = False
        elif not refname and os.path.exists(arg):
            refname = arg
        elif os.path.exists(arg):
            bedfiles.append(BedFile(arg))
        else:
            print "Bad argument: %s" % arg
            usage()
            sys.exit(1)

    if not refname or not bedfiles:
        usage()
        sys.exit(1)

    bed_refcount(BedFile(refname), bedfiles, stranded=stranded)
Exemplo n.º 17
0

if __name__ == "__main__":
    bed = None
    strand = None
    norm = None

    last = None
    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        if last == '-norm':
            norm = float(arg)
            last = None
        elif arg in ['-norm']:
            last = arg
        elif arg == '-plus':
            strand = '+'
        elif arg == '-minus':
            strand = '-'
        elif not bed and os.path.exists(arg):
            bed = arg
        else:
            print "Unknown option or missing index: %s" % arg
            usage()

    if not bed:
        usage()

    bed_tobedgraph(BedFile(bed), strand, norm)
Exemplo n.º 18
0
#!/usr/bin/env python
'''
Tests for bedutils refcount
'''

import unittest
import StringIO

import ngsutils.bed.refcount
from ngsutils.bed import BedFile

bedtest1 = BedFile('test1',
                   fileobj=StringIO.StringIO('''\
chr1|100|150|foo1|10|+
chr1|140|200|foo2|10|+
chr1|500|550|foo3|10|+
chr1|560|600|foo4|10|-
chr1|600|700|foo5|10|+
'''.replace('|', '\t')))

bedtest2 = BedFile('test2',
                   fileobj=StringIO.StringIO('''\
chr1|90|120|bar1|10|+
chr1|140|200|bar2|10|+
chr1|510|520|bar3|10|+
chr1|500|660|bar4|10|-
chr1|520|570|bar5|10|+
'''.replace('|', '\t')))

bedtest3 = BedFile('test3',
                   fileobj=StringIO.StringIO('''\
Exemplo n.º 19
0
def bed_reduce(target_bed, query_bed, stranded=True, exact=False, out=sys.stdout):
    for qregion in query_bed:
        for tregion in target_bed.fetch(qregion.chrom, qregion.start, qregion.end, qregion.strand if stranded else None):
            if not exact or (qregion.start == tregion.start and qregion.end == tregion.end):
                qregion.write(out)
                break


if __name__ == '__main__':
    fnames = []
    stranded = True
    exact = False

    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        if arg == '-nostrand':
            stranded = False
        elif arg == '-exact':
            exact = True
        elif os.path.exists(arg):
            fnames.append(arg)
        else:
            print "Unknown option: %s" % arg
            usage()

    if not fnames or len(fnames) < 2:
        usage()

    bed_reduce(BedFile(fnames[0]), BedStreamer(fnames[1]), stranded, exact)
Exemplo n.º 20
0
class ExcludeBED(object):
    def __init__(self, fname, nostrand=None):
        self.regions = {}  # store BED regions as keyed bins (chrom, bin)
        self.fname = fname
        if nostrand == 'nostrand':
            self.nostrand = True
        else:
            self.nostrand = False

        self.bed = BedFile(fname)
        # with open(fname) as f:
        #     for line in f:
        #         if not line:
        #             continue
        #         if line[0] == '#':
        #             continue
        #         cols = line.strip().split('\t')

        #         chrom = cols[0]
        #         start = int(cols[1])
        #         end = int(cols[2])
        #         if self.nostrand:
        #             strand = '?'
        #         else:
        #             strand = cols[5]

        #         startbin = start / 100000
        #         endbin = end / 100000

        #         for bin in xrange(startbin, endbin + 1):
        #             if not (chrom, bin) in self.regions:
        #                 self.regions[(chrom, bin)] = []
        #         self.regions[(chrom, bin)].append((start, end, strand))

    def filter(self, bam, read):
        if not read.is_unmapped:
            if self.nostrand:
                strand = None
            elif read.is_reverse:
                strand = '-'
            else:
                strand = '+'

            for region in self.bed.fetch(bam.getrname(read.tid), read.pos,
                                         read.aend, strand):
                # region found, exclude read
                return False
            return True

        #     bin = read.pos / 100000
        #     ref = bam.getrname(read.tid)

        #     if not (ref, bin) in self.regions:
        #         return True

        #     for start, end, strand in self.regions[(ref, bin)]:
        #         if not self.nostrand:
        #             if strand == '+' and read.is_reverse:
        #                 continue
        #             if strand == '-' and not read.is_reverse:
        #                 continue
        #         if start <= read.pos <= end:
        #             return False
        #         if start <= read.aend <= end:
        #             return False
        # return True

    def __repr__(self):
        return 'Excluding from BED: %s%s' % (self.fname, ' nostrand'
                                             if self.nostrand else '')

    def close(self):
        pass
Exemplo n.º 21
0
''')

    sys.exit(1)


if __name__ == '__main__':
    bam_fname = None
    bed_fname = None
    maxdist = 100000

    last = None

    for arg in sys.argv[1:]:
        if last == '-max':
            maxdist = int(arg)
            last = None
        elif arg in ['-max']:
            last = arg
        elif not bam_fname and os.path.exists(arg):
            bam_fname = arg
        elif not bed_fname and os.path.exists(arg):
            bed_fname = arg

    if not bam_fname or not bed_fname:
        usage()

    bed = BedFile(bed_fname)
    bam = ngsutils.bam.bam_open(bam_fname)

    find_nearest(bam, bed, maxdist)
Exemplo n.º 22
0
#!/usr/bin/env python
'''
Tests for bedutils tofasta
'''

import os
import unittest

import ngsutils.bed.tofasta
import StringIO
from ngsutils.bed import BedFile

bedtest = BedFile(fileobj=StringIO.StringIO('''\
test1|0|10|foo1|10|+
test1|10|20|foo2|10|-
test1|0|5|foo1|10|+
test3|0|50|foo1|10|+
'''.replace('|', '\t')))

fasta = os.path.join(os.path.dirname(__file__), 'test.fa')


class FASTATest(unittest.TestCase):
    def testBedFASTA(self):

        valid = '''\
>test1:0-10
aaaaaaaaaa
>test1:10-20
cccccccccc
'''
Exemplo n.º 23
0
class ExcludeBED(object):
    def __init__(self, fname, nostrand=None):
        self.regions = {}  # store BED regions as keyed bins (chrom, bin)
        self.fname = fname
        if nostrand == 'nostrand':
            self.nostrand = True
        else:
            self.nostrand = False

        self.bed = BedFile(fname)
        # with open(fname) as f:
        #     for line in f:
        #         if not line:
        #             continue
        #         if line[0] == '#':
        #             continue
        #         cols = line.strip().split('\t')

        #         chrom = cols[0]
        #         start = int(cols[1])
        #         end = int(cols[2])
        #         if self.nostrand:
        #             strand = '?'
        #         else:
        #             strand = cols[5]

        #         startbin = start / 100000
        #         endbin = end / 100000

        #         for bin in xrange(startbin, endbin + 1):
        #             if not (chrom, bin) in self.regions:
        #                 self.regions[(chrom, bin)] = []
        #         self.regions[(chrom, bin)].append((start, end, strand))

    def filter(self, bam, read):
        if not read.is_unmapped:
            if self.nostrand:
                strand = None
            elif read.is_reverse:
                strand = '-'
            else:
                strand = '+'

            for region in self.bed.fetch(bam.getrname(read.tid), read.pos, read.aend, strand):
                # region found, exclude read
                return False
            return True

        #     bin = read.pos / 100000
        #     ref = bam.getrname(read.tid)

        #     if not (ref, bin) in self.regions:
        #         return True

        #     for start, end, strand in self.regions[(ref, bin)]:
        #         if not self.nostrand:
        #             if strand == '+' and read.is_reverse:
        #                 continue
        #             if strand == '-' and not read.is_reverse:
        #                 continue
        #         if start <= read.pos <= end:
        #             return False
        #         if start <= read.aend <= end:
        #             return False
        # return True

    def __repr__(self):
        return 'Excluding from BED: %s%s' % (self.fname, ' nostrand' if self.nostrand else '')

    def close(self):
        pass
Exemplo n.º 24
0
    def testBedFile(self):
        fname = os.path.join(os.path.dirname(__file__), 'test.bed')
        valid = ['chr1|100|150|foo|1|+',
                   'chr1|100|150|foo|1|-',
                   'chr1|200|250|foo|1|+',
                   'chr1|300|350|foo|1|-',
                ]

        regions = ['%s|%s|%s|%s|%s|%s' % (x.chrom, x.start, x.end, x.name, x.score_int, x.strand) for x in BedFile(fname)]
        self.assertTrue(_matches(valid, regions))
Exemplo n.º 25
0
def bed_stats(infile, gtf_file=None, out=sys.stdout, quiet=False, names=False):
    if not quiet:
        sys.stderr.write('Calculating BED region stats...\n')

    stats = BedStats(BedFile(infile), gtf_file, names=names)
    stats.write(out)
Exemplo n.º 26
0
    def testBedFileObj(self):
        valid = ['chr1|100|150|foo|1|+',
                   'chr1|100|150|foo|1|-',
                ]

        instr = StringIO.StringIO('''
chr1|100|150|foo|1|+
chr1|100|150|foo|1|-
'''.replace('|', '\t'))

        regions = ['%s|%s|%s|%s|%s|%s' % (x.chrom, x.start, x.end, x.name, x.score, x.strand) for x in BedFile(fileobj=instr)]
        self.assertTrue(_matches(valid, regions))
Exemplo n.º 27
0
    def testBedRegion(self):
        valid = ['chr1|100|150']

        regions = ['%s|%s|%s' % (x.chrom, x.start, x.end) for x in BedFile(region="chr1:101-150")]
        self.assertTrue(_matches(valid, regions))
Exemplo n.º 28
0
Converts the "score" field to be an integer

"""
    sys.exit(1)


def bed_clean(bed, out=sys.stdout):
    for region in bed:
        region.score = int(region.score)
        if region.score > 1000:
            region.score = 1000

        out.write('%s\n' % region)

if __name__ == '__main__':
    fname = None

    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        if not fname and (os.path.exists(arg) or arg == '-'):
            fname = arg
        else:
            print "Unknown option: %s" % arg
            usage()

    if not fname:
        usage()

    bed_clean(BedFile(fname))