Exemplo n.º 1
0
def readBedIntervals(bedPath, ncol = 3, 
                     chrom = None, start = None, end = None,
                     sort = False, ignoreBed12 = True):
    """ Read bed intervals from a bed file (or a specifeid range therein).
    NOTE: intervals are sorted by their coordinates"""
    
    if not os.path.isfile(bedPath):
        raise RuntimeError("Bed interval file %s not found" % bedPath)
    assert ncol == 3 or ncol == 4 or ncol == 5
    outIntervals = []
    logger.debug("readBedIntervals(%s)" % bedPath)
    bedTool = BedTool(bedPath)
    if sort is True:
        bedTool = bedTool.sort()
        logger.debug("sortBed(%s)" % bedPath)
    if ignoreBed12 is False:
        bedTool = bedTool.bed6()
        logger.debug("bed6(%s)" % bedPath)
    if chrom is None:
        bedIntervals = bedTool
    else:
        assert start is not None and end is not None
        interval = Interval(chrom, start, end)
        logger.debug("intersecting (%s,%d,%d) and %s" % (chrom, start, end,
                                                          bedPath))
        # Below, we try switching from all_hits to intersect()
        # all_hits seems to leak a ton of memory for big files, so
        # we hope intersect (which creates a temp file) will be better
        #bedIntervals = bedTool.all_hits(interval)
        tempTool = BedTool(str(interval), from_string = True)
        bedIntervals = bedTool.intersect(tempTool)
        tempTool.delete_temporary_history(ask=False)

    logger.debug("appending bed intervals")
    for feat in bedIntervals:
        outInterval = (feat.chrom, feat.start, feat.end)
        if ncol >= 4:
            outInterval += (feat.name,)
        if ncol >= 5:
            outInterval += (feat.score,)
        outIntervals.append(outInterval)
    logger.debug("finished readBedIntervals(%s)" % bedPath)
        
    return outIntervals
Exemplo n.º 2
0
def getMergedBedIntervals(bedPath, ncol=3, sort = False, ignoreBed12 = True):
    """ Merge all contiguous and overlapping intervals""" 

    if not os.path.isfile(bedPath):
        raise RuntimeError("Bed interval file %s not found" % bedPath)
    logger.debug("mergeBedIntervals(%s)" % bedPath)
    outIntervals = []
    bedTool = BedTool(bedPath)
    if sort is True:
        bedTool = bedTool.sort()
        logger.debug("sortBed(%s)" % bedPath)
    if ignoreBed12 is False:
        logger.debug("bed6(%s)" % bedPath)
        bedTool = bedTool.bed6()
    for feat in bedTool.merge():
        outInterval = (feat.chrom, feat.start, feat.end)
        if ncol >= 4:
            outInterval += (feat.name,)
        if ncol >= 5:
            outInterval += (feat.score,)
        outIntervals.append(outInterval)
    logger.debug("finished mergeBedIntervals(%s)" % bedPath)

    return outIntervals
Exemplo n.º 3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Filter overlapping intervals out")
    parser.add_argument("inputBed", help="Bed file to filter")
    parser.add_argument("--bed12", help="Use bed12 exons instead of start/end"
                        " if present (equivalent to running bed12ToBed6 on"
                        " input first).", action="store_true", default=False)
    parser.add_argument("--rm", help="Make sure intervals that are labeled as TE "
                        "by rm2State.sh script are never cut by ones that are not",
                        default=False, action='store_true')
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    assert os.path.isfile(args.inputBed)
    tempBedToolPath = initBedTool()

    # do the --rm filter.  by splitting into TE / non-TE
    # then removing everything in non-TE that overlaps
    # TE.  The adding the remainder back to TE. 
    inputPath = args.inputBed
    if args.rm is True:
        tempPath = getLocalTempPath("Temp_", ".bed")
        tePath = getLocalTempPath("Temp_te_", ".bed")
        runShellCommand("rm2State.sh %s |grep TE | sortBed > %s" % (
            args.inputBed, tempPath))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %(
            args.inputBed, tempPath, tePath))
        otherPath = getLocalTempPath("Temp_other_", ".bed")
        runShellCommand("rm2State.sh %s |grep -v TE | sortBed > %s" % (
            args.inputBed, tempPath))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %(
            args.inputBed, tempPath, otherPath))
        if os.path.getsize(tePath) > 0  and\
           os.path.getsize(otherPath) > 0:
            filterPath = getLocalTempPath("Temp_filter_", ".bed")
            runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (
                otherPath, tePath, filterPath))
            inputPath = getLocalTempPath("Temp_input_", ".bed")
            runShellCommand("cat %s %s | sortBed > %s" % (
                tePath, filterPath, inputPath))
            runShellCommand("rm -f %s" % filterPath)
        runShellCommand("rm -f %s %s %s" % (tePath, otherPath, tempPath))

    bedIntervals = BedTool(inputPath).sort()
    if args.bed12 is True:
        bedIntervals = bedIntervals.bed6()
        
    prevInterval = None

    # this code has been way to buggy for something so simple
    # keep extra list to check for sure even though it's a waste of
    # time and space
    sanity = []
    
    for interval in bedIntervals:
        if (prevInterval is not None and
            interval.chrom == prevInterval.chrom and
            interval.start < prevInterval.end):
            logger.debug("Replace %d bases of \n%s with\n%s" % (
                prevInterval.end - interval.start,
                str(interval), str(prevInterval)))
            interval.start = prevInterval.end
            
        if interval.end > interval.start:
            sys.stdout.write("%s" % str(interval))
            sanity.append(interval)
            prevInterval = interval

    for i in xrange(len(sanity) - 1):
        if sanity[i].chrom == sanity[i+1].chrom:
            assert sanity[i+1].start >= sanity[i].end
    cleanBedTool(tempBedToolPath)
    if args.inputBed != inputPath:
        runShellCommand("rm -f %s" % inputPath)