예제 #1
0
def writeScaledTrack(trackData, track, args):
    """ Go base-by-base, writing the unscaled value to the output"""
    fname, fext = os.path.splitext(os.path.basename(track.getPath()))
    outBed = os.path.join(args.outputDir, fname + "_scale" + ".bed")
    outBigWig = os.path.join(args.outputDir, fname + "_scale" + ".bw")
    outFile = open(outBed, "w")

    trackNo = track.getNumber()
    valMap = track.getValueMap()

    for trackTable in trackData.getTrackTableList():
        chrom = trackTable.getChrom()
        start = trackTable.getStart()
        for i in xrange(len(trackTable)):
            binnedVal = trackTable[i][trackNo]
            unbinnedVal = valMap.getMapBack(binnedVal)

            outFile.write("%s\t%d\t%d\t%f\n" %
                          (chrom, start + i, start + i + 1, unbinnedVal))

    outFile.close()

    #make a .bw copy
    try:
        runShellCommand("bedGraphToBigWig %s %s %s" %
                        (outBed, args.chromSizes, outBigWig))
    except:
        logger.warning("Unable to big bigwig from %s" % outBed)
예제 #2
0
def writeScaledTrack(trackData, track, args):
    """ Go base-by-base, writing the unscaled value to the output"""
    fname, fext = os.path.splitext(os.path.basename(track.getPath()))
    outBed = os.path.join(args.outputDir, fname + "_scale" + ".bed")
    outBigWig = os.path.join(args.outputDir, fname + "_scale" + ".bw")
    outFile = open(outBed, "w")
    
    trackNo = track.getNumber()
    valMap = track.getValueMap()

    for trackTable in trackData.getTrackTableList():
        chrom = trackTable.getChrom()
        start = trackTable.getStart()
        for i in xrange(len(trackTable)):
            binnedVal = trackTable[i][trackNo]
            unbinnedVal = valMap.getMapBack(binnedVal)
            
            outFile.write("%s\t%d\t%d\t%f\n" % (
                chrom,
                start + i,
                start + i + 1,
                unbinnedVal))

    outFile.close()

    #make a .bw copy
    try:
        runShellCommand("bedGraphToBigWig %s %s %s" % (outBed, args.chromSizes,
                        outBigWig))
    except:
        logger.warning("Unable to big bigwig from %s" % outBed)
예제 #3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name",
                        help="Set ID field (column 4 instead of 5)",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath,
                            mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals,
                            obFile, args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
예제 #4
0
def checkExactOverlap(bed1, bed2):
    """ make sure two bed files cover same region exactly: a requirement for all
    code based on the comparisons in this module."""

    errorMessage = (
        "Bed files %s and %s cannot be compared. xxx. "
        " Input files must be both sorted, cover the exact same region,"
        " and contain no self-overlaps.") % (bed1, bed2)

    # empty file may break downstream comparisons
    size1 = os.path.getsize(bed1)
    size2 = os.path.getsize(bed2)
    if size1 == 0 or size2 == 0:
        raise RuntimeError(
            errorMessage.replace("xxx", "one or both inputs empty"))

    # test self-overlap and sorting
    intervals1 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals1)):
        if intersectSize(intervals1[i - 1], intervals1[i]) != 0:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Overlapping intervals %s and %s found in input1" %
                    (intervals1[i - 1], intervals1[i])))
        if intervals1[i - 1] > intervals1[i]:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Out of order intervals %s and %s found in input1" %
                    (intervals1[i - 1], intervals1[i])))

    # test self-overlap and sorting
    intervals2 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals2)):
        if intersectSize(intervals2[i - 1], intervals2[i]) != 0:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Overlapping intervals %s and %s found in input2" %
                    (intervals2[i - 1], intervals2[i])))
        if intervals2[i - 1] > intervals2[i]:
            raise RuntimeError(
                errorMessage.replace(
                    "xxx", "Out of order intervals %s and %s found in input2" %
                    (intervals2[i - 1], intervals2[i])))

    # test intersection size
    tempFile = getLocalTempPath("Temp_test", ".bed")
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(
            errorMessage.replace("xxx",
                                 "Input1 covers regions outside input2"))
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(
            errorMessage.replace("xxx",
                                 "Input2 covers regions outside input1"))
    runShellCommand("rm -f %s" % tempFile)
예제 #5
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Prepend track header onto bed file")

    parser.add_argument("inputBed", help="Path of bed file to add header to")
    parser.add_argument("name", help="Name of track")
    parser.add_argument("description", help="Track description")
    parser.add_argument("--useScore",
                        help="Use score",
                        action="store_true",
                        default=False)
    parser.add_argument("--rgb",
                        help="Enable rgb colours.  These must be "
                        "present in the bed file data (can be added using"
                        "addBedColours.py",
                        action="store_true",
                        default=False)

    args = parser.parse_args()

    # hack together a temporary file path in same directory as input
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(5))
    tempPath = os.path.splitext(os.path.basename(args.inputBed))[0] \
                   + "_temp%s.bed" % tag

    score = 0
    if args.useScore is True:
        score = 1

    rgb = ""
    if args.rgb is True:
        rgb = "\titemRgb=\"On\""

    # put the header in the file
    tempFile = open(tempPath, "w")
    tempFile.write("track\tname=\"%s\"\tdescription=\"%s\"\tuseScore=%d%s\n" %
                   (args.name, args.description, score, rgb))

    # copy the bed file to the temp file, skipping track header if found
    bedFile = open(args.inputBed, "r")
    skippedTrack = False
    for line in bedFile:
        if skippedTrack == False and len(line) > 11 \
          and line[:11] == "track\tname=":
            skippedTrack = True
        else:
            tempFile.write(line)
    bedFile.close()
    tempFile.close()

    # move the tempfile back to bed file, and hope nothing doesnt go wrong
    runShellCommand("mv %s %s" % (tempPath, args.inputBed))
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Set the score column of each bed interval in input to "
        "(MODE, BINNED) average value of the intersection region in another track). "
        "Can be used, for instance, to assign a copy number of each RepeatModeler "
        "prediction...")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inBed", help="BED file to annotate")
    parser.add_argument("track", help="Track to use for annotation")
    parser.add_argument("outBed", help="Path for output, annotated BED file")
    parser.add_argument("--name", help="Set ID field (column 4 instead of 5)",
                        action="store_true", default=False)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read the tracks list
    trackList = TrackList(args.tracksInfo)
    track = trackList.getTrackByName(args.track)
    if track is None:
        raise RuntimeError("Can't find track %s" % args.track)
    # make temporary tracks list with just our track so we can keep using
    # tracks list interface but not read unecessary crap.
    singleListPath = getLocalTempPath("Temp_secScore", ".bed")
    trackList.trackList = [track]
    trackList.saveXML(singleListPath)

    obFile = open(args.outBed, "w")

    # trackData interface not so great at cherry picking intervals.
    # need to merge them up and use segmentation interface    
    filledIntervals, mergedIntervals = fillGaps(args.inBed)

    # read track into trackData
    trackData = TrackData()
    logger.info("loading track %s" % singleListPath)
    trackData.loadTrackData(singleListPath, mergedIntervals,
                            segmentIntervals=filledIntervals,
                            applyMasking=False)

    # finally, write the annotation
    writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals, obFile,
                             args)

    runShellCommand("rm -f %s" % singleListPath)
    obFile.close()
    cleanBedTool(tempBedToolPath)
예제 #7
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="another wrapper for compareBedStates.py that will compare many files"
        " and make a decent table output")

    parser.add_argument("tracksList", help="XML tracks list")
    parser.add_argument("truthBeds", help="comma-separated references to benchmark against (ex repet)")
    parser.add_argument("testBeds", help="comma-spearated test beds")
    parser.add_argument("workDir", help="folder to write comparision outputs")
    parser.add_argument("outCSV", help="path for output")
    parser.add_argument("--state", help="state name", default="TE")
    parser.add_argument("--delMask", help="see help for compareBedStates.py", default=None, type=int)
    parser.add_argument("--proc", help="number of prcesses", default=1, type=int)
    parser.add_argument("--truthNames", help="comma-separated list of truth names", default =None)
    parser.add_argument("--testNames", help="comma-separated list of test names", default =None)
    
    args = parser.parse_args()

    truths = args.truthBeds.split(",")
    tests = args.testBeds.split(",")

    if args.truthNames is not None:
        truthNames = args.truthNames.split(",")
    else:
        truthNames = [os.path.splitext(os.path.basename(x))[0] for x in truths]
    if args.testNames is not None:
        testNames = args.testNames.split(",")
    else:
        testNames = [os.path.splitext(os.path.basename(x))[0] for x in tests]

    if not os.path.isdir(args.workDir):
        runShellCommand("mkdir %s" % args.workDir)

    assert len(tests) == len(testNames)
    assert len(truths) == len(truthNames)

    compCmds = []
    for i in xrange(len(tests)):
        for j in xrange(len(truths)):
            opath = os.path.join(args.workDir, "%s_vs_%s.txt" % (testNames[i], truthNames[j]))
            flags = "--tl %s" % args.tracksList
            if args.delMask is not None:
                flags += " --delMask %d" % args.delMask
            cmd = "compareBedStates.py %s %s %s > %s" % (truths[j], tests[i], flags, opath)
            compCmds.append(cmd)

    runParallelShellCommands(compCmds, args.proc)

    # munging ############
    def prettyAcc((prec, rec), spec):
        f1 = 0.
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)        
        return ("%.4f" % prec, "%.4f" % rec, "%.4f" % f1, "%.4f" % spec)
예제 #8
0
def makeRow(bed, header, args):
    """ print the tab-separated table row to stdout"""
    trueBed = getTrueBed(bed)
    fixBed = os.path.join(args.workdir, os.path.basename(bed) + ".fix")
    compFile = os.path.join(args.workdir, os.path.basename(bed) + ".comp")
    runShellCommand("fitStateNames.py %s %s %s" % (trueBed, bed, fixBed))
    runShellCommand("compareBedStates.py %s %s > %s" % (trueBed, fixBed, compFile))
    idRow = extractIdRow(bed, header)
    compRow = extractCompRow(compFile, header, args)
    probRow = extractProbRow(bed, header, args)
    return idRow + compRow + probRow
예제 #9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Prepend track header onto bed file")

    parser.add_argument("inputBed", help="Path of bed file to add header to")
    parser.add_argument("name", help="Name of track")
    parser.add_argument("description", help="Track description")
    parser.add_argument("--useScore", help="Use score", action="store_true",
                        default=False)
    parser.add_argument("--rgb", help="Enable rgb colours.  These must be "
                        "present in the bed file data (can be added using"
                        "addBedColours.py", action="store_true",
                        default=False)


    args = parser.parse_args()

    # hack together a temporary file path in same directory as input
    S = string.ascii_uppercase + string.digits
    tag = ''.join(random.choice(S) for x in range(5))
    tempPath = os.path.splitext(os.path.basename(args.inputBed))[0] \
                   + "_temp%s.bed" % tag

    score = 0
    if args.useScore is True:
        score = 1

    rgb=""
    if args.rgb is True:
        rgb="\titemRgb=\"On\""
        
    # put the header in the file
    tempFile = open(tempPath, "w")
    tempFile.write("track\tname=\"%s\"\tdescription=\"%s\"\tuseScore=%d%s\n" % (
        args.name, args.description, score, rgb))

    # copy the bed file to the temp file, skipping track header if found
    bedFile = open(args.inputBed, "r")
    skippedTrack = False
    for line in bedFile:
        if skippedTrack == False and len(line) > 11 \
          and line[:11] == "track\tname=":
            skippedTrack = True
        else:
            tempFile.write(line)
    bedFile.close()
    tempFile.close()

    # move the tempfile back to bed file, and hope nothing doesnt go wrong
    runShellCommand("mv %s %s" % (tempPath, args.inputBed))
예제 #10
0
def makeRow(bed, header, args):
    """ print the tab-separated table row to stdout"""
    trueBed = getTrueBed(bed)
    fixBed = os.path.join(args.workdir, os.path.basename(bed) + ".fix")
    compFile = os.path.join(args.workdir, os.path.basename(bed) + ".comp")
    runShellCommand("fitStateNames.py %s %s %s" % (trueBed, bed, fixBed))
    runShellCommand("compareBedStates.py %s %s > %s" %
                    (trueBed, fixBed, compFile))
    idRow = extractIdRow(bed, header)
    compRow = extractCompRow(compFile, header, args)
    probRow = extractProbRow(bed, header, args)
    return idRow + compRow + probRow
예제 #11
0
def checkExactOverlap(bed1, bed2):
    """ make sure two bed files cover same region exactly: a requirement for all
    code based on the comparisons in this module."""

    errorMessage = ("Bed files %s and %s cannot be compared. xxx. "
    " Input files must be both sorted, cover the exact same region,"
    " and contain no self-overlaps.") % (bed1, bed2)

    # empty file may break downstream comparisons
    size1 = os.path.getsize(bed1)
    size2 = os.path.getsize(bed2)
    if size1 == 0 or size2 == 0:
        raise RuntimeError(errorMessage.replace("xxx", "one or both inputs empty"))
                            

    # test self-overlap and sorting
    intervals1 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals1)):
        if intersectSize(intervals1[i-1], intervals1[i]) != 0:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Overlapping intervals %s and %s found in input1" % (
                    intervals1[i-1], intervals1[i])))
        if intervals1[i-1] > intervals1[i]:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Out of order intervals %s and %s found in input1" % (
                    intervals1[i-1], intervals1[i])))

    # test self-overlap and sorting
    intervals2 = readBedIntervals(bed1, sort=False)
    for i in xrange(1, len(intervals2)):
        if intersectSize(intervals2[i-1], intervals2[i]) != 0:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Overlapping intervals %s and %s found in input2" % (
                    intervals2[i-1], intervals2[i])))
        if intervals2[i-1] > intervals2[i]:
            raise RuntimeError(errorMessage.replace(
                "xxx", "Out of order intervals %s and %s found in input2" % (
                    intervals2[i-1], intervals2[i])))
        

    # test intersection size
    tempFile = getLocalTempPath("Temp_test", ".bed")
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(errorMessage.replace(
            "xxx", "Input1 covers regions outside input2"))
    runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile))
    if os.path.getsize(tempFile) != 0:
        runShellCommand("rm -f %s" % tempFile)
        raise RuntimeError(errorMessage.replace(
            "xxx", "Input2 covers regions outside input1"))
    runShellCommand("rm -f %s" % tempFile)
예제 #12
0
def cutBedRegion(bedInterval, cutTrackPath, inBed, outBed):
    """ intersect with a given interval """
    tempPath = getLocalTempPath("Temp_cut", ".bed")
    tempPath2 = getLocalTempPath("Temp_cut", ".bed")
    runShellCommand("rm -f %s" % outBed)
    runShellCommand(
        "echo \"%s\t%s\t%s\n\" > %s" %
        (bedInterval[0], bedInterval[1], bedInterval[2], tempPath2))
    runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                    (inBed, tempPath2, tempPath))
    runShellCommand("subtractBed -a %s -b %s | sortBed > %s" %
                    (tempPath, cutTrackPath, outBed))
    runShellCommand("rm -f %s %s" % (tempPath, tempPath2))
예제 #13
0
def filterEmptyRegions(genomePath, regions, outDir, cutTrackPath):
    """ to a trial cut on each region.  return a list of those that
    aren't empty after cut """
    filteredRegions = []
    for i, region in enumerate(regions):
        regionName = getRegionName(region, i)
        tempPath1 = getLocalTempPath("Temp", ".bed")
        cutBedRegion(region, cutTrackPath, genomePath, tempPath1)
        intervals = bedRead(tempPath1)
        runShellCommand("rm -f %s" % tempPath1)
        if len(intervals) > 0:
            filteredRegions.append(region)
    return filteredRegions
예제 #14
0
def filterEmptyRegions(genomePath, regions, outDir, cutTrackPath):
    """ to a trial cut on each region.  return a list of those that
    aren't empty after cut """
    filteredRegions = []
    for i, region in enumerate(regions):
        regionName = getRegionName(region, i)
        tempPath1 = getLocalTempPath("Temp", ".bed")
        cutBedRegion(region, cutTrackPath, genomePath, tempPath1)
        intervals = bedRead(tempPath1)
        runShellCommand("rm -f %s" % tempPath1)
        if len(intervals) > 0:
            filteredRegions.append(region)
    return filteredRegions
예제 #15
0
def runVennMaker(args0):                     
    # venn_maker seems designed to run on intervals (and looks pretty broken doing this).
    # try converting to base intervals.
    todie = []
    for i, f in enumerate(args.inputFiles):
        tempFile = getLocalTempPath("Temp_%d" % i, ".bed")
        todie.append(tempFile)
        baserize(f, tempFile)
        args.inputFiles[i] = tempFile
    
    venn_maker(args.inputFiles, args.names, args.outTiff, "venn.R",
               additional_args=None, run=True)

    for f in todie:
        runShellCommand("rm -f %s" % f)
예제 #16
0
def runScaling(args, tempTracksInfo):
    """ run setTrackScaling on temp track list"""
    tracksArg = ""
    if args.scaleTracks is not None:
        tracksArg = args.scaleTracks
    skipArg = ""
    if args.skipScale is not None:
        skipArg = args.skipScale

    if args.noScale is False:
        cmd = "setTrackScaling.py %s %s %s --numBins %d --logLevel %s %s %s" % (
            tempTracksInfo, args.allBed, args.outTracksInfo, args.numBins,
            getLogLevelString(), tracksArg, skipArg)
    else:
        cmd = "cp %s %s" % (tempTracksInfo, args.outTracksInfo)
    runShellCommand(cmd)
예제 #17
0
def cutBedRegion(bedInterval, cutTrackPath, inBed, outBed):
    """ intersect with a given interval """
    tempPath = getLocalTempPath("Temp_cut", ".bed")
    tempPath2 = getLocalTempPath("Temp_cut", ".bed")
    runShellCommand("rm -f %s" % outBed)
    runShellCommand("echo \"%s\t%s\t%s\n\" > %s" % (bedInterval[0],
                                                bedInterval[1],
                                                bedInterval[2],
                                                tempPath2))
    runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (inBed,
                                                                 tempPath2,
                                                                 tempPath))
    runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (tempPath,
                                                                cutTrackPath,
                                                                outBed))
    runShellCommand("rm -f %s %s" % (tempPath, tempPath2))
예제 #18
0
def runScaling(args, tempTracksInfo):
    """ run setTrackScaling on temp track list"""
    tracksArg = ""
    if args.scaleTracks is not None:
        tracksArg = args.scaleTracks
    skipArg = ""
    if args.skipScale is not None:
        skipArg = args.skipScale

    if args.noScale is False:
        cmd = "setTrackScaling.py %s %s %s --numBins %d --logLevel %s %s %s" % (
            tempTracksInfo, args.allBed, args.outTracksInfo, args.numBins,
            getLogLevelString(), tracksArg, skipArg)
    else:
        cmd = "cp %s %s" % (tempTracksInfo, args.outTracksInfo)
    runShellCommand(cmd)
예제 #19
0
def runVennMaker(args0):
    # venn_maker seems designed to run on intervals (and looks pretty broken doing this).
    # try converting to base intervals.
    todie = []
    for i, f in enumerate(args.inputFiles):
        tempFile = getLocalTempPath("Temp_%d" % i, ".bed")
        todie.append(tempFile)
        baserize(f, tempFile)
        args.inputFiles[i] = tempFile

    venn_maker(args.inputFiles,
               args.names,
               args.outTiff,
               "venn.R",
               additional_args=None,
               run=True)

    for f in todie:
        runShellCommand("rm -f %s" % f)
예제 #20
0
def extractNaive(tracksPath, benchDir, benchInputBedPath, args):
    """ use naiveTrackCombine.py to get a score instead of teHmmBenchmark.py """

    naiveEvalPath = os.path.join(benchDir,
                             os.path.splitext(
                                 os.path.basename(benchInputBedPath))[0]+
                                "_naiveEval.bed")
    naiveFitPath = os.path.join(benchDir,
                             os.path.splitext(
                                 os.path.basename(benchInputBedPath))[0]+
                                "_naiveEval_Fit.bed")
    naiveCompPath = os.path.join(benchDir,
                             os.path.splitext(
                                 os.path.basename(benchInputBedPath))[0]+
                                "_naive_comp.txt")


    runShellCommand("naiveTrackCombine.py %s %s %s" % (tracksPath, args.truth,
                                                        naiveEvalPath))
    runShellCommand("fitStateNames.py %s %s %s" % (args.truth,
                                                   naiveEvalPath,
                                                   naiveFitPath))
    runShellCommand("compareBedStates.py %s %s > %s" % (args.truth,
                                                                           naiveFitPath,
                                                                           naiveCompPath))
    score = extractScore(benchDir,
                        naiveCompPath.replace("_naive_comp.txt", "_naive.bed"),
                        args)
    return score
예제 #21
0
def runParallel(args, bedIntervals):
    """ Quick hack to rerun parallel jobs on different interval subsets. """
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))

    # chunk up BED input
    numIntervals = 0
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            numIntervals += 1
    jobSize = 1 + (numIntervals / args.numProc)
    logger.info("Dviding %d intervals into %d processes (%d intervals per)" %
                (numIntervals, args.numProc, jobSize))
    tempBeds = []
    curSize = sys.maxint
    curFile = None
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            if curSize >= jobSize:
                if curFile is not None:
                    curFile.close()
                tempBed = getLocalTempPath("TempTsdFinderIn", ".bed")
                tempBeds.append(tempBed)
                curFile = open(tempBed, "w")
                curSize = 0
            curFile.write("\t".join([str(s) for s in interval]))
            curFile.write("\n")
            curSize += 1
    if curFile is not None:
        curFile.close()

    # map jobs
    assert len(tempBeds) <= args.numProc
    tempOuts = []
    jobCmds = []
    for tempBed in tempBeds:
        cmdLine = " ".join(sys.argv)
        cmdLine = cmdLine.replace("--numProc %d" % args.numProc, "--numProc 1")
        cmdLine = cmdLine.replace(args.inBed, tempBed)
        tempOut = getLocalTempPath("TempTsdFinderOut", ".bed")
        cmdLine = cmdLine.replace(args.outBed, tempOut)
        tempOuts.append(tempOut)
        jobCmds.append(cmdLine)

    runParallelShellCommands(jobCmds, args.numProc)

    # reduce
    for i, tempOut in enumerate(tempOuts):
        if i == 0:
            runShellCommand("mv %s %s" % (tempOut, args.outBed))
        else:
            runShellCommand("cat %s >> %s" % (tempOut, args.outBed))
            runShellCommand("rm -f %s" % (tempOut))
예제 #22
0
def runParallel(args, bedIntervals):
    """ Quick hack to rerun parallel jobs on different interval subsets. """
    nameSet = None
    if args.names is not None:
        nameSet = set(args.names.split(","))
        
    # chunk up BED input
    numIntervals = 0
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            numIntervals += 1
    jobSize = 1 + (numIntervals / args.numProc)
    logger.info("Dviding %d intervals into %d processes (%d intervals per)" % (
        numIntervals, args.numProc, jobSize))
    tempBeds = []
    curSize = sys.maxint
    curFile = None
    for interval in bedIntervals:
        name = None
        if len(interval) > 3:
            name = interval[3]
        if nameSet is None or name in nameSet:
            if curSize >= jobSize:
                if curFile is not None:
                    curFile.close()
                tempBed = getLocalTempPath("TempTsdFinderIn", ".bed")
                tempBeds.append(tempBed)
                curFile = open(tempBed, "w")
                curSize = 0
            curFile.write("\t".join([str(s) for s in interval]))
            curFile.write("\n")
            curSize += 1
    if curFile is not None:
        curFile.close()

    # map jobs
    assert len(tempBeds) <= args.numProc
    tempOuts = []
    jobCmds = []
    for tempBed in tempBeds:
        cmdLine = " ".join(sys.argv)
        cmdLine = cmdLine.replace("--numProc %d" % args.numProc,"--numProc 1")
        cmdLine = cmdLine.replace(args.inBed, tempBed)
        tempOut = getLocalTempPath("TempTsdFinderOut", ".bed")
        cmdLine = cmdLine.replace(args.outBed, tempOut)
        tempOuts.append(tempOut)
        jobCmds.append(cmdLine)
        
    runParallelShellCommands(jobCmds, args.numProc)

    # reduce
    for i, tempOut in enumerate(tempOuts):
        if i == 0:
            runShellCommand("mv %s %s" % (tempOut, args.outBed))
        else:
            runShellCommand("cat %s >> %s" % (tempOut, args.outBed))
            runShellCommand("rm -f %s" % (tempOut))
예제 #23
0
def runCleaning(args, tempTracksInfo):
    """ run scripts for cleaning chaux, ltr_finder, and termini"""
    trackList = TrackList(args.tracksInfo)

    for track in trackList:
        if track.getPreprocess() is None:
            continue

        # convert bigbed/wig
        inFile = track.getPath()
        tempBed1 = None
        if inFile[-3:] == ".bb" or inFile[-3:] == ".bw":
            tempBed1 = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            if inFile[-3:] == ".bb":
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed1))
            else:
                runShellCommand("bigWigToBedGraph %s %s" % (inFile, tempBed1))
            inFile = tempBed1

        # run cleanRM.py on all tracks with rm or rmu preprocessor
        if track.getPreprocess() == "rm" or track.getPreprocess() == "rmu":
            flag = ""
            if track.getPreprocess() == "rmu":
                flag == "--keepUnderscore"
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            tempBed = getLocalTempPath("Temp_%s" % track.getName(), ".bed")
            runShellCommand("cleanRM.py %s %s > %s" % (inFile, flag, tempBed))
            runShellCommand("removeBedOverlaps.py --rm %s > %s" %
                            (tempBed, outFile))
            runShellCommand("rm -f %s" % tempBed)
            track.setPath(outFile)

        # run cleanTermini.py
        elif track.getPreprocess() == "termini":
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("cleanTermini.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        # run removeBedOverlaps
        elif track.getPreprocess() == "overlap":
            outFile = cleanPath(args, track)
            inFile = track.getPath()
            runShellCommand("removeBedOverlaps.py %s > %s" % (inFile, outFile))
            track.setPath(outFile)

        # run cleanLtrFinder.py
        elif track.getPreprocess() == "ltr_finder":
            inFile = track.getPath()
            outFile = cleanPath(args, track)
            # note: overlaps now removed in cleanLtrFinderID script
            runShellCommand("cleanLtrFinderID.py %s %s" % (inFile, outFile))
            track.setPath(outFile)

        if tempBed1 is not None:
            runShellCommand("rm -f %s" % tempBed1)

    # save a temporary xml
    trackList.saveXML(tempTracksInfo)
예제 #24
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument(
        "trainingBeds",
        help="comma-separated list of training regions"
        " (training region size will be a variable in output table). "
        "if segmentation is activated, these must also be the "
        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states",
                        help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps",
                        help="number of replicates",
                        type=int,
                        default=1)
    parser.add_argument("--proc",
                        help="maximum number of processors to use"
                        " in parallel",
                        type=int,
                        default=1)
    parser.add_argument("--resume",
                        help="try not to rewrite existing files",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--initTrans",
        help="the states argument is overridden"
        " to specify a list of transition initialization files "
        "instead of state numbers",
        action="store_true",
        default=False)
    parser.add_argument("--numReps",
                        help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments",
                        action="store_true",
                        default=False)
    parser.add_argument("--numIter",
                        help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[
                        states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)

    # run the training
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" %
                    stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize, trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" %
                            (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" %
                                (np.mean(bics), np.min(bics), np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()

    cleanBedTool(tempBedToolPath)
예제 #25
0
def runTrial(tracksList, iteration, newTrackName, args):
    """ compute a score for a given set of tracks using teHmmBenchmark.py """
    benchDir = os.path.join(args.outDir, "iter%d" % iteration)
    benchDir = os.path.join(benchDir, "%s_bench" % newTrackName)
    if not os.path.exists(benchDir):
        os.makedirs(benchDir)

    trainingPath = args.training
    truthPath = args.truth

    tracksPath =  os.path.join(benchDir, "tracks.xml")
    tracksList.saveXML(tracksPath)

    segLogPath = os.path.join(benchDir, "segment_cmd.txt")
    segLog = open(segLogPath, "w")

    if args.segTracks == args.tracks:
        segTracksPath = tracksPath
    # pull out desired tracks from segment tracks XML if specified
    else:
        segTracksIn = TrackList(args.segTracks)
        segTracks = TrackList()
        for track in tracksList:
            segTrack = segTracksIn.getTrackByName(track.getName())
            if segTrack is not None:
                segTracks.addTrack(segTrack)
            else:
                logger.warning("track %s not found in segment tracks %s" % (
                    track.getName(), args.segTracks))
        segTracksPath = os.path.join(benchDir, "seg_tracks.xml")
        segTracks.saveXML(segTracksPath)
        
    # segment training
    segTrainingPath = os.path.join(benchDir,
                                   os.path.splitext(
                                       os.path.basename(trainingPath))[0]+
                                   "_trainSeg.bed")    
    segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath,
                                                   trainingPath,
                                                   segTrainingPath,
                                                   args.segOpts)

    if args.fullSegment is False:
        runShellCommand(segmentCmd)
        segLog.write(segmentCmd + "\n")
    else:
        runShellCommand("ln -f -s %s %s" % (args.fullSegTrainPath, segTrainingPath))

    # segment eval
    segEvalPath = os.path.join(benchDir,
                                os.path.splitext(os.path.basename(truthPath))[0]+
                                "_evalSeg.bed")    
    segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath,
                                                   truthPath,
                                                   segEvalPath,
                                                   args.segOpts)
    if trainingPath == truthPath:
        segmentCmd = "ln -f -s %s %s" % (os.path.abspath(segTrainingPath), segEvalPath)
    if args.fullSegment is False:
        runShellCommand(segmentCmd)
        segLog.write(segmentCmd + "\n")
    else:
        runShellCommand("ln -f -s %s %s" % (args.fullSegEvalPath, segEvalPath))
    
    segLog.close()

    segPathOpts = " --eval %s --truth %s" % (segEvalPath, truthPath)
    
    benchCmd = "teHmmBenchmark.py %s %s %s %s" % (tracksPath,
                                                  benchDir,
                                                  segTrainingPath,
                                                  args.benchOpts + segPathOpts)
    runShellCommand(benchCmd)

    score = extractScore(benchDir, segTrainingPath, args)
    bic = extractBIC(benchDir, segTrainingPath, args)
    naive = 0
    if args.doNaive is True:
        naive = extractNaive(tracksPath, benchDir, segTrainingPath, args)
    slope, rsq = extractF1ProbSlope(benchDir, segTrainingPath, args)

    # clean up big files?

    return score, bic, naive, slope, rsq
예제 #26
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Given two bed files: a prediction and a true (or target)"
        " annotation, re-label the prediction's state names so that they "
        " best match the true annotation.  Usees same logic as "
        " compareBedStates.py for determining accuracy")

    parser.add_argument("tgtBed", help="Target bed file")
    parser.add_argument("predBed", help="Predicted bed file to re-label. ")
    parser.add_argument("outBed", help="Output bed (relabeling of predBed)")
    parser.add_argument("--col",
                        help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default=4,
                        type=int)
    parser.add_argument(
        "--intThresh",
        help="Threshold to consider interval from"
        " tgtBed covered by predBed.  If not specified, then base"
        " level statistics will be used. Value in range (0,1]",
        type=float,
        default=None)
    parser.add_argument("--noFrag",
                        help="Dont allow fragmented interval matches ("
                        "see help for --frag in compareBedStates.py).  Only"
                        " relevant with --intThresh",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--qualThresh",
        help="Minimum match ratio between truth"
        " and prediction to relabel prediction.  Example, if"
        " predicted state X overlaps target state LTR 25 pct of "
        "the time, then qualThresh must be at least 0.25 to "
        "label X as LTR in the output.  Value in range (0, 1]",
        type=float,
        default=0.1)
    parser.add_argument("--ignore",
                        help="Comma-separated list of stateNames to"
                        " ignore (in prediction)",
                        default=None)
    parser.add_argument("--ignoreTgt",
                        help="Comma-separated list of stateNames to"
                        " ignore (int target)",
                        default=None)
    parser.add_argument("--tgt",
                        help="Comma-separated list of stateNames to "
                        " consider (in target).  All others will be ignored",
                        default=None)
    parser.add_argument(
        "--unique",
        help="If more than one predicted state maps"
        " to the same target state, add a unique id (numeric "
        "suffix) to the output so that they can be distinguished",
        action="store_true",
        default=False)
    parser.add_argument("--model",
                        help="Apply state name mapping to the model"
                        " in the specified path (it is strongly advised to"
                        " make a backup of the model first)",
                        default=None)
    parser.add_argument("--noMerge",
                        help="By default, adjacent intervals"
                        " with the same state name in the output are "
                        "automatically merged into a single interval.  This"
                        " flag disables this.",
                        action="store_true",
                        default=False)
    parser.add_argument("--hm",
                        help="Write confusion matrix as heatmap in PDF"
                        " format to specified file",
                        default=None)
    parser.add_argument("--old",
                        help="Use old name mapping logic which just "
                        "takes biggest overlap in forward confusion matrix.  "
                        "faster than new default logic which does the greedy"
                        " f1 optimization",
                        action="store_true",
                        default=False)
    parser.add_argument("--fdr",
                        help="Use FDR cutoff instead of (default)"
                        " greedy F1 optimization for state labeling",
                        type=float,
                        default=None)
    parser.add_argument("--tl",
                        help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)",
                        default=None)
    parser.add_argument(
        "--colOrder",
        help="List of states used to force"
        " ordering in heatmap (otherwise alphabetical) columns. These"
        " states will correspond to the tgtBed when --old used and"
        " --predBed otherwise.",
        default=None)
    parser.add_argument(
        "--hmCovRow",
        help="Path to write 1-row heatmap of "
        "state coverage (fraction of bases). only works with --hm",
        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()
    if args.ignoreTgt is not None:
        args.ignoreTgt = set(args.ignoreTgt.split(","))
    else:
        args.ignoreTgt = set()
    if args.tgt is not None:
        args.tgt = set(args.tgt.split(","))
        if args.old is True:
            raise RuntimeError("--tgt option not implemented for --old")
    else:
        args.tgt = set()
    if args.old is True and args.fdr is not None:
        raise RuntimeError("--old and --fdr options are exclusive")

    assert args.col == 4 or args.col == 5

    tempFiles = []
    if args.tl is not None:
        cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl)
        cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl)

        if cutBedTgt is not None:
            assert cutBedPred is not None
            tempFiles += [cutBedTgt, cutBedPred]
            args.tgtBed = cutBedTgt
            args.predBed = cutBedPred

    checkExactOverlap(args.tgtBed, args.predBed)

    intervals1 = readBedIntervals(args.tgtBed, ncol=args.col)
    intervals2 = readBedIntervals(args.predBed, ncol=args.col)
    cfName = "reverse"

    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        cfName = "forward"

    # generate confusion matrix based on accuracy comparison using
    # base or interval stats as desired
    if args.intThresh is not None:
        logger.info("Computing interval %s confusion matrix" % cfName)
        confMat = compareIntervalsOneSided(intervals2, intervals1,
                                           args.col - 1, args.intThresh, False,
                                           not args.noFrag)[1]
    else:
        logger.info("Computing base %s confusion matrix" % cfName)
        confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1]

    logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat)))

    # find the best "true" match for each predicted state
    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        stateMap = getStateMapFromConfMatrix_simple(confMat)
    else:
        stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt,
                                             args.ignore, args.qualThresh,
                                             args.fdr)

    # filter the stateMap to take into account the command-line options
    # notably --ignore, --ignoreTgt, --qualThresh, and --unique
    filterStateMap(stateMap, args)

    logger.info("State Map:\n%s", str(stateMap))

    # write the model if spefied
    if args.model is not None:
        applyNamesToModel(stateMap, args.model)

    # generate the output bed using the statemap
    writeFittedBed(intervals2, stateMap, args.outBed, args.col - 1,
                   args.noMerge, args.ignoreTgt)

    # write the confusion matrix as heatmap
    if args.hm is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write heatmap.  Maybe matplotlib is "
                               "not installed?")
        writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
예제 #27
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    segFiles = []
    statsFiles = []
    offset = args.co
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""

        chromPath = getLocalTempPath("TempChromPath", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()

        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.allBed, chromPath, regionPath))

        if os.path.getsize(regionPath) < 2:
            continue

        offset += int(chrom[2]) - int(chrom[1])

        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[2] = regionPath

        segPath = getLocalTempPath("Temp", ".bed")
        cmdToks[3] = segPath
        segFiles.append(segPath)

        if "--co" in cmdToks:
            cmdToks[cmdToks.index("--co") + 1] = str(offset)
        else:
            cmdToks.append("--co")
            cmdToks.append(str(offset))

        if args.stats is not None:
            statsPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--stats") + 1] = statsPath
            statsFiles.append(statsPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed))
        if len(statsFiles) > 0:
            runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats))

    for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles):
        runShellCommand("rm %s" % i)
예제 #28
0
outCsvPath = "cross.csv"

setLogLevel("INFO")
addLoggingFileHandler("log.txt", False)

bedFiles=dict()
bedFiles["hollister"] = "alyrata_hollister_clean.bed"
bedFiles["modeler"] = "alyrata_repeatmodeler_clean.bed"
bedFiles["chaux"] = "alyrata_chaux_clean.bed"
bedFiles["hmm"] = "hmm_1_clean_2state.bed"
bedFiles["trf"] = "alyrata_trf_clean.bed"
bedFiles["fgenesh"] = "alyrata_fgenesh_clean.bed"

regionPath = "region1c4.bed"

runShellCommand("rm -rf %s; mkdir %s" % (workPath, workPath))

def bedPath(name, s) :
    return os.path.join(workPath, name + "_%s.bed" % s)

# make working files
# _out : intersection
# _te : TE-state renamed TE and everything else removed
# _gap : _out with gaps added
# _gap_te : _te with gaps added
for name, path in bedFiles.items():
    tPath = bedPath(name, "temp")
    outPath = bedPath(name, "out")
    runShellCommand("intersectBed -a %s -b %s > %s" % (path, regionPath, tPath))
    runShellCommand("setScoreFromTrackIntersection.py %s %s %s %s" % (
        tracksPath, tPath, copyTrack, outPath))
예제 #29
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Remove ltr_finder ids from 4th column")
    parser.add_argument("inBed", help="bed with ltr results to process")
    parser.add_argument("outBed",
                        help="bed to write output to.  Will also "
                        "write outBed_sym.bed outBed_tsd_as_gap.bed etc.")
    parser.add_argument("--keepOl",
                        help="by default, if LTR elements "
                        "overlap, the one with the highest score (length "
                        "in event of tie) is kept. This option disables"
                        " this logic.",
                        action="store_true",
                        default=False)
    parser.add_argument("--all",
                        help="write _sym, _tsd_as_gap, etc. versions"
                        " of output",
                        action="store_true",
                        default=False)
    parser.add_argument("--weak",
                        help="score threshold such that any elemetns"
                        " with a score lower or equal to will be assigned the"
                        " prefix WEAK_ to their names.",
                        type=float,
                        default=-1)
    parser.add_argument(
        "--weakIgnore",
        help="dont apply --weak to state names"
        " that contain given keywords (defined as comma-separated"
        " list",
        default=None)

    args = parser.parse_args()
    tempBedToolPath = initBedTool()
    assert os.path.exists(args.inBed)
    baseOut, ext = os.path.splitext(args.outBed)
    if args.weakIgnore is not None:
        args.weakIgnore = args.weakIgnore.split(",")
    else:
        args.weakIgnore = []

    inBed = args.inBed

    toRm = []
    if not args.keepOl:
        inBed = getLocalTempPath("Temp", ".bed")
        removeOverlaps(args.inBed, inBed, args)
        toRm.append(inBed)

    os.system("sed -e \"s/|LTR_TE|[0-9]*//g\" -e \"s/|-//g\" %s > %s" %
              (inBed, args.outBed))

    if args.all:
        symBed = baseOut + "_sym" + ext
        os.system("sed -e \"s/|left//g\" -e \"s/|right//g\" %s > %s" %
                  (args.outBed, symBed))

        tsd_as_gapsBed = baseOut + "_tsd_as_gap" + ext
        os.system("grep -v TSD %s > %s" % (args.outBed, tsd_as_gapsBed))

        sym_tsd_as_gapsBed = baseOut + "_sym_tsd_as_gap" + ext
        os.system("grep -v TSD %s > %s" % (symBed, sym_tsd_as_gapsBed))

        tsd_as_ltrBed = baseOut + "_tsd_as_ltr" + ext
        os.system("sed -e \"s/TSD/LTR/g\" %s > %s" %
                  (args.outBed, tsd_as_ltrBed))

        sym_tsd_as_ltrBed = baseOut + "_sym_tsd_as_ltr" + ext
        os.system("sed -e \"s/TSD/LTR/g\" %s > %s" %
                  (symBed, sym_tsd_as_ltrBed))

        singleBed = baseOut + "_single" + ext
        os.system("sed -e \"s/LTR/inside/g\" %s > %s" %
                  (sym_tsd_as_ltrBed, singleBed))

    for path in toRm:
        runShellCommand("rm -f %s" % path)

    cleanBedTool(tempBedToolPath)
예제 #30
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument("trainingBeds", help="comma-separated list of training regions"
                        " (training region size will be a variable in output table). "
                        "if segmentation is activated, these must also be the "
                        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states", help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps", help="number of replicates", type = int,
                        default=1)
    parser.add_argument("--proc", help="maximum number of processors to use"
                        " in parallel", type = int, default = 1)
    parser.add_argument("--resume", help="try not to rewrite existing files",
                        action="store_true", default=False)
    parser.add_argument("--initTrans", help="the states argument is overridden"
                        " to specify a list of transition initialization files "
                        "instead of state numbers", action="store_true",
                        default=False)
    parser.add_argument("--numReps", help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments", action="store_true", default=False)
    parser.add_argument("--numIter", help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments", action="store_true", default=False)
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in  args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)
            
    # run the training            
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize,trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics),
                                                  np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()
            
    cleanBedTool(tempBedToolPath)
예제 #31
0
def runTsd(args, tempTracksInfo):
    """ run addTsdTrack on termini and chaux to generate tsd track"""
    if args.noTsd is True:
        return

    origTrackList = TrackList(args.tracksInfo)
    outTrackList = TrackList(tempTracksInfo)

    tempFiles = []
    tsdInputFiles = []
    tsdInputTracks = []

    # preprocess termini
    lastzTracks = [
        origTrackList.getTrackByName(args.ltr_termini),
        origTrackList.getTrackByName(args.tir)
    ]
    for terminiTrack in lastzTracks:
        if terminiTrack is not None:
            inFile = terminiTrack.getPath()
            fillFile = getLocalTempPath("Temp_fill", ".bed")
            tempBed = None
            if inFile[-3:] == ".bb":
                tempBed = getLocalTempPath("Temp_termini", ".bed")
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
                inFile = tempBed
            runShellCommand("fillTermini.py %s %s" % (inFile, fillFile))
            tsdInputFiles.append(fillFile)
            tsdInputTracks.append(terminiTrack.getName())
            tempFiles.append(fillFile)
            if tempBed is not None:
                runShellCommand("rm -f %s" % tempBed)
        else:
            logger.warning("Could not find termini track")

    # add repeat_modeler
    repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler)
    if repeat_modelerTrack is not None:
        tsdInputFiles.append(repeat_modelerTrack.getPath())
        tsdInputTracks.append(repeat_modelerTrack.getName())

    # run addTsdTrack (appending except first time)
    # note we override input track paths in each case
    assert len(tsdInputFiles) == len(tsdInputTracks)
    for i in xrange(len(tsdInputFiles)):
        optString = ""
        if i > 0:
            optString += " --append"
        # really rough hardcoded params based on
        # (A unified classification system for eukaryotic transposable elements
        # Wicker et. al 2007)
        if tsdInputTracks[i] == args.repeat_modeler:
            optString += " --names LINE,SINE,Unknown"
            optString += " --maxScore 20"
            optString += " --left 20"
            optString += " --right 20"
            optString += " --min 5"
            optString += " --max 20"
            optString += " --overlap 20"
        elif tsdInputTracks[i] == args.ltr_termini:
            optString += " --maxScore 3"
            optString += " --left 8"
            optString += " --right 8"
            optString += " --min 3"
            optString += " --max 6"
        elif tsdInputTracks[i] == args.tir:
            optString += " --maxScore 3"
            optString += " --left 15"
            optString += " --right 15"
            optString += " --min 3"
            optString += " --max 12"

        tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml")
        runShellCommand(
            "addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" %
            (tempTracksInfo, args.cleanTrackPath, tempXMLOut,
             tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i],
             optString, args.logOpString, args.numProc))

        runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo))

    for i in xrange(len(tempFiles)):
        runShellCommand("rm %s" % tempFiles[i])
예제 #32
0
def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath):
    """ Filter out intervals of mask tracks from inBed with lengths
    outside given range. Idea is that it makes less sense to simply ignore,
    say, giant stretches of N's (like centromeres), as we would by masking
    them normally, than it does to remove them entirely, splitting the
    genome into multiple chunks.  Can also be used during comparision to
    get rid of all masked intervals """
    outPath = getLocalTempPath("Tempcut", ".bed")
    trackList = TrackList(tracksInfoPath)
    maskPaths = [t.getPath() for t in trackList.getMaskTracks()]
    if len(maskPaths) == 0:
        return None
    tempPath1 = getLocalTempPath("Tempcut1", ".bed")
    tempPath2 = getLocalTempPath("Tempcut2", ".bed")
    runShellCommand("cp %s %s" % (inBed, outPath))
    for maskPath in maskPaths:
        runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" %
                        (maskPath, tempPath1))
    if os.path.getsize(tempPath1) > 0:
        runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" %
                        (tempPath1, tempPath2, tempPath2, tempPath1))
        runShellCommand("filterBedLengths.py %s %d %d > %s" %
                        (tempPath1, minLength + 1, maxLength - 1, tempPath2))
        runShellCommand("subtractBed -a %s -b %s | sortBed > %s" %
                        (outPath, tempPath2, tempPath1))
        runShellCommand("mv %s %s" % (tempPath1, outPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))
    if os.path.getsize(outPath) == 0:
        raise RuntimeError(
            "cutOutMaskIntervals removed everything.  Can't continue."
            " probably best to rerun calling script on bigger region?")
    return outPath
예제 #33
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Compare two bed files where Model states are represented"
        " in a column.  Used to determine sensitivity and specificity.  NOTE"
        " that both bed files must be sorted and cover the exact same regions"
        " of the same genome.")

    parser.add_argument("bed1", help="Bed file (TRUTH)")
    parser.add_argument("bed2",
                        help="Bed file covering same regions in same"
                        " order as bed1")
    parser.add_argument("--col",
                        help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default=4,
                        type=int)
    parser.add_argument("--thresh",
                        help="Threshold to consider interval from"
                        " bed1 covered by bed2.",
                        type=float,
                        default=0.8)
    parser.add_argument("--plot",
                        help="Path of file to write Precision/Recall"
                        " graphs to in PDF format",
                        default=None)
    parser.add_argument("--ignore",
                        help="Comma-separated list of stateNames to"
                        " ignore",
                        default=None)
    parser.add_argument(
        "--strictPrec",
        help="By default, precision is computed"
        " in a manner strictly symmetric to recall.  So calling"
        " compareBedStates.py A.bed B.bed would give the exact"
        " same output as compareBedStates.py B.bed A.bed except"
        " precision and recall values would be swapped.  With "
        " this option, a predicted element only counts toward"
        " precision if it overlaps with 80pct of the true"
        " element, as opposed to only needing 80pct of itself"
        " overlapping with the true element. ",
        action="store_true",
        default=False)
    parser.add_argument("--noBase",
                        help="Skip base-level stats (and only show"
                        " interval stats).  Runs faster",
                        action="store_true",
                        default=False)
    parser.add_argument("--noFrag",
                        help="Do not allow fragmented matches in"
                        "interval predictions.  ie if a single truth interval"
                        " is covered by a series of predicted intervals, only "
                        "the best match will be counted if this flag is used",
                        action="store_true",
                        default=False)
    parser.add_argument("--tl",
                        help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)",
                        default=None)
    parser.add_argument("--delMask",
                        help="Entirely remove intervals from "
                        "mask tracks that are > given length.  Probably "
                        "only want to set to non-zero value K if using"
                        " with a prediction that was processed with "
                        "interpolateMaskedRegions.py --max K",
                        type=int,
                        default=0)
    parser.add_argument(
        "--window",
        help="A comma-delimited 5-tuple of "
        "windowSize,stateName,compType,score,outBed.  "
        "Where windowSize  is the sliding window size "
        "(overlap .5), stateName is target stateName,"
        " compType is in {base,interval,weighted}, sore is"
        " in {f1,precision,recall} and "
        "outBed is the path of a bedFile to write positional"
        " accuracy to.  For example, --window 1000000,TE,base,f1"
        ",acc.bed will write base-level f1 for 1MB sliding windows"
        " to acc.bed.  These can be viewed on the browser by first"
        " converting to BigWig.",
        default=None)

    args = parser.parse_args()
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()

    assert args.col == 4 or args.col == 5
    print "Commandline %s" % " ".join(sys.argv)
    origArgs = copy.deepcopy(args)

    tempFiles = []
    if args.tl is not None:
        cutBed1 = cutOutMaskIntervals(args.bed1, args.delMask, sys.maxint,
                                      args.tl)
        cutBed2 = cutOutMaskIntervals(args.bed2, args.delMask, sys.maxint,
                                      args.tl)
        if cutBed1 is not None:
            assert cutBed2 is not None
            tempFiles += [cutBed1, cutBed2]
            args.bed1 = cutBed1
            args.bed2 = cutBed2

    checkExactOverlap(args.bed1, args.bed2)

    if args.window is not None:
        runPositionalComparison(argv, origArgs)

    intervals1 = readBedIntervals(args.bed1, ncol=args.col)
    intervals2 = readBedIntervals(args.bed2, ncol=args.col)

    if args.noBase is False:
        stats = compareBaseLevel(intervals1, intervals2, args.col - 1)[0]

        totalRight, totalWrong, accMap = summarizeBaseComparision(
            stats, args.ignore)
        print "Base counts [False Negatives, False Positives, True Positives]:"
        print stats
        totalBoth = totalRight + totalWrong
        accuracy = float(totalRight) / float(totalBoth)
        print "Accuaracy: %d / %d = %f" % (totalRight, totalBoth, accuracy)
        print "State-by-state (Precision, Recall):"
        print "Base-by-base Accuracy"
        print accMap

    trueStats = compareIntervalsOneSided(intervals1, intervals2, args.col - 1,
                                         args.thresh, False,
                                         not args.noFrag)[0]
    predStats = compareIntervalsOneSided(intervals2, intervals1, args.col - 1,
                                         args.thresh, args.strictPrec,
                                         not args.noFrag)[0]
    intAccMap = summarizeIntervalComparison(trueStats, predStats, False,
                                            args.ignore)
    intAccMapWeighted = summarizeIntervalComparison(trueStats, predStats, True,
                                                    args.ignore)
    print "\nInterval Accuracy"
    print intAccMap
    print ""

    print "\nWeighted Interval Accuracy"
    print intAccMapWeighted
    print ""

    # print some row data to be picked up by scrapeBenchmarkRow.py
    if args.noBase is False:
        header, row = summaryRow(accuracy, stats, accMap)
        print " ".join(header)
        print " ".join(row)

    # make graph
    if args.plot is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write plots.  Maybe matplotlib is "
                               "not installed?")
        writeAccPlots(accuracy, accMap, intAccMap, intAccMapWeighted,
                      args.thresh, args.plot)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
예제 #34
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        " Generate some accuracy results.  To be used on output of statesVsBic.py"
        "(or some set of hmm prediction beds of the form *_trainsize.stateNum.bed"
    )

    parser.add_argument("tracksList", help="XML tracks list")
    parser.add_argument("truthBed",
                        help="reference to benchmark against (ex repet)")
    parser.add_argument("fitBed", help="predition to fit against (ex modeler)")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("beds",
                        help="one or more bed files to evaluate",
                        nargs="*")
    parser.add_argument("--proc",
                        help="number of parallel processes",
                        type=int,
                        default=1)
    parser.add_argument("--maskGap",
                        help="interpolate masked gaps smaller than this",
                        type=int,
                        default=5000)
    parser.add_argument("--exploreFdr",
                        help="try a bunch of fdr values",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--compWindow",
        help="intersect with this file before running comparison",
        default=None)

    args = parser.parse_args()

    # preloop to check files
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    outFile = open(os.path.join(args.outDir, "accuracy.csv"), "w")

    truthBed = args.truthBed
    if args.compWindow is not None:
        truthBed = os.path.join(args.outDir, "clippedTruth.bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.truthBed, args.compWindow, truthBed))

    if args.exploreFdr is True:
        fdrs = [
            0, .05, .1, .15, .20, .25, .30, .35, .40, .45, .50, .55, .60, .65,
            .70, .75, .80, .85, .90, .95, 1
        ]
    else:
        fdrs = [.65]

    # do two kinds of fitting vs modeer
    fitCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOut = os.path.join(
            args.outDir,
            os.path.basename(bed).replace(".bed", "_fit.bed"))
        fitLog = fitOut.replace(".bed", "_log.txt")
        cmd = "fitStateNames.py %s %s %s --tl %s --tgt TE --qualThresh 0.1 --logDebug --logFile %s" % (
            args.fitBed, bed, fitOut, args.tracksList, fitLog)
        fitCmds.append(cmd)
        for fdr in fdrs:
            fitOutFdr = fitOut.replace(".bed", "Fdr%f.bed" % fdr)
            fitLogFdr = fitOutFdr.replace(".bed", "_log.txt")
            cmdFdr = "fitStateNames.py %s %s %s --tl %s --tgt TE --fdr %f --logDebug --logFile %s" % (
                args.fitBed, bed, fitOutFdr, args.tracksList, fdr, fitLogFdr)
            fitCmds.append(cmdFdr)

    # interpolate the gaps
    interpolateCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOut = os.path.join(
            args.outDir,
            os.path.basename(bed).replace(".bed", "_fit.bed"))
        fitOutMI = os.path.join(
            args.outDir,
            os.path.basename(bed).replace(".bed", "_fitMI.bed"))
        cmd = "interpolateMaskedRegions.py %s %s %s %s --maxLen %d" % (
            args.tracksList, args.truthBed, fitOut, fitOutMI, args.maskGap)
        interpolateCmds.append(cmd)

        for fdr in fdrs:
            fitOutFdr = fitOut.replace(".bed", "Fdr%f.bed" % fdr)
            fitOutFdrMI = fitOutMI.replace(".bed", "Fdr%f.bed" % fdr)
            cmdFdr = "interpolateMaskedRegions.py %s %s %s %s --maxLen %d" % (
                args.tracksList, args.truthBed, fitOutFdr, fitOutFdrMI,
                args.maskGap)
            interpolateCmds.append(cmdFdr)

    # run the comparison
    compareCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOutMI = os.path.join(
            args.outDir,
            os.path.basename(bed).replace(".bed", "_fitMI.bed"))
        comp = os.path.join(args.outDir,
                            os.path.basename(bed).replace(".bed", "_comp.txt"))
        cmd = ""
        fitOutMIClipped = fitOutMI
        if args.compWindow is not None:
            fitOutMIClipped = fitOutMI.replace(".bed", "_clipped.bed")
            cmd += "intersectBed -a %s -b %s | sortBed > %s && " % (
                fitOutMI, args.compWindow, fitOutMIClipped)
        cmd += "compareBedStates.py %s %s --tl %s --delMask %d > %s" % (
            args.truthBed, fitOutMIClipped, args.tracksList, args.maskGap,
            comp)
        compareCmds.append(cmd)
        for fdr in fdrs:
            fitOutFdrMI = fitOutMI.replace(".bed", "Fdr%f.bed" % fdr)
            compFdr = comp.replace(".txt", "Fdr%f.txt" % fdr)
            cmdFdr = ""
            fitOutFdrMIClipped = fitOutFdrMI
            if args.compWindow is not None:
                fitOutFdrMIClipped = fitOutFdrMI.replace(
                    ".bed", "_clipped.bed")
                cmdFdr += "intersectBed -a %s -b %s | sortBed > %s &&" % (
                    fitOutFdrMI, args.compWindow, fitOutFdrMIClipped)
            cmdFdr += "compareBedStates.py %s %s --tl %s --delMask %d > %s" % (
                args.truthBed, fitOutFdrMIClipped, args.tracksList,
                args.maskGap, compFdr)
            compareCmds.append(cmdFdr)

    runParallelShellCommands(fitCmds, args.proc)
    runParallelShellCommands(interpolateCmds, args.proc)
    runParallelShellCommands(compareCmds, args.proc)
    # got a weird crash before where comp file wasn't found
    # maybe this will help?
    runShellCommand("sleep 10")

    # munging ############
    def prettyAcc((prec, rec), spec):
        f1 = 0.
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        return "%.4f, %.4f, %.4f, %.4f" % (prec, rec, f1, spec)
예제 #35
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Generate some accuracy results.  To be used on output of statesVsBic.py"
        "(or some set of hmm prediction beds of the form *_trainsize.stateNum.bed")

    parser.add_argument("tracksList", help="XML tracks list")
    parser.add_argument("truthBed", help="reference to benchmark against (ex repet)")
    parser.add_argument("fitBed", help="predition to fit against (ex modeler)")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("beds", help="one or more bed files to evaluate", nargs="*")
    parser.add_argument("--proc", help="number of parallel processes", type=int, default=1)
    parser.add_argument("--maskGap", help="interpolate masked gaps smaller than this", type=int, default=5000)
    parser.add_argument("--exploreFdr", help="try a bunch of fdr values", action="store_true", default=False)
    parser.add_argument("--compWindow", help="intersect with this file before running comparison", default=None)

    args = parser.parse_args()

    # preloop to check files
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        
    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    outFile = open(os.path.join(args.outDir, "accuracy.csv"), "w")

    truthBed = args.truthBed
    if args.compWindow is not None:
        truthBed = os.path.join(args.outDir, "clippedTruth.bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.truthBed, args.compWindow, truthBed))

    if args.exploreFdr is True:
        fdrs = [0, .05, .1, .15, .20, .25, .30, .35, .40, .45, .50, .55, .60, .65, .70, .75, .80, .85, .90, .95, 1]
    else:
        fdrs = [.65]

    # do two kinds of fitting vs modeer
    fitCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOut = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_fit.bed"))
        fitLog = fitOut.replace(".bed", "_log.txt")
        cmd = "fitStateNames.py %s %s %s --tl %s --tgt TE --qualThresh 0.1 --logDebug --logFile %s" % (args.fitBed, bed, fitOut, args.tracksList, fitLog)
        fitCmds.append(cmd)
        for fdr in fdrs:
            fitOutFdr = fitOut.replace(".bed", "Fdr%f.bed" % fdr)
            fitLogFdr = fitOutFdr.replace(".bed", "_log.txt")
            cmdFdr = "fitStateNames.py %s %s %s --tl %s --tgt TE --fdr %f --logDebug --logFile %s" % (args.fitBed, bed, fitOutFdr, args.tracksList, fdr, fitLogFdr)
            fitCmds.append(cmdFdr)

    # interpolate the gaps
    interpolateCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOut = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_fit.bed"))
        fitOutMI = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_fitMI.bed"))
        cmd = "interpolateMaskedRegions.py %s %s %s %s --maxLen %d" % (args.tracksList, args.truthBed, fitOut, fitOutMI, args.maskGap)
        interpolateCmds.append(cmd)

        for fdr in fdrs:
            fitOutFdr = fitOut.replace(".bed", "Fdr%f.bed" % fdr)        
            fitOutFdrMI = fitOutMI.replace(".bed", "Fdr%f.bed" % fdr)
            cmdFdr = "interpolateMaskedRegions.py %s %s %s %s --maxLen %d" % (args.tracksList, args.truthBed, fitOutFdr, fitOutFdrMI, args.maskGap)
            interpolateCmds.append(cmdFdr)

    # run the comparison
    compareCmds = []
    for bed in args.beds:
        toks = "_".join(os.path.basename(bed).split(".")).split("_")
        tSize, nStates = int(toks[1]), int(toks[3])
        fitOutMI = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_fitMI.bed"))
        comp = os.path.join(args.outDir, os.path.basename(bed).replace(".bed", "_comp.txt"))
        cmd = ""
        fitOutMIClipped = fitOutMI
        if args.compWindow is not None:
            fitOutMIClipped = fitOutMI.replace(".bed", "_clipped.bed")
            cmd += "intersectBed -a %s -b %s | sortBed > %s && " % (fitOutMI, args.compWindow, fitOutMIClipped)
        cmd += "compareBedStates.py %s %s --tl %s --delMask %d > %s" % (args.truthBed, fitOutMIClipped, args.tracksList, args.maskGap, comp)
        compareCmds.append(cmd)
        for fdr in fdrs:
            fitOutFdrMI = fitOutMI.replace(".bed", "Fdr%f.bed" % fdr)
            compFdr = comp.replace(".txt", "Fdr%f.txt" % fdr)
            cmdFdr = ""
            fitOutFdrMIClipped = fitOutFdrMI
            if args.compWindow is not None:
                fitOutFdrMIClipped = fitOutFdrMI.replace(".bed", "_clipped.bed")
                cmdFdr += "intersectBed -a %s -b %s | sortBed > %s &&" % (fitOutFdrMI, args.compWindow, fitOutFdrMIClipped)
            cmdFdr += "compareBedStates.py %s %s --tl %s --delMask %d > %s" % (args.truthBed, fitOutFdrMIClipped, args.tracksList, args.maskGap, compFdr)
            compareCmds.append(cmdFdr)
    
    runParallelShellCommands(fitCmds, args.proc)
    runParallelShellCommands(interpolateCmds, args.proc)
    runParallelShellCommands(compareCmds, args.proc)
    # got a weird crash before where comp file wasn't found
    # maybe this will help?
    runShellCommand("sleep 10")

    # munging ############
    def prettyAcc((prec, rec), spec):
        f1 = 0.
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)        
        return "%.4f, %.4f, %.4f, %.4f" % (prec, rec, f1, spec)
예제 #36
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    segFiles = []
    statsFiles = []
    offset = args.co
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""
        
        chromPath = getLocalTempPath("TempChromPath", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()
        
        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.allBed,
                                                                     chromPath,
                                                                     regionPath))

        if os.path.getsize(regionPath) < 2:
            continue

        offset += int(chrom[2]) - int(chrom[1])
        
        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[2] = regionPath

        segPath =  getLocalTempPath("Temp", ".bed")
        cmdToks[3] = segPath
        segFiles.append(segPath)

        if "--co" in cmdToks:
            cmdToks[cmdToks.index("--co")+1] = str(offset)
        else:
            cmdToks.append("--co")
            cmdToks.append(str(offset))
        
        if args.stats is not None:
            statsPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--stats")+1] = statsPath
            statsFiles.append(statsPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed))
        if len(statsFiles) > 0:
            runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats))

    for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles):
        runShellCommand("rm %s" % i)            
예제 #37
0
    cmdEval = "segmentTracks.py %s %s %s %s --stats %s" % (segTracksPath, evalRegionPath, evalSegPath, segOpts,
                                                           evalSegPath.replace(".bed", ".segStats"))
    runParallelShellCommands([cmdEval, cmdTrain], 2)

# train ############
modelPath = "hmm.mod"
if startPoint <=2:
    cmd = "teHmmTrain.py %s %s %s %s" % (trainTracksPath, trainSegPath, modelPath, logOpts)
    cmd += " --fixStart"
    cmd += " --segLen %d" % segLen
    cmd += " --numStates %d" % numStates
    cmd += " --reps %d --numThreads %d" % (threads, threads)
    cmd += " --emThresh %f" % thresh
    cmd += " --iter %d" % iter
    cmd += " --segment %s" % trainSegPath
    runShellCommand(cmd)

# eval ############
evalPath = "eval.bed"
if startPoint <=3:
    cmd = "teHmmEval.py %s %s %s --bed %s --segment %s" % (trainTracksPath, modelPath, evalSegPath, evalPath, logOpts)
    runShellCommand(cmd)
    
# fit ############
fitPath = "fit.bed"
fitFdrPath = "fitFdr.bed"
labelPath = "label.bed"
if startPoint <=4:
    tempPath = getLocalTempPath("Tempmask", ".bed")
    runShellCommand("mergeBed -i %s | sortBed > %s" % (evalSegPath, tempPath))
    runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (modelerPath, tempPath, labelPath))
예제 #38
0
non-LTR
TSD|right

"""

import sys
import os
from teHmm.common import runShellCommand, getLogLevelString, getLocalTempPath

assert len(sys.argv) == 3

infile = sys.argv[1]
outfile = sys.argv[2]
tempfile = outfile + "_temp"

# get rid of any orphan TIR termini (not next to DNA)
runShellCommand("filterPredictions.py %s --mustBefore \"TIR|left,DNA\" --mustAfter \"TIR|right,DNA\" > %s" % (infile, outfile))

# merge up DNA elements into one
runShellCommand("filterPredictions.py %s --mergeBefore \"TSD|left,TIR|left\" --mergeAfter \"TSD|right,TIR|right\" > %s" % (outfile, tempfile))

runShellCommand("filterPredictions.py %s --mergeBefore \"TIR|left,DNA\" --mergeAfter \"TIR|right,DNA\" > %s" % (tempfile, outfile))

# get rid of orphan LTR termini (not next to inside)
runShellCommand("filterPredictions.py %s --mustBefore \"LTR|left,inside\" --mustAfter \"LTR|right,inside\" > %s" % (outfile, tempfile))

# get rid of orphan TSD termini (not next to ltr or non-ltr)
runShellCommand("filterPredictions.py %s --mustBefore \"TSD|left,LTR|left\" --mustAfter \"TSD|right,LTR|right\" > %s" % (tempfile, outfile))

runShellCommand("rm -f %s" % tempfile)
예제 #39
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        description="Transform a Bed12 file into a Bed6 file where the names"
        " are mapped to one of Intron/Exon (Bed12 blocks mapped to exons"
        " / remaining region covered by chromStart - chromEnd mapped to introns)"
    )
    parser.add_argument("inBed", help="bed with chaux results to process")
    parser.add_argument("outBed",
                        help="output bed (will be copy of input)"
                        " if bed12 not detected")
    parser.add_argument("--keepName",
                        help="keep gene names as prefix.  ie"
                        " output will be of form geneName_intron etc.",
                        action="store_true",
                        default=False)
    parser.add_argument("--intron", help="intron name", default="intron")
    parser.add_argument("--exon", help="exon name", default="exon")

    args = parser.parse_args()
    assert os.path.exists(args.inBed)
    tempBedToolPath = initBedTool()
    outFile = open(args.outBed, "w")

    # convert bigbed if necessary
    inBed = args.inBed
    if args.inBed[-3:] == ".bb":
        inBed = getLocalTempPath("Temp_cleanGenes", ".bed")
        runShellCommand("bigBedToBed %s %s" % (args.inBed, inBed))

    for interval in BedTool(inBed).sort():
        if len(interval.fields) < 12:
            logger.warning("Input not bed12.. just copying")
            runShellCommand("cp %s %s" % (args.inBed, args.outBed))
            break
        else:
            numBlocks = int(interval.fields[9])
            blockSizes = [
                int(x) for x in interval.fields[10].split(",")[:numBlocks]
            ]
            blockOffsets = [
                int(x) for x in interval.fields[11].split(",")[:numBlocks]
            ]
            icopy = copy.deepcopy(interval)
            intron = args.intron
            exon = args.exon

            if args.keepName is True:
                intron = "%s_%s" % (icopy.name, intron)
                exon = "%s_%s" % (icopy.name, exon)

            # edge cases that probably violate bed format
            if numBlocks == 0:
                # no blocks --> one big intron
                icopy.name = intron
                outFile.write(bed6String(icopy))
                continue

            if blockOffsets[0] > 0:
                # gap between start and first block --> intron
                icopy.end = icopy.start + blockOffsets[0]
                icopy.name = intron
                outFile.write(bed6String(icopy))

            for i in xrange(numBlocks):
                # write block as exon
                icopy.name = exon
                icopy.start = interval.start + blockOffsets[i]
                icopy.end = icopy.start + blockSizes[i]
                outFile.write(bed6String(icopy))

                if i < numBlocks - 1:
                    gap = blockOffsets[i + 1] - (blockOffsets[i] +
                                                 blockSizes[i])
                    if gap > 0:
                        # room for intron before next block
                        icopy.name = intron
                        icopy.start = icopy.end
                        icopy.end = icopy.start + gap
                        outFile.write(bed6String(icopy))

            gap = interval.end - (interval.start + blockOffsets[-1] +
                                  blockSizes[-1])
            if gap > 0:
                # room for intron after last block
                icopy.name = intron
                icopy.start = interval.start + blockOffsets[-1] + blockSizes[-1]
                icopy.end = interval.end
                outFile.write(bed6String(icopy))

    outFile.close()
    cleanBedTool(tempBedToolPath)
    if inBed != args.inBed:
        runShellCommand("rm %s" % inBed)
예제 #40
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Remove ltr_finder ids from 4th column")
    parser.add_argument("inBed", help="bed with ltr results to process")
    parser.add_argument("outBed", help="bed to write output to.  Will also "
                        "write outBed_sym.bed outBed_tsd_as_gap.bed etc.")
    parser.add_argument("--keepOl", help="by default, if LTR elements "
                        "overlap, the one with the highest score (length "
                        "in event of tie) is kept. This option disables"
                        " this logic.", action="store_true", default=False)
    parser.add_argument("--all", help="write _sym, _tsd_as_gap, etc. versions"
                        " of output", action="store_true", default=False)
    parser.add_argument("--weak", help="score threshold such that any elemetns"
                        " with a score lower or equal to will be assigned the"
                        " prefix WEAK_ to their names.", type=float,
                        default=-1)
    parser.add_argument("--weakIgnore", help="dont apply --weak to state names"
                        " that contain given keywords (defined as comma-separated"
                        " list", default=None)
    
    args = parser.parse_args()
    tempBedToolPath = initBedTool()
    assert os.path.exists(args.inBed)
    baseOut, ext = os.path.splitext(args.outBed)
    if args.weakIgnore is not None:
        args.weakIgnore = args.weakIgnore.split(",")
    else:
        args.weakIgnore = []

    inBed = args.inBed

    toRm = []
    if not args.keepOl:
        inBed = getLocalTempPath("Temp", ".bed")
        removeOverlaps(args.inBed, inBed, args)
        toRm.append(inBed)

    os.system("sed -e \"s/|LTR_TE|[0-9]*//g\" -e \"s/|-//g\" %s > %s" % (
        inBed, args.outBed))

    if args.all:
        symBed = baseOut + "_sym" + ext
        os.system("sed -e \"s/|left//g\" -e \"s/|right//g\" %s > %s" % (args.outBed,
                                                                        symBed))

        tsd_as_gapsBed = baseOut + "_tsd_as_gap" + ext
        os.system("grep -v TSD %s > %s" % (args.outBed, tsd_as_gapsBed))

        sym_tsd_as_gapsBed = baseOut + "_sym_tsd_as_gap" + ext
        os.system("grep -v TSD %s > %s" % (symBed, sym_tsd_as_gapsBed))

        tsd_as_ltrBed = baseOut + "_tsd_as_ltr" + ext
        os.system("sed -e \"s/TSD/LTR/g\" %s > %s" % (args.outBed, tsd_as_ltrBed))

        sym_tsd_as_ltrBed = baseOut + "_sym_tsd_as_ltr" + ext
        os.system("sed -e \"s/TSD/LTR/g\" %s > %s" % (symBed, sym_tsd_as_ltrBed))

        singleBed = baseOut + "_single" + ext
        os.system("sed -e \"s/LTR/inside/g\" %s > %s" % (sym_tsd_as_ltrBed,
                                                         singleBed))

    for path in toRm:
        runShellCommand("rm -f %s" % path)
        
    cleanBedTool(tempBedToolPath)
예제 #41
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Produce a bed file of genome segments which are atomic"
        " elements with resepect to the hmm. ie each segment emits a single"
        " state. Mask tracks always cut.  "
        "Output intervals are assigned name 0 1 0 1 etc.")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outBed", help="Output segments")
    parser.add_argument("--thresh",
                        help="Number of tracks that can change "
                        "before a new segment formed.  Increasing this value"
                        " increases the expected lengths of output segments",
                        type=int,
                        default=1)
    parser.add_argument("--cutTracks",
                        help="Create a new segment if something"
                        " changes in one of these tracks (as specified by "
                        "comman-separated list), overriding --thresh options"
                        " if necessary.  For example, --cutTracks tsd,chaux"
                        " would invoke a new segment everytime the value at"
                        "either of these tracks changed",
                        default=None)
    parser.add_argument("--cutUnscaled",
                        help="Cut on all unscaled (used as "
                        "a proxy for non-numeric) tracks",
                        default=False,
                        action="store_true")
    parser.add_argument("--cutMultinomial",
                        help="Cut non-gaussian, non-binary"
                        " tracks everytime",
                        default=False,
                        action="store_true")
    parser.add_argument("--cutNonGaussian",
                        help="Cut all but guassian tracks",
                        default=False,
                        action="store_true")
    parser.add_argument("--comp",
                        help="Strategy for comparing columns for the "
                        "threshold cutoff.  Options are [first, prev], where"
                        " first compares with first column of segment and "
                        "prev compares with column immediately left",
                        default="first")
    parser.add_argument("--ignore",
                        help="Comma-separated list of tracks to "
                        "ignore (the FASTA DNA sequence would be a good "
                        "candidate",
                        default="sequence")
    parser.add_argument("--maxLen",
                        help="Maximum length of a segment (<= 0 means"
                        " no max length applied",
                        type=int,
                        default=0)
    parser.add_argument(
        "--fixLen",
        help="Just make segments of specifed fixed "
        "length ignoring other parameters and logic (<= 0 means"
        " no fixed length applied",
        type=int,
        default=0)
    parser.add_argument("--stats",
                        help="Write some statistics to specified "
                        "file. Of the form <trackName> <Diff> <DiffPct> "
                        " where <Diff> is the number of times a track differs"
                        " between two consecutive segments, and <DiffPct> "
                        " is the average perecentage of all such differences "
                        "accounted for by the track",
                        default=None)
    parser.add_argument(
        "--delMask",
        help="Entirely remove intervals from "
        "mask tracks that are > given length (otherwise "
        "they would just be ignored by HMM tools). The difference"
        " here is that removed intervals will break contiguity.",
        type=int,
        default=None)
    parser.add_argument(
        "--chroms",
        help="list of chromosomes, or regions, to run in parallel"
        " (in BED format).  input regions will be intersected with each line"
        " in this file, and the result will correspsond to an individual job",
        default=None)
    parser.add_argument(
        "--proc",
        help="number of processes (use in conjunction with --chroms)",
        type=int,
        default=1)
    parser.add_argument(
        "--co",
        help="count offset for segment labels.  only used internally",
        type=int,
        default=0)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.comp != "first" and args.comp != "prev":
        raise RuntimeError("--comp must be either first or prev")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0

    # read query intervals from the bed file
    tempFiles = []
    if args.delMask is not None:
        cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint,
                                     args.tracksInfo)
        if cutBed is not None:
            tempFiles.append(cutBed)
            args.allBed = cutBed
    logger.info("loading segment region intervals from %s" % args.allBed)
    mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.allBed)

    # read the tracks, while intersecting them with the query intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo,
                            mergedIntervals,
                            treatMaskAsBinary=True)

    # process the --cutTracks option
    trackList = trackData.getTrackList()
    cutList = np.zeros((len(trackList)), np.int)
    if args.cutTracks is not None:
        cutNames = args.cutTracks.split(",")
        for name in cutNames:
            track = trackList.getTrackByName(name)
            if track is None:
                raise RuntimeError("cutTrack %s not found" % name)
            trackNo = track.getNumber()
            assert trackNo < len(cutList)
            cutList[trackNo] = 1
    args.cutList = cutList

    # make sure mask tracks count as cut tracks
    for track in trackList:
        if track.getDist() == 'mask':
            args.cutList[track.getNumber()] = 1

    # process the --ignore option
    ignoreList = np.zeros((len(trackList)), np.int)
    if args.ignore is not None:
        ignoreNames = args.ignore.split(",")
        for name in ignoreNames:
            track = trackList.getTrackByName(name)
            if track is None:
                if name is not "sequence":
                    logger.warning("ignore track %s not found" % name)
                continue
            trackNo = track.getNumber()
            assert trackNo < len(ignoreList)
            ignoreList[trackNo] = 1
            if args.cutList[trackNo] == 1:
                raise RuntimeError("Same track (%s) cant be cut and ignored" %
                                   name)
    args.ignoreList = ignoreList

    #process the --cutUnscaled option
    if args.cutUnscaled is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.scale is None and track.shift is None and\
              track.logScale is None and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    #process the --cutMultinomial option
    if args.cutMultinomial is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist == "multinomial" and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    #process the --cutNonGaussian option
    if args.cutNonGaussian is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist != "gaussian" and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    # segment the tracks
    stats = dict()
    segmentTracks(trackData, args, stats)
    writeStats(trackData, args, stats)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
예제 #42
0
outCsvPath = "cross.csv"

setLogLevel("INFO")
addLoggingFileHandler("log.txt", False)

bedFiles = dict()
bedFiles["hollister"] = "alyrata_hollister_clean.bed"
bedFiles["modeler"] = "alyrata_repeatmodeler_clean.bed"
bedFiles["chaux"] = "alyrata_chaux_clean.bed"
bedFiles["hmm"] = "hmm_1_clean_2state.bed"
bedFiles["trf"] = "alyrata_trf_clean.bed"
bedFiles["fgenesh"] = "alyrata_fgenesh_clean.bed"

regionPath = "region1c4.bed"

runShellCommand("rm -rf %s; mkdir %s" % (workPath, workPath))


def bedPath(name, s):
    return os.path.join(workPath, name + "_%s.bed" % s)


# make working files
# _out : intersection
# _te : TE-state renamed TE and everything else removed
# _gap : _out with gaps added
# _gap_te : _te with gaps added
for name, path in bedFiles.items():
    tPath = bedPath(name, "temp")
    outPath = bedPath(name, "out")
    runShellCommand("intersectBed -a %s -b %s > %s" %
예제 #43
0
def parallelDispatch(argv, args):
    """ chunk up input with chrom option.  recursivlely launch eval. merge
    results """
    jobList = []
    chromIntervals = readBedIntervals(args.chroms, sort=True)
    chromFiles = []
    regionFiles = []
    bedFiles = []
    pdFiles = []
    bicFiles = []
    edFiles = []
    for chrom in chromIntervals:
        cmdToks = copy.deepcopy(argv)
        cmdToks[cmdToks.index("--chrom") + 1] = ""
        cmdToks[cmdToks.index("--chrom")] = ""
        
        chromPath = getLocalTempPath("Temp", ".bed")
        cpFile = open(chromPath, "w")
        cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2]))
        cpFile.close()
        
        regionPath = getLocalTempPath("Temp", ".bed")
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bedRegions,
                                                                     chromPath,
                                                                     regionPath))

        if os.path.getsize(regionPath) < 2:
            continue
        
        regionFiles.append(regionPath)
        chromFiles.append(chromPath)

        cmdToks[3] = regionPath

        if args.bed is not None:
            bedPath =  getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--bed")+1] = bedPath
            bedFiles.append(bedPath)
        if args.pd is not None:
            pdPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--pd")+1] = pdPath
            pdFiles.append(pdPath)
        if args.ed is not None:
            edPath = getLocalTempPath("Temp", ".bed")
            cmdToks[cmdToks.index("--ed")+1] = edPath
            edFiles.append(edPath)
        if args.bic is not None:
            bicPath = getLocalTempPath("Temp", ".bic")
            cmdToks[cmdToks.index("--bic")+1] = bicPath
            bicFiles.append(bicPath)
        cmd = " ".join(cmdToks)
        jobList.append(cmd)

    runParallelShellCommands(jobList, args.proc)

    for i in xrange(len(jobList)):
        if i == 0:
            ct = ">"
        else:
            ct = ">>"
        if len(bedFiles) > 0:
            runShellCommand("cat %s %s %s" % (bedFiles[i], ct, args.bed))
        if len(pdFiles) > 0:
            runShellCommand("cat %s %s %s" % (pdFiles[i], ct, args.pd))
        if len(edFiles) > 0:
            runShellCommand("cat %s %s %s" % (edFiles[i], ct, args.ed))
        if len(bicFiles) > 0:
            runShellCommand("cat %s %s %s" % (bicFiles[i], ct, args.bic))

    for i in itertools.chain(chromFiles, regionFiles, bedFiles, pdFiles, edFiles,
                             bicFiles):
        runShellCommand("rm %s" % i)            
예제 #44
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Compare two bed files where Model states are represented"
        " in a column.  Used to determine sensitivity and specificity.  NOTE"
        " that both bed files must be sorted and cover the exact same regions"
        " of the same genome.")

    parser.add_argument("bed1", help="Bed file (TRUTH)")
    parser.add_argument("bed2", help="Bed file covering same regions in same"
                        " order as bed1")
    parser.add_argument("--col", help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default = 4, type = int)
    parser.add_argument("--thresh", help="Threshold to consider interval from"
                        " bed1 covered by bed2.",
                        type=float, default=0.8)
    parser.add_argument("--plot", help="Path of file to write Precision/Recall"
                        " graphs to in PDF format", default=None)
    parser.add_argument("--ignore", help="Comma-separated list of stateNames to"
                        " ignore", default=None)
    parser.add_argument("--strictPrec", help="By default, precision is computed"
                        " in a manner strictly symmetric to recall.  So calling"
                        " compareBedStates.py A.bed B.bed would give the exact"
                        " same output as compareBedStates.py B.bed A.bed except"
                        " precision and recall values would be swapped.  With "
                        " this option, a predicted element only counts toward"
                        " precision if it overlaps with 80pct of the true"
                        " element, as opposed to only needing 80pct of itself"
                        " overlapping with the true element. ",
                        action="store_true", default = False)
    parser.add_argument("--noBase", help="Skip base-level stats (and only show"
                        " interval stats).  Runs faster", action="store_true",
                        default=False)
    parser.add_argument("--noFrag", help="Do not allow fragmented matches in"
                        "interval predictions.  ie if a single truth interval"
                        " is covered by a series of predicted intervals, only "
                        "the best match will be counted if this flag is used", 
                        action="store_true", default=False)
    parser.add_argument("--tl", help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)", default=None)
    parser.add_argument("--delMask", help="Entirely remove intervals from "
                        "mask tracks that are > given length.  Probably "
                        "only want to set to non-zero value K if using"
                        " with a prediction that was processed with "
                        "interpolateMaskedRegions.py --max K",
                        type=int, default=0)
    parser.add_argument("--window", help="A comma-delimited 5-tuple of "
                        "windowSize,stateName,compType,score,outBed.  "
                        "Where windowSize  is the sliding window size "
                        "(overlap .5), stateName is target stateName,"
                        " compType is in {base,interval,weighted}, sore is"
                        " in {f1,precision,recall} and "
                        "outBed is the path of a bedFile to write positional"
                        " accuracy to.  For example, --window 1000000,TE,base,f1"
                        ",acc.bed will write base-level f1 for 1MB sliding windows"
                        " to acc.bed.  These can be viewed on the browser by first"
                        " converting to BigWig.", default=None)

    args = parser.parse_args()
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()

    assert args.col == 4 or args.col == 5
    print "Commandline %s" % " ".join(sys.argv)
    origArgs = copy.deepcopy(args)
    
    tempFiles = []
    if args.tl is not None:
        cutBed1 = cutOutMaskIntervals(args.bed1, args.delMask,
                                      sys.maxint, args.tl)
        cutBed2 = cutOutMaskIntervals(args.bed2, args.delMask,
                                      sys.maxint, args.tl)
        if cutBed1 is not None:
            assert cutBed2 is not None
            tempFiles += [cutBed1, cutBed2]
            args.bed1 = cutBed1
            args.bed2 = cutBed2

    checkExactOverlap(args.bed1, args.bed2)

    if args.window is not None:
        runPositionalComparison(argv, origArgs)

    intervals1 = readBedIntervals(args.bed1, ncol = args.col)
    intervals2 = readBedIntervals(args.bed2, ncol = args.col)

    if args.noBase is False:
        stats = compareBaseLevel(intervals1, intervals2, args.col - 1)[0]

        totalRight, totalWrong, accMap = summarizeBaseComparision(stats, args.ignore)
        print "Base counts [False Negatives, False Positives, True Positives]:"
        print stats
        totalBoth = totalRight + totalWrong
        accuracy = float(totalRight) / float(totalBoth)
        print "Accuaracy: %d / %d = %f" % (totalRight, totalBoth, accuracy)
        print "State-by-state (Precision, Recall):"
        print "Base-by-base Accuracy"    
        print accMap

    trueStats = compareIntervalsOneSided(intervals1, intervals2, args.col -1,
                                         args.thresh, False, not args.noFrag)[0]
    predStats = compareIntervalsOneSided(intervals2, intervals1, args.col -1,
                                         args.thresh, args.strictPrec,
                                         not args.noFrag)[0]
    intAccMap = summarizeIntervalComparison(trueStats, predStats, False,
                                            args.ignore)
    intAccMapWeighted = summarizeIntervalComparison(trueStats, predStats, True,
                                                     args.ignore)
    print "\nInterval Accuracy"
    print intAccMap
    print ""

    print "\nWeighted Interval Accuracy"
    print intAccMapWeighted
    print ""


    # print some row data to be picked up by scrapeBenchmarkRow.py
    if args.noBase is False:
        header, row = summaryRow(accuracy, stats, accMap)
        print " ".join(header)
        print " ".join(row)

    # make graph
    if args.plot is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write plots.  Maybe matplotlib is "
                               "not installed?")
        writeAccPlots(accuracy, accMap, intAccMap, intAccMapWeighted,
                      args.thresh, args.plot)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
예제 #45
0
def greedyRank(args):
    """ Iteratively add best track to a (initially empty) tracklist according
    to some metric"""
    inputTrackList = TrackList(args.tracks)
    rankedTrackList = TrackList()
    if args.startTracks is not None:
        for startTrack in args.startTracks.split(","):
            track = inputTrackList.getTrackByName(startTrack)
            if track is None:
                logger.warning("Start track %s not found in tracks XML" %
                               startTrack)
            else:
                rankedTrackList.addTrack(copy.deepcopy(track))
            
    numTracks = len(inputTrackList) - len(rankedTrackList)
    currentScore, currentBIC = 0.0, sys.maxint

    # compute full segmentation if --fullSegment is True
    if args.fullSegment is True:
        args.fullSegTrainPath = os.path.abspath(os.path.join(args.outDir,
                                                             "fullSegTrain.bed"))
        segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks,
                                                       args.training,
                                                       args.fullSegTrainPath,
                                                       args.segOpts)
        runShellCommand(segmentCmd)
        args.fullSegEvalPath = os.path.abspath(os.path.join(args.outDir,
                                                            "fullSegEval.bed"))
        segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks,
                                                       args.truth,
                                                       args.fullSegEvalPath,
                                                       args.segOpts)
        runShellCommand(segmentCmd)

    #header
    rankFile = open(os.path.join(args.outDir, "ranking.txt"), "w")
    rankFile.write("It.\tTrack\tF1\tBIC\tNaiveF1\tAccProbSlop\tAccProbR2\n")
    rankFile.close()
    
    # baseline score if we not starting from scratch
    baseIt = 0
    if args.startTracks is not None:
        curTrackList = copy.deepcopy(rankedTrackList)
        score,bic,naive,slope,rsq = runTrial(curTrackList, baseIt, "baseline_test", args)
        rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a")
        rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (baseIt, args.startTracks,
                                        score, bic, naive,slope,rsq))
        rankFile.close()
        baseIt += 1
        
    for iteration in xrange(baseIt, baseIt + numTracks):
        bestItScore = -sys.maxint
        bestItBic = sys.maxint
        bestItNaive = -sys.maxint
        bestNextTrack = None
        bestSlope = None
        bestR = None
        for nextTrack in inputTrackList:
            if rankedTrackList.getTrackByName(nextTrack.getName()) is not None:
                continue
            curTrackList = copy.deepcopy(rankedTrackList)
            curTrackList.addTrack(nextTrack)
            score,bic,naive,slope,rsq = runTrial(curTrackList, iteration, nextTrack.getName(),
                                args)
            best = False
            if args.bic is True:
                if bic < bestItBic or (bic == bestItBic and score > bestItScore):
                    best = True
            elif args.naive is True:
                if naive > bestItNaive or (naive == bestItNaive and score > bestItScore):
                    best = True
            elif score > bestItScore or (score == bestItScore and bic < bestItBic):
                    best = True
            if best is True:
                bestItScore, bestItBic, bestItNaive, bestSlope, bestR, bestNextTrack =\
                       score, bic, naive, slope, rsq, nextTrack
            flags = "a"
            if iteration == baseIt:
                flags = "w"      
            trackLogFile = open(os.path.join(args.outDir, nextTrack.getName() +
                                             ".txt"), flags)
            trackLogFile.write("%d\t%f\t%f\t%f\t%f\t%f\n" % (iteration, score, bic, naive,
                                                             slope, rsq))
            trackLogFile.close()
        rankedTrackList.addTrack(copy.deepcopy(bestNextTrack))
        rankedTrackList.saveXML(os.path.join(args.outDir, "iter%d" % iteration,
                                "tracks.xml"))
        
        rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a")
        rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (iteration, bestNextTrack.getName(),
                                            bestItScore, bestItBic, bestItNaive,
                                            bestSlope, bestR))
        rankFile.close()
예제 #46
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Generate HMM-usable tracklist from raw tracklist. EX "
        "used to transform mustang_alyrata_tracks.xml -> "
        "mustang_alyrata_clean.xml.  Runs cleanRM.py cleanLtrFinder.py and "
        " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs "
        " removeBedOverlaps.py before each of the clean scripts)")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("cleanTrackPath", help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo", help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("--numBins", help="Maximum number of bins after scaling",
                        default=10, type=int)
    parser.add_argument("--scaleTracks", help="Comma-separated list of tracks "
                        "to process for scaling. If not set, all"
                        " tracks listed as having a multinomial distribution"
                        " (since this is the default value, this includes "
                        "tracks with no distribution attribute) or gaussian "
                        "distribution will be processed.", default=None)
    parser.add_argument("--skipScale", help="Comma-separated list of tracks to "
                        "skip for scaling.", default=None)
    parser.add_argument("--ltr_termini", help="Name of termini track (appy tsd)",
                        default="ltr_termini")
    parser.add_argument("--repeat_modeler", help="Name of repeat_modeler track (appy tsd)",
                        default="repeat_modeler")
    parser.add_argument("--sequence", help="Name of fasta sequence track",
                        default="sequence")
    parser.add_argument("--tsd", help="Name of tsd track to generate (appy cleanTermini.py)",
                        default="tsd")
    parser.add_argument("--tir", help="Name of tir_termini track (appy cleanTermini.py)",
                        default="tir_termini")
    parser.add_argument("--noScale", help="Dont do any scaling", default=False,
                        action="store_true")
    parser.add_argument("--noTsd", help="Dont generate TSD track.  NOTE:"
                        " TSD track is hardcoded to be generated from "
                        "termini and (non-LTR elements of ) chaux",
                        default=False, action="store_true")
    parser.add_argument("--numProc", help="Number of processes to use for tsdFinder.py",
                        default=1, type=int)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    try:
        os.makedirs(args.cleanTrackPath)
    except:
        pass
    if not os.path.isdir(args.cleanTrackPath):
        raise RuntimeError("Unable to find or create cleanTrack dir %s" %
                           args.cleanTrackPath)

    tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml")
    runCleaning(args, tempTracksInfo)
    assert os.path.isfile(tempTracksInfo)

    runTsd(args, tempTracksInfo)
    
    runScaling(args, tempTracksInfo)

    runShellCommand("rm -f %s" % tempTracksInfo)

    cleanBedTool(tempBedToolPath)
예제 #47
0
def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath):
    """ Filter out intervals of mask tracks from inBed with lengths
    outside given range. Idea is that it makes less sense to simply ignore,
    say, giant stretches of N's (like centromeres), as we would by masking
    them normally, than it does to remove them entirely, splitting the
    genome into multiple chunks.  Can also be used during comparision to
    get rid of all masked intervals """
    outPath = getLocalTempPath("Tempcut", ".bed")
    trackList = TrackList(tracksInfoPath)
    maskPaths = [t.getPath() for t in trackList.getMaskTracks()]
    if len(maskPaths) == 0:
        return None
    tempPath1 = getLocalTempPath("Tempcut1", ".bed")
    tempPath2 = getLocalTempPath("Tempcut2", ".bed")
    runShellCommand("cp %s %s" % (inBed, outPath))
    for maskPath in maskPaths:
        runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" % (
            maskPath, tempPath1))
    if os.path.getsize(tempPath1) > 0:
        runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" % (
            tempPath1, tempPath2, tempPath2, tempPath1))
        runShellCommand("filterBedLengths.py %s %d %d > %s" % (
            tempPath1, minLength+1, maxLength-1, tempPath2))
        runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (
            outPath, tempPath2, tempPath1))
        runShellCommand("mv %s %s" % (tempPath1, outPath))
    runShellCommand("rm -f %s %s" % (tempPath1, tempPath2))
    if os.path.getsize(outPath) == 0:
        raise RuntimeError("cutOutMaskIntervals removed everything.  Can't continue."
                           " probably best to rerun calling script on bigger region?")
    return outPath
예제 #48
0
#!/usr/bin/env python
"""

1) flatten LTR elements from 5 states to 1 (inside)
2) flatten non-LTR elements from 3 states to 1 (non-LTR)

"""

import sys
import os
from teHmm.common import runShellCommand, getLogLevelString, getLocalTempPath

assert len(sys.argv) == 3

infile = sys.argv[1]
outfile = sys.argv[2]
tempfile = outfile + "_temp"

# merge up TSD elements
runShellCommand(
    "filterPredictions.py %s --mergeBefore \"TSD|left,LTR|left,non-LTR\" --mergeAfter \"TSD|right,LTR|right,non-LTR\" > %s"
    % (infile, tempfile))

# merge up LTR elements
runShellCommand(
    "filterPredictions.py %s --mergeBefore \"LTR|left,inside\" --mergeAfter \"LTR|right,inside\" > %s"
    % (tempfile, outfile))

runShellCommand("rm -f %s" % tempfile)
예제 #49
0
def runTsdFinder(faPath, inBedPath, outBedPath, args):
    """ call tsdFinder and either overwrite or append output.  also call
    removeBedOverlaps on final output to make sure it is clean """

    # convert input to bed if necessary
    tempBed = None
    if os.path.splitext(inBedPath)[1].lower() == ".bb":
        tempBed = getLocalTempPath("Temp_addTsdTrack", ".bed")
        runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
        inBedPath = tempBed

    # run tsdfinder on input
    tempOut = getLocalTempPath("Temp_addTsdTrack", ".bed")
    runShellCommand("tsdFinder.py %s %s %s %s" %
                    (faPath, inBedPath, tempOut, args.tsdFinderOptions))
    if tempBed is not None:
        runShellCommand("rm %s" % tempBed)

    # merge with existing track
    if os.path.isfile(outBedPath) and args.append is True:
        runShellCommand("cat %s >> %s" % (outBedPath, tempOut))

    # remove overlaps into final output
    runShellCommand("removeBedOverlaps.py %s > %s" % (tempOut, outBedPath))

    runShellCommand("rm %s" % tempOut)
예제 #50
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Given two bed files: a prediction and a true (or target)"
         " annotation, re-label the prediction's state names so that they "
         " best match the true annotation.  Usees same logic as "
         " compareBedStates.py for determining accuracy")

    parser.add_argument("tgtBed", help="Target bed file")
    parser.add_argument("predBed", help="Predicted bed file to re-label. ")
    parser.add_argument("outBed", help="Output bed (relabeling of predBed)")
    parser.add_argument("--col", help="Column of bed files to use for state"
                        " (currently only support 4(name) or 5(score))",
                        default = 4, type = int)
    parser.add_argument("--intThresh", help="Threshold to consider interval from"
                        " tgtBed covered by predBed.  If not specified, then base"
                        " level statistics will be used. Value in range (0,1]",
                        type=float, default=None)
    parser.add_argument("--noFrag", help="Dont allow fragmented interval matches ("
                        "see help for --frag in compareBedStates.py).  Only"
                        " relevant with --intThresh", action="store_true",
                        default=False)
    parser.add_argument("--qualThresh", help="Minimum match ratio between truth"
                        " and prediction to relabel prediction.  Example, if"
                        " predicted state X overlaps target state LTR 25 pct of "
                        "the time, then qualThresh must be at least 0.25 to "
                        "label X as LTR in the output.  Value in range (0, 1]",
                        type=float, default=0.1)
    parser.add_argument("--ignore", help="Comma-separated list of stateNames to"
                        " ignore (in prediction)", default=None)
    parser.add_argument("--ignoreTgt", help="Comma-separated list of stateNames to"
                        " ignore (int target)", default=None)
    parser.add_argument("--tgt", help="Comma-separated list of stateNames to "
                        " consider (in target).  All others will be ignored",
                        default=None)
    parser.add_argument("--unique", help="If more than one predicted state maps"
                        " to the same target state, add a unique id (numeric "
                        "suffix) to the output so that they can be distinguished",
                        action="store_true", default=False)
    parser.add_argument("--model", help="Apply state name mapping to the model"
                        " in the specified path (it is strongly advised to"
                        " make a backup of the model first)", default=None)
    parser.add_argument("--noMerge", help="By default, adjacent intervals"
                        " with the same state name in the output are "
                        "automatically merged into a single interval.  This"
                        " flag disables this.", action="store_true",
                        default=False)
    parser.add_argument("--hm", help="Write confusion matrix as heatmap in PDF"
                        " format to specified file", default = None)
    parser.add_argument("--old", help="Use old name mapping logic which just "
                        "takes biggest overlap in forward confusion matrix.  "
                        "faster than new default logic which does the greedy"
                        " f1 optimization", action="store_true", default=False)
    parser.add_argument("--fdr", help="Use FDR cutoff instead of (default)"
                        " greedy F1 optimization for state labeling",
                        type=float, default=None)
    parser.add_argument("--tl", help="Path to tracks XML file.  Used to cut "
                        "out mask tracks so they are removed from comparison."
                        " (convenience option to not have to manually run "
                        "subtractBed everytime...)", default=None)
    parser.add_argument("--colOrder", help="List of states used to force"
                        " ordering in heatmap (otherwise alphabetical) columns. These"
                        " states will correspond to the tgtBed when --old used and"
                        " --predBed otherwise.", default=None)
    parser.add_argument("--hmCovRow", help="Path to write 1-row heatmap of "
                        "state coverage (fraction of bases). only works with --hm",
                        default=None)

    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.ignore is not None:
        args.ignore = set(args.ignore.split(","))
    else:
        args.ignore = set()
    if args.ignoreTgt is not None:
        args.ignoreTgt = set(args.ignoreTgt.split(","))
    else:
        args.ignoreTgt = set()
    if args.tgt is not None:
        args.tgt = set(args.tgt.split(","))
        if args.old is True:
            raise RuntimeError("--tgt option not implemented for --old")
    else:
        args.tgt = set()
    if args.old is True and args.fdr is not None:
        raise RuntimeError("--old and --fdr options are exclusive")

    assert args.col == 4 or args.col == 5

    tempFiles = []
    if args.tl is not None:
        cutBedTgt = cutOutMaskIntervals(args.tgtBed, -1, sys.maxint, args.tl)                                
        cutBedPred = cutOutMaskIntervals(args.predBed, -1, sys.maxint, args.tl)
        
        if cutBedTgt is not None:
            assert cutBedPred is not None
            tempFiles += [cutBedTgt, cutBedPred]
            args.tgtBed = cutBedTgt
            args.predBed = cutBedPred

    checkExactOverlap(args.tgtBed, args.predBed)

    intervals1 = readBedIntervals(args.tgtBed, ncol = args.col)
    intervals2 = readBedIntervals(args.predBed, ncol = args.col)
    cfName = "reverse"

    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        cfName = "forward"

    # generate confusion matrix based on accuracy comparison using
    # base or interval stats as desired
    if args.intThresh is not None:
        logger.info("Computing interval %s confusion matrix" % cfName)
        confMat = compareIntervalsOneSided(intervals2, intervals1, args.col -1,
                                            args.intThresh, False,
                                           not args.noFrag)[1]
    else:
        logger.info("Computing base %s confusion matrix" % cfName)
        confMat = compareBaseLevel(intervals2, intervals1, args.col - 1)[1]

    logger.info("%s Confusion Matrix:\n%s" % (cfName, str(confMat)))

    # find the best "true" match for each predicted state    
    if args.old is True:
        intervals1, intervals2 = intervals2, intervals1
        stateMap = getStateMapFromConfMatrix_simple(confMat)
    else:
        stateMap = getStateMapFromConfMatrix(confMat, args.tgt, args.ignoreTgt,
                                             args.ignore, args.qualThresh,
                                             args.fdr)

    # filter the stateMap to take into account the command-line options
    # notably --ignore, --ignoreTgt, --qualThresh, and --unique
    filterStateMap(stateMap, args)

    logger.info("State Map:\n%s", str(stateMap))
        
    # write the model if spefied
    if args.model is not None:
        applyNamesToModel(stateMap, args.model)
    
    # generate the output bed using the statemap
    writeFittedBed(intervals2, stateMap, args.outBed, args.col-1, args.noMerge,
                   args.ignoreTgt)

    # write the confusion matrix as heatmap
    if args.hm is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write heatmap.  Maybe matplotlib is "
                               "not installed?")
        writeHeatMap(confMat, args.hm, args.colOrder, args.hmCovRow)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
예제 #51
0
def runTsd(args, tempTracksInfo):
    """ run addTsdTrack on termini and chaux to generate tsd track"""
    if args.noTsd is True:
        return

    origTrackList = TrackList(args.tracksInfo)
    outTrackList = TrackList(tempTracksInfo)

    tempFiles = []
    tsdInputFiles = []
    tsdInputTracks = []
        
    # preprocess termini
    lastzTracks = [origTrackList.getTrackByName(args.ltr_termini),
                  origTrackList.getTrackByName(args.tir)]
    for terminiTrack in lastzTracks:
        if terminiTrack is not None:
            inFile = terminiTrack.getPath()
            fillFile = getLocalTempPath("Temp_fill", ".bed")
            tempBed = None
            if inFile[-3:] == ".bb":
                tempBed = getLocalTempPath("Temp_termini", ".bed")
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
                inFile = tempBed
            runShellCommand("fillTermini.py %s %s" % (inFile, fillFile))
            tsdInputFiles.append(fillFile)
            tsdInputTracks.append(terminiTrack.getName())
            tempFiles.append(fillFile)
            if tempBed is not None:
                runShellCommand("rm -f %s" % tempBed)
        else:
            logger.warning("Could not find termini track")

    # add repeat_modeler
    repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler)
    if repeat_modelerTrack is not None:
        tsdInputFiles.append(repeat_modelerTrack.getPath())
        tsdInputTracks.append(repeat_modelerTrack.getName())

    # run addTsdTrack (appending except first time)
    # note we override input track paths in each case
    assert len(tsdInputFiles) == len(tsdInputTracks)
    for i in xrange(len(tsdInputFiles)):
        optString = ""
        if i > 0:
            optString += " --append"
        # really rough hardcoded params based on
        # (A unified classification system for eukaryotic transposable elements
        # Wicker et. al 2007)
        if tsdInputTracks[i] == args.repeat_modeler:
            optString += " --names LINE,SINE,Unknown"
            optString += " --maxScore 20"
            optString += " --left 20"
            optString += " --right 20"
            optString += " --min 5"
            optString += " --max 20"
            optString += " --overlap 20"
        elif tsdInputTracks[i] == args.ltr_termini:
            optString += " --maxScore 3"
            optString += " --left 8"
            optString += " --right 8"
            optString += " --min 3"
            optString += " --max 6"
        elif tsdInputTracks[i] == args.tir:
            optString += " --maxScore 3"
            optString += " --left 15"
            optString += " --right 15"
            optString += " --min 3"
            optString += " --max 12"

        tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml")
        runShellCommand("addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % (
            tempTracksInfo,
            args.cleanTrackPath,
            tempXMLOut,
            tsdInputTracks[i],
            args.sequence,
            args.tsd,
            tsdInputFiles[i],
            optString,
            args.logOpString,
            args.numProc))
        
        runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo))

    for i in xrange(len(tempFiles)):
        runShellCommand("rm %s" % tempFiles[i])
예제 #52
0
def runPositionalComparison(argv, args):
    """ hack to recursively exectute compareBedStates.py on a sliding window of the two
    inputs and report accuracy in a BED file """
    try:
        windowToks = args.window.split(",")
        assert len(windowToks) == 5
        windowSize = int(windowToks[0])
        stateName = windowToks[1]
        compType = windowToks[2]
        score = windowToks[3]
        outBed = windowToks[4]
    except:
        raise RuntimeError("value passed to --window is not in valid format")
    if compType == "base":
        compIdx = 0
    elif compType == "interval":
        compIdx = 1
    elif compType == "weighted":
        compIdx = 2
    else:
        raise RuntimeError("invalid compType, %s, passed to --window" % compType)
    if score != "f1" and score != "precision" and score != "recall":
        raise RuntimeError("invalid score, %s, passed to --window" % score)
    try:
        outFile = open(outBed, "w")
    except:
        raise RuntimeError("invalid outBed, %s, passed to --window" % outBed)

    tempBed = getLocalTempPath("Temp_region", ".bed")
    runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed))
    chunkBed = getLocalTempPath("Temp_chunkBed", ".bed")
    runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" % (
        tempBed, windowSize, chunkBed))
    window = getLocalTempPath("Temp_window", ".bed")
    slice1 = getLocalTempPath("Temp_slice1", ".bed")
    slice2 = getLocalTempPath("Temp_slice2", ".bed")
    compFile = getLocalTempPath("Temp_compFile", ".bed")
    compOpts = ""
    winIdx = argv.index("--window")
    assert winIdx > 0 and winIdx < len(argv) -1 and argv[winIdx + 1] == args.window
    for i in xrange(3, len(argv)):
        if i != winIdx and i != winIdx + 1:
            compOpts += " " + argv[i]
    
    for chunk in readBedIntervals(chunkBed):
        runShellCommand("echo \"%s\t%d\t%d\" > %s" % (chunk[0], chunk[1], chunk[2],
                                                   window))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (
            args.bed1, window, slice1))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (
            args.bed2, window, slice2))
        runShellCommand("compareBedStates.py %s %s %s > %s" % (
            slice1, slice2, compOpts, compFile))
        stats = extractCompStatsFromFile(compFile)[compIdx]
        if stateName not in stats:
            stats[stateName] = (0,0)
        f1 = 0.
        prec, rec = stats[stateName]
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        val = f1
        if score == "precision":
            val = prec
        elif score == "recall":
            val = rec
        outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val))

    runShellCommand("rm -f %s %s %s %s %s %s" % (tempBed, chunkBed, window,
                                                 slice1, slice2, compFile))
    outFile.close()
예제 #53
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML", help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed", help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument("inBed", help="TE prediction BED file.  State labels"
                        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed", help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument("--maxLen", help="Maximum length of a masked interval"
                        " to fill (inclusive). Use --delMask option with same value"
                        "if running compareBedStates.py after.",
                        type=int, default=sys.maxint)
    parser.add_argument("--default", help="Default label to give to masked "
                        "region if no label can be determined", default="0")
    parser.add_argument("--tgts", help="Only relabel gaps that "
                        "are flanked on both sides by the same state, and this state"
                        " is in this comma- separated list. --default used for other"
                        " gaps.  If not targetst specified then all states checked.",
                        default=None)
    parser.add_argument("--oneSidedTgts", help="Only relabel gaps that "
                        "are flanked on at least one side by a state in this comma-"
                        "separated list --default used for other gaps",
                         default=None)
    parser.add_argument("--onlyDefault", help="Add the default state (--default) no"
                        " no all masked gaps no matter what. ie ignoring all other "
                        "logic", action="store_true", default=False)
    parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input."
                        " By default, the input is expected to come from the HMM "
                        "with mask intervals already absent, and will crash on with"
                        " an assertion error if an overlap is detected.",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0


    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (
            maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0
            
        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState
        
        # write our mask interval
        tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1],
                                                    maskInterval[2], maskState))

    
    tempOutMaskFile.close()    
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1,
                                                 tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath,
                                                       args.outBed))

    runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1,
                                      tempMergePath2, tempScopePath]))
    cleanBedTool(tempBedToolPath)
예제 #54
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Produce a bed file of genome segments which are atomic"
        " elements with resepect to the hmm. ie each segment emits a single"
        " state. Mask tracks always cut.  "
        "Output intervals are assigned name 0 1 0 1 etc.")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outBed", help="Output segments")
    parser.add_argument("--thresh", help="Number of tracks that can change "
                        "before a new segment formed.  Increasing this value"
                        " increases the expected lengths of output segments",
                        type=int, default=1)
    parser.add_argument("--cutTracks", help="Create a new segment if something"
                        " changes in one of these tracks (as specified by "
                        "comman-separated list), overriding --thresh options"
                        " if necessary.  For example, --cutTracks tsd,chaux"
                        " would invoke a new segment everytime the value at"
                        "either of these tracks changed", default=None)
    parser.add_argument("--cutUnscaled", help="Cut on all unscaled (used as "
                        "a proxy for non-numeric) tracks", default=False,
                        action="store_true")
    parser.add_argument("--cutMultinomial", help="Cut non-gaussian, non-binary"
                        " tracks everytime", default=False, action="store_true")
    parser.add_argument("--cutNonGaussian", help="Cut all but guassian tracks",
                        default=False, action="store_true")
    parser.add_argument("--comp", help="Strategy for comparing columns for the "
                        "threshold cutoff.  Options are [first, prev], where"
                        " first compares with first column of segment and "
                        "prev compares with column immediately left",
                        default="first")
    parser.add_argument("--ignore", help="Comma-separated list of tracks to "
                        "ignore (the FASTA DNA sequence would be a good "
                        "candidate", default="sequence")
    parser.add_argument("--maxLen", help="Maximum length of a segment (<= 0 means"
                        " no max length applied",
                        type=int, default=0)
    parser.add_argument("--fixLen", help="Just make segments of specifed fixed "
                        "length ignoring other parameters and logic (<= 0 means"
                        " no fixed length applied",
                        type=int, default=0)
    parser.add_argument("--stats", help="Write some statistics to specified "
                        "file. Of the form <trackName> <Diff> <DiffPct> "
                        " where <Diff> is the number of times a track differs"
                        " between two consecutive segments, and <DiffPct> "
                        " is the average perecentage of all such differences "
                        "accounted for by the track", default=None)
    parser.add_argument("--delMask", help="Entirely remove intervals from "
                        "mask tracks that are > given length (otherwise "
                        "they would just be ignored by HMM tools). The difference"
                        " here is that removed intervals will break contiguity.",
                        type=int, default=None)
    parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel"
                        " (in BED format).  input regions will be intersected with each line"
                        " in this file, and the result will correspsond to an individual job",
                        default=None)
    parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)",
                        type=int, default=1)
    parser.add_argument("--co", help="count offset for segment labels.  only used internally",
                        type=int, default=0)
        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.comp != "first" and args.comp != "prev":
        raise RuntimeError("--comp must be either first or prev")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun 
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0
        
    # read query intervals from the bed file
    tempFiles = []
    if args.delMask is not None:
        cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint,
                                     args.tracksInfo)
        if cutBed is not None:
            tempFiles.append(cutBed)
            args.allBed = cutBed
    logger.info("loading segment region intervals from %s" % args.allBed)
    mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.allBed)

    # read the tracks, while intersecting them with the query intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            treatMaskAsBinary=True)

    # process the --cutTracks option
    trackList = trackData.getTrackList()
    cutList = np.zeros((len(trackList)), np.int)
    if args.cutTracks is not None:
        cutNames = args.cutTracks.split(",")
        for name in cutNames:
            track = trackList.getTrackByName(name)
            if track is None:
                raise RuntimeError("cutTrack %s not found" % name)
            trackNo = track.getNumber()
            assert trackNo < len(cutList)
            cutList[trackNo] = 1
    args.cutList = cutList

    # make sure mask tracks count as cut tracks
    for track in trackList:
        if track.getDist() == 'mask':
            args.cutList[track.getNumber()] = 1

    # process the --ignore option
    ignoreList = np.zeros((len(trackList)), np.int)
    if args.ignore is not None:
        ignoreNames = args.ignore.split(",")
        for name in ignoreNames:
            track = trackList.getTrackByName(name)
            if track is None:
                if name is not "sequence":
                    logger.warning("ignore track %s not found" % name)
                continue
            trackNo = track.getNumber()
            assert trackNo < len(ignoreList)
            ignoreList[trackNo] = 1
            if args.cutList[trackNo] == 1:
                raise RuntimeError("Same track (%s) cant be cut and ignored" %
                                  name)
    args.ignoreList = ignoreList

    #process the --cutUnscaled option
    if args.cutUnscaled is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.scale is None and track.shift is None and\
              track.logScale is None and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1

    #process the --cutMultinomial option
    if args.cutMultinomial is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist == "multinomial" and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1

    #process the --cutNonGaussian option
    if args.cutNonGaussian is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist != "gaussian" and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1
              

    # segment the tracks
    stats = dict()
    segmentTracks(trackData, args, stats)
    writeStats(trackData, args, stats)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)
예제 #55
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Generate HMM-usable tracklist from raw tracklist. EX "
        "used to transform mustang_alyrata_tracks.xml -> "
        "mustang_alyrata_clean.xml.  Runs cleanRM.py cleanLtrFinder.py and "
        " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs "
        " removeBedOverlaps.py before each of the clean scripts)")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("cleanTrackPath",
                        help="Directory to write cleaned BED"
                        " tracks to")
    parser.add_argument("outTracksInfo",
                        help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("--numBins",
                        help="Maximum number of bins after scaling",
                        default=10,
                        type=int)
    parser.add_argument("--scaleTracks",
                        help="Comma-separated list of tracks "
                        "to process for scaling. If not set, all"
                        " tracks listed as having a multinomial distribution"
                        " (since this is the default value, this includes "
                        "tracks with no distribution attribute) or gaussian "
                        "distribution will be processed.",
                        default=None)
    parser.add_argument("--skipScale",
                        help="Comma-separated list of tracks to "
                        "skip for scaling.",
                        default=None)
    parser.add_argument("--ltr_termini",
                        help="Name of termini track (appy tsd)",
                        default="ltr_termini")
    parser.add_argument("--repeat_modeler",
                        help="Name of repeat_modeler track (appy tsd)",
                        default="repeat_modeler")
    parser.add_argument("--sequence",
                        help="Name of fasta sequence track",
                        default="sequence")
    parser.add_argument(
        "--tsd",
        help="Name of tsd track to generate (appy cleanTermini.py)",
        default="tsd")
    parser.add_argument(
        "--tir",
        help="Name of tir_termini track (appy cleanTermini.py)",
        default="tir_termini")
    parser.add_argument("--noScale",
                        help="Dont do any scaling",
                        default=False,
                        action="store_true")
    parser.add_argument("--noTsd",
                        help="Dont generate TSD track.  NOTE:"
                        " TSD track is hardcoded to be generated from "
                        "termini and (non-LTR elements of ) chaux",
                        default=False,
                        action="store_true")
    parser.add_argument("--numProc",
                        help="Number of processes to use for tsdFinder.py",
                        default=1,
                        type=int)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    try:
        os.makedirs(args.cleanTrackPath)
    except:
        pass
    if not os.path.isdir(args.cleanTrackPath):
        raise RuntimeError("Unable to find or create cleanTrack dir %s" %
                           args.cleanTrackPath)

    tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml")
    runCleaning(args, tempTracksInfo)
    assert os.path.isfile(tempTracksInfo)

    runTsd(args, tempTracksInfo)

    runScaling(args, tempTracksInfo)

    runShellCommand("rm -f %s" % tempTracksInfo)

    cleanBedTool(tempBedToolPath)
예제 #56
0
def runPositionalComparison(argv, args):
    """ hack to recursively exectute compareBedStates.py on a sliding window of the two
    inputs and report accuracy in a BED file """
    try:
        windowToks = args.window.split(",")
        assert len(windowToks) == 5
        windowSize = int(windowToks[0])
        stateName = windowToks[1]
        compType = windowToks[2]
        score = windowToks[3]
        outBed = windowToks[4]
    except:
        raise RuntimeError("value passed to --window is not in valid format")
    if compType == "base":
        compIdx = 0
    elif compType == "interval":
        compIdx = 1
    elif compType == "weighted":
        compIdx = 2
    else:
        raise RuntimeError("invalid compType, %s, passed to --window" %
                           compType)
    if score != "f1" and score != "precision" and score != "recall":
        raise RuntimeError("invalid score, %s, passed to --window" % score)
    try:
        outFile = open(outBed, "w")
    except:
        raise RuntimeError("invalid outBed, %s, passed to --window" % outBed)

    tempBed = getLocalTempPath("Temp_region", ".bed")
    runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed))
    chunkBed = getLocalTempPath("Temp_chunkBed", ".bed")
    runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" %
                    (tempBed, windowSize, chunkBed))
    window = getLocalTempPath("Temp_window", ".bed")
    slice1 = getLocalTempPath("Temp_slice1", ".bed")
    slice2 = getLocalTempPath("Temp_slice2", ".bed")
    compFile = getLocalTempPath("Temp_compFile", ".bed")
    compOpts = ""
    winIdx = argv.index("--window")
    assert winIdx > 0 and winIdx < len(argv) - 1 and argv[winIdx +
                                                          1] == args.window
    for i in xrange(3, len(argv)):
        if i != winIdx and i != winIdx + 1:
            compOpts += " " + argv[i]

    for chunk in readBedIntervals(chunkBed):
        runShellCommand("echo \"%s\t%d\t%d\" > %s" %
                        (chunk[0], chunk[1], chunk[2], window))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.bed1, window, slice1))
        runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %
                        (args.bed2, window, slice2))
        runShellCommand("compareBedStates.py %s %s %s > %s" %
                        (slice1, slice2, compOpts, compFile))
        stats = extractCompStatsFromFile(compFile)[compIdx]
        if stateName not in stats:
            stats[stateName] = (0, 0)
        f1 = 0.
        prec, rec = stats[stateName]
        if prec + rec > 0:
            f1 = (2. * prec * rec) / (prec + rec)
        val = f1
        if score == "precision":
            val = prec
        elif score == "recall":
            val = rec
        outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val))

    runShellCommand("rm -f %s %s %s %s %s %s" %
                    (tempBed, chunkBed, window, slice1, slice2, compFile))
    outFile.close()