def runTsdFinder(faPath, inBedPath, outBedPath, args): """ call tsdFinder and either overwrite or append output. also call removeBedOverlaps on final output to make sure it is clean """ # convert input to bed if necessary tempBed = None if os.path.splitext(inBedPath)[1].lower() == ".bb": tempBed = getLocalTempPath("Temp_addTsdTrack", ".bed") runShellCommand("bigBedToBed %s %s" % (inFile, tempBed)) inBedPath = tempBed # run tsdfinder on input tempOut = getLocalTempPath("Temp_addTsdTrack", ".bed") runShellCommand("tsdFinder.py %s %s %s %s" % (faPath, inBedPath, tempOut, args.tsdFinderOptions)) if tempBed is not None: runShellCommand("rm %s" % tempBed) # merge with existing track if os.path.isfile(outBedPath) and args.append is True: runShellCommand("cat %s >> %s" % (outBedPath, tempOut)) # remove overlaps into final output runShellCommand("removeBedOverlaps.py %s > %s" % (tempOut, outBedPath)) runShellCommand("rm %s" % tempOut)
def runCleaning(args, tempTracksInfo): """ run scripts for cleaning chaux, ltr_finder, and termini""" trackList = TrackList(args.tracksInfo) for track in trackList: if track.getPreprocess() is None: continue # convert bigbed/wig inFile = track.getPath() tempBed1 = None if inFile[-3:] == ".bb" or inFile[-3:] == ".bw": tempBed1 = getLocalTempPath("Temp_%s" % track.getName(), ".bed") if inFile[-3:] == ".bb": runShellCommand("bigBedToBed %s %s" % (inFile, tempBed1)) else: runShellCommand("bigWigToBedGraph %s %s" % (inFile, tempBed1)) inFile = tempBed1 # run cleanRM.py on all tracks with rm or rmu preprocessor if track.getPreprocess() == "rm" or track.getPreprocess() == "rmu": flag = "" if track.getPreprocess() == "rmu": flag == "--keepUnderscore" inFile = track.getPath() outFile = cleanPath(args, track) tempBed = getLocalTempPath("Temp_%s" % track.getName(), ".bed") runShellCommand("cleanRM.py %s %s > %s" % (inFile, flag, tempBed)) runShellCommand("removeBedOverlaps.py --rm %s > %s" % (tempBed, outFile)) runShellCommand("rm -f %s" % tempBed) track.setPath(outFile) # run cleanTermini.py elif track.getPreprocess() == "termini": outFile = cleanPath(args, track) inFile = track.getPath() runShellCommand("cleanTermini.py %s %s" % (inFile, outFile)) track.setPath(outFile) # run removeBedOverlaps elif track.getPreprocess() == "overlap": outFile = cleanPath(args, track) inFile = track.getPath() runShellCommand("removeBedOverlaps.py %s > %s" % (inFile, outFile)) track.setPath(outFile) # run cleanLtrFinder.py elif track.getPreprocess() == "ltr_finder": inFile = track.getPath() outFile = cleanPath(args, track) # note: overlaps now removed in cleanLtrFinderID script runShellCommand("cleanLtrFinderID.py %s %s" % (inFile, outFile)) track.setPath(outFile) if tempBed1 is not None: runShellCommand("rm -f %s" % tempBed1) # save a temporary xml trackList.saveXML(tempTracksInfo)
def runParallel(args, bedIntervals): """ Quick hack to rerun parallel jobs on different interval subsets. """ nameSet = None if args.names is not None: nameSet = set(args.names.split(",")) # chunk up BED input numIntervals = 0 for interval in bedIntervals: name = None if len(interval) > 3: name = interval[3] if nameSet is None or name in nameSet: numIntervals += 1 jobSize = 1 + (numIntervals / args.numProc) logger.info("Dviding %d intervals into %d processes (%d intervals per)" % (numIntervals, args.numProc, jobSize)) tempBeds = [] curSize = sys.maxint curFile = None for interval in bedIntervals: name = None if len(interval) > 3: name = interval[3] if nameSet is None or name in nameSet: if curSize >= jobSize: if curFile is not None: curFile.close() tempBed = getLocalTempPath("TempTsdFinderIn", ".bed") tempBeds.append(tempBed) curFile = open(tempBed, "w") curSize = 0 curFile.write("\t".join([str(s) for s in interval])) curFile.write("\n") curSize += 1 if curFile is not None: curFile.close() # map jobs assert len(tempBeds) <= args.numProc tempOuts = [] jobCmds = [] for tempBed in tempBeds: cmdLine = " ".join(sys.argv) cmdLine = cmdLine.replace("--numProc %d" % args.numProc, "--numProc 1") cmdLine = cmdLine.replace(args.inBed, tempBed) tempOut = getLocalTempPath("TempTsdFinderOut", ".bed") cmdLine = cmdLine.replace(args.outBed, tempOut) tempOuts.append(tempOut) jobCmds.append(cmdLine) runParallelShellCommands(jobCmds, args.numProc) # reduce for i, tempOut in enumerate(tempOuts): if i == 0: runShellCommand("mv %s %s" % (tempOut, args.outBed)) else: runShellCommand("cat %s >> %s" % (tempOut, args.outBed)) runShellCommand("rm -f %s" % (tempOut))
def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath): """ Filter out intervals of mask tracks from inBed with lengths outside given range. Idea is that it makes less sense to simply ignore, say, giant stretches of N's (like centromeres), as we would by masking them normally, than it does to remove them entirely, splitting the genome into multiple chunks. Can also be used during comparision to get rid of all masked intervals """ outPath = getLocalTempPath("Tempcut", ".bed") trackList = TrackList(tracksInfoPath) maskPaths = [t.getPath() for t in trackList.getMaskTracks()] if len(maskPaths) == 0: return None tempPath1 = getLocalTempPath("Tempcut1", ".bed") tempPath2 = getLocalTempPath("Tempcut2", ".bed") runShellCommand("cp %s %s" % (inBed, outPath)) for maskPath in maskPaths: runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" % (maskPath, tempPath1)) if os.path.getsize(tempPath1) > 0: runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" % (tempPath1, tempPath2, tempPath2, tempPath1)) runShellCommand("filterBedLengths.py %s %d %d > %s" % (tempPath1, minLength + 1, maxLength - 1, tempPath2)) runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (outPath, tempPath2, tempPath1)) runShellCommand("mv %s %s" % (tempPath1, outPath)) runShellCommand("rm -f %s %s" % (tempPath1, tempPath2)) if os.path.getsize(outPath) == 0: raise RuntimeError( "cutOutMaskIntervals removed everything. Can't continue." " probably best to rerun calling script on bigger region?") return outPath
def filterCutTrack(genomePath, fragmentFilterLen, trackListPath, cutTrackName, cutTrackLenFilter): """ return path of length filtered cut track""" tracks = TrackList(trackListPath) track = tracks.getTrackByName(cutTrackName) assert track is not None cutTrackOriginalPath = track.getPath() cutTrackPath = getOutPath(cutTrackOriginalPath, outDir, "filter%d" % cutTrackLenFilter) runShellCommand("filterBedLengths.py %s %s > %s" % (cutTrackOriginalPath, cutTrackLenFilter, cutTrackPath)) tempPath1 = getLocalTempPath("Temp", ".bed") runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (genomePath, cutTrackPath, tempPath1)) tempPath2 = getLocalTempPath("Temp", ".bed") S = string.ascii_uppercase + string.digits tag = ''.join(random.choice(S) for x in range(200)) runShellCommand( "filterBedLengths.py %s %d --rename %s |grep %s | sortBed> %s" % (tempPath1, fragmentFilterLen, tag, tag, tempPath2)) runShellCommand( "cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . > %s" % (tempPath2, tempPath1)) runShellCommand( "cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . >> %s" % (cutTrackPath, tempPath1)) runShellCommand("sortBed -i %s > %s" % (tempPath1, tempPath2)) runShellCommand("mergeBed -i %s > %s" % (tempPath2, cutTrackPath)) runShellCommand("rm -f %s %s" % (tempPath1, tempPath2)) return cutTrackPath
def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath): """ Filter out intervals of mask tracks from inBed with lengths outside given range. Idea is that it makes less sense to simply ignore, say, giant stretches of N's (like centromeres), as we would by masking them normally, than it does to remove them entirely, splitting the genome into multiple chunks. Can also be used during comparision to get rid of all masked intervals """ outPath = getLocalTempPath("Tempcut", ".bed") trackList = TrackList(tracksInfoPath) maskPaths = [t.getPath() for t in trackList.getMaskTracks()] if len(maskPaths) == 0: return None tempPath1 = getLocalTempPath("Tempcut1", ".bed") tempPath2 = getLocalTempPath("Tempcut2", ".bed") runShellCommand("cp %s %s" % (inBed, outPath)) for maskPath in maskPaths: runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" % ( maskPath, tempPath1)) if os.path.getsize(tempPath1) > 0: runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" % ( tempPath1, tempPath2, tempPath2, tempPath1)) runShellCommand("filterBedLengths.py %s %d %d > %s" % ( tempPath1, minLength+1, maxLength-1, tempPath2)) runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % ( outPath, tempPath2, tempPath1)) runShellCommand("mv %s %s" % (tempPath1, outPath)) runShellCommand("rm -f %s %s" % (tempPath1, tempPath2)) if os.path.getsize(outPath) == 0: raise RuntimeError("cutOutMaskIntervals removed everything. Can't continue." " probably best to rerun calling script on bigger region?") return outPath
def filterCutTrack(genomePath, fragmentFilterLen, trackListPath, cutTrackName, cutTrackLenFilter): """ return path of length filtered cut track""" tracks = TrackList(trackListPath) track = tracks.getTrackByName(cutTrackName) assert track is not None cutTrackOriginalPath = track.getPath() cutTrackPath = getOutPath(cutTrackOriginalPath, outDir, "filter%d" % cutTrackLenFilter) runShellCommand("filterBedLengths.py %s %s > %s" % (cutTrackOriginalPath, cutTrackLenFilter, cutTrackPath)) tempPath1 = getLocalTempPath("Temp", ".bed") runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (genomePath, cutTrackPath, tempPath1)) tempPath2 = getLocalTempPath("Temp", ".bed") S = string.ascii_uppercase + string.digits tag = ''.join(random.choice(S) for x in range(200)) runShellCommand("filterBedLengths.py %s %d --rename %s |grep %s | sortBed> %s" % ( tempPath1, fragmentFilterLen, tag, tag, tempPath2)) runShellCommand("cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . > %s" % (tempPath2, tempPath1)) runShellCommand("cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . >> %s" % (cutTrackPath, tempPath1)) runShellCommand("sortBed -i %s > %s" % (tempPath1, tempPath2)) runShellCommand("mergeBed -i %s > %s" %(tempPath2, cutTrackPath)) runShellCommand("rm -f %s %s" % (tempPath1, tempPath2)) return cutTrackPath
def runParallel(args, bedIntervals): """ Quick hack to rerun parallel jobs on different interval subsets. """ nameSet = None if args.names is not None: nameSet = set(args.names.split(",")) # chunk up BED input numIntervals = 0 for interval in bedIntervals: name = None if len(interval) > 3: name = interval[3] if nameSet is None or name in nameSet: numIntervals += 1 jobSize = 1 + (numIntervals / args.numProc) logger.info("Dviding %d intervals into %d processes (%d intervals per)" % ( numIntervals, args.numProc, jobSize)) tempBeds = [] curSize = sys.maxint curFile = None for interval in bedIntervals: name = None if len(interval) > 3: name = interval[3] if nameSet is None or name in nameSet: if curSize >= jobSize: if curFile is not None: curFile.close() tempBed = getLocalTempPath("TempTsdFinderIn", ".bed") tempBeds.append(tempBed) curFile = open(tempBed, "w") curSize = 0 curFile.write("\t".join([str(s) for s in interval])) curFile.write("\n") curSize += 1 if curFile is not None: curFile.close() # map jobs assert len(tempBeds) <= args.numProc tempOuts = [] jobCmds = [] for tempBed in tempBeds: cmdLine = " ".join(sys.argv) cmdLine = cmdLine.replace("--numProc %d" % args.numProc,"--numProc 1") cmdLine = cmdLine.replace(args.inBed, tempBed) tempOut = getLocalTempPath("TempTsdFinderOut", ".bed") cmdLine = cmdLine.replace(args.outBed, tempOut) tempOuts.append(tempOut) jobCmds.append(cmdLine) runParallelShellCommands(jobCmds, args.numProc) # reduce for i, tempOut in enumerate(tempOuts): if i == 0: runShellCommand("mv %s %s" % (tempOut, args.outBed)) else: runShellCommand("cat %s >> %s" % (tempOut, args.outBed)) runShellCommand("rm -f %s" % (tempOut))
def cutBedRegion(bedInterval, cutTrackPath, inBed, outBed): """ intersect with a given interval """ tempPath = getLocalTempPath("Temp_cut", ".bed") tempPath2 = getLocalTempPath("Temp_cut", ".bed") runShellCommand("rm -f %s" % outBed) runShellCommand( "echo \"%s\t%s\t%s\n\" > %s" % (bedInterval[0], bedInterval[1], bedInterval[2], tempPath2)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (inBed, tempPath2, tempPath)) runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (tempPath, cutTrackPath, outBed)) runShellCommand("rm -f %s %s" % (tempPath, tempPath2))
def combineTrack(track, outPath, tempRegionPath, iter, args): """ merge track with outPath """ # make sure track is of form chrom start end state tempColPath = getLocalTempPath("Temp", "_col.bed") tempColFile = open(tempColPath, "w") vc = track.getValCol() + 1 if track.getDist() == "binary": assert track.getName() != args.outside vc = 3 bedIntervals = readBedIntervals(track.getPath(), vc, sort = True) for bedInterval in bedIntervals: outStr = "\t".join([str(x) for x in bedInterval]) if track.getDist() == "binary": # state name = track name for binary track outStr += "\t%s" % track.getName() outStr += "\n" tempColFile.write(outStr) tempColFile.close() # intersect the target region tempIntersectPath = getLocalTempPath("Temp", "_int.bed") runShellCommand("intersectBed -a %s -b %s > %s" % ( tempColPath, tempRegionPath, tempIntersectPath)) # add the outside states tempGappedPath = getLocalTempPath("Temp", "_gap.bed") runShellCommand("addBedGaps.py --state %s %s %s %s" % ( args.outside, tempRegionPath, tempIntersectPath, tempGappedPath)) # fit the names with previous interations' result tempFitPath = getLocalTempPath("Temp", "_fit.bed") if iter == 0: runShellCommand("cp %s %s" % (tempGappedPath, tempFitPath)) else: runShellCommand("fitStateNames.py %s %s %s --qualThresh %f --ignoreTgt %s" % ( outPath, tempGappedPath, tempFitPath, args.fitThresh, args.outside)) # now merge into outPath runShellCommand("cat %s >> %s" % (tempFitPath, outPath)) runShellCommand("removeBedOverlaps.py %s > %s" % (outPath, tempColPath)) runShellCommand("mv %s %s" % (tempColPath, outPath)) # clean crap (note tempCol should already be gone) runShellCommand("rm -f %s" % tempColPath) runShellCommand("rm -f %s" % tempIntersectPath) runShellCommand("rm -f %s" % tempGappedPath) runShellCommand("rm -f %s" % tempFitPath)
def cutBedRegion(bedInterval, cutTrackPath, inBed, outBed): """ intersect with a given interval """ tempPath = getLocalTempPath("Temp_cut", ".bed") tempPath2 = getLocalTempPath("Temp_cut", ".bed") runShellCommand("rm -f %s" % outBed) runShellCommand("echo \"%s\t%s\t%s\n\" > %s" % (bedInterval[0], bedInterval[1], bedInterval[2], tempPath2)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (inBed, tempPath2, tempPath)) runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (tempPath, cutTrackPath, outBed)) runShellCommand("rm -f %s %s" % (tempPath, tempPath2))
def checkExactOverlap(bed1, bed2): """ make sure two bed files cover same region exactly: a requirement for all code based on the comparisons in this module.""" errorMessage = ( "Bed files %s and %s cannot be compared. xxx. " " Input files must be both sorted, cover the exact same region," " and contain no self-overlaps.") % (bed1, bed2) # empty file may break downstream comparisons size1 = os.path.getsize(bed1) size2 = os.path.getsize(bed2) if size1 == 0 or size2 == 0: raise RuntimeError( errorMessage.replace("xxx", "one or both inputs empty")) # test self-overlap and sorting intervals1 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals1)): if intersectSize(intervals1[i - 1], intervals1[i]) != 0: raise RuntimeError( errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input1" % (intervals1[i - 1], intervals1[i]))) if intervals1[i - 1] > intervals1[i]: raise RuntimeError( errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input1" % (intervals1[i - 1], intervals1[i]))) # test self-overlap and sorting intervals2 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals2)): if intersectSize(intervals2[i - 1], intervals2[i]) != 0: raise RuntimeError( errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input2" % (intervals2[i - 1], intervals2[i]))) if intervals2[i - 1] > intervals2[i]: raise RuntimeError( errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input2" % (intervals2[i - 1], intervals2[i]))) # test intersection size tempFile = getLocalTempPath("Temp_test", ".bed") runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError( errorMessage.replace("xxx", "Input1 covers regions outside input2")) runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError( errorMessage.replace("xxx", "Input2 covers regions outside input1")) runShellCommand("rm -f %s" % tempFile)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Set the score column of each bed interval in input to " "(MODE, BINNED) average value of the intersection region in another track). " "Can be used, for instance, to assign a copy number of each RepeatModeler " "prediction...") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("inBed", help="BED file to annotate") parser.add_argument("track", help="Track to use for annotation") parser.add_argument("outBed", help="Path for output, annotated BED file") parser.add_argument("--name", help="Set ID field (column 4 instead of 5)", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # read the tracks list trackList = TrackList(args.tracksInfo) track = trackList.getTrackByName(args.track) if track is None: raise RuntimeError("Can't find track %s" % args.track) # make temporary tracks list with just our track so we can keep using # tracks list interface but not read unecessary crap. singleListPath = getLocalTempPath("Temp_secScore", ".bed") trackList.trackList = [track] trackList.saveXML(singleListPath) obFile = open(args.outBed, "w") # trackData interface not so great at cherry picking intervals. # need to merge them up and use segmentation interface filledIntervals, mergedIntervals = fillGaps(args.inBed) # read track into trackData trackData = TrackData() logger.info("loading track %s" % singleListPath) trackData.loadTrackData(singleListPath, mergedIntervals, segmentIntervals=filledIntervals, applyMasking=False) # finally, write the annotation writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals, obFile, args) runShellCommand("rm -f %s" % singleListPath) obFile.close() cleanBedTool(tempBedToolPath)
def checkExactOverlap(bed1, bed2): """ make sure two bed files cover same region exactly: a requirement for all code based on the comparisons in this module.""" errorMessage = ("Bed files %s and %s cannot be compared. xxx. " " Input files must be both sorted, cover the exact same region," " and contain no self-overlaps.") % (bed1, bed2) # empty file may break downstream comparisons size1 = os.path.getsize(bed1) size2 = os.path.getsize(bed2) if size1 == 0 or size2 == 0: raise RuntimeError(errorMessage.replace("xxx", "one or both inputs empty")) # test self-overlap and sorting intervals1 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals1)): if intersectSize(intervals1[i-1], intervals1[i]) != 0: raise RuntimeError(errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input1" % ( intervals1[i-1], intervals1[i]))) if intervals1[i-1] > intervals1[i]: raise RuntimeError(errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input1" % ( intervals1[i-1], intervals1[i]))) # test self-overlap and sorting intervals2 = readBedIntervals(bed1, sort=False) for i in xrange(1, len(intervals2)): if intersectSize(intervals2[i-1], intervals2[i]) != 0: raise RuntimeError(errorMessage.replace( "xxx", "Overlapping intervals %s and %s found in input2" % ( intervals2[i-1], intervals2[i]))) if intervals2[i-1] > intervals2[i]: raise RuntimeError(errorMessage.replace( "xxx", "Out of order intervals %s and %s found in input2" % ( intervals2[i-1], intervals2[i]))) # test intersection size tempFile = getLocalTempPath("Temp_test", ".bed") runShellCommand("subtractBed -a %s -b %s > %s" % (bed1, bed2, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError(errorMessage.replace( "xxx", "Input1 covers regions outside input2")) runShellCommand("subtractBed -a %s -b %s > %s" % (bed2, bed1, tempFile)) if os.path.getsize(tempFile) != 0: runShellCommand("rm -f %s" % tempFile) raise RuntimeError(errorMessage.replace( "xxx", "Input2 covers regions outside input1")) runShellCommand("rm -f %s" % tempFile)
def filterEmptyRegions(genomePath, regions, outDir, cutTrackPath): """ to a trial cut on each region. return a list of those that aren't empty after cut """ filteredRegions = [] for i, region in enumerate(regions): regionName = getRegionName(region, i) tempPath1 = getLocalTempPath("Temp", ".bed") cutBedRegion(region, cutTrackPath, genomePath, tempPath1) intervals = bedRead(tempPath1) runShellCommand("rm -f %s" % tempPath1) if len(intervals) > 0: filteredRegions.append(region) return filteredRegions
def runVennMaker(args0): # venn_maker seems designed to run on intervals (and looks pretty broken doing this). # try converting to base intervals. todie = [] for i, f in enumerate(args.inputFiles): tempFile = getLocalTempPath("Temp_%d" % i, ".bed") todie.append(tempFile) baserize(f, tempFile) args.inputFiles[i] = tempFile venn_maker(args.inputFiles, args.names, args.outTiff, "venn.R", additional_args=None, run=True) for f in todie: runShellCommand("rm -f %s" % f)
cmd += " --iter %d" % iter cmd += " --segment %s" % trainSegPath runShellCommand(cmd) # eval ############ evalPath = "eval.bed" if startPoint <=3: cmd = "teHmmEval.py %s %s %s --bed %s --segment %s" % (trainTracksPath, modelPath, evalSegPath, evalPath, logOpts) runShellCommand(cmd) # fit ############ fitPath = "fit.bed" fitFdrPath = "fitFdr.bed" labelPath = "label.bed" if startPoint <=4: tempPath = getLocalTempPath("Tempmask", ".bed") runShellCommand("mergeBed -i %s | sortBed > %s" % (evalSegPath, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (modelerPath, tempPath, labelPath)) runShellCommand("rm -f %s" % tempPath) fitCmd = "fitStateNames.py %s %s %s %s" % (labelPath, evalPath, fitPath, fitFlags) fitFdrCmd = "fitStateNames.py %s %s %s %s" % (labelPath, evalPath, fitFdrPath, fitFlagsFdr) runParallelShellCommands([fitCmd, fitFdrCmd], 2) # compare ############ compDir = "comp" if not os.path.exists(compDir): runShellCommand("mkdir %s" % compDir) def getTruthPath(idx): return os.path.join(compDir, truthNames[idx] + ".bed") fitPathMI = "fitMI.bed"
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument("inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument("--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument("--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument("--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument("--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % ( maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath])) cleanBedTool(tempBedToolPath)
def runPositionalComparison(argv, args): """ hack to recursively exectute compareBedStates.py on a sliding window of the two inputs and report accuracy in a BED file """ try: windowToks = args.window.split(",") assert len(windowToks) == 5 windowSize = int(windowToks[0]) stateName = windowToks[1] compType = windowToks[2] score = windowToks[3] outBed = windowToks[4] except: raise RuntimeError("value passed to --window is not in valid format") if compType == "base": compIdx = 0 elif compType == "interval": compIdx = 1 elif compType == "weighted": compIdx = 2 else: raise RuntimeError("invalid compType, %s, passed to --window" % compType) if score != "f1" and score != "precision" and score != "recall": raise RuntimeError("invalid score, %s, passed to --window" % score) try: outFile = open(outBed, "w") except: raise RuntimeError("invalid outBed, %s, passed to --window" % outBed) tempBed = getLocalTempPath("Temp_region", ".bed") runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed)) chunkBed = getLocalTempPath("Temp_chunkBed", ".bed") runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" % ( tempBed, windowSize, chunkBed)) window = getLocalTempPath("Temp_window", ".bed") slice1 = getLocalTempPath("Temp_slice1", ".bed") slice2 = getLocalTempPath("Temp_slice2", ".bed") compFile = getLocalTempPath("Temp_compFile", ".bed") compOpts = "" winIdx = argv.index("--window") assert winIdx > 0 and winIdx < len(argv) -1 and argv[winIdx + 1] == args.window for i in xrange(3, len(argv)): if i != winIdx and i != winIdx + 1: compOpts += " " + argv[i] for chunk in readBedIntervals(chunkBed): runShellCommand("echo \"%s\t%d\t%d\" > %s" % (chunk[0], chunk[1], chunk[2], window)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % ( args.bed1, window, slice1)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % ( args.bed2, window, slice2)) runShellCommand("compareBedStates.py %s %s %s > %s" % ( slice1, slice2, compOpts, compFile)) stats = extractCompStatsFromFile(compFile)[compIdx] if stateName not in stats: stats[stateName] = (0,0) f1 = 0. prec, rec = stats[stateName] if prec + rec > 0: f1 = (2. * prec * rec) / (prec + rec) val = f1 if score == "precision": val = prec elif score == "recall": val = rec outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val)) runShellCommand("rm -f %s %s %s %s %s %s" % (tempBed, chunkBed, window, slice1, slice2, compFile)) outFile.close()
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Remove ltr_finder ids from 4th column") parser.add_argument("inBed", help="bed with ltr results to process") parser.add_argument("outBed", help="bed to write output to. Will also " "write outBed_sym.bed outBed_tsd_as_gap.bed etc.") parser.add_argument("--keepOl", help="by default, if LTR elements " "overlap, the one with the highest score (length " "in event of tie) is kept. This option disables" " this logic.", action="store_true", default=False) parser.add_argument("--all", help="write _sym, _tsd_as_gap, etc. versions" " of output", action="store_true", default=False) parser.add_argument("--weak", help="score threshold such that any elemetns" " with a score lower or equal to will be assigned the" " prefix WEAK_ to their names.", type=float, default=-1) parser.add_argument("--weakIgnore", help="dont apply --weak to state names" " that contain given keywords (defined as comma-separated" " list", default=None) args = parser.parse_args() tempBedToolPath = initBedTool() assert os.path.exists(args.inBed) baseOut, ext = os.path.splitext(args.outBed) if args.weakIgnore is not None: args.weakIgnore = args.weakIgnore.split(",") else: args.weakIgnore = [] inBed = args.inBed toRm = [] if not args.keepOl: inBed = getLocalTempPath("Temp", ".bed") removeOverlaps(args.inBed, inBed, args) toRm.append(inBed) os.system("sed -e \"s/|LTR_TE|[0-9]*//g\" -e \"s/|-//g\" %s > %s" % ( inBed, args.outBed)) if args.all: symBed = baseOut + "_sym" + ext os.system("sed -e \"s/|left//g\" -e \"s/|right//g\" %s > %s" % (args.outBed, symBed)) tsd_as_gapsBed = baseOut + "_tsd_as_gap" + ext os.system("grep -v TSD %s > %s" % (args.outBed, tsd_as_gapsBed)) sym_tsd_as_gapsBed = baseOut + "_sym_tsd_as_gap" + ext os.system("grep -v TSD %s > %s" % (symBed, sym_tsd_as_gapsBed)) tsd_as_ltrBed = baseOut + "_tsd_as_ltr" + ext os.system("sed -e \"s/TSD/LTR/g\" %s > %s" % (args.outBed, tsd_as_ltrBed)) sym_tsd_as_ltrBed = baseOut + "_sym_tsd_as_ltr" + ext os.system("sed -e \"s/TSD/LTR/g\" %s > %s" % (symBed, sym_tsd_as_ltrBed)) singleBed = baseOut + "_single" + ext os.system("sed -e \"s/LTR/inside/g\" %s > %s" % (sym_tsd_as_ltrBed, singleBed)) for path in toRm: runShellCommand("rm -f %s" % path) cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Combine a bunch of non-numeric BED tracks into" " single file using fitStateNames.py to try to keep names " "consistent. Idea is to be used as baseline to compare" " hmm to (via base-by-base statistics, primarily, since" " this procedure could induce some fragmentation)") parser.add_argument("tracksXML", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("regionBed", help="BED file representing " "target region (best if whole genome)") parser.add_argument("outBed", help="Output bed") parser.add_argument("--tracks", help="Comma-separated list of " "track names to use. All tracks will be" " used by default", default=None) parser.add_argument("--outside", help="Name to give non-annotated" "regions", default="Outside") parser.add_argument("--fitThresh", help="Min map percentage (0,1)" " in order to rename (see --qualThresh option" "of fitStateNames.py", type=float, default=0.5) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() inputTrackList = TrackList(args.tracksXML) iter = 0 # get regionBed where all intervals are merged when possible regionIntervals = getMergedBedIntervals(args.regionBed, sort=True) tempRegionPath = getLocalTempPath("Temp", "_reg.bed") tempRegionFile = open(tempRegionPath, "w") for interval in regionIntervals: tempRegionFile.write("\t".join([str(x) for x in interval]) + "\n") tempRegionFile.close() # accumulate tracks in temp file tempOutPath = getLocalTempPath("Temp", "_out.bed") for track in inputTrackList: if track.shift is not None or track.scale is not None or\ track.logScale is not None or track.dist == "gaussian" or\ os.path.splitext(track.getPath())[1].lower() != ".bed": logger.warning("Skipping numeric track %s" % track.getName()) elif args.tracks is None or track.getName() in args.tracks.split(","): combineTrack(track, tempOutPath, tempRegionPath, iter, args) iter += 1 # nothing got written, make everything outside if iter == 0: tempOutFile = open(tempOutPath, "w") for interval in regionIntervals: tempOutFile.write("%s\t%s\t%s\t%s\n" % (interval[0], interval[1], interval[2], args.outside)) tempOutFile.close() runShellCommand("mv %s %s" % (tempOutPath, args.outBed)) runShellCommand("rm -f %s" % (tempRegionPath)) cleanBedTool(tempBedToolPath)
def runTsd(args, tempTracksInfo): """ run addTsdTrack on termini and chaux to generate tsd track""" if args.noTsd is True: return origTrackList = TrackList(args.tracksInfo) outTrackList = TrackList(tempTracksInfo) tempFiles = [] tsdInputFiles = [] tsdInputTracks = [] # preprocess termini lastzTracks = [origTrackList.getTrackByName(args.ltr_termini), origTrackList.getTrackByName(args.tir)] for terminiTrack in lastzTracks: if terminiTrack is not None: inFile = terminiTrack.getPath() fillFile = getLocalTempPath("Temp_fill", ".bed") tempBed = None if inFile[-3:] == ".bb": tempBed = getLocalTempPath("Temp_termini", ".bed") runShellCommand("bigBedToBed %s %s" % (inFile, tempBed)) inFile = tempBed runShellCommand("fillTermini.py %s %s" % (inFile, fillFile)) tsdInputFiles.append(fillFile) tsdInputTracks.append(terminiTrack.getName()) tempFiles.append(fillFile) if tempBed is not None: runShellCommand("rm -f %s" % tempBed) else: logger.warning("Could not find termini track") # add repeat_modeler repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler) if repeat_modelerTrack is not None: tsdInputFiles.append(repeat_modelerTrack.getPath()) tsdInputTracks.append(repeat_modelerTrack.getName()) # run addTsdTrack (appending except first time) # note we override input track paths in each case assert len(tsdInputFiles) == len(tsdInputTracks) for i in xrange(len(tsdInputFiles)): optString = "" if i > 0: optString += " --append" # really rough hardcoded params based on # (A unified classification system for eukaryotic transposable elements # Wicker et. al 2007) if tsdInputTracks[i] == args.repeat_modeler: optString += " --names LINE,SINE,Unknown" optString += " --maxScore 20" optString += " --left 20" optString += " --right 20" optString += " --min 5" optString += " --max 20" optString += " --overlap 20" elif tsdInputTracks[i] == args.ltr_termini: optString += " --maxScore 3" optString += " --left 8" optString += " --right 8" optString += " --min 3" optString += " --max 6" elif tsdInputTracks[i] == args.tir: optString += " --maxScore 3" optString += " --left 15" optString += " --right 15" optString += " --min 3" optString += " --max 12" tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml") runShellCommand("addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % ( tempTracksInfo, args.cleanTrackPath, tempXMLOut, tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i], optString, args.logOpString, args.numProc)) runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo)) for i in xrange(len(tempFiles)): runShellCommand("rm %s" % tempFiles[i])
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Generate HMM-usable tracklist from raw tracklist. EX " "used to transform mustang_alyrata_tracks.xml -> " "mustang_alyrata_clean.xml. Runs cleanRM.py cleanLtrFinder.py and " " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs " " removeBedOverlaps.py before each of the clean scripts)") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("cleanTrackPath", help="Directory to write cleaned BED" " tracks to") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML" " to.") parser.add_argument("--numBins", help="Maximum number of bins after scaling", default=10, type=int) parser.add_argument("--scaleTracks", help="Comma-separated list of tracks " "to process for scaling. If not set, all" " tracks listed as having a multinomial distribution" " (since this is the default value, this includes " "tracks with no distribution attribute) or gaussian " "distribution will be processed.", default=None) parser.add_argument("--skipScale", help="Comma-separated list of tracks to " "skip for scaling.", default=None) parser.add_argument("--ltr_termini", help="Name of termini track (appy tsd)", default="ltr_termini") parser.add_argument("--repeat_modeler", help="Name of repeat_modeler track (appy tsd)", default="repeat_modeler") parser.add_argument("--sequence", help="Name of fasta sequence track", default="sequence") parser.add_argument("--tsd", help="Name of tsd track to generate (appy cleanTermini.py)", default="tsd") parser.add_argument("--tir", help="Name of tir_termini track (appy cleanTermini.py)", default="tir_termini") parser.add_argument("--noScale", help="Dont do any scaling", default=False, action="store_true") parser.add_argument("--noTsd", help="Dont generate TSD track. NOTE:" " TSD track is hardcoded to be generated from " "termini and (non-LTR elements of ) chaux", default=False, action="store_true") parser.add_argument("--numProc", help="Number of processes to use for tsdFinder.py", default=1, type=int) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile try: os.makedirs(args.cleanTrackPath) except: pass if not os.path.isdir(args.cleanTrackPath): raise RuntimeError("Unable to find or create cleanTrack dir %s" % args.cleanTrackPath) tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml") runCleaning(args, tempTracksInfo) assert os.path.isfile(tempTracksInfo) runTsd(args, tempTracksInfo) runScaling(args, tempTracksInfo) runShellCommand("rm -f %s" % tempTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Remove ltr_finder ids from 4th column") parser.add_argument("inBed", help="bed with ltr results to process") parser.add_argument("outBed", help="bed to write output to. Will also " "write outBed_sym.bed outBed_tsd_as_gap.bed etc.") parser.add_argument("--keepOl", help="by default, if LTR elements " "overlap, the one with the highest score (length " "in event of tie) is kept. This option disables" " this logic.", action="store_true", default=False) parser.add_argument("--all", help="write _sym, _tsd_as_gap, etc. versions" " of output", action="store_true", default=False) parser.add_argument("--weak", help="score threshold such that any elemetns" " with a score lower or equal to will be assigned the" " prefix WEAK_ to their names.", type=float, default=-1) parser.add_argument( "--weakIgnore", help="dont apply --weak to state names" " that contain given keywords (defined as comma-separated" " list", default=None) args = parser.parse_args() tempBedToolPath = initBedTool() assert os.path.exists(args.inBed) baseOut, ext = os.path.splitext(args.outBed) if args.weakIgnore is not None: args.weakIgnore = args.weakIgnore.split(",") else: args.weakIgnore = [] inBed = args.inBed toRm = [] if not args.keepOl: inBed = getLocalTempPath("Temp", ".bed") removeOverlaps(args.inBed, inBed, args) toRm.append(inBed) os.system("sed -e \"s/|LTR_TE|[0-9]*//g\" -e \"s/|-//g\" %s > %s" % (inBed, args.outBed)) if args.all: symBed = baseOut + "_sym" + ext os.system("sed -e \"s/|left//g\" -e \"s/|right//g\" %s > %s" % (args.outBed, symBed)) tsd_as_gapsBed = baseOut + "_tsd_as_gap" + ext os.system("grep -v TSD %s > %s" % (args.outBed, tsd_as_gapsBed)) sym_tsd_as_gapsBed = baseOut + "_sym_tsd_as_gap" + ext os.system("grep -v TSD %s > %s" % (symBed, sym_tsd_as_gapsBed)) tsd_as_ltrBed = baseOut + "_tsd_as_ltr" + ext os.system("sed -e \"s/TSD/LTR/g\" %s > %s" % (args.outBed, tsd_as_ltrBed)) sym_tsd_as_ltrBed = baseOut + "_sym_tsd_as_ltr" + ext os.system("sed -e \"s/TSD/LTR/g\" %s > %s" % (symBed, sym_tsd_as_ltrBed)) singleBed = baseOut + "_single" + ext os.system("sed -e \"s/LTR/inside/g\" %s > %s" % (sym_tsd_as_ltrBed, singleBed)) for path in toRm: runShellCommand("rm -f %s" % path) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Filter overlapping intervals out") parser.add_argument("inputBed", help="Bed file to filter") parser.add_argument("--bed12", help="Use bed12 exons instead of start/end" " if present (equivalent to running bed12ToBed6 on" " input first).", action="store_true", default=False) parser.add_argument("--rm", help="Make sure intervals that are labeled as TE " "by rm2State.sh script are never cut by ones that are not", default=False, action='store_true') addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) assert os.path.isfile(args.inputBed) tempBedToolPath = initBedTool() # do the --rm filter. by splitting into TE / non-TE # then removing everything in non-TE that overlaps # TE. The adding the remainder back to TE. inputPath = args.inputBed if args.rm is True: tempPath = getLocalTempPath("Temp_", ".bed") tePath = getLocalTempPath("Temp_te_", ".bed") runShellCommand("rm2State.sh %s |grep TE | sortBed > %s" % ( args.inputBed, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %( args.inputBed, tempPath, tePath)) otherPath = getLocalTempPath("Temp_other_", ".bed") runShellCommand("rm2State.sh %s |grep -v TE | sortBed > %s" % ( args.inputBed, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" %( args.inputBed, tempPath, otherPath)) if os.path.getsize(tePath) > 0 and\ os.path.getsize(otherPath) > 0: filterPath = getLocalTempPath("Temp_filter_", ".bed") runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % ( otherPath, tePath, filterPath)) inputPath = getLocalTempPath("Temp_input_", ".bed") runShellCommand("cat %s %s | sortBed > %s" % ( tePath, filterPath, inputPath)) runShellCommand("rm -f %s" % filterPath) runShellCommand("rm -f %s %s %s" % (tePath, otherPath, tempPath)) bedIntervals = BedTool(inputPath).sort() if args.bed12 is True: bedIntervals = bedIntervals.bed6() prevInterval = None # this code has been way to buggy for something so simple # keep extra list to check for sure even though it's a waste of # time and space sanity = [] for interval in bedIntervals: if (prevInterval is not None and interval.chrom == prevInterval.chrom and interval.start < prevInterval.end): logger.debug("Replace %d bases of \n%s with\n%s" % ( prevInterval.end - interval.start, str(interval), str(prevInterval))) interval.start = prevInterval.end if interval.end > interval.start: sys.stdout.write("%s" % str(interval)) sanity.append(interval) prevInterval = interval for i in xrange(len(sanity) - 1): if sanity[i].chrom == sanity[i+1].chrom: assert sanity[i+1].start >= sanity[i].end cleanBedTool(tempBedToolPath) if args.inputBed != inputPath: runShellCommand("rm -f %s" % inputPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument( "inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument( "--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument( "--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument( "--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument( "--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument( "--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol=4, sort=True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand( "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[ 2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[ 1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write( "%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([ tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath ])) cleanBedTool(tempBedToolPath)
def parallelDispatch(argv, args): """ chunk up input with chrom option. recursivlely launch eval. merge results """ jobList = [] chromIntervals = readBedIntervals(args.chroms, sort=True) chromFiles = [] regionFiles = [] segFiles = [] statsFiles = [] offset = args.co for chrom in chromIntervals: cmdToks = copy.deepcopy(argv) cmdToks[cmdToks.index("--chrom") + 1] = "" cmdToks[cmdToks.index("--chrom")] = "" chromPath = getLocalTempPath("TempChromPath", ".bed") cpFile = open(chromPath, "w") cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2])) cpFile.close() regionPath = getLocalTempPath("Temp", ".bed") runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.allBed, chromPath, regionPath)) if os.path.getsize(regionPath) < 2: continue offset += int(chrom[2]) - int(chrom[1]) regionFiles.append(regionPath) chromFiles.append(chromPath) cmdToks[2] = regionPath segPath = getLocalTempPath("Temp", ".bed") cmdToks[3] = segPath segFiles.append(segPath) if "--co" in cmdToks: cmdToks[cmdToks.index("--co") + 1] = str(offset) else: cmdToks.append("--co") cmdToks.append(str(offset)) if args.stats is not None: statsPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--stats") + 1] = statsPath statsFiles.append(statsPath) cmd = " ".join(cmdToks) jobList.append(cmd) runParallelShellCommands(jobList, args.proc) for i in xrange(len(jobList)): if i == 0: ct = ">" else: ct = ">>" runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed)) if len(statsFiles) > 0: runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats)) for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles): runShellCommand("rm %s" % i)
def parallelDispatch(argv, args): """ chunk up input with chrom option. recursivlely launch eval. merge results """ jobList = [] chromIntervals = readBedIntervals(args.chroms, sort=True) chromFiles = [] regionFiles = [] bedFiles = [] pdFiles = [] bicFiles = [] edFiles = [] for chrom in chromIntervals: cmdToks = copy.deepcopy(argv) cmdToks[cmdToks.index("--chrom") + 1] = "" cmdToks[cmdToks.index("--chrom")] = "" chromPath = getLocalTempPath("Temp", ".bed") cpFile = open(chromPath, "w") cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2])) cpFile.close() regionPath = getLocalTempPath("Temp", ".bed") runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bedRegions, chromPath, regionPath)) if os.path.getsize(regionPath) < 2: continue regionFiles.append(regionPath) chromFiles.append(chromPath) cmdToks[3] = regionPath if args.bed is not None: bedPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--bed")+1] = bedPath bedFiles.append(bedPath) if args.pd is not None: pdPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--pd")+1] = pdPath pdFiles.append(pdPath) if args.ed is not None: edPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--ed")+1] = edPath edFiles.append(edPath) if args.bic is not None: bicPath = getLocalTempPath("Temp", ".bic") cmdToks[cmdToks.index("--bic")+1] = bicPath bicFiles.append(bicPath) cmd = " ".join(cmdToks) jobList.append(cmd) runParallelShellCommands(jobList, args.proc) for i in xrange(len(jobList)): if i == 0: ct = ">" else: ct = ">>" if len(bedFiles) > 0: runShellCommand("cat %s %s %s" % (bedFiles[i], ct, args.bed)) if len(pdFiles) > 0: runShellCommand("cat %s %s %s" % (pdFiles[i], ct, args.pd)) if len(edFiles) > 0: runShellCommand("cat %s %s %s" % (edFiles[i], ct, args.ed)) if len(bicFiles) > 0: runShellCommand("cat %s %s %s" % (bicFiles[i], ct, args.bic)) for i in itertools.chain(chromFiles, regionFiles, bedFiles, pdFiles, edFiles, bicFiles): runShellCommand("rm %s" % i)
def runPositionalComparison(argv, args): """ hack to recursively exectute compareBedStates.py on a sliding window of the two inputs and report accuracy in a BED file """ try: windowToks = args.window.split(",") assert len(windowToks) == 5 windowSize = int(windowToks[0]) stateName = windowToks[1] compType = windowToks[2] score = windowToks[3] outBed = windowToks[4] except: raise RuntimeError("value passed to --window is not in valid format") if compType == "base": compIdx = 0 elif compType == "interval": compIdx = 1 elif compType == "weighted": compIdx = 2 else: raise RuntimeError("invalid compType, %s, passed to --window" % compType) if score != "f1" and score != "precision" and score != "recall": raise RuntimeError("invalid score, %s, passed to --window" % score) try: outFile = open(outBed, "w") except: raise RuntimeError("invalid outBed, %s, passed to --window" % outBed) tempBed = getLocalTempPath("Temp_region", ".bed") runShellCommand("mergeBed -i %s > %s" % (args.bed1, tempBed)) chunkBed = getLocalTempPath("Temp_chunkBed", ".bed") runShellCommand("chunkBedRegions.py %s %d --overlap .5 > %s" % (tempBed, windowSize, chunkBed)) window = getLocalTempPath("Temp_window", ".bed") slice1 = getLocalTempPath("Temp_slice1", ".bed") slice2 = getLocalTempPath("Temp_slice2", ".bed") compFile = getLocalTempPath("Temp_compFile", ".bed") compOpts = "" winIdx = argv.index("--window") assert winIdx > 0 and winIdx < len(argv) - 1 and argv[winIdx + 1] == args.window for i in xrange(3, len(argv)): if i != winIdx and i != winIdx + 1: compOpts += " " + argv[i] for chunk in readBedIntervals(chunkBed): runShellCommand("echo \"%s\t%d\t%d\" > %s" % (chunk[0], chunk[1], chunk[2], window)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bed1, window, slice1)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.bed2, window, slice2)) runShellCommand("compareBedStates.py %s %s %s > %s" % (slice1, slice2, compOpts, compFile)) stats = extractCompStatsFromFile(compFile)[compIdx] if stateName not in stats: stats[stateName] = (0, 0) f1 = 0. prec, rec = stats[stateName] if prec + rec > 0: f1 = (2. * prec * rec) / (prec + rec) val = f1 if score == "precision": val = prec elif score == "recall": val = rec outFile.write("%s\t%d\t%d\t%f\n" % (chunk[0], chunk[1], chunk[2], val)) runShellCommand("rm -f %s %s %s %s %s %s" % (tempBed, chunkBed, window, slice1, slice2, compFile)) outFile.close()
def runTsd(args, tempTracksInfo): """ run addTsdTrack on termini and chaux to generate tsd track""" if args.noTsd is True: return origTrackList = TrackList(args.tracksInfo) outTrackList = TrackList(tempTracksInfo) tempFiles = [] tsdInputFiles = [] tsdInputTracks = [] # preprocess termini lastzTracks = [ origTrackList.getTrackByName(args.ltr_termini), origTrackList.getTrackByName(args.tir) ] for terminiTrack in lastzTracks: if terminiTrack is not None: inFile = terminiTrack.getPath() fillFile = getLocalTempPath("Temp_fill", ".bed") tempBed = None if inFile[-3:] == ".bb": tempBed = getLocalTempPath("Temp_termini", ".bed") runShellCommand("bigBedToBed %s %s" % (inFile, tempBed)) inFile = tempBed runShellCommand("fillTermini.py %s %s" % (inFile, fillFile)) tsdInputFiles.append(fillFile) tsdInputTracks.append(terminiTrack.getName()) tempFiles.append(fillFile) if tempBed is not None: runShellCommand("rm -f %s" % tempBed) else: logger.warning("Could not find termini track") # add repeat_modeler repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler) if repeat_modelerTrack is not None: tsdInputFiles.append(repeat_modelerTrack.getPath()) tsdInputTracks.append(repeat_modelerTrack.getName()) # run addTsdTrack (appending except first time) # note we override input track paths in each case assert len(tsdInputFiles) == len(tsdInputTracks) for i in xrange(len(tsdInputFiles)): optString = "" if i > 0: optString += " --append" # really rough hardcoded params based on # (A unified classification system for eukaryotic transposable elements # Wicker et. al 2007) if tsdInputTracks[i] == args.repeat_modeler: optString += " --names LINE,SINE,Unknown" optString += " --maxScore 20" optString += " --left 20" optString += " --right 20" optString += " --min 5" optString += " --max 20" optString += " --overlap 20" elif tsdInputTracks[i] == args.ltr_termini: optString += " --maxScore 3" optString += " --left 8" optString += " --right 8" optString += " --min 3" optString += " --max 6" elif tsdInputTracks[i] == args.tir: optString += " --maxScore 3" optString += " --left 15" optString += " --right 15" optString += " --min 3" optString += " --max 12" tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml") runShellCommand( "addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % (tempTracksInfo, args.cleanTrackPath, tempXMLOut, tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i], optString, args.logOpString, args.numProc)) runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo)) for i in xrange(len(tempFiles)): runShellCommand("rm %s" % tempFiles[i])
cmd += " --segment %s" % trainSegPath runShellCommand(cmd) # eval ############ evalPath = "eval.bed" if startPoint <= 3: cmd = "teHmmEval.py %s %s %s --bed %s --segment %s" % ( trainTracksPath, modelPath, evalSegPath, evalPath, logOpts) runShellCommand(cmd) # fit ############ fitPath = "fit.bed" fitFdrPath = "fitFdr.bed" labelPath = "label.bed" if startPoint <= 4: tempPath = getLocalTempPath("Tempmask", ".bed") runShellCommand("mergeBed -i %s | sortBed > %s" % (evalSegPath, tempPath)) runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (modelerPath, tempPath, labelPath)) runShellCommand("rm -f %s" % tempPath) fitCmd = "fitStateNames.py %s %s %s %s" % (labelPath, evalPath, fitPath, fitFlags) fitFdrCmd = "fitStateNames.py %s %s %s %s" % (labelPath, evalPath, fitFdrPath, fitFlagsFdr) runParallelShellCommands([fitCmd, fitFdrCmd], 2) # compare ############ compDir = "comp" if not os.path.exists(compDir): runShellCommand("mkdir %s" % compDir)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Cut names off at first |, /, ?, or _") parser.add_argument("inBed", help="bed with chaux results to process") parser.add_argument("--keepSlash", help="dont strip anything after slash " "ex: DNA/HELITRONY1C -> DNA", action="store_true", default=False) parser.add_argument("--keepUnderscore", help="dont strip anything after _ ", action="store_true", default=False) parser.add_argument("--leaveNumbers", help="by default, numbers as the end" " of names are trimmed off. ex: DNA/HELITRONY1C -> " " DNA/HELITRONY. This option disables this behaviour", default=False) parser.add_argument("--mapPrefix", help="Rename all strings with given " "prefix to just the prefix. ex: --mapPrefix DNA/HELI" " would cause any instance of DNA/HELITRONY1C or " "HELITRON2 to be mapped to just DNA/HELI. This option" " overrides --keepSlash and --leaveNumbers for the" " elements to which it applies. This option can be" " specified more than once. ex --mapPrefix DNA/HELI " "--maxPrefix DNA/ASINE.", action="append") parser.add_argument("--minScore", help="Minimum score value to not filter" " out", default=-sys.maxint, type=float) parser.add_argument("--maxScore", help="Maximum score value to not filter" " out", default=sys.maxint, type=float) parser.add_argument("--overlap", help="Dont run removeBedOverlaps.py", action="store_true", default=False) args = parser.parse_args() assert os.path.exists(args.inBed) assert args.minScore <= args.maxScore tempBedToolPath = initBedTool() tempPath = getLocalTempPath("Temp_cleanOut", ".bed") tempPath2 = getLocalTempPath("Temp2_", ".bed") tempFile = open(tempPath, "w") for interval in BedTool(args.inBed).sort(): # filter score if exists try: if interval.score is not None and\ (float(interval.score) < args.minScore or float(interval.score) > args.maxScore): continue except: pass prefix = findPrefix(interval.name, args.mapPrefix) if prefix is not None: # prefix was specified with --mapPrefix, that's what we use interval.name = prefix else: # otherwise, strip after | if "|" in interval.name: interval.name = interval.name[:interval.name.find("|")] # strip after ? if "?" in interval.name: interval.name = interval.name[:interval.name.find("?")] #strip after _ unlerss told not to if "_" in interval.name and args.keepUnderscore is False: interval.name = interval.name[:interval.name.find("_")] # strip after "/" unless told not to if "/" in interval.name and args.keepSlash is False: interval.name = interval.name[:interval.name.find("/")] # strip trailing digits (and anything after) unless told not to if args.leaveNumbers is False: m = re.search("\d", interval.name) if m is not None: interval.name = interval.name[:m.start()] tempFile.write(str(interval)) tempFile.close() if not args.overlap: runShellCommand("removeBedOverlaps.py %s --rm > %s" % (tempPath, tempPath2)) tempPath, tempPath2, = tempPath2, tempPath tempFile = open(tempPath, "r") for line in tempFile: sys.stdout.write(line) tempFile.close() runShellCommand("rm -f %s %s" % (tempPath, tempPath2)) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Generate HMM-usable tracklist from raw tracklist. EX " "used to transform mustang_alyrata_tracks.xml -> " "mustang_alyrata_clean.xml. Runs cleanRM.py cleanLtrFinder.py and " " cleanTermini.py and addTsdTrack.py and setTrackScaling.py (also runs " " removeBedOverlaps.py before each of the clean scripts)") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("cleanTrackPath", help="Directory to write cleaned BED" " tracks to") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML" " to.") parser.add_argument("--numBins", help="Maximum number of bins after scaling", default=10, type=int) parser.add_argument("--scaleTracks", help="Comma-separated list of tracks " "to process for scaling. If not set, all" " tracks listed as having a multinomial distribution" " (since this is the default value, this includes " "tracks with no distribution attribute) or gaussian " "distribution will be processed.", default=None) parser.add_argument("--skipScale", help="Comma-separated list of tracks to " "skip for scaling.", default=None) parser.add_argument("--ltr_termini", help="Name of termini track (appy tsd)", default="ltr_termini") parser.add_argument("--repeat_modeler", help="Name of repeat_modeler track (appy tsd)", default="repeat_modeler") parser.add_argument("--sequence", help="Name of fasta sequence track", default="sequence") parser.add_argument( "--tsd", help="Name of tsd track to generate (appy cleanTermini.py)", default="tsd") parser.add_argument( "--tir", help="Name of tir_termini track (appy cleanTermini.py)", default="tir_termini") parser.add_argument("--noScale", help="Dont do any scaling", default=False, action="store_true") parser.add_argument("--noTsd", help="Dont generate TSD track. NOTE:" " TSD track is hardcoded to be generated from " "termini and (non-LTR elements of ) chaux", default=False, action="store_true") parser.add_argument("--numProc", help="Number of processes to use for tsdFinder.py", default=1, type=int) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile try: os.makedirs(args.cleanTrackPath) except: pass if not os.path.isdir(args.cleanTrackPath): raise RuntimeError("Unable to find or create cleanTrack dir %s" % args.cleanTrackPath) tempTracksInfo = getLocalTempPath("Temp_mustang_alyrata_clean", "xml") runCleaning(args, tempTracksInfo) assert os.path.isfile(tempTracksInfo) runTsd(args, tempTracksInfo) runScaling(args, tempTracksInfo) runShellCommand("rm -f %s" % tempTracksInfo) cleanBedTool(tempBedToolPath)
def parallelDispatch(argv, args): """ chunk up input with chrom option. recursivlely launch eval. merge results """ jobList = [] chromIntervals = readBedIntervals(args.chroms, sort=True) chromFiles = [] regionFiles = [] segFiles = [] statsFiles = [] offset = args.co for chrom in chromIntervals: cmdToks = copy.deepcopy(argv) cmdToks[cmdToks.index("--chrom") + 1] = "" cmdToks[cmdToks.index("--chrom")] = "" chromPath = getLocalTempPath("TempChromPath", ".bed") cpFile = open(chromPath, "w") cpFile.write("%s\t%d\t%d\t0\t0\t.\n" % (chrom[0], chrom[1], chrom[2])) cpFile.close() regionPath = getLocalTempPath("Temp", ".bed") runShellCommand("intersectBed -a %s -b %s | sortBed > %s" % (args.allBed, chromPath, regionPath)) if os.path.getsize(regionPath) < 2: continue offset += int(chrom[2]) - int(chrom[1]) regionFiles.append(regionPath) chromFiles.append(chromPath) cmdToks[2] = regionPath segPath = getLocalTempPath("Temp", ".bed") cmdToks[3] = segPath segFiles.append(segPath) if "--co" in cmdToks: cmdToks[cmdToks.index("--co")+1] = str(offset) else: cmdToks.append("--co") cmdToks.append(str(offset)) if args.stats is not None: statsPath = getLocalTempPath("Temp", ".bed") cmdToks[cmdToks.index("--stats")+1] = statsPath statsFiles.append(statsPath) cmd = " ".join(cmdToks) jobList.append(cmd) runParallelShellCommands(jobList, args.proc) for i in xrange(len(jobList)): if i == 0: ct = ">" else: ct = ">>" runShellCommand("cat %s %s %s" % (segFiles[i], ct, args.outBed)) if len(statsFiles) > 0: runShellCommand("cat %s %s %s" % (statsFiles[i], ct, args.stats)) for i in itertools.chain(chromFiles, regionFiles, segFiles, statsFiles): runShellCommand("rm %s" % i)