def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath): """ Filter out intervals of mask tracks from inBed with lengths outside given range. Idea is that it makes less sense to simply ignore, say, giant stretches of N's (like centromeres), as we would by masking them normally, than it does to remove them entirely, splitting the genome into multiple chunks. Can also be used during comparision to get rid of all masked intervals """ outPath = getLocalTempPath("Tempcut", ".bed") trackList = TrackList(tracksInfoPath) maskPaths = [t.getPath() for t in trackList.getMaskTracks()] if len(maskPaths) == 0: return None tempPath1 = getLocalTempPath("Tempcut1", ".bed") tempPath2 = getLocalTempPath("Tempcut2", ".bed") runShellCommand("cp %s %s" % (inBed, outPath)) for maskPath in maskPaths: runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" % (maskPath, tempPath1)) if os.path.getsize(tempPath1) > 0: runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" % (tempPath1, tempPath2, tempPath2, tempPath1)) runShellCommand("filterBedLengths.py %s %d %d > %s" % (tempPath1, minLength + 1, maxLength - 1, tempPath2)) runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (outPath, tempPath2, tempPath1)) runShellCommand("mv %s %s" % (tempPath1, outPath)) runShellCommand("rm -f %s %s" % (tempPath1, tempPath2)) if os.path.getsize(outPath) == 0: raise RuntimeError( "cutOutMaskIntervals removed everything. Can't continue." " probably best to rerun calling script on bigger region?") return outPath
def filterCutTrack(genomePath, fragmentFilterLen, trackListPath, cutTrackName, cutTrackLenFilter): """ return path of length filtered cut track""" tracks = TrackList(trackListPath) track = tracks.getTrackByName(cutTrackName) assert track is not None cutTrackOriginalPath = track.getPath() cutTrackPath = getOutPath(cutTrackOriginalPath, outDir, "filter%d" % cutTrackLenFilter) runShellCommand("filterBedLengths.py %s %s > %s" % (cutTrackOriginalPath, cutTrackLenFilter, cutTrackPath)) tempPath1 = getLocalTempPath("Temp", ".bed") runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (genomePath, cutTrackPath, tempPath1)) tempPath2 = getLocalTempPath("Temp", ".bed") S = string.ascii_uppercase + string.digits tag = ''.join(random.choice(S) for x in range(200)) runShellCommand( "filterBedLengths.py %s %d --rename %s |grep %s | sortBed> %s" % (tempPath1, fragmentFilterLen, tag, tag, tempPath2)) runShellCommand( "cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . > %s" % (tempPath2, tempPath1)) runShellCommand( "cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . >> %s" % (cutTrackPath, tempPath1)) runShellCommand("sortBed -i %s > %s" % (tempPath1, tempPath2)) runShellCommand("mergeBed -i %s > %s" % (tempPath2, cutTrackPath)) runShellCommand("rm -f %s %s" % (tempPath1, tempPath2)) return cutTrackPath
def cutOutMaskIntervals(inBed, minLength, maxLength, tracksInfoPath): """ Filter out intervals of mask tracks from inBed with lengths outside given range. Idea is that it makes less sense to simply ignore, say, giant stretches of N's (like centromeres), as we would by masking them normally, than it does to remove them entirely, splitting the genome into multiple chunks. Can also be used during comparision to get rid of all masked intervals """ outPath = getLocalTempPath("Tempcut", ".bed") trackList = TrackList(tracksInfoPath) maskPaths = [t.getPath() for t in trackList.getMaskTracks()] if len(maskPaths) == 0: return None tempPath1 = getLocalTempPath("Tempcut1", ".bed") tempPath2 = getLocalTempPath("Tempcut2", ".bed") runShellCommand("cp %s %s" % (inBed, outPath)) for maskPath in maskPaths: runShellCommand("cat %s | awk \'{print $1\"\t\"$2\"\t\"$3}\' >> %s" % ( maskPath, tempPath1)) if os.path.getsize(tempPath1) > 0: runShellCommand("sortBed -i %s > %s ; mergeBed -i %s > %s" % ( tempPath1, tempPath2, tempPath2, tempPath1)) runShellCommand("filterBedLengths.py %s %d %d > %s" % ( tempPath1, minLength+1, maxLength-1, tempPath2)) runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % ( outPath, tempPath2, tempPath1)) runShellCommand("mv %s %s" % (tempPath1, outPath)) runShellCommand("rm -f %s %s" % (tempPath1, tempPath2)) if os.path.getsize(outPath) == 0: raise RuntimeError("cutOutMaskIntervals removed everything. Can't continue." " probably best to rerun calling script on bigger region?") return outPath
def filterCutTrack(genomePath, fragmentFilterLen, trackListPath, cutTrackName, cutTrackLenFilter): """ return path of length filtered cut track""" tracks = TrackList(trackListPath) track = tracks.getTrackByName(cutTrackName) assert track is not None cutTrackOriginalPath = track.getPath() cutTrackPath = getOutPath(cutTrackOriginalPath, outDir, "filter%d" % cutTrackLenFilter) runShellCommand("filterBedLengths.py %s %s > %s" % (cutTrackOriginalPath, cutTrackLenFilter, cutTrackPath)) tempPath1 = getLocalTempPath("Temp", ".bed") runShellCommand("subtractBed -a %s -b %s | sortBed > %s" % (genomePath, cutTrackPath, tempPath1)) tempPath2 = getLocalTempPath("Temp", ".bed") S = string.ascii_uppercase + string.digits tag = ''.join(random.choice(S) for x in range(200)) runShellCommand("filterBedLengths.py %s %d --rename %s |grep %s | sortBed> %s" % ( tempPath1, fragmentFilterLen, tag, tag, tempPath2)) runShellCommand("cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . > %s" % (tempPath2, tempPath1)) runShellCommand("cat %s | setBedCol.py 3 N | setBedCol.py 4 0 | setBedCol.py 5 . >> %s" % (cutTrackPath, tempPath1)) runShellCommand("sortBed -i %s > %s" % (tempPath1, tempPath2)) runShellCommand("mergeBed -i %s > %s" %(tempPath2, cutTrackPath)) runShellCommand("rm -f %s %s" % (tempPath1, tempPath2)) return cutTrackPath
def runCleaning(args, tempTracksInfo): """ run scripts for cleaning chaux, ltr_finder, and termini""" trackList = TrackList(args.tracksInfo) for track in trackList: if track.getPreprocess() is None: continue # convert bigbed/wig inFile = track.getPath() tempBed1 = None if inFile[-3:] == ".bb" or inFile[-3:] == ".bw": tempBed1 = getLocalTempPath("Temp_%s" % track.getName(), ".bed") if inFile[-3:] == ".bb": runShellCommand("bigBedToBed %s %s" % (inFile, tempBed1)) else: runShellCommand("bigWigToBedGraph %s %s" % (inFile, tempBed1)) inFile = tempBed1 # run cleanRM.py on all tracks with rm or rmu preprocessor if track.getPreprocess() == "rm" or track.getPreprocess() == "rmu": flag = "" if track.getPreprocess() == "rmu": flag == "--keepUnderscore" inFile = track.getPath() outFile = cleanPath(args, track) tempBed = getLocalTempPath("Temp_%s" % track.getName(), ".bed") runShellCommand("cleanRM.py %s %s > %s" % (inFile, flag, tempBed)) runShellCommand("removeBedOverlaps.py --rm %s > %s" % (tempBed, outFile)) runShellCommand("rm -f %s" % tempBed) track.setPath(outFile) # run cleanTermini.py elif track.getPreprocess() == "termini": outFile = cleanPath(args, track) inFile = track.getPath() runShellCommand("cleanTermini.py %s %s" % (inFile, outFile)) track.setPath(outFile) # run removeBedOverlaps elif track.getPreprocess() == "overlap": outFile = cleanPath(args, track) inFile = track.getPath() runShellCommand("removeBedOverlaps.py %s > %s" % (inFile, outFile)) track.setPath(outFile) # run cleanLtrFinder.py elif track.getPreprocess() == "ltr_finder": inFile = track.getPath() outFile = cleanPath(args, track) # note: overlaps now removed in cleanLtrFinderID script runShellCommand("cleanLtrFinderID.py %s %s" % (inFile, outFile)) track.setPath(outFile) if tempBed1 is not None: runShellCommand("rm -f %s" % tempBed1) # save a temporary xml trackList.saveXML(tempTracksInfo)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Set the score column of each bed interval in input to " "(MODE, BINNED) average value of the intersection region in another track). " "Can be used, for instance, to assign a copy number of each RepeatModeler " "prediction...") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("inBed", help="BED file to annotate") parser.add_argument("track", help="Track to use for annotation") parser.add_argument("outBed", help="Path for output, annotated BED file") parser.add_argument("--name", help="Set ID field (column 4 instead of 5)", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # read the tracks list trackList = TrackList(args.tracksInfo) track = trackList.getTrackByName(args.track) if track is None: raise RuntimeError("Can't find track %s" % args.track) # make temporary tracks list with just our track so we can keep using # tracks list interface but not read unecessary crap. singleListPath = getLocalTempPath("Temp_secScore", ".bed") trackList.trackList = [track] trackList.saveXML(singleListPath) obFile = open(args.outBed, "w") # trackData interface not so great at cherry picking intervals. # need to merge them up and use segmentation interface filledIntervals, mergedIntervals = fillGaps(args.inBed) # read track into trackData trackData = TrackData() logger.info("loading track %s" % singleListPath) trackData.loadTrackData(singleListPath, mergedIntervals, segmentIntervals=filledIntervals, applyMasking=False) # finally, write the annotation writeAnnotatedIntervals(trackData, filledIntervals, mergedIntervals, obFile, args) runShellCommand("rm -f %s" % singleListPath) obFile.close() cleanBedTool(tempBedToolPath)
def subsetTrackList(trackList, sizeRange, mandTracks): """ generate tracklists of all combinations of tracks in the input list optionally using size range to limit the different sizes tried. so, for example, given input list [t1, t2, t3] and sizeRange=None this will gneerate [t1] [t2] [t3] [t1,t2] [t1,t3] [t2,t3] [t1,t2,t3] """ assert sizeRange[0] > 0 sizeRange = (sizeRange[0], min(sizeRange[1], len(trackList) + 1)) for outLen in xrange(*sizeRange): for perm in itertools.combinations([x for x in xrange(len(trackList))], outLen): permList = TrackList() mandFound = 0 for trackNo in perm: track = copy.deepcopy(trackList.getTrackByNumber(trackNo)) permList.addTrack(track) if track.getName() in mandTracks: mandFound += 1 if mandFound == len(mandTracks): yield permList
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Make benchmark summary row. Called from within " "teHmmBenchmark.py") parser.add_argument("tracksInfo", help="Path of Tracks Info file that" " teHmmBenchmark.py was run on.") parser.add_argument("localTracksInfo", help="Path of Tracks Info file for" " row (could be a subset of above)") parser.add_argument("evalBed", help="Bed file created by teHmmEval. Used" " for the Viterbi score in comment at top") parser.add_argument("compBed", help="Results of comparison script") parser.add_argument("outRow", help="File to write row information to") args = parser.parse_args() inputTrackList = TrackList(args.tracksInfo) trackList = TrackList(args.localTracksInfo) header, row = scrapeRow(inputTrackList, trackList, args.evalBed, args.compBed) header = map(str, header) row = map(str, row) outFile = open(args.outRow, "w") outFile.write(",".join(header) + "\n") outFile.write(",".join(row) + "\n") outFile.close()
def runTrial(tracksList, iteration, newTrackName, args): """ compute a score for a given set of tracks using teHmmBenchmark.py """ benchDir = os.path.join(args.outDir, "iter%d" % iteration) benchDir = os.path.join(benchDir, "%s_bench" % newTrackName) if not os.path.exists(benchDir): os.makedirs(benchDir) trainingPath = args.training truthPath = args.truth tracksPath = os.path.join(benchDir, "tracks.xml") tracksList.saveXML(tracksPath) segLogPath = os.path.join(benchDir, "segment_cmd.txt") segLog = open(segLogPath, "w") if args.segTracks == args.tracks: segTracksPath = tracksPath # pull out desired tracks from segment tracks XML if specified else: segTracksIn = TrackList(args.segTracks) segTracks = TrackList() for track in tracksList: segTrack = segTracksIn.getTrackByName(track.getName()) if segTrack is not None: segTracks.addTrack(segTrack) else: logger.warning("track %s not found in segment tracks %s" % ( track.getName(), args.segTracks)) segTracksPath = os.path.join(benchDir, "seg_tracks.xml") segTracks.saveXML(segTracksPath) # segment training segTrainingPath = os.path.join(benchDir, os.path.splitext( os.path.basename(trainingPath))[0]+ "_trainSeg.bed") segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath, trainingPath, segTrainingPath, args.segOpts) if args.fullSegment is False: runShellCommand(segmentCmd) segLog.write(segmentCmd + "\n") else: runShellCommand("ln -f -s %s %s" % (args.fullSegTrainPath, segTrainingPath)) # segment eval segEvalPath = os.path.join(benchDir, os.path.splitext(os.path.basename(truthPath))[0]+ "_evalSeg.bed") segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath, truthPath, segEvalPath, args.segOpts) if trainingPath == truthPath: segmentCmd = "ln -f -s %s %s" % (os.path.abspath(segTrainingPath), segEvalPath) if args.fullSegment is False: runShellCommand(segmentCmd) segLog.write(segmentCmd + "\n") else: runShellCommand("ln -f -s %s %s" % (args.fullSegEvalPath, segEvalPath)) segLog.close() segPathOpts = " --eval %s --truth %s" % (segEvalPath, truthPath) benchCmd = "teHmmBenchmark.py %s %s %s %s" % (tracksPath, benchDir, segTrainingPath, args.benchOpts + segPathOpts) runShellCommand(benchCmd) score = extractScore(benchDir, segTrainingPath, args) bic = extractBIC(benchDir, segTrainingPath, args) naive = 0 if args.doNaive is True: naive = extractNaive(tracksPath, benchDir, segTrainingPath, args) slope, rsq = extractF1ProbSlope(benchDir, segTrainingPath, args) # clean up big files? return score, bic, naive, slope, rsq
def greedyRank(args): """ Iteratively add best track to a (initially empty) tracklist according to some metric""" inputTrackList = TrackList(args.tracks) rankedTrackList = TrackList() if args.startTracks is not None: for startTrack in args.startTracks.split(","): track = inputTrackList.getTrackByName(startTrack) if track is None: logger.warning("Start track %s not found in tracks XML" % startTrack) else: rankedTrackList.addTrack(copy.deepcopy(track)) numTracks = len(inputTrackList) - len(rankedTrackList) currentScore, currentBIC = 0.0, sys.maxint # compute full segmentation if --fullSegment is True if args.fullSegment is True: args.fullSegTrainPath = os.path.abspath(os.path.join(args.outDir, "fullSegTrain.bed")) segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks, args.training, args.fullSegTrainPath, args.segOpts) runShellCommand(segmentCmd) args.fullSegEvalPath = os.path.abspath(os.path.join(args.outDir, "fullSegEval.bed")) segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks, args.truth, args.fullSegEvalPath, args.segOpts) runShellCommand(segmentCmd) #header rankFile = open(os.path.join(args.outDir, "ranking.txt"), "w") rankFile.write("It.\tTrack\tF1\tBIC\tNaiveF1\tAccProbSlop\tAccProbR2\n") rankFile.close() # baseline score if we not starting from scratch baseIt = 0 if args.startTracks is not None: curTrackList = copy.deepcopy(rankedTrackList) score,bic,naive,slope,rsq = runTrial(curTrackList, baseIt, "baseline_test", args) rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a") rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (baseIt, args.startTracks, score, bic, naive,slope,rsq)) rankFile.close() baseIt += 1 for iteration in xrange(baseIt, baseIt + numTracks): bestItScore = -sys.maxint bestItBic = sys.maxint bestItNaive = -sys.maxint bestNextTrack = None bestSlope = None bestR = None for nextTrack in inputTrackList: if rankedTrackList.getTrackByName(nextTrack.getName()) is not None: continue curTrackList = copy.deepcopy(rankedTrackList) curTrackList.addTrack(nextTrack) score,bic,naive,slope,rsq = runTrial(curTrackList, iteration, nextTrack.getName(), args) best = False if args.bic is True: if bic < bestItBic or (bic == bestItBic and score > bestItScore): best = True elif args.naive is True: if naive > bestItNaive or (naive == bestItNaive and score > bestItScore): best = True elif score > bestItScore or (score == bestItScore and bic < bestItBic): best = True if best is True: bestItScore, bestItBic, bestItNaive, bestSlope, bestR, bestNextTrack =\ score, bic, naive, slope, rsq, nextTrack flags = "a" if iteration == baseIt: flags = "w" trackLogFile = open(os.path.join(args.outDir, nextTrack.getName() + ".txt"), flags) trackLogFile.write("%d\t%f\t%f\t%f\t%f\t%f\n" % (iteration, score, bic, naive, slope, rsq)) trackLogFile.close() rankedTrackList.addTrack(copy.deepcopy(bestNextTrack)) rankedTrackList.saveXML(os.path.join(args.outDir, "iter%d" % iteration, "tracks.xml")) rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a") rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (iteration, bestNextTrack.getName(), bestItScore, bestItBic, bestItNaive, bestSlope, bestR)) rankFile.close()
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Combine a bunch of non-numeric BED tracks into" " single file using fitStateNames.py to try to keep names " "consistent. Idea is to be used as baseline to compare" " hmm to (via base-by-base statistics, primarily, since" " this procedure could induce some fragmentation)") parser.add_argument("tracksXML", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("regionBed", help="BED file representing " "target region (best if whole genome)") parser.add_argument("outBed", help="Output bed") parser.add_argument("--tracks", help="Comma-separated list of " "track names to use. All tracks will be" " used by default", default=None) parser.add_argument("--outside", help="Name to give non-annotated" "regions", default="Outside") parser.add_argument("--fitThresh", help="Min map percentage (0,1)" " in order to rename (see --qualThresh option" "of fitStateNames.py", type=float, default=0.5) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() inputTrackList = TrackList(args.tracksXML) iter = 0 # get regionBed where all intervals are merged when possible regionIntervals = getMergedBedIntervals(args.regionBed, sort=True) tempRegionPath = getLocalTempPath("Temp", "_reg.bed") tempRegionFile = open(tempRegionPath, "w") for interval in regionIntervals: tempRegionFile.write("\t".join([str(x) for x in interval]) + "\n") tempRegionFile.close() # accumulate tracks in temp file tempOutPath = getLocalTempPath("Temp", "_out.bed") for track in inputTrackList: if track.shift is not None or track.scale is not None or\ track.logScale is not None or track.dist == "gaussian" or\ os.path.splitext(track.getPath())[1].lower() != ".bed": logger.warning("Skipping numeric track %s" % track.getName()) elif args.tracks is None or track.getName() in args.tracks.split(","): combineTrack(track, tempOutPath, tempRegionPath, iter, args) iter += 1 # nothing got written, make everything outside if iter == 0: tempOutFile = open(tempOutPath, "w") for interval in regionIntervals: tempOutFile.write("%s\t%s\t%s\t%s\n" % (interval[0], interval[1], interval[2], args.outside)) tempOutFile.close() runShellCommand("mv %s %s" % (tempOutPath, args.outBed)) runShellCommand("rm -f %s" % (tempRegionPath)) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Train, evalaute, then compare hmm model on input") parser.add_argument("trainingTracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks used " "for training") parser.add_argument("outputDir", help="directory to write output") parser.add_argument("inBeds", nargs="*", help="list of training beds") parser.add_argument("--evalTracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks used" " for evaluation (only need if different from" " trainingTracksInfo", default=None) parser.add_argument("--numProc", help="Max number of processors to use", type=int, default=1) parser.add_argument("--allTrackCombinations", help="Rerun with all" " possible combinations of tracks from the input" " tracksInfo file. Note that this number gets big" " pretty fast.", action = "store_true", default= False) parser.add_argument("--emStates", help="By default the supervised mode" " of teHmmTrain is activated. This option overrides" " that and uses the EM mode and the given number of " "states instead", type=int, default=None) parser.add_argument("--cross", help="Do 50/50 cross validation by training" " on first half input and validating on second", action="store_true", default=False) parser.add_argument("--emFac", help="Normalization factor for weighting" " emission probabilities because when there are " "many tracks, the transition probabilities can get " "totally lost. 0 = no normalization. 1 =" " divide by number of tracks. k = divide by number " "of tracks / k", type=int, default=0) parser.add_argument("--mod", help="Path to trained model. This will " "bypass the training phase that would normally be done" " and just skip to the evaluation. Note that the user" " must make sure that the trained model has the " "states required to process the input data", default = None) parser.add_argument("--iter", help="Number of EM iterations. Needs to be" " used in conjunction with --emStates to specify EM" " training", type = int, default=None) parser.add_argument("--initTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". This file (all other transitions get probability 0)" " is used to specifiy the initial transition model." " The names and number of states will be initialized " "according to this file (overriding --numStates)", default = None) parser.add_argument("--fixTrans", help="Do not learn transition parameters" " (best used with --initTransProbs)", action="store_true", default=False) parser.add_argument("--initEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". This file (all other emissions get probability 0)" " is used to specifiy the initial emission model. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixEm", help="Do not learn emission parameters" " (best used with --initEmProbs)", action="store_true", default=False) parser.add_argument("--initStartProbs", help="Path of text file where each " "line has two entries: State Probability" ". This file (all other start probs get probability 0)" " is used to specifiy the initial start dist. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixStart", help="Do not learn start parameters" " (best used with --initStartProbs)", action="store_true", default=False) parser.add_argument("--forceTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". These transition probabilities will override any " " learned probabilities after training (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed" , default=None) parser.add_argument("--forceEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". These " "emission probabilities will override any learned" " probabilities after training (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed." , default = None) parser.add_argument("--flatEm", help="Use a flat emission distribution as " "a baseline. If not specified, the initial emission " "distribution will be randomized by default. Emission" " probabilities specified with --initEmpProbs or " "--forceEmProbs will never be affected by randomizaiton" ". The randomization is important for Baum Welch " "training, since if two states dont have at least one" " different emission or transition probability to begin" " with, they will never learn to be different.", action="store_true", default=False) parser.add_argument("--emRandRange", help="When randomly initialzing a" " multinomial emission distribution, constrain" " the values to the given range (pair of " "comma-separated numbers). Overridden by " "--initEmProbs and --forceEmProbs when applicable." " Completely overridden by --flatEm (which is equivalent" " to --emRandRange .5,.5.). Actual values used will" " always be normalized.", default=None) parser.add_argument("--mandTracks", help="Mandatory track names for use " "with --allTrackCombinations in comma-separated list", default=None) parser.add_argument("--combinationRange", help="in form MIN,MAX: Only " "explore track combination in given (closed) range. " "A more refined version of --allTrackCombinations.", default=None) parser.add_argument("--supervised", help="Use name (4th) column of " "<traingingBed> for the true hidden states of the" " model. Transition parameters will be estimated" " directly from this information rather than EM." " NOTE: The number of states will be determined " "from the bed.", action = "store_true", default = False) parser.add_argument("--segment", help="Input bed files are also used to " "segment data. Ie teHmmTrain is called with --segment" " set to the input file. Not currently working with " " --supervised", action = "store_true", default=False) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied) in training", type=int, default=None) parser.add_argument("--truth", help="Use specifed file instead of " "input file(s) for truth comparison. Makes sense" " when --segment is specified and only one input" " bed specified", default = None) parser.add_argument("--eval", help="Bed file used for evaluation. It should" " cover same region in same order as --truth. Option " "exists mostly to specify segmentation of --truth", default=None) parser.add_argument("--seed", help="Seed for random number generator" " which will be used to initialize emissions " "(if --flatEM and --supervised not specified)", default=None, type=int) parser.add_argument("--reps", help="Number of training replicates (with " " different" " random initializations) to run. The replicate" " with the highest likelihood will be chosen for the" " output", default=None, type=int) parser.add_argument("--numThreads", help="Number of threads to use when" " running training replicates (see --rep) in parallel.", type=int, default=None) parser.add_argument("--emThresh", help="Threshold used for convergence" " in baum welch training. IE delta log likelihood" " must be bigger than this number (which should be" " positive) for convergence", type=float, default=None) parser.add_argument("--fit", help="Run fitStateNames.py to automap names" " before running comparison", action="store_true", default=False) parser.add_argument("--fitOpts", help="Options to pass to fitStateNames.py" " (only effective if used with --fit)", default=None) parser.add_argument("--saveAllReps", help="Save all replicates (--reps)" " models to disk, instead of just the best one" ". Format is <outputModel>.repN. There will be " " --reps -1 such models saved as the best output" " counts as a replicate. Comparison statistics" " will be generated for each rep.", action="store_true", default=False) parser.add_argument("--maxProb", help="Gaussian distributions and/or" " segment length corrections can cause probability" " to *decrease* during BW iteration. Use this option" " to remember the parameters with the highest probability" " rather than returning the parameters after the final " "iteration.", action="store_true", default=False) parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop" " training if a given number of iterations go by without" " hitting a new maxProb", default=None, type=int) parser.add_argument("--transMatEpsilons", help="By default, epsilons are" " added to all transition probabilities to prevent " "converging on 0 due to rounding error only for fully" " unsupervised training. Use this option to force this" " behaviour for supervised and semisupervised modes", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) logOps = "--logLevel %s" % getLogLevelString() if args.logFile is not None: logOps += " --logFile %s" % args.logFile if not os.path.exists(args.outputDir): os.makedirs(args.outputDir) if args.evalTracksInfo is None: args.evalTracksInfo = args.trainingTracksInfo trainingTrackList = TrackList(args.trainingTracksInfo) evalTrackList = TrackList(args.evalTracksInfo) checkTrackListCompatible(trainingTrackList, evalTrackList) sizeRange = (len(trainingTrackList), len(trainingTrackList) + 1) if args.allTrackCombinations is True: sizeRange = (1, len(trainingTrackList) + 1) if args.combinationRange is not None: toks = args.combinationRange.split(",") sizeRange = int(toks[0]),int(toks[1]) + 1 logger.debug("manual range (%d, %d) " % sizeRange) mandTracks = set() if args.mandTracks is not None: mandTracks = set(args.mandTracks.split(",")) logger.debug("mandatory set %s" % str(mandTracks)) trainFlags = "" if args.emStates is not None: trainFlags += " --numStates %d" % args.emStates if args.supervised is True: trainFlags += " --supervised" if args.segment is True: raise RuntimeError("--supervised not currently compatible with " "--segment") trainFlags += " --emFac %d" % args.emFac if args.forceEmProbs is not None: trainFlags += " --forceEmProbs %s" % args.forceEmProbs if args.iter is not None: assert args.emStates is not None or args.initTransProbs is not None trainFlags += " --iter %d" % args.iter if args.initTransProbs is not None: trainFlags += " --initTransProbs %s" % args.initTransProbs if args.initEmProbs is not None: trainFlags += " --initEmProbs %s" % args.initEmProbs if args.fixEm is True: trainFlags += " --fixEm" if args.initStartProbs is not None: trainFlags += " --initStartProbs %s" % args.initStartProbs if args.fixStart is True: trainFlags += " --fixStart" if args.forceTransProbs is not None: trainFlags += " --forceTransProbs %s" % args.forceTransProbs if args.forceEmProbs is not None: trainFlags += " --forceEmProbs %s" % args.forceEmProbs if args.flatEm is True: trainFlags += " --flatEm" if args.emRandRange is not None: trainFlags += " --emRandRange %s" % args.emRandRange if args.segLen is not None: trainFlags += " --segLen %d" % args.segLen if args.seed is not None: trainFlags += " --seed %d" % args.seed if args.reps is not None: trainFlags += " --reps %d" % args.reps if args.numThreads is not None: trainFlags += " --numThreads %d" % args.numThreads if args.emThresh is not None: trainFlags += " --emThresh %f" % args.emThresh if args.saveAllReps is True: trainFlags += " --saveAllReps" if args.maxProb is True: trainFlags += " --maxProb" if args.transMatEpsilons is True: trainFlags += " --transMatEpsilons" if args.maxProbCut is not None: trainFlags += " --maxProbCut %d" % args.maxProbCut # write out command line for posteriorty's sake if not os.path.exists(args.outputDir): os.makedirs(args.outputDir) cmdPath = os.path.join(args.outputDir, "teHmmBenchmark_cmd.txt") cmdFile = open(cmdPath, "w") cmdFile.write(" ".join(argv) + "\n") cmdFile.close() #todo: try to get timing for each command commands = [] rows = dict() for pn, pList in enumerate(subsetTrackList(trainingTrackList, sizeRange, mandTracks)): if len(pList) == len(trainingTrackList): outDir = args.outputDir else: outDir = os.path.join(args.outputDir, "perm%d" % pn) if not os.path.exists(outDir): os.makedirs(outDir) trainingTrackPath = os.path.join(outDir, "training_tracks.xml") evalTrackPath = os.path.join(outDir, "eval_tracks.xml") for maskTrack in trainingTrackList.getMaskTracks(): pList.addTrack(copy.deepcopy(maskTrack)) pList.saveXML(trainingTrackPath) epList = TrackList() for track in pList: t = copy.deepcopy(evalTrackList.getTrackByName(track.getName())) epList.addTrack(t) for maskTrack in trainingTrackList.getMaskTracks(): epList.addTrack(copy.deepcopy(maskTrack)) epList.saveXML(evalTrackPath) for inBed in args.inBeds: base = os.path.basename(inBed) truthBed = inBed testBed = inBed if args.cross is True: truthBed = os.path.join(outDir, os.path.splitext(base)[0] + "_truth_temp.bed") testBed = os.path.join(outDir, os.path.splitext(base)[0] + "_test_temp.bed") splitBed(inBed, truthBed, testBed) # train if args.mod is not None: modPath = args.mod command = "ls %s" % modPath else: modPath = os.path.join(outDir, os.path.splitext(base)[0] + ".mod") command = "teHmmTrain.py %s %s %s %s %s" % (trainingTrackPath, truthBed, modPath, logOps, trainFlags) if args.segment is True: command += " --segment %s" % truthBed # view viewPath = os.path.join(outDir, os.path.splitext(base)[0] + "_view.txt") command += " && teHmmView.py %s > %s" % (modPath, viewPath) # evaluate numReps = 1 if args.reps is not None and args.saveAllReps is True: numReps = args.reps assert numReps > 0 missed = 0 # little hack to repeat evaluation for each training replicate for repNum in xrange(-1, numReps-1): if repNum == -1: repSuffix = "" else: repSuffix = ".rep%d" % repNum evalBed = os.path.join(outDir, os.path.splitext(base)[0] + "_eval.bed" + repSuffix) hmmEvalInputBed = testBed if args.eval is not None: hmmEvalInputBed = args.eval bicPath = os.path.join(outDir, os.path.splitext(base)[0] + "_bic.txt" + repSuffix) command += " && teHmmEval.py %s %s %s --bed %s %s --bic %s" % ( evalTrackPath, modPath + repSuffix, hmmEvalInputBed, evalBed, logOps, bicPath) zin = True if args.segment is True: command += " --segment" # fit compTruth = testBed if args.truth is not None: compTruth = args.truth compareInputBed = evalBed if args.fit is True: fitBed = os.path.join(outDir, os.path.splitext(base)[0] + "_eval_fit.bed" + repSuffix) command += " && fitStateNames.py %s %s %s --tl %s" % (compTruth, evalBed, fitBed, evalTrackPath) if args.fitOpts is not None: command += " " + args.fitOpts compareInputBed = fitBed # compare compPath = os.path.join(outDir, os.path.splitext(base)[0] + "_comp.txt" + repSuffix) command += " && compareBedStates.py %s %s --tl %s > %s" % ( compTruth, compareInputBed, evalTrackPath, compPath) # make table row if repSuffix == "": rowPath = os.path.join(outDir, os.path.splitext(base)[0] + "_row.txt") if inBed in rows: rows[inBed].append(rowPath) else: rows[inBed] = [rowPath] command += " && scrapeBenchmarkRow.py %s %s %s %s %s" % ( args.trainingTracksInfo, trainingTrackPath, evalBed, compPath, rowPath) # remember command inCmdPath = os.path.join(outDir, os.path.splitext(base)[0] + "_cmd.txt") inCmdFile = open(inCmdPath, "w") inCmdFile.write(command + "\n") inCmdFile.close() commands.append(command) runParallelShellCommands(commands, args.numProc) writeTables(args.outputDir, rows)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument("inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument("--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument("--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument("--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument("--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % ( maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath])) cleanBedTool(tempBedToolPath)
def runTsd(args, tempTracksInfo): """ run addTsdTrack on termini and chaux to generate tsd track""" if args.noTsd is True: return origTrackList = TrackList(args.tracksInfo) outTrackList = TrackList(tempTracksInfo) tempFiles = [] tsdInputFiles = [] tsdInputTracks = [] # preprocess termini lastzTracks = [origTrackList.getTrackByName(args.ltr_termini), origTrackList.getTrackByName(args.tir)] for terminiTrack in lastzTracks: if terminiTrack is not None: inFile = terminiTrack.getPath() fillFile = getLocalTempPath("Temp_fill", ".bed") tempBed = None if inFile[-3:] == ".bb": tempBed = getLocalTempPath("Temp_termini", ".bed") runShellCommand("bigBedToBed %s %s" % (inFile, tempBed)) inFile = tempBed runShellCommand("fillTermini.py %s %s" % (inFile, fillFile)) tsdInputFiles.append(fillFile) tsdInputTracks.append(terminiTrack.getName()) tempFiles.append(fillFile) if tempBed is not None: runShellCommand("rm -f %s" % tempBed) else: logger.warning("Could not find termini track") # add repeat_modeler repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler) if repeat_modelerTrack is not None: tsdInputFiles.append(repeat_modelerTrack.getPath()) tsdInputTracks.append(repeat_modelerTrack.getName()) # run addTsdTrack (appending except first time) # note we override input track paths in each case assert len(tsdInputFiles) == len(tsdInputTracks) for i in xrange(len(tsdInputFiles)): optString = "" if i > 0: optString += " --append" # really rough hardcoded params based on # (A unified classification system for eukaryotic transposable elements # Wicker et. al 2007) if tsdInputTracks[i] == args.repeat_modeler: optString += " --names LINE,SINE,Unknown" optString += " --maxScore 20" optString += " --left 20" optString += " --right 20" optString += " --min 5" optString += " --max 20" optString += " --overlap 20" elif tsdInputTracks[i] == args.ltr_termini: optString += " --maxScore 3" optString += " --left 8" optString += " --right 8" optString += " --min 3" optString += " --max 6" elif tsdInputTracks[i] == args.tir: optString += " --maxScore 3" optString += " --left 15" optString += " --right 15" optString += " --min 3" optString += " --max 12" tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml") runShellCommand("addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % ( tempTracksInfo, args.cleanTrackPath, tempXMLOut, tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i], optString, args.logOpString, args.numProc)) runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo)) for i in xrange(len(tempFiles)): runShellCommand("rm %s" % tempFiles[i])
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create starting transition and emission distributions " "from a candidate BED annotation, which can" " be used with teHmmTrain.py using the --initTransProbs and " "--initEmProbs options, respectively. The distributions created here" " are extremely simple, but this can be a good shortcut to at least " "getting the state names into the init files, which can be further " "tweeked by hand.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("trackName", help="Name of Track to use as initial" " annotation") parser.add_argument("queryBed", help="Bed file with regions to query") parser.add_argument("outTransProbs", help="File to write transition model" " to") parser.add_argument("outEmProbs", help="File to write emission model to") parser.add_argument("--numOut", help="Number of \"outside\" states to add" " to the model.", default=1, type=int) parser.add_argument("--numTot", help="Add x \"outside\" states such " "that total states is this. (overrieds --numOut)", default=0, type=int) parser.add_argument("--outName", help="Name of outside states (will have" " numeric suffix if more than 1)", default="Outside") parser.add_argument("--mode", help="Strategy for initializing the " "transition graph: {\'star\': all states are connected" " to the oustide state(s) but not each other; " " \'data\': transitions estimated from input bed; " " \'full\': dont write edges and let teHmmTrain.py " "initialize as a clique}", default="star") parser.add_argument("--selfTran", help="This script will always write all" " the self-transition probabilities to the output file. " "They will all be set to the specified value using this" " option, or estimated from the data if -1", default=-1., type=float) parser.add_argument("--em", help="Emission probability for input track (" "ie probability that state emits itself)", type=float, default=0.95) parser.add_argument("--outEmNone", help="Add None emission probabilities" " for target track for Outside states", action="store_true", default=None) addLoggingOptions(parser) args = parser.parse_args() if args.mode == "star" and args.numOut < 1: raise RuntimeError("--numOut must be at least 1 if --mode star is used") if args.mode != "star" and args.mode != "data" and args.mode != "full": raise RuntimeError("--mode must be one of {star, data, full}") if args.mode == "data": raise RuntimeError("--data not implemented yet") assert os.path.isfile(args.tracksInfo) setLoggingFromOptions(args) tempBedToolPath = initBedTool() # Read the tracks info trackList = TrackList(args.tracksInfo) # Extract the track we want track = trackList.getTrackByName(args.trackName) if track is None: raise RuntimeError("Track %s not found in tracksInfo" % args.trackName) trackPath = track.getPath() if track.getDist() != "multinomial" and track.getDist() != "gaussian": raise RuntimeError("Track %s does not have multinomial or " "gaussian distribution" % args.trackName) if track.getScale() is not None or track.getLogScale() is not None: raise RuntimeError("Track %s must not have scale" % args.trackName) # read query intervals from the bed file logger.info("loading query intervals from %s" % args.queryBed) mergedIntervals = getMergedBedIntervals(args.queryBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.queryBed) # read the track, while intersecting with query intervals # (track is saved as temp XML file for sake not changing interface) bedIntervals = [] for queryInterval in mergedIntervals: bedIntervals += readBedIntervals(trackPath, ncol = track.getValCol() + 1, chrom=queryInterval[0], start=queryInterval[1], end=queryInterval[2]) # 1st pass to collect set of names nameMap = CategoryMap(reserved = 0) for interval in bedIntervals: nameMap.update(interval[track.getValCol()]) outNameMap = CategoryMap(reserved = 0) if args.numTot > 0: args.numOut = max(0, args.numTot - len(nameMap)) for i in xrange(args.numOut): outName = args.outName if args.numOut > 1: outName += str(i) assert nameMap.has(outName) is False outNameMap.update(outName) # write the transition model for use with teHmmTrain.py --initTransProbs writeTransitions(bedIntervals, nameMap, outNameMap, args) # write the emission model for use with teHmmTrain.py --initEmProbs writeEmissions(bedIntervals, nameMap, outNameMap, args) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Automatically set the scale attributes of numeric tracks" " within a given tracks.xml function using some simple heuristics. ") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("outputTracks", help="Path to write modified tracks XML" " to.") parser.add_argument("--numBins", help="Maximum number of bins after scaling", default=10, type=int) parser.add_argument("--tracks", help="Comma-separated list of tracks " "to process. If not set, all" " tracks listed as having a multinomial distribution" " (since this is the default value, this includes " "tracks with no distribution attribute) or gaussian" " distribution will be processed.", default=None) parser.add_argument("--skip", help="Comma-separated list of tracks to " "skip.", default=None) parser.add_argument("--noLog", help="Never use log scaling", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() trackNames = [] if args.tracks is not None: trackNames = args.tracks.split(",") skipNames = [] if args.skip is not None: skipNames = args.skip.split(",") trackList = TrackList(args.tracksInfo) outTrackList = copy.deepcopy(trackList) allIntervals = getMergedBedIntervals(args.allBed) for track in trackList: trackExt = os.path.splitext(track.getPath())[1] isFasta = len(trackExt) >= 3 and trackExt[:3].lower() == ".fa" if track.getName() not in skipNames and\ (track.getName() in trackNames or len(trackNames) == 0) and\ (track.getDist() == "multinomial" or track.getDist() == "sparse_multinomial" or track.getDist() == "gaussian") and\ not isFasta: try: setTrackScale(track, args.numBins, allIntervals, args.noLog) except ValueError as e: logger.warning("Skipping (non-numeric?) track %s due to: %s" % ( track.getName(), str(e))) trackList.saveXML(args.outputTracks) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument( "inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument( "--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument( "--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument( "--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument( "--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument( "--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol=4, sort=True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand( "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[ 2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[ 1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write( "%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([ tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath ])) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Add a TSD track (or modify an existing one) based on a " "given track") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("tsdTrackDir", help="Directory to write cleaned BED" " tracks to") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML" " to.") parser.add_argument("inputTrack", help="Name of track to createTSDs from") parser.add_argument("fastaTrack", help="Name of track for fasta sequence") parser.add_argument("outputTrack", help="Name of tsd track to add. Will" " overwrite if it already exists (or append with" " --append option)") parser.add_argument("--append", help="Add onto existing TSD track if exists", default=False, action="store_true") parser.add_argument("--inPath", help="Use given file instead of inputTrack" " path to generate TSD", default=None) ############ TSDFINDER OPTIONS ############## parser.add_argument("--min", help="Minimum length of a TSD", default=None, type=int) parser.add_argument("--max", help="Maximum length of a TSD", default=None, type=int) parser.add_argument("--all", help="Report all matches in region (as opposed" " to only the nearest to the BED element which is the " "default behaviour", action="store_true", default=False) parser.add_argument("--maxScore", help="Only report matches with given " "score or smaller. The score is definied as the " "maximum distance between the (two) TSD intervals and " "the query interval", default=None, type=int) parser.add_argument("--left", help="Number of bases immediately left of the " "BED element to search for the left TSD", default=None, type=int) parser.add_argument("--right", help="Number of bases immediately right of " "the BED element to search for the right TSD", default=None, type=int) parser.add_argument("--overlap", help="Number of bases overlapping the " "BED element to include in search (so total space " "on each side will be --left + overlap, and --right + " "--overlap", default=None, type=int) parser.add_argument("--leftName", help="Name of left TSDs in output Bed", default=None) parser.add_argument("--rightName", help="Name of right TSDs in output Bed", default=None) parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique" " matching ID", action="store_true", default=False) parser.add_argument("--names", help="Only apply to bed interval whose " "name is in (comma-separated) list. If not specified" " then all intervals are processed", default=None) parser.add_argument("--numProc", help="Number of jobs to run in parallel." " (parallization done on different sequences in FASTA" "file", type=int, default=1) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # copy out all options for call to tsd finder args.tsdFinderOptions = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.tsdFinderOptions += " --logFile %s" % args.logFile for option in [ "min", "max", "all", "maxScore", "left", "right", "overlap", "leftName", "rightName", "id", "names", "numProc" ]: val = getattr(args, option) if val is True: args.tsdFinderOptions += " --%s" % option elif val is not None and val is not False: args.tsdFinderOptions += " --%s %s" % (option, val) try: os.makedirs(args.tsdTrackDir) except: pass if not os.path.isdir(args.tsdTrackDir): raise RuntimeError("Unable to find or create tsdTrack dir %s" % args.tsdTrackDir) trackList = TrackList(args.tracksInfo) outTrackList = copy.deepcopy(trackList) inputTrack = trackList.getTrackByName(args.inputTrack) if inputTrack is None: raise RuntimeError("Track %s not found" % args.inputTrack) if args.inPath is not None: assert os.path.isfile(args.inPath) inputTrack.setPath(args.inPath) inTrackExt = os.path.splitext(inputTrack.getPath())[1].lower() if inTrackExt != ".bb" and inTrackExt != ".bed": raise RuntimeError("Track %s has non-bed extension %s" % (args.inputTrack, inTrackExt)) fastaTrack = trackList.getTrackByName(args.fastaTrack) if fastaTrack is None: raise RuntimeError("Fasta Track %s not found" % args.fastaTrack) faTrackExt = os.path.splitext(fastaTrack.getPath())[1].lower() if faTrackExt[:3] != ".fa": raise RuntimeError("Fasta Track %s has non-fasta extension %s" % (args.fastaTrack, faTrackExt)) tsdTrack = outTrackList.getTrackByName(args.outputTrack) if tsdTrack is None: if args.append is True: raise RuntimeError("TSD track %s not found. Cannot append" % (args.outputTrack)) tsdTrack = Track() tsdTrack.name = args.outputTrack tsdTrack.path = os.path.join( args.tsdTrackDir, args.inputTrack + "_" + args.outputTrack + ".bed") runTsdFinder(fastaTrack.getPath(), inputTrack.getPath(), tsdTrack.getPath(), args) if outTrackList.getTrackByName(tsdTrack.getName()) is None: outTrackList.addTrack(tsdTrack) outTrackList.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="fix up track names and sort alphabetically. easier to do here on xml than at end for pape\ r.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile nm = dict() nm["hollister"] = "RM-RepBase-Hollister" nm["chaux"] = "RM-RepBase-deLaChaux" nm["repeat_modeler"] = "RM-RepeatModeler" nm["repbase"] = "RM-RepBase" nm["repet"] = "REPET" nm["ltr_finder"] = "LTR_FINDER" nm["ltr_harvest"] = "LTR_Harvest" nm["ltr_termini"] = "lastz-Termini" nm["lastz-Termini"] = "lastz-LTRTermini" nm["tir_termini"] = "lastz-InvTermini" nm["irf"] = "IRF" nm["palindrome"] = "lastz-Palindrome" nm["overlap"] = "lastz-Overlap" nm["mitehunter"] = "MITE-Hunter" nm["helitronscanner"] = "HelitronScanner" nm["cov_80-"] = "lastz-SelfLowId" nm["cov_80-90"] = "lastz-SelfMedId" nm["cov_90+"] = "lastz-SelfHighId" nm["left_peak_80-"] = "lastz-SelfPeakLeftLow" nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"] nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed" nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"] nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh" nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"] nm["right_peak_80-"] = "lastz-SelfPeakRightLow" nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"] nm["right_peak_80-90"] = "lastz-SelfPeakRightMed" nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"] nm["right_peak_90+"] = "lastz-SelfPeakRightHigh" nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"] nm["cov_maxPId"] = "lastz-SelfPctMaxId" nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"] nm["te_domains"] = "TE-Domains" nm["fgenesh"] = "Genes" nm["genes"] = nm["fgenesh"] nm["refseq"] = nm["fgenesh"] nm["mrna"] = "mRNA" nm["srna"] = "sRNA" nm["ortho_depth"] = "Alignment-Depth" nm["orthology"] = nm["ortho_depth"] nm["chain_depth"] = nm["ortho_depth"] nm["alignment_depth"] = nm["ortho_depth"] nm["gcpct"] = "GC" nm["trf"] = "TRF" nm["windowmasker"] = "WindowMasker" nm["polyN"] = "Ns" nm["phastcons_ce"] = "Conservation" nm["phastcons"] = nm["phastcons_ce"] nm["PhastCons"] = nm["phastcons_ce"] nm["phyloP"] = nm["phastcons_ce"] nm["phylop"] = nm["phastcons_ce"] rtracks = dict() rtracks["tantan"] = True rtracks["polyA"] = True rtracks["transposon_psi"] = True rtracks["transposonpsi"] = True rtracks["repbase_censor"] = True rtracks["tsd"] = True rtracks["repbase_default"] = True rtracks["dustmasker"] = True inTracks = TrackList(args.tracksInfo) outTracks = TrackList() outList = [] for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList): if not os.path.exists(track.path): raise RuntimeError("Track DNE %s" % track.path) if track.name not in rtracks: if track.name in nm: track.name = nm[track.name] else: logger.warning("Did not map track %s" % track.name) outList.append(track) else: logger.warning("Deleted track %s" % track.name) # sort the list def sortComp(x): lname = x.name.lower() if x.name == "RM-RepeatModeler": return "aaaaa" + lname elif "RM" in x.name: return "aaaa" + lname elif "REPET" in x.name: return "aaa" + lname elif "softmask" in lname or "tigr" in lname or "te-domains" in lname: return "aa" + lname elif x.getDist == "mask": return "zzzz" + lname else: return lname outList = sorted(outList, key = lambda track : sortComp(track)) for track in outList: outTracks.addTrack(track) outTracks.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "fix up track names and sort alphabetically. easier to do here on xml than at end for pape\ r.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile nm = dict() nm["hollister"] = "RM-RepBase-Hollister" nm["chaux"] = "RM-RepBase-deLaChaux" nm["repeat_modeler"] = "RM-RepeatModeler" nm["repbase"] = "RM-RepBase" nm["repet"] = "REPET" nm["ltr_finder"] = "LTR_FINDER" nm["ltr_harvest"] = "LTR_Harvest" nm["ltr_termini"] = "lastz-Termini" nm["lastz-Termini"] = "lastz-LTRTermini" nm["tir_termini"] = "lastz-InvTermini" nm["irf"] = "IRF" nm["palindrome"] = "lastz-Palindrome" nm["overlap"] = "lastz-Overlap" nm["mitehunter"] = "MITE-Hunter" nm["helitronscanner"] = "HelitronScanner" nm["cov_80-"] = "lastz-SelfLowId" nm["cov_80-90"] = "lastz-SelfMedId" nm["cov_90+"] = "lastz-SelfHighId" nm["left_peak_80-"] = "lastz-SelfPeakLeftLow" nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"] nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed" nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"] nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh" nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"] nm["right_peak_80-"] = "lastz-SelfPeakRightLow" nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"] nm["right_peak_80-90"] = "lastz-SelfPeakRightMed" nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"] nm["right_peak_90+"] = "lastz-SelfPeakRightHigh" nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"] nm["cov_maxPId"] = "lastz-SelfPctMaxId" nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"] nm["te_domains"] = "TE-Domains" nm["fgenesh"] = "Genes" nm["genes"] = nm["fgenesh"] nm["refseq"] = nm["fgenesh"] nm["mrna"] = "mRNA" nm["srna"] = "sRNA" nm["ortho_depth"] = "Alignment-Depth" nm["orthology"] = nm["ortho_depth"] nm["chain_depth"] = nm["ortho_depth"] nm["alignment_depth"] = nm["ortho_depth"] nm["gcpct"] = "GC" nm["trf"] = "TRF" nm["windowmasker"] = "WindowMasker" nm["polyN"] = "Ns" nm["phastcons_ce"] = "Conservation" nm["phastcons"] = nm["phastcons_ce"] nm["PhastCons"] = nm["phastcons_ce"] nm["phyloP"] = nm["phastcons_ce"] nm["phylop"] = nm["phastcons_ce"] rtracks = dict() rtracks["tantan"] = True rtracks["polyA"] = True rtracks["transposon_psi"] = True rtracks["transposonpsi"] = True rtracks["repbase_censor"] = True rtracks["tsd"] = True rtracks["repbase_default"] = True rtracks["dustmasker"] = True inTracks = TrackList(args.tracksInfo) outTracks = TrackList() outList = [] for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList): if not os.path.exists(track.path): raise RuntimeError("Track DNE %s" % track.path) if track.name not in rtracks: if track.name in nm: track.name = nm[track.name] else: logger.warning("Did not map track %s" % track.name) outList.append(track) else: logger.warning("Deleted track %s" % track.name) # sort the list def sortComp(x): lname = x.name.lower() if x.name == "RM-RepeatModeler": return "aaaaa" + lname elif "RM" in x.name: return "aaaa" + lname elif "REPET" in x.name: return "aaa" + lname elif "softmask" in lname or "tigr" in lname or "te-domains" in lname: return "aa" + lname elif x.getDist == "mask": return "zzzz" + lname else: return lname outList = sorted(outList, key=lambda track: sortComp(track)) for track in outList: outTracks.addTrack(track) outTracks.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)
def runTsd(args, tempTracksInfo): """ run addTsdTrack on termini and chaux to generate tsd track""" if args.noTsd is True: return origTrackList = TrackList(args.tracksInfo) outTrackList = TrackList(tempTracksInfo) tempFiles = [] tsdInputFiles = [] tsdInputTracks = [] # preprocess termini lastzTracks = [ origTrackList.getTrackByName(args.ltr_termini), origTrackList.getTrackByName(args.tir) ] for terminiTrack in lastzTracks: if terminiTrack is not None: inFile = terminiTrack.getPath() fillFile = getLocalTempPath("Temp_fill", ".bed") tempBed = None if inFile[-3:] == ".bb": tempBed = getLocalTempPath("Temp_termini", ".bed") runShellCommand("bigBedToBed %s %s" % (inFile, tempBed)) inFile = tempBed runShellCommand("fillTermini.py %s %s" % (inFile, fillFile)) tsdInputFiles.append(fillFile) tsdInputTracks.append(terminiTrack.getName()) tempFiles.append(fillFile) if tempBed is not None: runShellCommand("rm -f %s" % tempBed) else: logger.warning("Could not find termini track") # add repeat_modeler repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler) if repeat_modelerTrack is not None: tsdInputFiles.append(repeat_modelerTrack.getPath()) tsdInputTracks.append(repeat_modelerTrack.getName()) # run addTsdTrack (appending except first time) # note we override input track paths in each case assert len(tsdInputFiles) == len(tsdInputTracks) for i in xrange(len(tsdInputFiles)): optString = "" if i > 0: optString += " --append" # really rough hardcoded params based on # (A unified classification system for eukaryotic transposable elements # Wicker et. al 2007) if tsdInputTracks[i] == args.repeat_modeler: optString += " --names LINE,SINE,Unknown" optString += " --maxScore 20" optString += " --left 20" optString += " --right 20" optString += " --min 5" optString += " --max 20" optString += " --overlap 20" elif tsdInputTracks[i] == args.ltr_termini: optString += " --maxScore 3" optString += " --left 8" optString += " --right 8" optString += " --min 3" optString += " --max 6" elif tsdInputTracks[i] == args.tir: optString += " --maxScore 3" optString += " --left 15" optString += " --right 15" optString += " --min 3" optString += " --max 12" tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml") runShellCommand( "addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % (tempTracksInfo, args.cleanTrackPath, tempXMLOut, tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i], optString, args.logOpString, args.numProc)) runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo)) for i in xrange(len(tempFiles)): runShellCommand("rm %s" % tempFiles[i])
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Add a TSD track (or modify an existing one) based on a " "given track") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("tsdTrackDir", help="Directory to write cleaned BED" " tracks to") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML" " to.") parser.add_argument("inputTrack", help="Name of track to createTSDs from") parser.add_argument("fastaTrack", help="Name of track for fasta sequence") parser.add_argument("outputTrack", help="Name of tsd track to add. Will" " overwrite if it already exists (or append with" " --append option)") parser.add_argument("--append", help="Add onto existing TSD track if exists", default=False, action="store_true") parser.add_argument("--inPath", help="Use given file instead of inputTrack" " path to generate TSD", default=None) ############ TSDFINDER OPTIONS ############## parser.add_argument("--min", help="Minimum length of a TSD", default=None, type=int) parser.add_argument("--max", help="Maximum length of a TSD", default=None, type=int) parser.add_argument("--all", help="Report all matches in region (as opposed" " to only the nearest to the BED element which is the " "default behaviour", action="store_true", default=False) parser.add_argument("--maxScore", help="Only report matches with given " "score or smaller. The score is definied as the " "maximum distance between the (two) TSD intervals and " "the query interval", default=None, type=int) parser.add_argument("--left", help="Number of bases immediately left of the " "BED element to search for the left TSD", default=None, type=int) parser.add_argument("--right", help="Number of bases immediately right of " "the BED element to search for the right TSD", default=None, type=int) parser.add_argument("--overlap", help="Number of bases overlapping the " "BED element to include in search (so total space " "on each side will be --left + overlap, and --right + " "--overlap", default=None, type=int) parser.add_argument("--leftName", help="Name of left TSDs in output Bed", default=None) parser.add_argument("--rightName", help="Name of right TSDs in output Bed", default=None) parser.add_argument("--id", help="Assign left/right pairs of TSDs a unique" " matching ID", action="store_true", default=False) parser.add_argument("--names", help="Only apply to bed interval whose " "name is in (comma-separated) list. If not specified" " then all intervals are processed", default=None) parser.add_argument("--numProc", help="Number of jobs to run in parallel." " (parallization done on different sequences in FASTA" "file", type=int, default=1) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # copy out all options for call to tsd finder args.tsdFinderOptions = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.tsdFinderOptions += " --logFile %s" % args.logFile for option in ["min", "max", "all", "maxScore", "left", "right", "overlap", "leftName", "rightName", "id", "names", "numProc"]: val = getattr(args, option) if val is True: args.tsdFinderOptions += " --%s" % option elif val is not None and val is not False: args.tsdFinderOptions += " --%s %s" % (option, val) try: os.makedirs(args.tsdTrackDir) except: pass if not os.path.isdir(args.tsdTrackDir): raise RuntimeError("Unable to find or create tsdTrack dir %s" % args.tsdTrackDir) trackList = TrackList(args.tracksInfo) outTrackList = copy.deepcopy(trackList) inputTrack = trackList.getTrackByName(args.inputTrack) if inputTrack is None: raise RuntimeError("Track %s not found" % args.inputTrack) if args.inPath is not None: assert os.path.isfile(args.inPath) inputTrack.setPath(args.inPath) inTrackExt = os.path.splitext(inputTrack.getPath())[1].lower() if inTrackExt != ".bb" and inTrackExt != ".bed": raise RuntimeError("Track %s has non-bed extension %s" % ( args.inputTrack, inTrackExt)) fastaTrack = trackList.getTrackByName(args.fastaTrack) if fastaTrack is None: raise RuntimeError("Fasta Track %s not found" % args.fastaTrack) faTrackExt = os.path.splitext(fastaTrack.getPath())[1].lower() if faTrackExt[:3] != ".fa": raise RuntimeError("Fasta Track %s has non-fasta extension %s" % ( args.fastaTrack, faTrackExt)) tsdTrack = outTrackList.getTrackByName(args.outputTrack) if tsdTrack is None: if args.append is True: raise RuntimeError("TSD track %s not found. Cannot append" % ( args.outputTrack)) tsdTrack = Track() tsdTrack.name = args.outputTrack tsdTrack.path = os.path.join(args.tsdTrackDir, args.inputTrack + "_" + args.outputTrack + ".bed") runTsdFinder(fastaTrack.getPath(), inputTrack.getPath(), tsdTrack.getPath(), args) if outTrackList.getTrackByName(tsdTrack.getName()) is None: outTrackList.addTrack(tsdTrack) outTrackList.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)