def writeScaledTrack(trackData, track, args): """ Go base-by-base, writing the unscaled value to the output""" fname, fext = os.path.splitext(os.path.basename(track.getPath())) outBed = os.path.join(args.outputDir, fname + "_scale" + ".bed") outBigWig = os.path.join(args.outputDir, fname + "_scale" + ".bw") outFile = open(outBed, "w") trackNo = track.getNumber() valMap = track.getValueMap() for trackTable in trackData.getTrackTableList(): chrom = trackTable.getChrom() start = trackTable.getStart() for i in xrange(len(trackTable)): binnedVal = trackTable[i][trackNo] unbinnedVal = valMap.getMapBack(binnedVal) outFile.write("%s\t%d\t%d\t%f\n" % (chrom, start + i, start + i + 1, unbinnedVal)) outFile.close() #make a .bw copy try: runShellCommand("bedGraphToBigWig %s %s %s" % (outBed, args.chromSizes, outBigWig)) except: logger.warning("Unable to big bigwig from %s" % outBed)
def extractScore(benchDir, benchInputBedPath, args, repSuffix = ""): """ Reduce entire benchmark output into a single score value """ compPath = os.path.join(benchDir, os.path.splitext( os.path.basename(benchInputBedPath))[0]+ "_comp.txt" + repSuffix) baseStats, intStats, weightedStats = extractCompStatsFromFile(compPath) stats = intStats if args.base is True: stats = baseStats f1List = [] for state in args.states.split(","): if state not in stats: logger.warning("State %s not found in intstats %s. giving 0" % ( state, str(stats))) f1List.append(0) continue prec = stats[state][0] rec = stats[state][1] * args.recallSkew f1 = 0 if prec + rec > 0: f1 = 2. * ((prec * rec) / (prec + rec)) if args.score == "prec": f1List.append(prec) elif args.score == "rec": f1List.append(rec) else: f1List.append(f1) avgF1 = np.mean(f1List) return avgF1
def writeScaledTrack(trackData, track, args): """ Go base-by-base, writing the unscaled value to the output""" fname, fext = os.path.splitext(os.path.basename(track.getPath())) outBed = os.path.join(args.outputDir, fname + "_scale" + ".bed") outBigWig = os.path.join(args.outputDir, fname + "_scale" + ".bw") outFile = open(outBed, "w") trackNo = track.getNumber() valMap = track.getValueMap() for trackTable in trackData.getTrackTableList(): chrom = trackTable.getChrom() start = trackTable.getStart() for i in xrange(len(trackTable)): binnedVal = trackTable[i][trackNo] unbinnedVal = valMap.getMapBack(binnedVal) outFile.write("%s\t%d\t%d\t%f\n" % ( chrom, start + i, start + i + 1, unbinnedVal)) outFile.close() #make a .bw copy try: runShellCommand("bedGraphToBigWig %s %s %s" % (outBed, args.chromSizes, outBigWig)) except: logger.warning("Unable to big bigwig from %s" % outBed)
def getPosteriorsMask(pdStates, hmm): """ returns array mask where mask[i] == 1 iff state i is part of our desired posterior distribution""" stateMap = hmm.getStateNameMap() if stateMap is None: stateMap = CategoryMap(reserved = 0) for i in xrange(hmm.getEmissionModel().getNumStates()): stateMap.update(str(i)) mask = np.zeros((len(stateMap)), dtype=np.int8) for state in pdStates.split(","): if not stateMap.has(state): logger.warning("Posterior (or Emission) Distribution state %s" " not found in model" % state) else: stateNumber = stateMap.getMap(state) mask[stateNumber] = 1 return mask
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Produce a bed file of genome segments which are atomic" " elements with resepect to the hmm. ie each segment emits a single" " state. Mask tracks always cut. " "Output intervals are assigned name 0 1 0 1 etc.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("outBed", help="Output segments") parser.add_argument("--thresh", help="Number of tracks that can change " "before a new segment formed. Increasing this value" " increases the expected lengths of output segments", type=int, default=1) parser.add_argument("--cutTracks", help="Create a new segment if something" " changes in one of these tracks (as specified by " "comman-separated list), overriding --thresh options" " if necessary. For example, --cutTracks tsd,chaux" " would invoke a new segment everytime the value at" "either of these tracks changed", default=None) parser.add_argument("--cutUnscaled", help="Cut on all unscaled (used as " "a proxy for non-numeric) tracks", default=False, action="store_true") parser.add_argument("--cutMultinomial", help="Cut non-gaussian, non-binary" " tracks everytime", default=False, action="store_true") parser.add_argument("--cutNonGaussian", help="Cut all but guassian tracks", default=False, action="store_true") parser.add_argument("--comp", help="Strategy for comparing columns for the " "threshold cutoff. Options are [first, prev], where" " first compares with first column of segment and " "prev compares with column immediately left", default="first") parser.add_argument("--ignore", help="Comma-separated list of tracks to " "ignore (the FASTA DNA sequence would be a good " "candidate", default="sequence") parser.add_argument("--maxLen", help="Maximum length of a segment (<= 0 means" " no max length applied", type=int, default=0) parser.add_argument("--fixLen", help="Just make segments of specifed fixed " "length ignoring other parameters and logic (<= 0 means" " no fixed length applied", type=int, default=0) parser.add_argument("--stats", help="Write some statistics to specified " "file. Of the form <trackName> <Diff> <DiffPct> " " where <Diff> is the number of times a track differs" " between two consecutive segments, and <DiffPct> " " is the average perecentage of all such differences " "accounted for by the track", default=None) parser.add_argument("--delMask", help="Entirely remove intervals from " "mask tracks that are > given length (otherwise " "they would just be ignored by HMM tools). The difference" " here is that removed intervals will break contiguity.", type=int, default=None) parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel" " (in BED format). input regions will be intersected with each line" " in this file, and the result will correspsond to an individual job", default=None) parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)", type=int, default=1) parser.add_argument("--co", help="count offset for segment labels. only used internally", type=int, default=0) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.comp != "first" and args.comp != "prev": raise RuntimeError("--comp must be either first or prev") if args.chroms is not None: # hack to allow chroms argument to chunk and rerun parallelDispatch(argv, args) cleanBedTool(tempBedToolPath) return 0 # read query intervals from the bed file tempFiles = [] if args.delMask is not None: cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint, args.tracksInfo) if cutBed is not None: tempFiles.append(cutBed) args.allBed = cutBed logger.info("loading segment region intervals from %s" % args.allBed) mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.allBed) # read the tracks, while intersecting them with the query intervals logger.info("loading tracks %s" % args.tracksInfo) trackData = TrackData() trackData.loadTrackData(args.tracksInfo, mergedIntervals, treatMaskAsBinary=True) # process the --cutTracks option trackList = trackData.getTrackList() cutList = np.zeros((len(trackList)), np.int) if args.cutTracks is not None: cutNames = args.cutTracks.split(",") for name in cutNames: track = trackList.getTrackByName(name) if track is None: raise RuntimeError("cutTrack %s not found" % name) trackNo = track.getNumber() assert trackNo < len(cutList) cutList[trackNo] = 1 args.cutList = cutList # make sure mask tracks count as cut tracks for track in trackList: if track.getDist() == 'mask': args.cutList[track.getNumber()] = 1 # process the --ignore option ignoreList = np.zeros((len(trackList)), np.int) if args.ignore is not None: ignoreNames = args.ignore.split(",") for name in ignoreNames: track = trackList.getTrackByName(name) if track is None: if name is not "sequence": logger.warning("ignore track %s not found" % name) continue trackNo = track.getNumber() assert trackNo < len(ignoreList) ignoreList[trackNo] = 1 if args.cutList[trackNo] == 1: raise RuntimeError("Same track (%s) cant be cut and ignored" % name) args.ignoreList = ignoreList #process the --cutUnscaled option if args.cutUnscaled is True: for track in trackList: trackNo = track.getNumber() if track.scale is None and track.shift is None and\ track.logScale is None and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 #process the --cutMultinomial option if args.cutMultinomial is True: for track in trackList: trackNo = track.getNumber() if track.dist == "multinomial" and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 #process the --cutNonGaussian option if args.cutNonGaussian is True: for track in trackList: trackNo = track.getNumber() if track.dist != "gaussian" and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 # segment the tracks stats = dict() segmentTracks(trackData, args, stats) writeStats(trackData, args, stats) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Helper script to rank a list of tracks based on how well " "they improve some measure of HMM accuracy, by wrapping " "teHmmBenchmark.py") parser.add_argument("tracks", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("training", help="BED Training regions" "teHmmTrain.py") parser.add_argument("truth", help="BED Truth used for scoring") parser.add_argument("states", help="States (in truth) to use for" " average F1 score (comma-separated") parser.add_argument("outDir", help="Directory to place all results") parser.add_argument("--benchOpts", help="Options to pass to " "teHmmBenchmark.py (wrap in double quotes)", default="") parser.add_argument("--startTracks", help="comma-separated list of " "tracks to start off with", default = None) parser.add_argument("--segOpts", help="Options to pass to " "segmentTracks.py (wrap in double quotes)", default="--comp first --thresh 1 --cutUnscaled") parser.add_argument("--fullSegment", help="Only use segmentation" " based on entire track list for each iteration" " rather than compute segmentation each time (as" " done by default)", action="store_true", default=False) parser.add_argument("--bic", help="rank by BIC instead of score " " (both always present in output table though)", action="store_true", default=False) parser.add_argument("--base", help="use base-level F1 instead of " "interval-level", default=False, action="store_true") parser.add_argument("--naive", help="rank by \"naive\" score", action="store_true", default=False) parser.add_argument("--doNaive", help="compute naive stats. will be " "turned on by default if --naive is used", default=False, action="store_true") parser.add_argument("--segTracks", help="tracks XML to use for segmentation" " (by default will be same as tracks))", default=None) parser.add_argument("--recallSkew", help="when computing f1, multiply recall" " by this number (hack to favour larger recall)", default=1., type=float) parser.add_argument("--score", help="accuracy score to use from " "{f1, prec, rec}", default="f1") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) # make sure no no-no options in benchOpts if "--eval" in args.benchOpts or "--truth" in args.benchOpts: raise RuntimeError("--eval and --truth cannot be passed through to " "teHmmBenchmark.py as they are generated from " "<training> and <truth> args from this script") # don't want to keep track of extra logic required for not segmenting if "--segment" not in args.benchOpts: args.benchOpts += " --segment" logger.warning("Adding --segment to teHmmBenchmark.py options") if args.bic is True and args.naive is True: raise RuntimeError("--bic and --naive are mutually incompatible") if args.naive is True: args.doNaive = True if args.segTracks is None: args.segTracks = args.tracks if not os.path.exists(args.outDir): os.makedirs(args.outDir) greedyRank(args)
def runTrial(tracksList, iteration, newTrackName, args): """ compute a score for a given set of tracks using teHmmBenchmark.py """ benchDir = os.path.join(args.outDir, "iter%d" % iteration) benchDir = os.path.join(benchDir, "%s_bench" % newTrackName) if not os.path.exists(benchDir): os.makedirs(benchDir) trainingPath = args.training truthPath = args.truth tracksPath = os.path.join(benchDir, "tracks.xml") tracksList.saveXML(tracksPath) segLogPath = os.path.join(benchDir, "segment_cmd.txt") segLog = open(segLogPath, "w") if args.segTracks == args.tracks: segTracksPath = tracksPath # pull out desired tracks from segment tracks XML if specified else: segTracksIn = TrackList(args.segTracks) segTracks = TrackList() for track in tracksList: segTrack = segTracksIn.getTrackByName(track.getName()) if segTrack is not None: segTracks.addTrack(segTrack) else: logger.warning("track %s not found in segment tracks %s" % ( track.getName(), args.segTracks)) segTracksPath = os.path.join(benchDir, "seg_tracks.xml") segTracks.saveXML(segTracksPath) # segment training segTrainingPath = os.path.join(benchDir, os.path.splitext( os.path.basename(trainingPath))[0]+ "_trainSeg.bed") segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath, trainingPath, segTrainingPath, args.segOpts) if args.fullSegment is False: runShellCommand(segmentCmd) segLog.write(segmentCmd + "\n") else: runShellCommand("ln -f -s %s %s" % (args.fullSegTrainPath, segTrainingPath)) # segment eval segEvalPath = os.path.join(benchDir, os.path.splitext(os.path.basename(truthPath))[0]+ "_evalSeg.bed") segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath, truthPath, segEvalPath, args.segOpts) if trainingPath == truthPath: segmentCmd = "ln -f -s %s %s" % (os.path.abspath(segTrainingPath), segEvalPath) if args.fullSegment is False: runShellCommand(segmentCmd) segLog.write(segmentCmd + "\n") else: runShellCommand("ln -f -s %s %s" % (args.fullSegEvalPath, segEvalPath)) segLog.close() segPathOpts = " --eval %s --truth %s" % (segEvalPath, truthPath) benchCmd = "teHmmBenchmark.py %s %s %s %s" % (tracksPath, benchDir, segTrainingPath, args.benchOpts + segPathOpts) runShellCommand(benchCmd) score = extractScore(benchDir, segTrainingPath, args) bic = extractBIC(benchDir, segTrainingPath, args) naive = 0 if args.doNaive is True: naive = extractNaive(tracksPath, benchDir, segTrainingPath, args) slope, rsq = extractF1ProbSlope(benchDir, segTrainingPath, args) # clean up big files? return score, bic, naive, slope, rsq
def greedyRank(args): """ Iteratively add best track to a (initially empty) tracklist according to some metric""" inputTrackList = TrackList(args.tracks) rankedTrackList = TrackList() if args.startTracks is not None: for startTrack in args.startTracks.split(","): track = inputTrackList.getTrackByName(startTrack) if track is None: logger.warning("Start track %s not found in tracks XML" % startTrack) else: rankedTrackList.addTrack(copy.deepcopy(track)) numTracks = len(inputTrackList) - len(rankedTrackList) currentScore, currentBIC = 0.0, sys.maxint # compute full segmentation if --fullSegment is True if args.fullSegment is True: args.fullSegTrainPath = os.path.abspath(os.path.join(args.outDir, "fullSegTrain.bed")) segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks, args.training, args.fullSegTrainPath, args.segOpts) runShellCommand(segmentCmd) args.fullSegEvalPath = os.path.abspath(os.path.join(args.outDir, "fullSegEval.bed")) segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks, args.truth, args.fullSegEvalPath, args.segOpts) runShellCommand(segmentCmd) #header rankFile = open(os.path.join(args.outDir, "ranking.txt"), "w") rankFile.write("It.\tTrack\tF1\tBIC\tNaiveF1\tAccProbSlop\tAccProbR2\n") rankFile.close() # baseline score if we not starting from scratch baseIt = 0 if args.startTracks is not None: curTrackList = copy.deepcopy(rankedTrackList) score,bic,naive,slope,rsq = runTrial(curTrackList, baseIt, "baseline_test", args) rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a") rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (baseIt, args.startTracks, score, bic, naive,slope,rsq)) rankFile.close() baseIt += 1 for iteration in xrange(baseIt, baseIt + numTracks): bestItScore = -sys.maxint bestItBic = sys.maxint bestItNaive = -sys.maxint bestNextTrack = None bestSlope = None bestR = None for nextTrack in inputTrackList: if rankedTrackList.getTrackByName(nextTrack.getName()) is not None: continue curTrackList = copy.deepcopy(rankedTrackList) curTrackList.addTrack(nextTrack) score,bic,naive,slope,rsq = runTrial(curTrackList, iteration, nextTrack.getName(), args) best = False if args.bic is True: if bic < bestItBic or (bic == bestItBic and score > bestItScore): best = True elif args.naive is True: if naive > bestItNaive or (naive == bestItNaive and score > bestItScore): best = True elif score > bestItScore or (score == bestItScore and bic < bestItBic): best = True if best is True: bestItScore, bestItBic, bestItNaive, bestSlope, bestR, bestNextTrack =\ score, bic, naive, slope, rsq, nextTrack flags = "a" if iteration == baseIt: flags = "w" trackLogFile = open(os.path.join(args.outDir, nextTrack.getName() + ".txt"), flags) trackLogFile.write("%d\t%f\t%f\t%f\t%f\t%f\n" % (iteration, score, bic, naive, slope, rsq)) trackLogFile.close() rankedTrackList.addTrack(copy.deepcopy(bestNextTrack)) rankedTrackList.saveXML(os.path.join(args.outDir, "iter%d" % iteration, "tracks.xml")) rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a") rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (iteration, bestNextTrack.getName(), bestItScore, bestItBic, bestItNaive, bestSlope, bestR)) rankFile.close()
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Produce a bed file of genome segments which are atomic" " elements with resepect to the hmm. ie each segment emits a single" " state. Mask tracks always cut. " "Output intervals are assigned name 0 1 0 1 etc.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("outBed", help="Output segments") parser.add_argument("--thresh", help="Number of tracks that can change " "before a new segment formed. Increasing this value" " increases the expected lengths of output segments", type=int, default=1) parser.add_argument("--cutTracks", help="Create a new segment if something" " changes in one of these tracks (as specified by " "comman-separated list), overriding --thresh options" " if necessary. For example, --cutTracks tsd,chaux" " would invoke a new segment everytime the value at" "either of these tracks changed", default=None) parser.add_argument("--cutUnscaled", help="Cut on all unscaled (used as " "a proxy for non-numeric) tracks", default=False, action="store_true") parser.add_argument("--cutMultinomial", help="Cut non-gaussian, non-binary" " tracks everytime", default=False, action="store_true") parser.add_argument("--cutNonGaussian", help="Cut all but guassian tracks", default=False, action="store_true") parser.add_argument("--comp", help="Strategy for comparing columns for the " "threshold cutoff. Options are [first, prev], where" " first compares with first column of segment and " "prev compares with column immediately left", default="first") parser.add_argument("--ignore", help="Comma-separated list of tracks to " "ignore (the FASTA DNA sequence would be a good " "candidate", default="sequence") parser.add_argument("--maxLen", help="Maximum length of a segment (<= 0 means" " no max length applied", type=int, default=0) parser.add_argument( "--fixLen", help="Just make segments of specifed fixed " "length ignoring other parameters and logic (<= 0 means" " no fixed length applied", type=int, default=0) parser.add_argument("--stats", help="Write some statistics to specified " "file. Of the form <trackName> <Diff> <DiffPct> " " where <Diff> is the number of times a track differs" " between two consecutive segments, and <DiffPct> " " is the average perecentage of all such differences " "accounted for by the track", default=None) parser.add_argument( "--delMask", help="Entirely remove intervals from " "mask tracks that are > given length (otherwise " "they would just be ignored by HMM tools). The difference" " here is that removed intervals will break contiguity.", type=int, default=None) parser.add_argument( "--chroms", help="list of chromosomes, or regions, to run in parallel" " (in BED format). input regions will be intersected with each line" " in this file, and the result will correspsond to an individual job", default=None) parser.add_argument( "--proc", help="number of processes (use in conjunction with --chroms)", type=int, default=1) parser.add_argument( "--co", help="count offset for segment labels. only used internally", type=int, default=0) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.comp != "first" and args.comp != "prev": raise RuntimeError("--comp must be either first or prev") if args.chroms is not None: # hack to allow chroms argument to chunk and rerun parallelDispatch(argv, args) cleanBedTool(tempBedToolPath) return 0 # read query intervals from the bed file tempFiles = [] if args.delMask is not None: cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint, args.tracksInfo) if cutBed is not None: tempFiles.append(cutBed) args.allBed = cutBed logger.info("loading segment region intervals from %s" % args.allBed) mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.allBed) # read the tracks, while intersecting them with the query intervals logger.info("loading tracks %s" % args.tracksInfo) trackData = TrackData() trackData.loadTrackData(args.tracksInfo, mergedIntervals, treatMaskAsBinary=True) # process the --cutTracks option trackList = trackData.getTrackList() cutList = np.zeros((len(trackList)), np.int) if args.cutTracks is not None: cutNames = args.cutTracks.split(",") for name in cutNames: track = trackList.getTrackByName(name) if track is None: raise RuntimeError("cutTrack %s not found" % name) trackNo = track.getNumber() assert trackNo < len(cutList) cutList[trackNo] = 1 args.cutList = cutList # make sure mask tracks count as cut tracks for track in trackList: if track.getDist() == 'mask': args.cutList[track.getNumber()] = 1 # process the --ignore option ignoreList = np.zeros((len(trackList)), np.int) if args.ignore is not None: ignoreNames = args.ignore.split(",") for name in ignoreNames: track = trackList.getTrackByName(name) if track is None: if name is not "sequence": logger.warning("ignore track %s not found" % name) continue trackNo = track.getNumber() assert trackNo < len(ignoreList) ignoreList[trackNo] = 1 if args.cutList[trackNo] == 1: raise RuntimeError("Same track (%s) cant be cut and ignored" % name) args.ignoreList = ignoreList #process the --cutUnscaled option if args.cutUnscaled is True: for track in trackList: trackNo = track.getNumber() if track.scale is None and track.shift is None and\ track.logScale is None and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 #process the --cutMultinomial option if args.cutMultinomial is True: for track in trackList: trackNo = track.getNumber() if track.dist == "multinomial" and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 #process the --cutNonGaussian option if args.cutNonGaussian is True: for track in trackList: trackNo = track.getNumber() if track.dist != "gaussian" and\ args.ignoreList[trackNo] == 0: assert trackNo < len(cutList) cutList[trackNo] = 1 # segment the tracks stats = dict() segmentTracks(trackData, args, stats) writeStats(trackData, args, stats) if len(tempFiles) > 0: runShellCommand("rm -f %s" % " ".join(tempFiles)) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create a teHMM") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("trainingBed", help="Path of BED file containing" " genome regions to train model on. If --supervised " "is used, the names in this bed file will be treated " "as the true annotation (otherwise it is only used for " "interval coordinates)") parser.add_argument("outputModel", help="Path of output hmm") parser.add_argument("--numStates", help="Number of states in model", type = int, default=2) parser.add_argument("--iter", help="Number of EM iterations", type = int, default=100) parser.add_argument("--supervised", help="Use name (4th) column of " "<traingingBed> for the true hidden states of the" " model. Transition parameters will be estimated" " directly from this information rather than EM." " NOTE: The number of states will be determined " "from the bed.", action = "store_true", default = False) parser.add_argument("--cfg", help="Use Context Free Grammar insead of " "HMM. Only works with --supervised for now", action = "store_true", default = False) parser.add_argument("--saPrior", help="Confidence in self alignment " "track for CFG. Probability of pair emission " "is multiplied by this number if the bases are aligned" " and its complement if bases are not aligned. Must" " be between [0,1].", default=0.95, type=float) parser.add_argument("--pairStates", help="Comma-separated list of states" " (from trainingBed) that are treated as pair-emitors" " for the CFG", default=None) parser.add_argument("--emFac", help="Normalization factor for weighting" " emission probabilities because when there are " "many tracks, the transition probabilities can get " "totally lost. 0 = no normalization. 1 =" " divide by number of tracks. k = divide by number " "of tracks / k", type=int, default=0) parser.add_argument("--initTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". This file (all other transitions get probability 0)" " is used to specifiy the initial transition model." " The names and number of states will be initialized " "according to this file (overriding --numStates)", default = None) parser.add_argument("--fixTrans", help="Do not learn transition parameters" " (best used with --initTransProbs)", action="store_true", default=False) parser.add_argument("--initEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". This file (all other emissions get probability 0)" " is used to specifiy the initial emission model. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixEm", help="Do not learn emission parameters" " (best used with --initEmProbs)", action="store_true", default=False) parser.add_argument("--initStartProbs", help="Path of text file where each " "line has two entries: State Probability" ". This file (all other start probs get probability 0)" " is used to specifiy the initial start dist. All " "states specified in this file must appear in the file" " specified with --initTransProbs (but not vice versa).", default = None) parser.add_argument("--fixStart", help="Do not learn start parameters" " (best used with --initStartProbs)", action="store_true", default=False) parser.add_argument("--forceTransProbs", help="Path of text file where each " "line has three entries: FromState ToState Probability" ". These transition probabilities will override any " " learned probabilities after each training iteration" " (unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed)" , default=None) parser.add_argument("--forceEmProbs", help="Path of text file where each " "line has four entries: State Track Symbol Probability" ". These " "emission probabilities will override any learned" " probabilities after each training iteration " "(unspecified " "will not be set to 0 in this case. the learned values" " will be kept, but normalized as needed.)" , default = None) parser.add_argument("--flatEm", help="Use a flat emission distribution as " "a baseline. If not specified, the initial emission " "distribution will be randomized by default. Emission" " probabilities specified with --initEmpProbs or " "--forceEmProbs will never be affected by randomizaiton" ". The randomization is important for Baum Welch " "training, since if two states dont have at least one" " different emission or transition probability to begin" " with, they will never learn to be different.", action="store_true", default=False) parser.add_argument("--emRandRange", help="When randomly initialzing an" " emission distribution, constrain" " the values to the given range (pair of " "comma-separated numbers). Overridden by " "--initEmProbs and --forceEmProbs when applicable." " Completely overridden by --flatEm (which is equivalent" " to --emRandRange .5,.5.). Actual values used will" " always be normalized.", default="0.2,0.8") parser.add_argument("--segment", help="Bed file of segments to treat as " "single columns for HMM (ie as created with " "segmentTracks.py). IMPORTANT: this file must cover " "the same regions as the traininBed file. Unless in " "supervised mode, probably best to use same bed file " " as both traingBed and --segment argument. Otherwise" " use intersectBed to make sure the overlap is exact", default=None) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied)", type=int, default=0) parser.add_argument("--seed", help="Seed for random number generator" " which will be used to initialize emissions " "(if --flatEM and --supervised not specified)", default=None, type=int) parser.add_argument("--reps", help="Number of replicates (with different" " random initializations) to run. The replicate" " with the highest likelihood will be chosen for the" " output", default=1, type=int) parser.add_argument("--numThreads", help="Number of threads to use when" " running replicates (see --rep) in parallel.", type=int, default=1) parser.add_argument("--emThresh", help="Threshold used for convergence" " in baum welch training. IE delta log likelihood" " must be bigger than this number (which should be" " positive) for convergence", type=float, default=0.001) parser.add_argument("--saveAllReps", help="Save all replicates (--reps)" " models to disk, instead of just the best one" ". Format is <outputModel>.repN. There will be " " --reps -1 such models saved as the best output" " counts as a replicate", action="store_true", default=False) parser.add_argument("--maxProb", help="Gaussian distributions and/or" " segment length corrections can cause probability" " to *decrease* during BW iteration. Use this option" " to remember the parameters with the highest probability" " rather than returning the parameters after the final " "iteration.", action="store_true", default=False) parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop" " training if a given number of iterations go by without" " hitting a new maxProb", default=None, type=int) parser.add_argument("--transMatEpsilons", help="By default, epsilons are" " added to all transition probabilities to prevent " "converging on 0 due to rounding error only for fully" " unsupervised training. Use this option to force this" " behaviour for supervised and semisupervised modes", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() if args.cfg is True: assert args.supervised is True assert args.saPrior >= 0. and args.saPrior <= 1. if args.pairStates is not None: assert args.cfg is True if args.initTransProbs is not None or args.fixTrans is True or\ args.initEmProbs is not None or args.fixEm is not None: if args.cfg is True: raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm " "are not currently compatible with --cfg.") if args.fixTrans is True and args.supervised is True: raise RuntimeError("--fixTrans option not compatible with --supervised") if args.fixEm is True and args.supervised is True: raise RuntimeError("--fixEm option not compatible with --supervised") if (args.forceTransProbs is not None or args.forceEmProbs is not None) \ and args.cfg is True: raise RuntimeError("--forceTransProbs and --forceEmProbs are not " "currently compatible with --cfg") if args.flatEm is True and args.supervised is False and\ args.initEmProbs is None and args.initTransProbs is None: raise RuntimeError("--flatEm must be used with --initEmProbs and or" " --initTransProbs") if args.initEmProbs is not None and args.initTransProbs is None: raise RuntimeError("--initEmProbs can only be used in conjunction with" " --initTransProbs") if args.emRandRange is not None: args.emRandRange = args.emRandRange.split(",") try: assert len(args.emRandRange) == 2 args.emRandRange = (float(args.emRandRange[0]), float(args.emRandRange[1])) except: raise RuntimeError("Invalid --emRandRange specified") if args.transMatEpsilons is False: # old logic here. now overriden with above options args.transMatEpsilons = (args.supervised is False and args.initTransProbs is None and args.forceTransProbs is None) setLoggingFromOptions(args) tempBedToolPath = initBedTool() # read training intervals from the bed file logger.info("loading training intervals from %s" % args.trainingBed) mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.trainingBed) # read segment intervals segIntervals = None if args.segment is not None: logger.info("loading segment intervals from %s" % args.segment) try: checkExactOverlap(args.trainingBed, args.segment) except: raise RuntimeError("bed file passed with --segments option" " must exactly overlap trainingBed") segIntervals = readBedIntervals(args.segment, sort=True) elif args.segLen > 0: raise RuntimeError("--segLen can only be used with --segment") if args.segLen <= 0: args.segLen = None if args.segLen > 0 and args.segLen != 1: logger.warning("--segLen should be 0 (no correction) or 1 (base" " correction). Values > 1 may cause bias.") # read the tracks, while intersecting them with the training intervals logger.info("loading tracks %s" % args.tracksInfo) trackData = TrackData() trackData.loadTrackData(args.tracksInfo, mergedIntervals, segmentIntervals=segIntervals) catMap = None userTrans = None if args.supervised is False and args.initTransProbs is not None: logger.debug("initializing transition model with user data") catMap = stateNamesFromUserTrans(args.initTransProbs) # state number is overrided by the transProbs file args.numStates = len(catMap) truthIntervals = None # state number is overrided by the input bed file in supervised mode if args.supervised is True: logger.info("processing supervised state names") # we reload because we don't want to be merging them here truthIntervals = readBedIntervals(args.trainingBed, ncol=4) catMap = mapStateNames(truthIntervals) args.numStates = len(catMap) # train the model seeds = [random.randint(0, 4294967294)] if args.seed is not None: seeds = [args.seed] random.seed(args.seed) seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)] def trainClosure(randomSeed): return trainModel(randomSeed, trackData=trackData, catMap=catMap, userTrans=userTrans, truthIntervals=truthIntervals, args=args) modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads, execFunction = trainClosure, useThreads = True) # select best model logmsg = "" bestModel = (-1, LOGZERO) for i in xrange(len(modelList)): curModel = (i, modelList[i].getLastLogProb()) if curModel[1] > bestModel[1]: bestModel = curModel if curModel[1] is not None: logmsg += "Rep %i: TotalProb: %f\n" % curModel if len(modelList) > 1: logging.info("Training Replicates Statistics:\n%s" % logmsg) logging.info("Selecting best replicate (%d, %f)" % bestModel) model = modelList[bestModel[0]] # write the model to a pickle logger.info("saving trained model to %s" % args.outputModel) saveModel(args.outputModel, model) # write all replicates writtenCount = 0 if args.saveAllReps is True: for i, repModel in enumerate(modelList): if i != bestModel[0]: repPath = "%s.rep%d" % (args.outputModel, writtenCount) logger.info("saving replicate model to %s" % repPath) saveModel(repPath, repModel) writtenCount += 1 cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Combine a bunch of non-numeric BED tracks into" " single file using fitStateNames.py to try to keep names " "consistent. Idea is to be used as baseline to compare" " hmm to (via base-by-base statistics, primarily, since" " this procedure could induce some fragmentation)") parser.add_argument("tracksXML", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("regionBed", help="BED file representing " "target region (best if whole genome)") parser.add_argument("outBed", help="Output bed") parser.add_argument("--tracks", help="Comma-separated list of " "track names to use. All tracks will be" " used by default", default=None) parser.add_argument("--outside", help="Name to give non-annotated" "regions", default="Outside") parser.add_argument("--fitThresh", help="Min map percentage (0,1)" " in order to rename (see --qualThresh option" "of fitStateNames.py", type=float, default=0.5) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() inputTrackList = TrackList(args.tracksXML) iter = 0 # get regionBed where all intervals are merged when possible regionIntervals = getMergedBedIntervals(args.regionBed, sort=True) tempRegionPath = getLocalTempPath("Temp", "_reg.bed") tempRegionFile = open(tempRegionPath, "w") for interval in regionIntervals: tempRegionFile.write("\t".join([str(x) for x in interval]) + "\n") tempRegionFile.close() # accumulate tracks in temp file tempOutPath = getLocalTempPath("Temp", "_out.bed") for track in inputTrackList: if track.shift is not None or track.scale is not None or\ track.logScale is not None or track.dist == "gaussian" or\ os.path.splitext(track.getPath())[1].lower() != ".bed": logger.warning("Skipping numeric track %s" % track.getName()) elif args.tracks is None or track.getName() in args.tracks.split(","): combineTrack(track, tempOutPath, tempRegionPath, iter, args) iter += 1 # nothing got written, make everything outside if iter == 0: tempOutFile = open(tempOutPath, "w") for interval in regionIntervals: tempOutFile.write("%s\t%s\t%s\t%s\n" % (interval[0], interval[1], interval[2], args.outside)) tempOutFile.close() runShellCommand("mv %s %s" % (tempOutPath, args.outBed)) runShellCommand("rm -f %s" % (tempRegionPath)) cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Thin wrapper of teHmmTrain.py and teHmmEval.py " "to generate a table of Number-of-HMM-states VS BIC. Lower BIC" " is better") parser.add_argument("tracks", help="tracks xml used for training and eval") parser.add_argument( "trainingBeds", help="comma-separated list of training regions" " (training region size will be a variable in output table). " "if segmentation is activated, these must also be the " "segmented beds...") parser.add_argument("evalBed", help="eval region") parser.add_argument("trainOpts", help="all teHmmTrain options in quotes") parser.add_argument("evalOpts", help="all teHmmEval options in quotes") parser.add_argument("states", help="comma separated-list of numbers of states" " to try") parser.add_argument("outDir", help="output directory") parser.add_argument("--reps", help="number of replicates", type=int, default=1) parser.add_argument("--proc", help="maximum number of processors to use" " in parallel", type=int, default=1) parser.add_argument("--resume", help="try not to rewrite existing files", action="store_true", default=False) parser.add_argument( "--initTrans", help="the states argument is overridden" " to specify a list of transition initialization files " "instead of state numbers", action="store_true", default=False) parser.add_argument("--numReps", help="the states argument is overridden" " to specifiy a list of replicate numbers (--reps)" " arguments", action="store_true", default=False) parser.add_argument("--numIter", help="the states argument is overridden" " to specifiy a list of iteration counts (--iter)" " arugments", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1: raise RuntimeError("only one of {--initTrans, --numReps, --numIter} " "can be used at a time") if not os.path.isdir(args.outDir): runShellCommand("mkdir %s" % args.outDir) # get the sizes of the trianing beds trainingSizes = [] trainingBeds = [] for tb in args.trainingBeds.split(","): if len(tb) > 0: trainingBeds.append(tb) for bed in trainingBeds: assert os.path.isfile(bed) bedLen = 0 for interval in readBedIntervals(bed): bedLen += interval[2] - interval[1] trainingSizes.append(bedLen) # make sure --bed not in teHmmEval options and --numStates not in train # options trainOpts = args.trainOpts.split() if "--numStates" in args.trainOpts and not args.numReps and not args.numIter: nsIdx = trainOpts.index("--numStates") assert nsIdx < len(trainOpts) - 1 del trainOpts[nsIdx] del trainOpts[nsIdx] if "--initTransProbs" in args.trainOpts: tpIdx = trainOpts.index("--initTransProbs") assert tpIdx < len(trainOpts) - 1 del trainOpts[tpIdx] del trianOpts[tpIdx] trainProcs = 1 if "--numThreads" in args.trainOpts: npIdx = trainOpts.index("--numThreads") assert npIdx < len(trainOpts) - 1 trainProcs = int(trainOpts[npIdx + 1]) segOptIdx = -1 if "--segment" in args.trainOpts: segIdx = trainOpts.index("--segment") assert segIdx < len(trainOpts) - 1 segOptIdx = segIdx + 1 if args.numReps and "--reps" in args.trainOpts: repsIdx = trainOpts.index("--reps") assert repsIdx < len(trainOpts) - 1 del trainOpts[repsIdx] del trainOpts[repsIdx] if args.numIter and "--iter" in args.trainOpts: iterIdx = trainOpts.index("--iter") assert iterIdx < len(trainOpts) - 1 del trainOpts[iterIdx] del trainOpts[iterIdx] evalOpts = args.evalOpts.split() if "--bed" in args.evalOpts: bedIdx = evalOpts.index("--bed") assert bedIdx < len(evalOpts) - 1 del evalOpts[bedIdx] del evalOpts[bedIdx] if "--bic" in args.evalOpts: bicIdx = evalOpts.index("--bic") assert bicIdx < len(evalOpts) - 1 del evalOpts[bicIdx] del evalOpts[bicIdx] # hack in support for --initTrans option by munging out model sizes # from the text files if args.initTrans is True: transFiles = args.states.split(",") states = [] for tf in transFiles: stateSet = set() with open(tf) as f: for line in f: toks = line.split() print toks if len(toks) > 1 and toks[0][0] != "#": stateSet.add(toks[0]) stateSet.add(toks[1]) states.append(len(stateSet)) else: states = args.states.split(",") trainCmds = [] evalCmds = [] prevSize = -1 sameSizeCount = 0 for trainingSize, trainingBed in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize print prevSize, trainingSize, sameSizeCount for numStates in states: for rep in xrange(args.reps): outMod = os.path.join( args.outDir, "hmm_%d.%d.%d.%d.mod" % (trainingSize, sameSizeCount, int(numStates), int(rep))) if segOptIdx != -1: trainOpts[segOptIdx] = trainingBed if args.initTrans is True: statesOpt = "--initTransProbs %s" % transFiles[ states.index(numStates)] elif args.numIter is True: # states argument overridden by iterations statesOpt = "--iter %d" % int(numStates) elif args.numReps is True: # states argument overridden by reps statesOpt = "--reps %d" % int(numStates) else: statesOpt = "--numStates %d" % int(numStates) trainCmd = "teHmmTrain.py %s %s %s %s %s" % ( args.tracks, trainingBed, outMod, " ".join(trainOpts), statesOpt) if not args.resume or not os.path.isfile(outMod) or \ os.path.getsize(outMod) < 100: trainCmds.append(trainCmd) outBic = outMod.replace(".mod", ".bic") outBed = outMod.replace(".mod", "_eval.bed") evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % ( args.tracks, outMod, args.evalBed, outBed, outBic, " ".join(evalOpts)) if not args.resume or not os.path.isfile(outBic) or \ os.path.getsize(outBic) < 2: evalCmds.append(evalCmd) # run the training runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs)) # run the eval runParallelShellCommands(evalCmds, args.proc) # make the table header tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w") stateColName = "states" if args.numIter is True: statesColName = "iter" elif args.numReps is True: stateColName = "reps" tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName) for i in xrange(args.reps): tableFile.write(", bic.%d" % i) tableFile.write("\n") # make the table body prevSize = -1 sameSizeCount = 0 for (trainingSize, trainingBed) in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize for numStates in states: bics = [] printBics = [] for rep in xrange(args.reps): outMod = os.path.join( args.outDir, "hmm_%d.%d.%d.%d.mod" % (trainingSize, sameSizeCount, int(numStates), int(rep))) outBic = outMod.replace(".mod", ".bic") try: with open(outBic, "r") as obFile: for line in obFile: bic = float(line.split()[0]) break bics.append(bic) printBics.append(bic) except: logger.warning("Coudn't find bic %s" % outBic) printBics.append("ERROR") # write row tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates))) if len(bics) > 0: tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics), np.max(bics))) else: tableFile.write(", ERROR, ERROR, ERROR") for pb in printBics: tableFile.write(", %s" % pb) tableFile.write("\n") tableFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="fix up track names and sort alphabetically. easier to do here on xml than at end for pape\ r.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile nm = dict() nm["hollister"] = "RM-RepBase-Hollister" nm["chaux"] = "RM-RepBase-deLaChaux" nm["repeat_modeler"] = "RM-RepeatModeler" nm["repbase"] = "RM-RepBase" nm["repet"] = "REPET" nm["ltr_finder"] = "LTR_FINDER" nm["ltr_harvest"] = "LTR_Harvest" nm["ltr_termini"] = "lastz-Termini" nm["lastz-Termini"] = "lastz-LTRTermini" nm["tir_termini"] = "lastz-InvTermini" nm["irf"] = "IRF" nm["palindrome"] = "lastz-Palindrome" nm["overlap"] = "lastz-Overlap" nm["mitehunter"] = "MITE-Hunter" nm["helitronscanner"] = "HelitronScanner" nm["cov_80-"] = "lastz-SelfLowId" nm["cov_80-90"] = "lastz-SelfMedId" nm["cov_90+"] = "lastz-SelfHighId" nm["left_peak_80-"] = "lastz-SelfPeakLeftLow" nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"] nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed" nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"] nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh" nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"] nm["right_peak_80-"] = "lastz-SelfPeakRightLow" nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"] nm["right_peak_80-90"] = "lastz-SelfPeakRightMed" nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"] nm["right_peak_90+"] = "lastz-SelfPeakRightHigh" nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"] nm["cov_maxPId"] = "lastz-SelfPctMaxId" nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"] nm["te_domains"] = "TE-Domains" nm["fgenesh"] = "Genes" nm["genes"] = nm["fgenesh"] nm["refseq"] = nm["fgenesh"] nm["mrna"] = "mRNA" nm["srna"] = "sRNA" nm["ortho_depth"] = "Alignment-Depth" nm["orthology"] = nm["ortho_depth"] nm["chain_depth"] = nm["ortho_depth"] nm["alignment_depth"] = nm["ortho_depth"] nm["gcpct"] = "GC" nm["trf"] = "TRF" nm["windowmasker"] = "WindowMasker" nm["polyN"] = "Ns" nm["phastcons_ce"] = "Conservation" nm["phastcons"] = nm["phastcons_ce"] nm["PhastCons"] = nm["phastcons_ce"] nm["phyloP"] = nm["phastcons_ce"] nm["phylop"] = nm["phastcons_ce"] rtracks = dict() rtracks["tantan"] = True rtracks["polyA"] = True rtracks["transposon_psi"] = True rtracks["transposonpsi"] = True rtracks["repbase_censor"] = True rtracks["tsd"] = True rtracks["repbase_default"] = True rtracks["dustmasker"] = True inTracks = TrackList(args.tracksInfo) outTracks = TrackList() outList = [] for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList): if not os.path.exists(track.path): raise RuntimeError("Track DNE %s" % track.path) if track.name not in rtracks: if track.name in nm: track.name = nm[track.name] else: logger.warning("Did not map track %s" % track.name) outList.append(track) else: logger.warning("Deleted track %s" % track.name) # sort the list def sortComp(x): lname = x.name.lower() if x.name == "RM-RepeatModeler": return "aaaaa" + lname elif "RM" in x.name: return "aaaa" + lname elif "REPET" in x.name: return "aaa" + lname elif "softmask" in lname or "tigr" in lname or "te-domains" in lname: return "aa" + lname elif x.getDist == "mask": return "zzzz" + lname else: return lname outList = sorted(outList, key = lambda track : sortComp(track)) for track in outList: outTracks.addTrack(track) outTracks.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "fix up track names and sort alphabetically. easier to do here on xml than at end for pape\ r.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("outTracksInfo", help="Path to write modified tracks XML") addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() args.logOpString = "--logLevel %s" % getLogLevelString() if args.logFile is not None: args.logOpString += " --logFile %s" % args.logFile nm = dict() nm["hollister"] = "RM-RepBase-Hollister" nm["chaux"] = "RM-RepBase-deLaChaux" nm["repeat_modeler"] = "RM-RepeatModeler" nm["repbase"] = "RM-RepBase" nm["repet"] = "REPET" nm["ltr_finder"] = "LTR_FINDER" nm["ltr_harvest"] = "LTR_Harvest" nm["ltr_termini"] = "lastz-Termini" nm["lastz-Termini"] = "lastz-LTRTermini" nm["tir_termini"] = "lastz-InvTermini" nm["irf"] = "IRF" nm["palindrome"] = "lastz-Palindrome" nm["overlap"] = "lastz-Overlap" nm["mitehunter"] = "MITE-Hunter" nm["helitronscanner"] = "HelitronScanner" nm["cov_80-"] = "lastz-SelfLowId" nm["cov_80-90"] = "lastz-SelfMedId" nm["cov_90+"] = "lastz-SelfHighId" nm["left_peak_80-"] = "lastz-SelfPeakLeftLow" nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"] nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed" nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"] nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh" nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"] nm["right_peak_80-"] = "lastz-SelfPeakRightLow" nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"] nm["right_peak_80-90"] = "lastz-SelfPeakRightMed" nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"] nm["right_peak_90+"] = "lastz-SelfPeakRightHigh" nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"] nm["cov_maxPId"] = "lastz-SelfPctMaxId" nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"] nm["te_domains"] = "TE-Domains" nm["fgenesh"] = "Genes" nm["genes"] = nm["fgenesh"] nm["refseq"] = nm["fgenesh"] nm["mrna"] = "mRNA" nm["srna"] = "sRNA" nm["ortho_depth"] = "Alignment-Depth" nm["orthology"] = nm["ortho_depth"] nm["chain_depth"] = nm["ortho_depth"] nm["alignment_depth"] = nm["ortho_depth"] nm["gcpct"] = "GC" nm["trf"] = "TRF" nm["windowmasker"] = "WindowMasker" nm["polyN"] = "Ns" nm["phastcons_ce"] = "Conservation" nm["phastcons"] = nm["phastcons_ce"] nm["PhastCons"] = nm["phastcons_ce"] nm["phyloP"] = nm["phastcons_ce"] nm["phylop"] = nm["phastcons_ce"] rtracks = dict() rtracks["tantan"] = True rtracks["polyA"] = True rtracks["transposon_psi"] = True rtracks["transposonpsi"] = True rtracks["repbase_censor"] = True rtracks["tsd"] = True rtracks["repbase_default"] = True rtracks["dustmasker"] = True inTracks = TrackList(args.tracksInfo) outTracks = TrackList() outList = [] for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList): if not os.path.exists(track.path): raise RuntimeError("Track DNE %s" % track.path) if track.name not in rtracks: if track.name in nm: track.name = nm[track.name] else: logger.warning("Did not map track %s" % track.name) outList.append(track) else: logger.warning("Deleted track %s" % track.name) # sort the list def sortComp(x): lname = x.name.lower() if x.name == "RM-RepeatModeler": return "aaaaa" + lname elif "RM" in x.name: return "aaaa" + lname elif "REPET" in x.name: return "aaa" + lname elif "softmask" in lname or "tigr" in lname or "te-domains" in lname: return "aa" + lname elif x.getDist == "mask": return "zzzz" + lname else: return lname outList = sorted(outList, key=lambda track: sortComp(track)) for track in outList: outTracks.addTrack(track) outTracks.saveXML(args.outTracksInfo) cleanBedTool(tempBedToolPath)
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=" Thin wrapper of teHmmTrain.py and teHmmEval.py " "to generate a table of Number-of-HMM-states VS BIC. Lower BIC" " is better") parser.add_argument("tracks", help="tracks xml used for training and eval") parser.add_argument("trainingBeds", help="comma-separated list of training regions" " (training region size will be a variable in output table). " "if segmentation is activated, these must also be the " "segmented beds...") parser.add_argument("evalBed", help="eval region") parser.add_argument("trainOpts", help="all teHmmTrain options in quotes") parser.add_argument("evalOpts", help="all teHmmEval options in quotes") parser.add_argument("states", help="comma separated-list of numbers of states" " to try") parser.add_argument("outDir", help="output directory") parser.add_argument("--reps", help="number of replicates", type = int, default=1) parser.add_argument("--proc", help="maximum number of processors to use" " in parallel", type = int, default = 1) parser.add_argument("--resume", help="try not to rewrite existing files", action="store_true", default=False) parser.add_argument("--initTrans", help="the states argument is overridden" " to specify a list of transition initialization files " "instead of state numbers", action="store_true", default=False) parser.add_argument("--numReps", help="the states argument is overridden" " to specifiy a list of replicate numbers (--reps)" " arguments", action="store_true", default=False) parser.add_argument("--numIter", help="the states argument is overridden" " to specifiy a list of iteration counts (--iter)" " arugments", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1: raise RuntimeError("only one of {--initTrans, --numReps, --numIter} " "can be used at a time") if not os.path.isdir(args.outDir): runShellCommand("mkdir %s" % args.outDir) # get the sizes of the trianing beds trainingSizes = [] trainingBeds = [] for tb in args.trainingBeds.split(","): if len(tb) > 0: trainingBeds.append(tb) for bed in trainingBeds: assert os.path.isfile(bed) bedLen = 0 for interval in readBedIntervals(bed): bedLen += interval[2] - interval[1] trainingSizes.append(bedLen) # make sure --bed not in teHmmEval options and --numStates not in train # options trainOpts = args.trainOpts.split() if "--numStates" in args.trainOpts and not args.numReps and not args.numIter: nsIdx = trainOpts.index("--numStates") assert nsIdx < len(trainOpts) - 1 del trainOpts[nsIdx] del trainOpts[nsIdx] if "--initTransProbs" in args.trainOpts: tpIdx = trainOpts.index("--initTransProbs") assert tpIdx < len(trainOpts) - 1 del trainOpts[tpIdx] del trianOpts[tpIdx] trainProcs = 1 if "--numThreads" in args.trainOpts: npIdx = trainOpts.index("--numThreads") assert npIdx < len(trainOpts) - 1 trainProcs = int(trainOpts[npIdx + 1]) segOptIdx = -1 if "--segment" in args.trainOpts: segIdx = trainOpts.index("--segment") assert segIdx < len(trainOpts) - 1 segOptIdx = segIdx + 1 if args.numReps and "--reps" in args.trainOpts: repsIdx = trainOpts.index("--reps") assert repsIdx < len(trainOpts) - 1 del trainOpts[repsIdx] del trainOpts[repsIdx] if args.numIter and "--iter" in args.trainOpts: iterIdx = trainOpts.index("--iter") assert iterIdx < len(trainOpts) - 1 del trainOpts[iterIdx] del trainOpts[iterIdx] evalOpts = args.evalOpts.split() if "--bed" in args.evalOpts: bedIdx = evalOpts.index("--bed") assert bedIdx < len(evalOpts) - 1 del evalOpts[bedIdx] del evalOpts[bedIdx] if "--bic" in args.evalOpts: bicIdx = evalOpts.index("--bic") assert bicIdx < len(evalOpts) - 1 del evalOpts[bicIdx] del evalOpts[bicIdx] # hack in support for --initTrans option by munging out model sizes # from the text files if args.initTrans is True: transFiles = args.states.split(",") states = [] for tf in transFiles: stateSet = set() with open(tf) as f: for line in f: toks = line.split() print toks if len(toks) > 1 and toks[0][0] != "#": stateSet.add(toks[0]) stateSet.add(toks[1]) states.append(len(stateSet)) else: states = args.states.split(",") trainCmds = [] evalCmds = [] prevSize = -1 sameSizeCount = 0 for trainingSize, trainingBed in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize print prevSize, trainingSize, sameSizeCount for numStates in states: for rep in xrange(args.reps): outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % ( trainingSize, sameSizeCount, int(numStates), int(rep))) if segOptIdx != -1: trainOpts[segOptIdx] = trainingBed if args.initTrans is True: statesOpt = "--initTransProbs %s" % transFiles[states.index(numStates)] elif args.numIter is True: # states argument overridden by iterations statesOpt = "--iter %d" % int(numStates) elif args.numReps is True: # states argument overridden by reps statesOpt = "--reps %d" % int(numStates) else: statesOpt = "--numStates %d" % int(numStates) trainCmd = "teHmmTrain.py %s %s %s %s %s" % ( args.tracks, trainingBed, outMod, " ".join(trainOpts), statesOpt) if not args.resume or not os.path.isfile(outMod) or \ os.path.getsize(outMod) < 100: trainCmds.append(trainCmd) outBic = outMod.replace(".mod", ".bic") outBed = outMod.replace(".mod", "_eval.bed") evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % ( args.tracks, outMod, args.evalBed, outBed, outBic, " ".join(evalOpts)) if not args.resume or not os.path.isfile(outBic) or \ os.path.getsize(outBic) < 2: evalCmds.append(evalCmd) # run the training runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs)) # run the eval runParallelShellCommands(evalCmds, args.proc) # make the table header tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w") stateColName = "states" if args.numIter is True: statesColName = "iter" elif args.numReps is True: stateColName = "reps" tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName) for i in xrange(args.reps): tableFile.write(", bic.%d" % i) tableFile.write("\n") # make the table body prevSize = -1 sameSizeCount = 0 for (trainingSize,trainingBed) in zip(trainingSizes, trainingBeds): # hack to take into account we may have different inputs with same # same size, so their corresponding results need unique filenames if trainingSize == prevSize: sameSizeCount += 1 else: sameSizeCount = 0 prevSize = trainingSize for numStates in states: bics = [] printBics = [] for rep in xrange(args.reps): outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % ( trainingSize, sameSizeCount, int(numStates), int(rep))) outBic = outMod.replace(".mod", ".bic") try: with open(outBic, "r") as obFile: for line in obFile: bic = float(line.split()[0]) break bics.append(bic) printBics.append(bic) except: logger.warning("Coudn't find bic %s" % outBic) printBics.append("ERROR") # write row tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates))) if len(bics) > 0: tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics), np.max(bics))) else: tableFile.write(", ERROR, ERROR, ERROR") for pb in printBics: tableFile.write(", %s" % pb) tableFile.write("\n") tableFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument( "inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument( "--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument( "--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument( "--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument( "--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument( "--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol=4, sort=True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand( "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[ 2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[ 1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write( "%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([ tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath ])) cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Make some tables of statistics from a BED file. All" " output will be written in one big CSV table to be viewed in a " "spreadsheet.") parser.add_argument("inBed", help="Input bed file") parser.add_argument("outCsv", help="Path to write output in CSV format") parser.add_argument("--ignore", help="Comma-separated list of names" " to ignore", default="") parser.add_argument("--numBins", help="Number of (linear) bins for " "histograms", type=int, default=10) parser.add_argument("--logHist", help="Apply log-transform to data for " "histogram", action="store_true", default=False) parser.add_argument("--histRange", help="Histogram range as comma-" "separated pair of numbers", default=None) parser.add_argument("--noHist", help="skip hisograms", action="store_true", default=False) parser.add_argument("--noScore", help="Just do length stats", action="store_true", default=False) parser.add_argument("--noLen", help="Just do score stats", action="store_true", default=False) parser.add_argument("--nearness", help="Compute nearness stats (instead " "of normal stats) of input bed with given BED. Output" " will be a BED instead of CSV, with nearness in the " "score position", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.histRange is not None: args.histRange = args.histRange.split(",") assert len(args.histRange) == 2 args.histRange = int(args.histRange[0]), int(args.histRange[1]) outFile = open(args.outCsv, "w") args.ignoreSet = set(args.ignore.split(",")) intervals = readBedIntervals(args.inBed, ncol=5, sort=args.nearness is not None) csvStats = "" # nearness stats if args.nearness is not None: args.noScore = True csvStats = makeNearnessBED(intervals, args) # length stats elif args.noLen is False: csvStats = makeCSV(intervals, args, lambda x: int(x[2]) - int(x[1]), "Length") # score stats try: if args.noScore is False: csvStats += "\n" + makeCSV(intervals, args, lambda x: float(x[4]), "Score") csvStats += "\n" + makeCSV( intervals, args, lambda x: float(x[4]) * (float(x[2]) - float(x[1])), "Score*Length") except Exception as e: logger.warning("Couldn't make score stats because %s" % str(e)) outFile.write(csvStats) outFile.write("\n") outFile.close() cleanBedTool(tempBedToolPath)
def runTsd(args, tempTracksInfo): """ run addTsdTrack on termini and chaux to generate tsd track""" if args.noTsd is True: return origTrackList = TrackList(args.tracksInfo) outTrackList = TrackList(tempTracksInfo) tempFiles = [] tsdInputFiles = [] tsdInputTracks = [] # preprocess termini lastzTracks = [origTrackList.getTrackByName(args.ltr_termini), origTrackList.getTrackByName(args.tir)] for terminiTrack in lastzTracks: if terminiTrack is not None: inFile = terminiTrack.getPath() fillFile = getLocalTempPath("Temp_fill", ".bed") tempBed = None if inFile[-3:] == ".bb": tempBed = getLocalTempPath("Temp_termini", ".bed") runShellCommand("bigBedToBed %s %s" % (inFile, tempBed)) inFile = tempBed runShellCommand("fillTermini.py %s %s" % (inFile, fillFile)) tsdInputFiles.append(fillFile) tsdInputTracks.append(terminiTrack.getName()) tempFiles.append(fillFile) if tempBed is not None: runShellCommand("rm -f %s" % tempBed) else: logger.warning("Could not find termini track") # add repeat_modeler repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler) if repeat_modelerTrack is not None: tsdInputFiles.append(repeat_modelerTrack.getPath()) tsdInputTracks.append(repeat_modelerTrack.getName()) # run addTsdTrack (appending except first time) # note we override input track paths in each case assert len(tsdInputFiles) == len(tsdInputTracks) for i in xrange(len(tsdInputFiles)): optString = "" if i > 0: optString += " --append" # really rough hardcoded params based on # (A unified classification system for eukaryotic transposable elements # Wicker et. al 2007) if tsdInputTracks[i] == args.repeat_modeler: optString += " --names LINE,SINE,Unknown" optString += " --maxScore 20" optString += " --left 20" optString += " --right 20" optString += " --min 5" optString += " --max 20" optString += " --overlap 20" elif tsdInputTracks[i] == args.ltr_termini: optString += " --maxScore 3" optString += " --left 8" optString += " --right 8" optString += " --min 3" optString += " --max 6" elif tsdInputTracks[i] == args.tir: optString += " --maxScore 3" optString += " --left 15" optString += " --right 15" optString += " --min 3" optString += " --max 12" tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml") runShellCommand("addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % ( tempTracksInfo, args.cleanTrackPath, tempXMLOut, tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i], optString, args.logOpString, args.numProc)) runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo)) for i in xrange(len(tempFiles)): runShellCommand("rm %s" % tempFiles[i])
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Make some tables of statistics from a BED file. All" " output will be written in one big CSV table to be viewed in a " "spreadsheet.") parser.add_argument("inBed", help="Input bed file") parser.add_argument("outCsv", help="Path to write output in CSV format") parser.add_argument("--ignore", help="Comma-separated list of names" " to ignore", default="") parser.add_argument("--numBins", help="Number of (linear) bins for " "histograms", type=int, default=10) parser.add_argument("--logHist", help="Apply log-transform to data for " "histogram", action="store_true", default=False) parser.add_argument("--histRange", help="Histogram range as comma-" "separated pair of numbers", default=None) parser.add_argument("--noHist", help="skip hisograms", action="store_true", default=False) parser.add_argument("--noScore", help="Just do length stats", action="store_true", default=False) parser.add_argument("--noLen", help="Just do score stats", action="store_true", default=False) parser.add_argument("--nearness", help="Compute nearness stats (instead " "of normal stats) of input bed with given BED. Output" " will be a BED instead of CSV, with nearness in the " "score position", default=None) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.histRange is not None: args.histRange = args.histRange.split(",") assert len(args.histRange) == 2 args.histRange = int(args.histRange[0]), int(args.histRange[1]) outFile = open(args.outCsv, "w") args.ignoreSet = set(args.ignore.split(",")) intervals = readBedIntervals(args.inBed, ncol = 5, sort = args.nearness is not None) csvStats = "" # nearness stats if args.nearness is not None: args.noScore = True csvStats = makeNearnessBED(intervals, args) # length stats elif args.noLen is False: csvStats = makeCSV(intervals, args, lambda x : int(x[2])-int(x[1]), "Length") # score stats try: if args.noScore is False: csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]), "Score") csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]) * ( float(x[2]) - float(x[1])), "Score*Length") except Exception as e: logger.warning("Couldn't make score stats because %s" % str(e)) outFile.write(csvStats) outFile.write("\n") outFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Fill in masked intervals of an hmm prediction " "(from teHmmEval.py) with state corresponding to surrounding" " intervals.") parser.add_argument("tracksXML", help="XML track list (used to id masking" " tracks") parser.add_argument("allBed", help="Target scope. Masked intervals outside" " of these regions will not be included") parser.add_argument("inBed", help="TE prediction BED file. State labels" " should probably be mapped (ie with fitStateNames.py)") parser.add_argument("outBed", help="Output BED. Will be equivalent to" " the input bed except all gaps corresponding to " "masked intervals will be filled") parser.add_argument("--maxLen", help="Maximum length of a masked interval" " to fill (inclusive). Use --delMask option with same value" "if running compareBedStates.py after.", type=int, default=sys.maxint) parser.add_argument("--default", help="Default label to give to masked " "region if no label can be determined", default="0") parser.add_argument("--tgts", help="Only relabel gaps that " "are flanked on both sides by the same state, and this state" " is in this comma- separated list. --default used for other" " gaps. If not targetst specified then all states checked.", default=None) parser.add_argument("--oneSidedTgts", help="Only relabel gaps that " "are flanked on at least one side by a state in this comma-" "separated list --default used for other gaps", default=None) parser.add_argument("--onlyDefault", help="Add the default state (--default) no" " no all masked gaps no matter what. ie ignoring all other " "logic", action="store_true", default=False) parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input." " By default, the input is expected to come from the HMM " "with mask intervals already absent, and will crash on with" " an assertion error if an overlap is detected.", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() # make sets tgtSet = set() if args.tgts is not None: tgtSet = set(args.tgts.split(",")) oneSidedTgtSet = set() if args.oneSidedTgts is not None: oneSidedTgtSet = set(args.oneSidedTgts.split(",")) assert len(tgtSet.intersection(oneSidedTgtSet)) == 0 # read the track list trackList = TrackList(args.tracksXML) maskTracks = trackList.getMaskTracks() # read the input bed inBed = args.inBed if args.cut is True: inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML) inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True) if args.cut is True: runShellCommand("rm -f %s" % inBed) if len(maskTracks) == 0 or len(inputIntervals) == 0: runShellCommand("cp %s %s" % (args.inBed, args.outBed)) logger.warning("No mask tracks located in %s or" " %s empty" % (args.tracksXML, args.inBed)) return 0 # make a temporary, combined, merged masking bed file tempMaskBed = getLocalTempPath("Temp_mb", ".bed") for maskTrack in maskTracks: assert os.path.isfile(maskTrack.getPath()) runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % ( maskTrack.getPath(), tempMaskBed)) maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True) resolvedMasks = 0 if len(inputIntervals) == 0: logger.warning("No mask tracks located in %s" % args.tracksXML) return inputIdx = 0 rightFlank = inputIntervals[inputIdx] tempOutMask = getLocalTempPath("Temp_om", ".bed") tempOutMaskFile = open(tempOutMask, "w") for maskIdx, maskInterval in enumerate(maskedIntervals): if maskInterval[2] - maskInterval[1] > args.maxLen: continue # find candidate right flank while rightFlank < maskInterval: if inputIdx == len(inputIntervals) - 1: rightFlank = None break else: inputIdx += 1 rightFlank = inputIntervals[inputIdx] # candidate left flank leftFlank = None if inputIdx > 0: leftFlank = inputIntervals[inputIdx - 1] # identify flanking states if the intervals perfectly abut leftState = None if leftFlank is not None: if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]: leftState = str(leftFlank[3]) else: assert intersectSize(leftFlank, maskInterval) == 0 rightState = None if rightFlank is not None: if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]: rightState = str(rightFlank[3]) else: assert intersectSize(rightFlank, maskInterval) == 0 # choose a state for the mask interval maskState = str(args.default) if args.onlyDefault is True: pass elif leftState is not None and leftState == rightState: if len(tgtSet) == 0 or leftState in tgtSet: maskState = leftState elif leftState in oneSidedTgtSet: maskState = leftState elif rightState in oneSidedTgtSet: maskState = rightState # write our mask interval tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1], maskInterval[2], maskState)) tempOutMaskFile.close() tempMergePath1 = getLocalTempPath("Temp_mp", ".bed") tempMergePath2 = getLocalTempPath("Temp_mp", ".bed") runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1, tempOutMask, tempMergePath1)) runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2)) tempScopePath = getLocalTempPath("temp_all", ".bed") runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath)) runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath, args.outBed)) runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath])) cleanBedTool(tempBedToolPath)
def runTsd(args, tempTracksInfo): """ run addTsdTrack on termini and chaux to generate tsd track""" if args.noTsd is True: return origTrackList = TrackList(args.tracksInfo) outTrackList = TrackList(tempTracksInfo) tempFiles = [] tsdInputFiles = [] tsdInputTracks = [] # preprocess termini lastzTracks = [ origTrackList.getTrackByName(args.ltr_termini), origTrackList.getTrackByName(args.tir) ] for terminiTrack in lastzTracks: if terminiTrack is not None: inFile = terminiTrack.getPath() fillFile = getLocalTempPath("Temp_fill", ".bed") tempBed = None if inFile[-3:] == ".bb": tempBed = getLocalTempPath("Temp_termini", ".bed") runShellCommand("bigBedToBed %s %s" % (inFile, tempBed)) inFile = tempBed runShellCommand("fillTermini.py %s %s" % (inFile, fillFile)) tsdInputFiles.append(fillFile) tsdInputTracks.append(terminiTrack.getName()) tempFiles.append(fillFile) if tempBed is not None: runShellCommand("rm -f %s" % tempBed) else: logger.warning("Could not find termini track") # add repeat_modeler repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler) if repeat_modelerTrack is not None: tsdInputFiles.append(repeat_modelerTrack.getPath()) tsdInputTracks.append(repeat_modelerTrack.getName()) # run addTsdTrack (appending except first time) # note we override input track paths in each case assert len(tsdInputFiles) == len(tsdInputTracks) for i in xrange(len(tsdInputFiles)): optString = "" if i > 0: optString += " --append" # really rough hardcoded params based on # (A unified classification system for eukaryotic transposable elements # Wicker et. al 2007) if tsdInputTracks[i] == args.repeat_modeler: optString += " --names LINE,SINE,Unknown" optString += " --maxScore 20" optString += " --left 20" optString += " --right 20" optString += " --min 5" optString += " --max 20" optString += " --overlap 20" elif tsdInputTracks[i] == args.ltr_termini: optString += " --maxScore 3" optString += " --left 8" optString += " --right 8" optString += " --min 3" optString += " --max 6" elif tsdInputTracks[i] == args.tir: optString += " --maxScore 3" optString += " --left 15" optString += " --right 15" optString += " --min 3" optString += " --max 12" tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml") runShellCommand( "addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % (tempTracksInfo, args.cleanTrackPath, tempXMLOut, tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i], optString, args.logOpString, args.numProc)) runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo)) for i in xrange(len(tempFiles)): runShellCommand("rm %s" % tempFiles[i])
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Automatically set the scale attributes of numeric tracks" " within a given tracks.xml function using some simple heuristics. ") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("allBed", help="Bed file spanning entire genome") parser.add_argument("outputTracks", help="Path to write modified tracks XML" " to.") parser.add_argument("--numBins", help="Maximum number of bins after scaling", default=10, type=int) parser.add_argument("--tracks", help="Comma-separated list of tracks " "to process. If not set, all" " tracks listed as having a multinomial distribution" " (since this is the default value, this includes " "tracks with no distribution attribute) or gaussian" " distribution will be processed.", default=None) parser.add_argument("--skip", help="Comma-separated list of tracks to " "skip.", default=None) parser.add_argument("--noLog", help="Never use log scaling", action="store_true", default=False) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() trackNames = [] if args.tracks is not None: trackNames = args.tracks.split(",") skipNames = [] if args.skip is not None: skipNames = args.skip.split(",") trackList = TrackList(args.tracksInfo) outTrackList = copy.deepcopy(trackList) allIntervals = getMergedBedIntervals(args.allBed) for track in trackList: trackExt = os.path.splitext(track.getPath())[1] isFasta = len(trackExt) >= 3 and trackExt[:3].lower() == ".fa" if track.getName() not in skipNames and\ (track.getName() in trackNames or len(trackNames) == 0) and\ (track.getDist() == "multinomial" or track.getDist() == "sparse_multinomial" or track.getDist() == "gaussian") and\ not isFasta: try: setTrackScale(track, args.numBins, allIntervals, args.noLog) except ValueError as e: logger.warning("Skipping (non-numeric?) track %s due to: %s" % ( track.getName(), str(e))) trackList.saveXML(args.outputTracks) cleanBedTool(tempBedToolPath)