def applyNamesToModel(stateMap, modelPath): """ change a given HMM model to use the new state names""" # load model created with teHmmTrain.py logger.debug("loading model %s" % modelPath) model = loadModel(modelPath) modelMap = model.getStateNameMap() raise RuntimeError("Not Implemented")
def extractTotalProb(benchDir, benchInputBedPath, args, repSuffix=""): """ Get the total log probability from the model """ modPath = os.path.join(benchDir, os.path.splitext( os.path.basename(benchInputBedPath))[0]+ ".mod" + repSuffix) model = loadModel(modPath) totalProb = model.getLastLogProb() return totalProb
def extractProbRow(bed, header, args): """ get the total prob from the model and the viterbi prob from the bed """ # note directory structure hardcoded in line below: modPath = bed.replace("predictions", "models") modPath = modPath[:modPath.rfind("_")] if ".mod" not in modPath: modPath += ".mod" assert os.path.exists(modPath) model = loadModel(modPath) totalProb = model.getLastLogProb() bedFile = open(bed, "r") line0 = [line for line in bedFile][0] vitProb = float(line0.split()[2]) assert vitProb <= 0. if header: return ["TotLogProb", "VitLogProb"] else: return [totalProb, vitProb] bedFile.close() model = NULL
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create starting transition and emission distributions " " that be used with teHmmTrain.py using the --initTransProbs and " "--initEmProbs options, respectively. The distributions will be " " derived by an already-trained model. This tool is written to" " allow combining supervised and unsupervised training. IE a " " supervised model is created (teHmmTrain.py with --supervised " " option). This tool can then be used to create the necessary " " files to bootstrap an unsupervised training run with a subset" " of the parameters.") parser.add_argument("inputModel", help="Path of input model to use " "for bootstrap parameter creation") parser.add_argument("outTransProbs", help="File to write transition model" " to (for use with teHmmTrain.py --initTransProbs and" " --forceTransProbs)") parser.add_argument("outEmProbs", help="File to write emission model to" " (for use with teHmmTrain.py --initEmProbs and " " --forceEmProbs)") parser.add_argument("--ignore", help="comma-separated list of states to ignore from" " inputModel", default=None) parser.add_argument("--numAdd", help="Number of \"unlabeled\" states to add" " to the model.", default=0, type=int) parser.add_argument("--numTotal", help="Add unlabeled states such that" " output model has given number of states. If input " "model already has a greater number of states then" " none added", default=0, type=int) parser.add_argument("--stp", help="Self-transition probality assigned to" " added states.", default=0.9, type=float) parser.add_argument("--allTrans", help="By default only self-transitions" " are written. Use this option to write entire " "transition matrix (excluding ignroed states)", default=False, action="store_true") addLoggingOptions(parser) args = parser.parse_args() if args.numAdd != 0 and args.numTotal != 0: raise RuntimeError("--numAdd and --numTotal mutually exclusive") # load model created with teHmmTrain.py logger.info("loading model %s" % args.inputModel) model = loadModel(args.inputModel) # parse ignore states if args.ignore is None: args.ignore = set() else: args.ignore = set(args.ignore.split(",")) # make sure we have a state name for every state (should really # be done within hmm...) stateMap = model.getStateNameMap() if stateMap is None: stateMap = CategoryMap(reserved=0) for i in xrange(model.getEmissionModel().getNumStates()): stateMap.getMap(str(i), update=True) # write the transition probabilities writeTransitions(model, stateMap, args) # write the emission probabilities writeEmissions(model, stateMap, args)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Create starting transition and emission distributions " " that be used with teHmmTrain.py using the --initTransProbs and " "--initEmProbs options, respectively. The distributions will be " " derived by an already-trained model. This tool is written to" " allow combining supervised and unsupervised training. IE a " " supervised model is created (teHmmTrain.py with --supervised " " option). This tool can then be used to create the necessary " " files to bootstrap an unsupervised training run with a subset" " of the parameters.") parser.add_argument("inputModel", help="Path of input model to use " "for bootstrap parameter creation") parser.add_argument("outTransProbs", help="File to write transition model" " to (for use with teHmmTrain.py --initTransProbs and" " --forceTransProbs)") parser.add_argument("outEmProbs", help="File to write emission model to" " (for use with teHmmTrain.py --initEmProbs and " " --forceEmProbs)") parser.add_argument("--ignore", help="comma-separated list of states to ignore from" " inputModel", default=None) parser.add_argument("--numAdd", help="Number of \"unlabeled\" states to add" " to the model.", default=0, type=int) parser.add_argument("--numTotal", help="Add unlabeled states such that" " output model has given number of states. If input " "model already has a greater number of states then" " none added", default=0, type=int) parser.add_argument("--stp", help="Self-transition probality assigned to" " added states.", default=0.9, type=float) parser.add_argument("--allTrans", help="By default only self-transitions" " are written. Use this option to write entire " "transition matrix (excluding ignroed states)", default=False, action="store_true") addLoggingOptions(parser) args = parser.parse_args() if args.numAdd != 0 and args.numTotal != 0: raise RuntimeError("--numAdd and --numTotal mutually exclusive") # load model created with teHmmTrain.py logger.info("loading model %s" % args.inputModel) model = loadModel(args.inputModel) # parse ignore states if args.ignore is None: args.ignore = set() else: args.ignore = set(args.ignore.split(",")) # make sure we have a state name for every state (should really # be done within hmm...) stateMap = model.getStateNameMap() if stateMap is None: stateMap = CategoryMap(reserved = 0) for i in xrange(model.getEmissionModel().getNumStates()): stateMap.getMap(str(i), update=True) # write the transition probabilities writeTransitions(model, stateMap, args) # write the emission probabilities writeEmissions(model, stateMap, args)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Evaluate a given data set with a trained HMM. Display" " the log probability of the input data given the model, and " "optionally output the most likely sequence of hidden states.") parser.add_argument("tracksInfo", help="Path of Tracks Info file " "containing paths to genome annotation tracks") parser.add_argument("inputModel", help="Path of hmm created with" "teHmmTrain.py") parser.add_argument("bedRegions", help="Intervals to process") parser.add_argument("--bed", help="path of file to write viterbi " "output to (most likely sequence of hidden states)", default=None) parser.add_argument("--numThreads", help="Number of threads to use (only" " applies to CFG parser for the moment)", type=int, default=1) parser.add_argument("--slice", help="Make sure that regions are sliced" " to a maximum length of the given value. Most " "useful when model is a CFG to keep memory down. " "When 0, no slicing is done", type=int, default=0) parser.add_argument("--segment", help="Use the intervals in bedRegions" " as segments which each count as a single column" " for evaluattion. Note the model should have been" " trained with the --segment option pointing to this" " same bed file.", action="store_true", default=False) parser.add_argument("--segLen", help="Effective segment length used for" " normalizing input segments (specifying 0 means no" " normalization applied)", type=int, default=0) parser.add_argument("--maxPost", help="Use maximum posterior decoding instead" " of Viterbi for evaluation", action="store_true", default=False) parser.add_argument("--pd", help="Output BED file for posterior distribution. Must" " be used in conjunction with --pdStates (View on the " "browser via bedGraphToBigWig)", default=None) parser.add_argument("--pdStates", help="comma-separated list of state names to use" " for computing posterior distribution. For example: " " --pdStates inside,LTR_left,LTR_right will compute the probability" ", for each observation, that the hidden state is inside OR LTR_left" " OR LTR_right. Must be used with --pd to specify output " "file.", default=None) parser.add_argument("--bic", help="save Bayesian Information Criterion (BIC) score" " in given file", default=None) parser.add_argument("--ed", help="Output BED file for emission distribution. Must" " be used in conjunction with --edStates (View on the " "browser via bedGraphToBigWig)", default=None) parser.add_argument("--edStates", help="comma-separated list of state names to use" " for computing emission distribution. For example: " " --edStates inside,LTR_left for each obsercation the probability " " that inside emitted that observaiton plus the probabillity that" " LTR_left emitted it. If more than one state is selected, this " " is not a distribution, but a sum of distributions (and values" " can exceed 1). Mostly for debugging purposes. Note output in LOG", default=None) parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel" " (in BED format). input regions will be intersected with each line" " in this file, and the result will correspsond to an individual job", default=None) parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)", type=int, default=1) addLoggingOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) tempBedToolPath = initBedTool() if args.slice <= 0: args.slice = sys.maxint elif args.segment is True: raise RuntimeError("--slice and --segment options are not compatible at " "this time") if (args.pd is not None) ^ (args.pdStates is not None): raise RuntimeError("--pd requires --pdStates and vice versa") if (args.ed is not None) ^ (args.edStates is not None): raise RuntimeError("--ed requires --edStates and vice versa") if args.bed is None and (args.pd is not None or args.ed is not None): raise RuntimeError("Both --ed and --pd only usable in conjunction with" " --bed") if args.chroms is not None: # hack to allow chroms argument to chunk and rerun parallelDispatch(argv, args) cleanBedTool(tempBedToolPath) return 0 # load model created with teHmmTrain.py logger.info("loading model %s" % args.inputModel) model = loadModel(args.inputModel) if isinstance(model, MultitrackCfg): if args.maxPost is True: raise RuntimeErorr("--post not supported on CFG models") # apply the effective segment length if args.segLen > 0: assert args.segment is True model.getEmissionModel().effectiveSegmentLength = args.segLen # read intervals from the bed file logger.info("loading target intervals from %s" % args.bedRegions) mergedIntervals = getMergedBedIntervals(args.bedRegions, ncol=4) if mergedIntervals is None or len(mergedIntervals) < 1: raise RuntimeError("Could not read any intervals from %s" % args.bedRegions) # slice if desired choppedIntervals = [x for x in slicedIntervals(mergedIntervals, args.slice)] # read segment intervals segIntervals = None if args.segment is True: logger.info("loading segment intervals from %s" % args.bedRegions) segIntervals = readBedIntervals(args.bedRegions, sort=True) # load the input # read the tracks, while intersecting them with the given interval trackData = TrackData() # note we pass in the trackList that was saved as part of the model # because we do not want to generate a new one. logger.info("loading tracks %s" % args.tracksInfo) trackData.loadTrackData(args.tracksInfo, choppedIntervals, model.getTrackList(), segmentIntervals=segIntervals) # do the viterbi algorithm if isinstance(model, MultitrackHmm): algname = "viterbi" if args.maxPost is True: algname = "posterior decoding" logger.info("running %s algorithm" % algname) elif isinstance(model, MultitrackCfg): logger.info("running CYK algorithm") vitOutFile = None if args.bed is not None: vitOutFile = open(args.bed, "w") totalScore = 0 tableIndex = 0 totalDatapoints = 0 # Note: in general there's room to save on memory by only computing single # track table at once (just need to add table by table interface to hmm...) posteriors = [None] * trackData.getNumTrackTables() posteriorsFile = None posteriorsMask = None if args.pd is not None: posteriors = model.posteriorDistribution(trackData) posteriorsFile = open(args.pd, "w") posteriorsMask = getPosteriorsMask(args.pdStates, model) assert len(posteriors[0][0]) == len(posteriorsMask) emProbs = [None] * trackData.getNumTrackTables() emissionsFile = None emissionsMask = None if args.ed is not None: emProbs = model.emissionDistribution(trackData) emissionsFile = open(args.ed, "w") emissionsMask = getPosteriorsMask(args.edStates, model) assert len(emProbs[0][0]) == len(emissionsMask) decodeFunction = model.viterbi if args.maxPost is True: decodeFunction = model.posteriorDecode for i, (vitLogProb, vitStates) in enumerate(decodeFunction(trackData, numThreads=args.numThreads)): totalScore += vitLogProb if args.bed is not None or args.pd is not None: if args.bed is not None: vitOutFile.write("#Viterbi Score: %f\n" % (vitLogProb)) trackTable = trackData.getTrackTableList()[tableIndex] tableIndex += 1 statesToBed(trackTable, vitStates, vitOutFile, posteriors[i], posteriorsMask, posteriorsFile, emProbs[i], emissionsMask, emissionsFile) totalDatapoints += len(vitStates) * trackTable.getNumTracks() print "Viterbi (log) score: %f" % totalScore if isinstance(model, MultitrackHmm) and model.current_iteration is not None: print "Number of EM iterations: %d" % model.current_iteration if args.bed is not None: vitOutFile.close() if posteriorsFile is not None: posteriorsFile.close() if emissionsFile is not None: emissionsFile.close() if args.bic is not None: bicFile = open(args.bic, "w") # http://en.wikipedia.org/wiki/Bayesian_information_criterion lnL = float(totalScore) try: k = float(model.getNumFreeParameters()) except: # numFreeParameters still not done for semi-supervised # just pass through a 0 instead of crashing for now k = 0.0 n = float(totalDatapoints) bic = -2.0 * lnL + k * (np.log(n) + np.log(2 * np.pi)) bicFile.write("%f\n" % bic) bicFile.write("# = -2.0 * lnL + k * (lnN + ln(2 * np.pi))\n" "# where lnL=%f k=%d (%d states) N=%d (%d obs * %d tracks) lnN=%f\n" % ( lnL, int(k), model.getEmissionModel().getNumStates(), int(totalDatapoints), totalDatapoints / model.getEmissionModel().getNumTracks(), model.getEmissionModel().getNumTracks(), np.log(n))) bicFile.close() cleanBedTool(tempBedToolPath)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Rename HMM states.") parser.add_argument("inputModel", help="Path of teHMM model created with" " teHmmTrain.py") parser.add_argument("outputModel", help="Path of model with renamed states") parser.add_argument( "--newNames", help="comma-separated list of state names to" " apply. This list must have exactly the same number of" " states as the model. The ith name in the list will be " "assigned to the ith name of the model...", default=None) parser.add_argument( "--teNumbers", help="comma-separated list of state numbers" " that will be assigned TE states, with everything else" " assigned Other. This is less flexible but maybe more" " convenient at times than --newNames.", default=None) parser.add_argument("--bed", help="apply naming to bed file and print " "results to stdout", default=None) parser.add_argument( "--sizes", help="bedFile to use for computing state numbering" " by using decreasing order in total coverage (only works" " with --teNumbers)", default=None) parser.add_argument("--noMerge", help="dont merge adjacent intervals with same" " name with --bed option", action="store_true", default=False) parser.add_argument( "--changeTrackName", help="dont do anything else, just change" " the name of one track. specified value should be of form" " currentNAme, newName", default=None) args = parser.parse_args() assert args.inputModel != args.outputModel # load model created with teHmmTrain.py model = loadModel(args.inputModel) # trackChangeName logic hacked in completely separate from everything else if args.changeTrackName is not None: oldName, newName = args.changeTrackName.split(",") track = model.getTrackList().getTrackByName(oldName) track.setName(newName) saveModel(args.outputModel, model) return 0 assert (args.newNames is None) != (args.teNumbers is None) # names manually specified if args.newNames is not None: names = args.newNames.split(",") # names computed using simple scheme from set of "TE" state numbers (as found from # log output of fitStateNames.py) elif args.teNumbers is not None: teNos = set([int(x) for x in args.teNumbers.split(",")]) teCount, otherCount = 0, 0 numStates = model.getEmissionModel().getNumStates() # re-order from sizing info if args.sizes is not None: bedIntervals = readBedIntervals(args.sizes, ncol=4) sizeMap = defaultdict(int) for interval in bedIntervals: sizeMap[int(interval[3])] += interval[2] - interval[1] stateNumbers = sorted([x for x in xrange(numStates)], reverse=True, key=lambda x: sizeMap[x]) else: stateNumbers = [x for x in xrange(numStates)] names = [""] * numStates for i in stateNumbers: if i in teNos: name = "TE-%.2d" % teCount teCount += 1 else: name = "Other-%.2d" % otherCount otherCount += 1 names[i] = name assert teCount == len(teNos) and teCount + otherCount == len(names) assert len(names) == model.getEmissionModel().getNumStates() # throw names in the mapping object and stick into model catMap = CategoryMap(reserved=0) for i, name in enumerate(names): catMap.getMap(name, update=True) model.stateNameMap = catMap # save model saveModel(args.outputModel, model) # process optional bed file if args.bed is not None: prevInterval = None bedIntervals = readBedIntervals(args.bed, ncol=4) for interval in bedIntervals: oldName = interval[3] newName = names[int(oldName)] newInterval = list(interval) newInterval[3] = newName if args.noMerge: # write interval print "\t".join(str(x) for x in newInterval) else: if prevInterval is None: # update prev interval first time prevInterval = newInterval elif newInterval[3] == prevInterval[3] and\ newInterval[0] == prevInterval[0] and\ newInterval[1] == prevInterval[2]: # glue onto prev interval prevInterval[2] = newInterval[2] else: # write and update prev print "\t".join(str(x) for x in prevInterval) prevInterval = newInterval if prevInterval is not None: print "\t".join(str(x) for x in prevInterval)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Print out paramaters of a teHMM") parser.add_argument("inputModel", help="Path of teHMM model created with" " teHmmTrain.py") parser.add_argument("--nameMap", help="Print out name map tables", action="store_true", default=False) parser.add_argument("--ec", help="Print emission distribution clusterings" " to given file in PDF format", default=None) parser.add_argument("--ecn", help="Like --ec option but only print non" " numeric tracks", default=None) parser.add_argument("--pca", help="Print emission pca scatters" " to given file in PDF format", default=None) parser.add_argument("--hm", help="Print heatmap of emission distribution means" " for (only) numeric tracks", default=None) parser.add_argument("--t", help="Print transition matrix to given" " file in GRAPHVIZ DOT format. Convert to PDF with " " dot <file> -Tpdf > <outFile>", default=None) parser.add_argument("--minTP", help="Minimum tranisition probability " "to include in transition matrix output from --t option.", type=float, default=EPSILON) parser.add_argument("--minTPns", help="Minimum transition probability after " "self transition is normalized out (ie after dividing by 1-self)", type=float, default=EPSILON) parser.add_argument("--teStates", help="comma-separated list of state names" " to consider TE-1, TE-2, ... etc", default=None) args = parser.parse_args() # load model created with teHmmTrain.py model = loadModel(args.inputModel) if args.teStates is not None: args.teStates = set(x for x in args.teStates.split(",")) # crappy print method print model if args.nameMap is True: print "State Maps:" trackList = model.trackList if trackList is None: print "TrackList: None" else: for track in trackList: print "Track: %s" % track.getName() print " map %s " % track.getValueMap().catMap print " pam %s " % track.getValueMap().catMapBack if args.ec is not None: if canPlot is False: raise RuntimeError("Unable to write plots. Maybe matplotlib is " "not installed?") writeEmissionClusters(model, args, False) if args.ecn is not None: if canPlot is False: raise RuntimeError("Unable to write plots. Maybe matplotlib is " "not installed?") writeEmissionClusters(model, args, True) if args.pca is not None: if canPlot is False: raise RuntimeError("Unable to write plots. Maybe matplotlib is " "not installed?") writeEmissionScatters(model, args) if args.hm is not None: if canPlot is False: raise RuntimeError("Unable to write plots. Maybe matplotlib is " "not installed?") writeEmissionHeatMap(model, args) if args.t is not None: writeTransitionGraph(model, args)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Rename HMM states.") parser.add_argument("inputModel", help="Path of teHMM model created with" " teHmmTrain.py") parser.add_argument("outputModel", help="Path of model with renamed states") parser.add_argument("--newNames", help="comma-separated list of state names to" " apply. This list must have exactly the same number of" " states as the model. The ith name in the list will be " "assigned to the ith name of the model...", default=None) parser.add_argument("--teNumbers", help="comma-separated list of state numbers" " that will be assigned TE states, with everything else" " assigned Other. This is less flexible but maybe more" " convenient at times than --newNames.", default=None) parser.add_argument("--bed", help="apply naming to bed file and print " "results to stdout", default=None) parser.add_argument("--sizes", help="bedFile to use for computing state numbering" " by using decreasing order in total coverage (only works" " with --teNumbers)", default=None) parser.add_argument("--noMerge", help="dont merge adjacent intervals with same" " name with --bed option", action="store_true",default=False) parser.add_argument("--changeTrackName", help="dont do anything else, just change" " the name of one track. specified value should be of form" " currentNAme, newName", default=None) args = parser.parse_args() assert args.inputModel != args.outputModel # load model created with teHmmTrain.py model = loadModel(args.inputModel) # trackChangeName logic hacked in completely separate from everything else if args.changeTrackName is not None: oldName, newName = args.changeTrackName.split(",") track = model.getTrackList().getTrackByName(oldName) track.setName(newName) saveModel(args.outputModel, model) return 0 assert (args.newNames is None) != (args.teNumbers is None) # names manually specified if args.newNames is not None: names = args.newNames.split(",") # names computed using simple scheme from set of "TE" state numbers (as found from # log output of fitStateNames.py) elif args.teNumbers is not None: teNos = set([int(x) for x in args.teNumbers.split(",")]) teCount, otherCount = 0, 0 numStates = model.getEmissionModel().getNumStates() # re-order from sizing info if args.sizes is not None: bedIntervals = readBedIntervals(args.sizes, ncol=4) sizeMap = defaultdict(int) for interval in bedIntervals: sizeMap[int(interval[3])] += interval[2] - interval[1] stateNumbers = sorted([x for x in xrange(numStates)], reverse=True, key = lambda x : sizeMap[x]) else: stateNumbers = [x for x in xrange(numStates)] names = [""] * numStates for i in stateNumbers: if i in teNos: name = "TE-%.2d" % teCount teCount += 1 else: name = "Other-%.2d" % otherCount otherCount += 1 names[i] = name assert teCount == len(teNos) and teCount + otherCount == len(names) assert len(names) == model.getEmissionModel().getNumStates() # throw names in the mapping object and stick into model catMap = CategoryMap(reserved=0) for i, name in enumerate(names): catMap.getMap(name, update=True) model.stateNameMap = catMap # save model saveModel(args.outputModel, model) # process optional bed file if args.bed is not None: prevInterval = None bedIntervals = readBedIntervals(args.bed, ncol=4) for interval in bedIntervals: oldName = interval[3] newName = names[int(oldName)] newInterval = list(interval) newInterval[3] = newName if args.noMerge: # write interval print "\t".join(str(x) for x in newInterval) else: if prevInterval is None: # update prev interval first time prevInterval = newInterval elif newInterval[3] == prevInterval[3] and\ newInterval[0] == prevInterval[0] and\ newInterval[1] == prevInterval[2]: # glue onto prev interval prevInterval[2] = newInterval[2] else: # write and update prev print "\t".join(str(x) for x in prevInterval) prevInterval = newInterval if prevInterval is not None: print "\t".join(str(x) for x in prevInterval)