Пример #1
0
def applyNamesToModel(stateMap, modelPath):
    """ change a given HMM model to use the new state names"""
    # load model created with teHmmTrain.py
    logger.debug("loading model %s" % modelPath)
    model = loadModel(modelPath)
    modelMap = model.getStateNameMap()
    raise RuntimeError("Not Implemented")
Пример #2
0
def applyNamesToModel(stateMap, modelPath):
    """ change a given HMM model to use the new state names"""
    # load model created with teHmmTrain.py
    logger.debug("loading model %s" % modelPath)
    model = loadModel(modelPath)
    modelMap = model.getStateNameMap()
    raise RuntimeError("Not Implemented")
Пример #3
0
def extractTotalProb(benchDir, benchInputBedPath, args, repSuffix=""):
    """ Get the total log probability from the model """
    modPath = os.path.join(benchDir,
                             os.path.splitext(
                                 os.path.basename(benchInputBedPath))[0]+
                                ".mod" + repSuffix)
    model = loadModel(modPath)
    totalProb = model.getLastLogProb()
    return totalProb
Пример #4
0
def extractProbRow(bed, header, args):
    """ get the total prob from the model and the viterbi prob from the bed
    """
    # note directory structure hardcoded in line below:
    modPath = bed.replace("predictions", "models")
    modPath = modPath[:modPath.rfind("_")]
    if ".mod" not in modPath:
        modPath += ".mod"
    assert os.path.exists(modPath)
    model = loadModel(modPath)
    totalProb = model.getLastLogProb()
    bedFile = open(bed, "r")
    line0 = [line for line in bedFile][0]
    vitProb = float(line0.split()[2])
    assert vitProb <= 0.
    if header:
        return ["TotLogProb", "VitLogProb"]
    else:
        return [totalProb, vitProb]
    bedFile.close()
    model = NULL
Пример #5
0
def extractProbRow(bed, header, args):
    """ get the total prob from the model and the viterbi prob from the bed
    """
    # note directory structure hardcoded in line below:
    modPath = bed.replace("predictions", "models")
    modPath = modPath[:modPath.rfind("_")]
    if ".mod" not in modPath:
        modPath += ".mod"
    assert os.path.exists(modPath)
    model = loadModel(modPath)
    totalProb = model.getLastLogProb()
    bedFile = open(bed, "r")
    line0 = [line for line in bedFile][0]
    vitProb = float(line0.split()[2])
    assert vitProb <= 0.
    if header:
        return ["TotLogProb", "VitLogProb"]
    else:
        return [totalProb, vitProb]
    bedFile.close()
    model = NULL
Пример #6
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create starting transition and emission distributions "
        " that be used with teHmmTrain.py using the --initTransProbs and "
        "--initEmProbs options, respectively.  The distributions will be "
        " derived by an already-trained model.  This tool is written to"
        " allow combining supervised and unsupervised training.  IE a "
        " supervised model is created (teHmmTrain.py with --supervised "
        " option).  This tool can then be used to create the necessary "
        " files to bootstrap an unsupervised training run with a subset"
        " of the parameters.")

    parser.add_argument("inputModel",
                        help="Path of input model to use "
                        "for bootstrap parameter creation")
    parser.add_argument("outTransProbs",
                        help="File to write transition model"
                        " to (for use with teHmmTrain.py --initTransProbs and"
                        " --forceTransProbs)")
    parser.add_argument("outEmProbs",
                        help="File to write emission model to"
                        " (for use with teHmmTrain.py --initEmProbs and "
                        " --forceEmProbs)")
    parser.add_argument("--ignore",
                        help="comma-separated list of states to ignore from"
                        " inputModel",
                        default=None)
    parser.add_argument("--numAdd",
                        help="Number of \"unlabeled\" states to add"
                        " to the model.",
                        default=0,
                        type=int)
    parser.add_argument("--numTotal",
                        help="Add unlabeled states such that"
                        " output model has given number of states.  If input "
                        "model already has a greater number of states then"
                        " none added",
                        default=0,
                        type=int)
    parser.add_argument("--stp",
                        help="Self-transition probality assigned to"
                        " added states.",
                        default=0.9,
                        type=float)
    parser.add_argument("--allTrans",
                        help="By default only self-transitions"
                        " are written.  Use this option to write entire "
                        "transition matrix (excluding ignroed states)",
                        default=False,
                        action="store_true")

    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.numAdd != 0 and args.numTotal != 0:
        raise RuntimeError("--numAdd and --numTotal mutually exclusive")

    # load model created with teHmmTrain.py
    logger.info("loading model %s" % args.inputModel)
    model = loadModel(args.inputModel)

    # parse ignore states
    if args.ignore is None:
        args.ignore = set()
    else:
        args.ignore = set(args.ignore.split(","))

    # make sure we have a state name for every state (should really
    # be done within hmm...)
    stateMap = model.getStateNameMap()
    if stateMap is None:
        stateMap = CategoryMap(reserved=0)
        for i in xrange(model.getEmissionModel().getNumStates()):
            stateMap.getMap(str(i), update=True)

    # write the transition probabilities
    writeTransitions(model, stateMap, args)

    # write the emission probabilities
    writeEmissions(model, stateMap, args)
Пример #7
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create starting transition and emission distributions "
        " that be used with teHmmTrain.py using the --initTransProbs and "
        "--initEmProbs options, respectively.  The distributions will be "
        " derived by an already-trained model.  This tool is written to"
        " allow combining supervised and unsupervised training.  IE a "
        " supervised model is created (teHmmTrain.py with --supervised "
        " option).  This tool can then be used to create the necessary "
        " files to bootstrap an unsupervised training run with a subset"
        " of the parameters.")

    parser.add_argument("inputModel", help="Path of input model to use "
                        "for bootstrap parameter creation")
    parser.add_argument("outTransProbs", help="File to write transition model"
                        " to (for use with teHmmTrain.py --initTransProbs and"
                        " --forceTransProbs)")
    parser.add_argument("outEmProbs", help="File to write emission model to"
                        " (for use with teHmmTrain.py --initEmProbs and "
                        " --forceEmProbs)")
    parser.add_argument("--ignore", help="comma-separated list of states to ignore from"
                        " inputModel", default=None)
    parser.add_argument("--numAdd", help="Number of \"unlabeled\" states to add"
                        " to the model.", default=0, type=int)
    parser.add_argument("--numTotal", help="Add unlabeled states such that"
                        " output model has given number of states.  If input "
                        "model already has a greater number of states then"
                        " none added", default=0, type=int)
    parser.add_argument("--stp", help="Self-transition probality assigned to"
                        " added states.", default=0.9, type=float)
    parser.add_argument("--allTrans", help="By default only self-transitions"
                        " are written.  Use this option to write entire "
                        "transition matrix (excluding ignroed states)",
                        default=False, action="store_true")
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.numAdd != 0 and args.numTotal != 0:
        raise RuntimeError("--numAdd and --numTotal mutually exclusive")

    # load model created with teHmmTrain.py
    logger.info("loading model %s" % args.inputModel)
    model = loadModel(args.inputModel)

    # parse ignore states
    if args.ignore is None:
        args.ignore = set()
    else:
        args.ignore = set(args.ignore.split(","))

    # make sure we have a state name for every state (should really
    # be done within hmm...)
    stateMap = model.getStateNameMap()
    if stateMap is None:
        stateMap = CategoryMap(reserved = 0)
        for i in xrange(model.getEmissionModel().getNumStates()):
            stateMap.getMap(str(i), update=True)

    # write the transition probabilities
    writeTransitions(model, stateMap, args)

    # write the emission probabilities
    writeEmissions(model, stateMap, args)
Пример #8
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Evaluate a given data set with a trained HMM. Display"
        " the log probability of the input data given the model, and "
        "optionally output the most likely sequence of hidden states.")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("inputModel", help="Path of hmm created with"
                        "teHmmTrain.py")
    parser.add_argument("bedRegions", help="Intervals to process")
    parser.add_argument("--bed", help="path of file to write viterbi "
                        "output to (most likely sequence of hidden states)",
                        default=None)
    parser.add_argument("--numThreads", help="Number of threads to use (only"
                        " applies to CFG parser for the moment)",
                        type=int, default=1)
    parser.add_argument("--slice", help="Make sure that regions are sliced"
                        " to a maximum length of the given value.  Most "
                        "useful when model is a CFG to keep memory down. "
                        "When 0, no slicing is done",
                        type=int, default=0)
    parser.add_argument("--segment", help="Use the intervals in bedRegions"
                        " as segments which each count as a single column"
                        " for evaluattion.  Note the model should have been"
                        " trained with the --segment option pointing to this"
                        " same bed file.", action="store_true", default=False)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)    
    parser.add_argument("--maxPost", help="Use maximum posterior decoding instead"
                        " of Viterbi for evaluation", action="store_true",
                        default=False)
    parser.add_argument("--pd", help="Output BED file for posterior distribution. Must"
                        " be used in conjunction with --pdStates (View on the "
                        "browser via bedGraphToBigWig)", default=None)
    parser.add_argument("--pdStates", help="comma-separated list of state names to use"
                        " for computing posterior distribution.  For example: "
                        " --pdStates inside,LTR_left,LTR_right will compute the probability"
                        ", for each observation, that the hidden state is inside OR LTR_left"
                        " OR LTR_right.  Must be used with --pd to specify output "
                        "file.", default=None)
    parser.add_argument("--bic", help="save Bayesian Information Criterion (BIC) score"
                        " in given file", default=None)
    parser.add_argument("--ed", help="Output BED file for emission distribution. Must"
                        " be used in conjunction with --edStates (View on the "
                        "browser via bedGraphToBigWig)", default=None)
    parser.add_argument("--edStates", help="comma-separated list of state names to use"
                        " for computing emission distribution.  For example: "
                        " --edStates inside,LTR_left for each obsercation the probability "
                        " that inside emitted that observaiton plus the probabillity that"
                        " LTR_left emitted it. If more than one state is selected, this "
                        " is not a distribution, but a sum of distributions (and values"
                        " can exceed 1).  Mostly for debugging purposes. Note output in LOG",
                         default=None)
    parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel"
                        " (in BED format).  input regions will be intersected with each line"
                        " in this file, and the result will correspsond to an individual job",
                        default=None)
    parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)",
                        type=int, default=1)
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    if args.slice <= 0:
        args.slice = sys.maxint
    elif args.segment is True:
        raise RuntimeError("--slice and --segment options are not compatible at "
                           "this time")
    if (args.pd is not None) ^ (args.pdStates is not None):
        raise RuntimeError("--pd requires --pdStates and vice versa")
    if (args.ed is not None) ^ (args.edStates is not None):
        raise RuntimeError("--ed requires --edStates and vice versa")
    if args.bed is None and (args.pd is not None or args.ed is not None):
        raise RuntimeError("Both --ed and --pd only usable in conjunction with"
                           " --bed")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun 
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0
    
    # load model created with teHmmTrain.py
    logger.info("loading model %s" % args.inputModel)
    model = loadModel(args.inputModel)

    if isinstance(model, MultitrackCfg):
        if args.maxPost is True:
           raise RuntimeErorr("--post not supported on CFG models")

    # apply the effective segment length
    if args.segLen > 0:
        assert args.segment is True
        model.getEmissionModel().effectiveSegmentLength = args.segLen
        
    # read intervals from the bed file
    logger.info("loading target intervals from %s" % args.bedRegions)
    mergedIntervals = getMergedBedIntervals(args.bedRegions, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.bedRegions)

    # slice if desired
    choppedIntervals = [x for x in slicedIntervals(mergedIntervals, args.slice)]

    # read segment intervals
    segIntervals = None
    if args.segment is True:
        logger.info("loading segment intervals from %s" % args.bedRegions)
        segIntervals = readBedIntervals(args.bedRegions, sort=True)

    # load the input
    # read the tracks, while intersecting them with the given interval
    trackData = TrackData()
    # note we pass in the trackList that was saved as part of the model
    # because we do not want to generate a new one.
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData.loadTrackData(args.tracksInfo, choppedIntervals, 
                            model.getTrackList(),
                            segmentIntervals=segIntervals)

    # do the viterbi algorithm
    if isinstance(model, MultitrackHmm):
        algname = "viterbi"
        if args.maxPost is True:
            algname = "posterior decoding"
        logger.info("running %s algorithm" % algname)
    elif isinstance(model, MultitrackCfg):
        logger.info("running CYK algorithm")

    vitOutFile = None
    if args.bed is not None:
        vitOutFile = open(args.bed, "w")
    totalScore = 0
    tableIndex = 0
    totalDatapoints = 0

    # Note: in general there's room to save on memory by only computing single
    # track table at once (just need to add table by table interface to hmm...)
    
    posteriors = [None] * trackData.getNumTrackTables()
    posteriorsFile = None
    posteriorsMask = None
    if args.pd is not None:
        posteriors = model.posteriorDistribution(trackData)
        posteriorsFile = open(args.pd, "w")
        posteriorsMask = getPosteriorsMask(args.pdStates, model)
        assert len(posteriors[0][0]) == len(posteriorsMask)
    emProbs = [None] * trackData.getNumTrackTables()
    emissionsFile = None
    emissionsMask = None
    if args.ed is not None:
        emProbs = model.emissionDistribution(trackData)
        emissionsFile = open(args.ed, "w")
        emissionsMask = getPosteriorsMask(args.edStates, model)
        assert len(emProbs[0][0]) == len(emissionsMask)

    
    decodeFunction = model.viterbi
    if args.maxPost is True:
        decodeFunction = model.posteriorDecode

    for i, (vitLogProb, vitStates) in enumerate(decodeFunction(trackData,
                                                numThreads=args.numThreads)):
        totalScore += vitLogProb
        if args.bed is not None or args.pd is not None:
            if args.bed is not None:
                vitOutFile.write("#Viterbi Score: %f\n" % (vitLogProb))
            trackTable = trackData.getTrackTableList()[tableIndex]
            tableIndex += 1
            statesToBed(trackTable,
                        vitStates, vitOutFile, posteriors[i], posteriorsMask,
                        posteriorsFile, emProbs[i], emissionsMask, emissionsFile)
            totalDatapoints += len(vitStates) * trackTable.getNumTracks()

    print "Viterbi (log) score: %f" % totalScore
    if isinstance(model, MultitrackHmm) and model.current_iteration is not None:
        print "Number of EM iterations: %d" % model.current_iteration
    if args.bed is not None:
        vitOutFile.close()
    if posteriorsFile is not None:
        posteriorsFile.close()
    if emissionsFile is not None:
        emissionsFile.close()

    if args.bic is not None:
        bicFile = open(args.bic, "w")
        # http://en.wikipedia.org/wiki/Bayesian_information_criterion
        lnL = float(totalScore)
        try:
            k = float(model.getNumFreeParameters())
        except:
            # numFreeParameters still not done for semi-supervised
            # just pass through a 0 instead of crashing for now
            k = 0.0 
        n = float(totalDatapoints)
        bic = -2.0 * lnL + k * (np.log(n) + np.log(2 * np.pi))
        bicFile.write("%f\n" % bic)
        bicFile.write("# = -2.0 * lnL + k * (lnN + ln(2 * np.pi))\n"
                      "# where lnL=%f  k=%d (%d states)  N=%d (%d obs * %d tracks)  lnN=%f\n" % (
            lnL, int(k), model.getEmissionModel().getNumStates(), int(totalDatapoints),
            totalDatapoints / model.getEmissionModel().getNumTracks(),
            model.getEmissionModel().getNumTracks(), np.log(n)))
        bicFile.close()

    cleanBedTool(tempBedToolPath)
Пример #9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Rename HMM states.")
    parser.add_argument("inputModel",
                        help="Path of teHMM model created with"
                        " teHmmTrain.py")
    parser.add_argument("outputModel",
                        help="Path of model with renamed states")
    parser.add_argument(
        "--newNames",
        help="comma-separated list of state names to"
        " apply.  This list must have exactly the same number of"
        " states as the model.  The ith name in the list will be "
        "assigned to the ith name of the model...",
        default=None)
    parser.add_argument(
        "--teNumbers",
        help="comma-separated list of state numbers"
        " that will be assigned TE states, with everything else"
        " assigned Other.  This is less flexible but maybe more"
        " convenient at times than --newNames.",
        default=None)
    parser.add_argument("--bed",
                        help="apply naming to bed file and print "
                        "results to stdout",
                        default=None)
    parser.add_argument(
        "--sizes",
        help="bedFile to use for computing state numbering"
        " by using decreasing order in total coverage (only works"
        " with --teNumbers)",
        default=None)
    parser.add_argument("--noMerge",
                        help="dont merge adjacent intervals with same"
                        " name with --bed option",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--changeTrackName",
        help="dont do anything else, just change"
        " the name of one track.  specified value should be of form"
        " currentNAme, newName",
        default=None)

    args = parser.parse_args()
    assert args.inputModel != args.outputModel

    # load model created with teHmmTrain.py
    model = loadModel(args.inputModel)

    # trackChangeName logic hacked in completely separate from everything else
    if args.changeTrackName is not None:
        oldName, newName = args.changeTrackName.split(",")
        track = model.getTrackList().getTrackByName(oldName)
        track.setName(newName)
        saveModel(args.outputModel, model)
        return 0

    assert (args.newNames is None) != (args.teNumbers is None)

    # names manually specified
    if args.newNames is not None:
        names = args.newNames.split(",")

    # names computed using simple scheme from set of "TE" state numbers (as found from
    # log output of fitStateNames.py)
    elif args.teNumbers is not None:
        teNos = set([int(x) for x in args.teNumbers.split(",")])
        teCount, otherCount = 0, 0
        numStates = model.getEmissionModel().getNumStates()

        # re-order from sizing info
        if args.sizes is not None:
            bedIntervals = readBedIntervals(args.sizes, ncol=4)
            sizeMap = defaultdict(int)
            for interval in bedIntervals:
                sizeMap[int(interval[3])] += interval[2] - interval[1]
            stateNumbers = sorted([x for x in xrange(numStates)],
                                  reverse=True,
                                  key=lambda x: sizeMap[x])
        else:
            stateNumbers = [x for x in xrange(numStates)]
        names = [""] * numStates
        for i in stateNumbers:
            if i in teNos:
                name = "TE-%.2d" % teCount
                teCount += 1
            else:
                name = "Other-%.2d" % otherCount
                otherCount += 1
            names[i] = name
        assert teCount == len(teNos) and teCount + otherCount == len(names)

    assert len(names) == model.getEmissionModel().getNumStates()

    # throw names in the mapping object and stick into model
    catMap = CategoryMap(reserved=0)
    for i, name in enumerate(names):
        catMap.getMap(name, update=True)
    model.stateNameMap = catMap

    # save model
    saveModel(args.outputModel, model)

    # process optional bed file
    if args.bed is not None:
        prevInterval = None
        bedIntervals = readBedIntervals(args.bed, ncol=4)
        for interval in bedIntervals:
            oldName = interval[3]
            newName = names[int(oldName)]
            newInterval = list(interval)
            newInterval[3] = newName
            if args.noMerge:
                # write interval
                print "\t".join(str(x) for x in newInterval)
            else:
                if prevInterval is None:
                    # update prev interval first time
                    prevInterval = newInterval
                elif newInterval[3] == prevInterval[3] and\
                         newInterval[0] == prevInterval[0] and\
                         newInterval[1] == prevInterval[2]:
                    # glue onto prev interval
                    prevInterval[2] = newInterval[2]
                else:
                    # write and update prev
                    print "\t".join(str(x) for x in prevInterval)
                    prevInterval = newInterval
        if prevInterval is not None:
            print "\t".join(str(x) for x in prevInterval)
Пример #10
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Print out paramaters of a teHMM")

    parser.add_argument("inputModel", help="Path of teHMM model created with"
                        " teHmmTrain.py")
    parser.add_argument("--nameMap", help="Print out name map tables",
                        action="store_true", default=False)
    parser.add_argument("--ec", help="Print emission distribution clusterings"
                        " to given file in PDF format", default=None)
    parser.add_argument("--ecn", help="Like --ec option but only print non"
                        " numeric tracks", default=None)
    parser.add_argument("--pca", help="Print emission pca scatters"
                        " to given file in PDF format", default=None)
    parser.add_argument("--hm", help="Print heatmap of emission distribution means"
                        " for (only) numeric tracks", default=None)
    parser.add_argument("--t", help="Print transition matrix to given"
                        " file in GRAPHVIZ DOT format.  Convert to PDF with "
                        " dot <file> -Tpdf > <outFile>", default=None)
    parser.add_argument("--minTP", help="Minimum tranisition probability "
                        "to include in transition matrix output from --t option.",
                        type=float, default=EPSILON)
    parser.add_argument("--minTPns", help="Minimum transition probability after "
                        "self transition is normalized out (ie after dividing by 1-self)",
                        type=float, default=EPSILON)
    parser.add_argument("--teStates", help="comma-separated list of state names"
                        " to consider TE-1, TE-2, ... etc", default=None)
    
    args = parser.parse_args()

    # load model created with teHmmTrain.py
    model = loadModel(args.inputModel)

    if args.teStates is not None:
        args.teStates = set(x for x in args.teStates.split(","))

    # crappy print method
    print model

    if args.nameMap is True:
        print "State Maps:"
        trackList = model.trackList
        if trackList is None:
            print "TrackList: None"
        else:
            for track in trackList:
                print "Track: %s" % track.getName()
                print " map %s " % track.getValueMap().catMap
                print " pam %s " % track.getValueMap().catMapBack

    if args.ec is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write plots.  Maybe matplotlib is "
                               "not installed?")
        writeEmissionClusters(model, args, False)

    if args.ecn is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write plots.  Maybe matplotlib is "
                               "not installed?")
        writeEmissionClusters(model, args, True)        

    if args.pca is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write plots.  Maybe matplotlib is "
                               "not installed?")
        writeEmissionScatters(model, args)

    if args.hm is not None:
        if canPlot is False:
            raise RuntimeError("Unable to write plots.  Maybe matplotlib is "
                               "not installed?")
        writeEmissionHeatMap(model, args)

    if args.t is not None:
        writeTransitionGraph(model, args)
Пример #11
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Rename HMM states.")
    parser.add_argument("inputModel", help="Path of teHMM model created with"
                        " teHmmTrain.py")
    parser.add_argument("outputModel", help="Path of model with renamed states")
    parser.add_argument("--newNames", help="comma-separated list of state names to"
                        " apply.  This list must have exactly the same number of"
                        " states as the model.  The ith name in the list will be "
                        "assigned to the ith name of the model...", default=None)
    parser.add_argument("--teNumbers", help="comma-separated list of state numbers"
                        " that will be assigned TE states, with everything else"
                        " assigned Other.  This is less flexible but maybe more"
                        " convenient at times than --newNames.", default=None)
    parser.add_argument("--bed", help="apply naming to bed file and print "
                        "results to stdout", default=None)
    parser.add_argument("--sizes", help="bedFile to use for computing state numbering"
                        " by using decreasing order in total coverage (only works"
                        " with --teNumbers)", default=None)
    parser.add_argument("--noMerge", help="dont merge adjacent intervals with same"
                        " name with --bed option", action="store_true",default=False)
    parser.add_argument("--changeTrackName", help="dont do anything else, just change"
                        " the name of one track.  specified value should be of form"
                        " currentNAme, newName", default=None)
    

    args = parser.parse_args()
    assert args.inputModel != args.outputModel

    # load model created with teHmmTrain.py
    model = loadModel(args.inputModel)

    # trackChangeName logic hacked in completely separate from everything else
    if args.changeTrackName is not None:
        oldName, newName = args.changeTrackName.split(",")
        track = model.getTrackList().getTrackByName(oldName)
        track.setName(newName)
        saveModel(args.outputModel, model)
        return 0

    assert (args.newNames is None) != (args.teNumbers is None)
    
    # names manually specified
    if args.newNames is not None:
        names = args.newNames.split(",")
        
    # names computed using simple scheme from set of "TE" state numbers (as found from
    # log output of fitStateNames.py)
    elif args.teNumbers is not None:
        teNos = set([int(x) for x in args.teNumbers.split(",")])
        teCount, otherCount = 0, 0
        numStates = model.getEmissionModel().getNumStates()

        # re-order from sizing info
        if args.sizes is not None:
            bedIntervals = readBedIntervals(args.sizes, ncol=4)
            sizeMap = defaultdict(int)
            for interval in bedIntervals:
                sizeMap[int(interval[3])] += interval[2] - interval[1]
            stateNumbers = sorted([x for x in xrange(numStates)],
                           reverse=True, key = lambda x : sizeMap[x])
        else:
            stateNumbers = [x for x in xrange(numStates)]
        names = [""] * numStates
        for i in stateNumbers:
            if i in teNos:
                name = "TE-%.2d" % teCount
                teCount += 1
            else:
                name = "Other-%.2d" % otherCount
                otherCount += 1
            names[i] = name
        assert teCount == len(teNos) and teCount + otherCount == len(names)
                
    assert len(names) == model.getEmissionModel().getNumStates()

    # throw names in the mapping object and stick into model
    catMap = CategoryMap(reserved=0)
    for i, name in enumerate(names):
        catMap.getMap(name, update=True)
    model.stateNameMap = catMap
    
    # save model
    saveModel(args.outputModel, model)

    # process optional bed file
    if args.bed is not None:
        prevInterval = None
        bedIntervals = readBedIntervals(args.bed, ncol=4)
        for interval in bedIntervals:
            oldName = interval[3]
            newName = names[int(oldName)]
            newInterval = list(interval)
            newInterval[3] = newName
            if args.noMerge:
                # write interval
                print "\t".join(str(x) for x in newInterval)
            else:
                if prevInterval is None:
                    # update prev interval first time
                    prevInterval = newInterval
                elif newInterval[3] == prevInterval[3] and\
                         newInterval[0] == prevInterval[0] and\
                         newInterval[1] == prevInterval[2]:
                    # glue onto prev interval
                    prevInterval[2] = newInterval[2]
                else:
                    # write and update prev
                    print "\t".join(str(x) for x in prevInterval)
                    prevInterval = newInterval
        if prevInterval is not None:
            print "\t".join(str(x) for x in prevInterval)