Exemplos de warning em Python, exemplos de teHmm.common.logger.warning em Python

Exemplo n.º 1

0

Exibir arquivo

def writeScaledTrack(trackData, track, args):
    """ Go base-by-base, writing the unscaled value to the output"""
    fname, fext = os.path.splitext(os.path.basename(track.getPath()))
    outBed = os.path.join(args.outputDir, fname + "_scale" + ".bed")
    outBigWig = os.path.join(args.outputDir, fname + "_scale" + ".bw")
    outFile = open(outBed, "w")

    trackNo = track.getNumber()
    valMap = track.getValueMap()

    for trackTable in trackData.getTrackTableList():
        chrom = trackTable.getChrom()
        start = trackTable.getStart()
        for i in xrange(len(trackTable)):
            binnedVal = trackTable[i][trackNo]
            unbinnedVal = valMap.getMapBack(binnedVal)

            outFile.write("%s\t%d\t%d\t%f\n" %
                          (chrom, start + i, start + i + 1, unbinnedVal))

    outFile.close()

    #make a .bw copy
    try:
        runShellCommand("bedGraphToBigWig %s %s %s" %
                        (outBed, args.chromSizes, outBigWig))
    except:
        logger.warning("Unable to big bigwig from %s" % outBed)

Exemplo n.º 2

0

Exibir arquivo

def extractScore(benchDir, benchInputBedPath, args, repSuffix = ""):
    """ Reduce entire benchmark output into a single score value """

    compPath = os.path.join(benchDir,
                             os.path.splitext(
                                 os.path.basename(benchInputBedPath))[0]+
                                "_comp.txt" + repSuffix) 
    baseStats, intStats, weightedStats = extractCompStatsFromFile(compPath)
    stats = intStats
    if args.base is True:
        stats = baseStats
    f1List = []
    for state in args.states.split(","):
        if state not in stats:
            logger.warning("State %s not found in intstats %s. giving 0" % (
                state, str(stats)))
            f1List.append(0)
            continue
        
        prec = stats[state][0] 
        rec = stats[state][1] * args.recallSkew
        f1 = 0
        if prec + rec > 0:
            f1 = 2. * ((prec * rec) / (prec + rec))
        if args.score  == "prec":
            f1List.append(prec)
        elif args.score == "rec":
            f1List.append(rec)
        else:
            f1List.append(f1)

    avgF1 = np.mean(f1List)
    return avgF1

Exemplo n.º 3

0

Exibir arquivo

Arquivo: applyTrackScaling.py Projeto: glennhickey/teHmm

def writeScaledTrack(trackData, track, args):
    """ Go base-by-base, writing the unscaled value to the output"""
    fname, fext = os.path.splitext(os.path.basename(track.getPath()))
    outBed = os.path.join(args.outputDir, fname + "_scale" + ".bed")
    outBigWig = os.path.join(args.outputDir, fname + "_scale" + ".bw")
    outFile = open(outBed, "w")
    
    trackNo = track.getNumber()
    valMap = track.getValueMap()

    for trackTable in trackData.getTrackTableList():
        chrom = trackTable.getChrom()
        start = trackTable.getStart()
        for i in xrange(len(trackTable)):
            binnedVal = trackTable[i][trackNo]
            unbinnedVal = valMap.getMapBack(binnedVal)
            
            outFile.write("%s\t%d\t%d\t%f\n" % (
                chrom,
                start + i,
                start + i + 1,
                unbinnedVal))

    outFile.close()

    #make a .bw copy
    try:
        runShellCommand("bedGraphToBigWig %s %s %s" % (outBed, args.chromSizes,
                        outBigWig))
    except:
        logger.warning("Unable to big bigwig from %s" % outBed)

Exemplo n.º 4

0

Exibir arquivo

def getPosteriorsMask(pdStates, hmm):
    """ returns array mask where mask[i] == 1 iff state i is part of our desired
    posterior distribution"""
    stateMap = hmm.getStateNameMap()
    if stateMap is None:
        stateMap = CategoryMap(reserved = 0)
        for i in xrange(hmm.getEmissionModel().getNumStates()):
            stateMap.update(str(i))
    mask = np.zeros((len(stateMap)), dtype=np.int8)
    for state in pdStates.split(","):
        if not stateMap.has(state):
            logger.warning("Posterior (or Emission) Distribution state %s"
                           " not found in model" % state)
        else:
            stateNumber = stateMap.getMap(state)
            mask[stateNumber] = 1
    return mask

Exemplo n.º 5

0

Exibir arquivo

Arquivo: segmentTracks.py Projeto: glennhickey/teHmm

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Produce a bed file of genome segments which are atomic"
        " elements with resepect to the hmm. ie each segment emits a single"
        " state. Mask tracks always cut.  "
        "Output intervals are assigned name 0 1 0 1 etc.")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outBed", help="Output segments")
    parser.add_argument("--thresh", help="Number of tracks that can change "
                        "before a new segment formed.  Increasing this value"
                        " increases the expected lengths of output segments",
                        type=int, default=1)
    parser.add_argument("--cutTracks", help="Create a new segment if something"
                        " changes in one of these tracks (as specified by "
                        "comman-separated list), overriding --thresh options"
                        " if necessary.  For example, --cutTracks tsd,chaux"
                        " would invoke a new segment everytime the value at"
                        "either of these tracks changed", default=None)
    parser.add_argument("--cutUnscaled", help="Cut on all unscaled (used as "
                        "a proxy for non-numeric) tracks", default=False,
                        action="store_true")
    parser.add_argument("--cutMultinomial", help="Cut non-gaussian, non-binary"
                        " tracks everytime", default=False, action="store_true")
    parser.add_argument("--cutNonGaussian", help="Cut all but guassian tracks",
                        default=False, action="store_true")
    parser.add_argument("--comp", help="Strategy for comparing columns for the "
                        "threshold cutoff.  Options are [first, prev], where"
                        " first compares with first column of segment and "
                        "prev compares with column immediately left",
                        default="first")
    parser.add_argument("--ignore", help="Comma-separated list of tracks to "
                        "ignore (the FASTA DNA sequence would be a good "
                        "candidate", default="sequence")
    parser.add_argument("--maxLen", help="Maximum length of a segment (<= 0 means"
                        " no max length applied",
                        type=int, default=0)
    parser.add_argument("--fixLen", help="Just make segments of specifed fixed "
                        "length ignoring other parameters and logic (<= 0 means"
                        " no fixed length applied",
                        type=int, default=0)
    parser.add_argument("--stats", help="Write some statistics to specified "
                        "file. Of the form <trackName> <Diff> <DiffPct> "
                        " where <Diff> is the number of times a track differs"
                        " between two consecutive segments, and <DiffPct> "
                        " is the average perecentage of all such differences "
                        "accounted for by the track", default=None)
    parser.add_argument("--delMask", help="Entirely remove intervals from "
                        "mask tracks that are > given length (otherwise "
                        "they would just be ignored by HMM tools). The difference"
                        " here is that removed intervals will break contiguity.",
                        type=int, default=None)
    parser.add_argument("--chroms", help="list of chromosomes, or regions, to run in parallel"
                        " (in BED format).  input regions will be intersected with each line"
                        " in this file, and the result will correspsond to an individual job",
                        default=None)
    parser.add_argument("--proc", help="number of processes (use in conjunction with --chroms)",
                        type=int, default=1)
    parser.add_argument("--co", help="count offset for segment labels.  only used internally",
                        type=int, default=0)
        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.comp != "first" and args.comp != "prev":
        raise RuntimeError("--comp must be either first or prev")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun 
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0
        
    # read query intervals from the bed file
    tempFiles = []
    if args.delMask is not None:
        cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint,
                                     args.tracksInfo)
        if cutBed is not None:
            tempFiles.append(cutBed)
            args.allBed = cutBed
    logger.info("loading segment region intervals from %s" % args.allBed)
    mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.allBed)

    # read the tracks, while intersecting them with the query intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            treatMaskAsBinary=True)

    # process the --cutTracks option
    trackList = trackData.getTrackList()
    cutList = np.zeros((len(trackList)), np.int)
    if args.cutTracks is not None:
        cutNames = args.cutTracks.split(",")
        for name in cutNames:
            track = trackList.getTrackByName(name)
            if track is None:
                raise RuntimeError("cutTrack %s not found" % name)
            trackNo = track.getNumber()
            assert trackNo < len(cutList)
            cutList[trackNo] = 1
    args.cutList = cutList

    # make sure mask tracks count as cut tracks
    for track in trackList:
        if track.getDist() == 'mask':
            args.cutList[track.getNumber()] = 1

    # process the --ignore option
    ignoreList = np.zeros((len(trackList)), np.int)
    if args.ignore is not None:
        ignoreNames = args.ignore.split(",")
        for name in ignoreNames:
            track = trackList.getTrackByName(name)
            if track is None:
                if name is not "sequence":
                    logger.warning("ignore track %s not found" % name)
                continue
            trackNo = track.getNumber()
            assert trackNo < len(ignoreList)
            ignoreList[trackNo] = 1
            if args.cutList[trackNo] == 1:
                raise RuntimeError("Same track (%s) cant be cut and ignored" %
                                  name)
    args.ignoreList = ignoreList

    #process the --cutUnscaled option
    if args.cutUnscaled is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.scale is None and track.shift is None and\
              track.logScale is None and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1

    #process the --cutMultinomial option
    if args.cutMultinomial is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist == "multinomial" and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1

    #process the --cutNonGaussian option
    if args.cutNonGaussian is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist != "gaussian" and\
              args.ignoreList[trackNo] == 0:
              assert trackNo < len(cutList)
              cutList[trackNo] = 1
              

    # segment the tracks
    stats = dict()
    segmentTracks(trackData, args, stats)
    writeStats(trackData, args, stats)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)

Exemplo n.º 6

0

Exibir arquivo

def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Helper script to rank a list of tracks based on how well "
        "they improve some measure of HMM accuracy, by wrapping "
         "teHmmBenchmark.py")

    parser.add_argument("tracks", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("training", help="BED Training regions"
                        "teHmmTrain.py")
    parser.add_argument("truth", help="BED Truth used for scoring")
    parser.add_argument("states", help="States (in truth) to use for"
                        " average F1 score (comma-separated")
    parser.add_argument("outDir", help="Directory to place all results")
    parser.add_argument("--benchOpts", help="Options to pass to "
                        "teHmmBenchmark.py (wrap in double quotes)",
                        default="")
    parser.add_argument("--startTracks", help="comma-separated list of "
                        "tracks to start off with", default = None)
    parser.add_argument("--segOpts", help="Options to pass to "
                        "segmentTracks.py (wrap in double quotes)",
                        default="--comp first --thresh 1 --cutUnscaled")
    parser.add_argument("--fullSegment", help="Only use segmentation"
                        " based on entire track list for each iteration"
                        " rather than compute segmentation each time (as"
                        " done by default)", action="store_true",
                        default=False)
    parser.add_argument("--bic", help="rank by BIC instead of score "
                        " (both always present in output table though)",
                        action="store_true", default=False)
    parser.add_argument("--base", help="use base-level F1 instead of "
                        "interval-level", default=False, action="store_true")
    parser.add_argument("--naive", help="rank by \"naive\" score",
                         action="store_true", default=False)
    parser.add_argument("--doNaive", help="compute naive stats.  will be "
                        "turned on by default if --naive is used", default=False,
                        action="store_true")
    parser.add_argument("--segTracks", help="tracks XML to use for segmentation"
                        " (by default will be same as tracks))", default=None)
    parser.add_argument("--recallSkew", help="when computing f1, multiply recall"
                        " by this number (hack to favour larger recall)",
                        default=1., type=float)
    parser.add_argument("--score", help="accuracy score to use from "
                        "{f1, prec, rec}", default="f1")
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    # make sure no no-no options in benchOpts
    if "--eval" in args.benchOpts or "--truth" in args.benchOpts:
        raise RuntimeError("--eval and --truth cannot be passed through to "
                           "teHmmBenchmark.py as they are generated from "
                           "<training> and <truth> args from this script")
    
    # don't want to keep track of extra logic required for not segmenting
    if "--segment" not in args.benchOpts:
        args.benchOpts += " --segment"
        logger.warning("Adding --segment to teHmmBenchmark.py options")

    if args.bic is True and args.naive is True:
        raise RuntimeError("--bic and --naive are mutually incompatible")
    if args.naive is True:
        args.doNaive = True
    if args.segTracks is None:
        args.segTracks = args.tracks
        
    if not os.path.exists(args.outDir):
        os.makedirs(args.outDir)

    greedyRank(args)

Exemplo n.º 7

0

Exibir arquivo

def runTrial(tracksList, iteration, newTrackName, args):
    """ compute a score for a given set of tracks using teHmmBenchmark.py """
    benchDir = os.path.join(args.outDir, "iter%d" % iteration)
    benchDir = os.path.join(benchDir, "%s_bench" % newTrackName)
    if not os.path.exists(benchDir):
        os.makedirs(benchDir)

    trainingPath = args.training
    truthPath = args.truth

    tracksPath =  os.path.join(benchDir, "tracks.xml")
    tracksList.saveXML(tracksPath)

    segLogPath = os.path.join(benchDir, "segment_cmd.txt")
    segLog = open(segLogPath, "w")

    if args.segTracks == args.tracks:
        segTracksPath = tracksPath
    # pull out desired tracks from segment tracks XML if specified
    else:
        segTracksIn = TrackList(args.segTracks)
        segTracks = TrackList()
        for track in tracksList:
            segTrack = segTracksIn.getTrackByName(track.getName())
            if segTrack is not None:
                segTracks.addTrack(segTrack)
            else:
                logger.warning("track %s not found in segment tracks %s" % (
                    track.getName(), args.segTracks))
        segTracksPath = os.path.join(benchDir, "seg_tracks.xml")
        segTracks.saveXML(segTracksPath)
        
    # segment training
    segTrainingPath = os.path.join(benchDir,
                                   os.path.splitext(
                                       os.path.basename(trainingPath))[0]+
                                   "_trainSeg.bed")    
    segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath,
                                                   trainingPath,
                                                   segTrainingPath,
                                                   args.segOpts)

    if args.fullSegment is False:
        runShellCommand(segmentCmd)
        segLog.write(segmentCmd + "\n")
    else:
        runShellCommand("ln -f -s %s %s" % (args.fullSegTrainPath, segTrainingPath))

    # segment eval
    segEvalPath = os.path.join(benchDir,
                                os.path.splitext(os.path.basename(truthPath))[0]+
                                "_evalSeg.bed")    
    segmentCmd = "segmentTracks.py %s %s %s %s" % (segTracksPath,
                                                   truthPath,
                                                   segEvalPath,
                                                   args.segOpts)
    if trainingPath == truthPath:
        segmentCmd = "ln -f -s %s %s" % (os.path.abspath(segTrainingPath), segEvalPath)
    if args.fullSegment is False:
        runShellCommand(segmentCmd)
        segLog.write(segmentCmd + "\n")
    else:
        runShellCommand("ln -f -s %s %s" % (args.fullSegEvalPath, segEvalPath))
    
    segLog.close()

    segPathOpts = " --eval %s --truth %s" % (segEvalPath, truthPath)
    
    benchCmd = "teHmmBenchmark.py %s %s %s %s" % (tracksPath,
                                                  benchDir,
                                                  segTrainingPath,
                                                  args.benchOpts + segPathOpts)
    runShellCommand(benchCmd)

    score = extractScore(benchDir, segTrainingPath, args)
    bic = extractBIC(benchDir, segTrainingPath, args)
    naive = 0
    if args.doNaive is True:
        naive = extractNaive(tracksPath, benchDir, segTrainingPath, args)
    slope, rsq = extractF1ProbSlope(benchDir, segTrainingPath, args)

    # clean up big files?

    return score, bic, naive, slope, rsq

Exemplo n.º 8

0

Exibir arquivo

def greedyRank(args):
    """ Iteratively add best track to a (initially empty) tracklist according
    to some metric"""
    inputTrackList = TrackList(args.tracks)
    rankedTrackList = TrackList()
    if args.startTracks is not None:
        for startTrack in args.startTracks.split(","):
            track = inputTrackList.getTrackByName(startTrack)
            if track is None:
                logger.warning("Start track %s not found in tracks XML" %
                               startTrack)
            else:
                rankedTrackList.addTrack(copy.deepcopy(track))
            
    numTracks = len(inputTrackList) - len(rankedTrackList)
    currentScore, currentBIC = 0.0, sys.maxint

    # compute full segmentation if --fullSegment is True
    if args.fullSegment is True:
        args.fullSegTrainPath = os.path.abspath(os.path.join(args.outDir,
                                                             "fullSegTrain.bed"))
        segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks,
                                                       args.training,
                                                       args.fullSegTrainPath,
                                                       args.segOpts)
        runShellCommand(segmentCmd)
        args.fullSegEvalPath = os.path.abspath(os.path.join(args.outDir,
                                                            "fullSegEval.bed"))
        segmentCmd = "segmentTracks.py %s %s %s %s" % (args.segTracks,
                                                       args.truth,
                                                       args.fullSegEvalPath,
                                                       args.segOpts)
        runShellCommand(segmentCmd)

    #header
    rankFile = open(os.path.join(args.outDir, "ranking.txt"), "w")
    rankFile.write("It.\tTrack\tF1\tBIC\tNaiveF1\tAccProbSlop\tAccProbR2\n")
    rankFile.close()
    
    # baseline score if we not starting from scratch
    baseIt = 0
    if args.startTracks is not None:
        curTrackList = copy.deepcopy(rankedTrackList)
        score,bic,naive,slope,rsq = runTrial(curTrackList, baseIt, "baseline_test", args)
        rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a")
        rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (baseIt, args.startTracks,
                                        score, bic, naive,slope,rsq))
        rankFile.close()
        baseIt += 1
        
    for iteration in xrange(baseIt, baseIt + numTracks):
        bestItScore = -sys.maxint
        bestItBic = sys.maxint
        bestItNaive = -sys.maxint
        bestNextTrack = None
        bestSlope = None
        bestR = None
        for nextTrack in inputTrackList:
            if rankedTrackList.getTrackByName(nextTrack.getName()) is not None:
                continue
            curTrackList = copy.deepcopy(rankedTrackList)
            curTrackList.addTrack(nextTrack)
            score,bic,naive,slope,rsq = runTrial(curTrackList, iteration, nextTrack.getName(),
                                args)
            best = False
            if args.bic is True:
                if bic < bestItBic or (bic == bestItBic and score > bestItScore):
                    best = True
            elif args.naive is True:
                if naive > bestItNaive or (naive == bestItNaive and score > bestItScore):
                    best = True
            elif score > bestItScore or (score == bestItScore and bic < bestItBic):
                    best = True
            if best is True:
                bestItScore, bestItBic, bestItNaive, bestSlope, bestR, bestNextTrack =\
                       score, bic, naive, slope, rsq, nextTrack
            flags = "a"
            if iteration == baseIt:
                flags = "w"      
            trackLogFile = open(os.path.join(args.outDir, nextTrack.getName() +
                                             ".txt"), flags)
            trackLogFile.write("%d\t%f\t%f\t%f\t%f\t%f\n" % (iteration, score, bic, naive,
                                                             slope, rsq))
            trackLogFile.close()
        rankedTrackList.addTrack(copy.deepcopy(bestNextTrack))
        rankedTrackList.saveXML(os.path.join(args.outDir, "iter%d" % iteration,
                                "tracks.xml"))
        
        rankFile = open(os.path.join(args.outDir, "ranking.txt"), "a")
        rankFile.write("%d\t%s\t%s\t%s\t%s\t%s\t%s\n" % (iteration, bestNextTrack.getName(),
                                            bestItScore, bestItBic, bestItNaive,
                                            bestSlope, bestR))
        rankFile.close()

Exemplo n.º 9

0

Exibir arquivo

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Produce a bed file of genome segments which are atomic"
        " elements with resepect to the hmm. ie each segment emits a single"
        " state. Mask tracks always cut.  "
        "Output intervals are assigned name 0 1 0 1 etc.")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outBed", help="Output segments")
    parser.add_argument("--thresh",
                        help="Number of tracks that can change "
                        "before a new segment formed.  Increasing this value"
                        " increases the expected lengths of output segments",
                        type=int,
                        default=1)
    parser.add_argument("--cutTracks",
                        help="Create a new segment if something"
                        " changes in one of these tracks (as specified by "
                        "comman-separated list), overriding --thresh options"
                        " if necessary.  For example, --cutTracks tsd,chaux"
                        " would invoke a new segment everytime the value at"
                        "either of these tracks changed",
                        default=None)
    parser.add_argument("--cutUnscaled",
                        help="Cut on all unscaled (used as "
                        "a proxy for non-numeric) tracks",
                        default=False,
                        action="store_true")
    parser.add_argument("--cutMultinomial",
                        help="Cut non-gaussian, non-binary"
                        " tracks everytime",
                        default=False,
                        action="store_true")
    parser.add_argument("--cutNonGaussian",
                        help="Cut all but guassian tracks",
                        default=False,
                        action="store_true")
    parser.add_argument("--comp",
                        help="Strategy for comparing columns for the "
                        "threshold cutoff.  Options are [first, prev], where"
                        " first compares with first column of segment and "
                        "prev compares with column immediately left",
                        default="first")
    parser.add_argument("--ignore",
                        help="Comma-separated list of tracks to "
                        "ignore (the FASTA DNA sequence would be a good "
                        "candidate",
                        default="sequence")
    parser.add_argument("--maxLen",
                        help="Maximum length of a segment (<= 0 means"
                        " no max length applied",
                        type=int,
                        default=0)
    parser.add_argument(
        "--fixLen",
        help="Just make segments of specifed fixed "
        "length ignoring other parameters and logic (<= 0 means"
        " no fixed length applied",
        type=int,
        default=0)
    parser.add_argument("--stats",
                        help="Write some statistics to specified "
                        "file. Of the form <trackName> <Diff> <DiffPct> "
                        " where <Diff> is the number of times a track differs"
                        " between two consecutive segments, and <DiffPct> "
                        " is the average perecentage of all such differences "
                        "accounted for by the track",
                        default=None)
    parser.add_argument(
        "--delMask",
        help="Entirely remove intervals from "
        "mask tracks that are > given length (otherwise "
        "they would just be ignored by HMM tools). The difference"
        " here is that removed intervals will break contiguity.",
        type=int,
        default=None)
    parser.add_argument(
        "--chroms",
        help="list of chromosomes, or regions, to run in parallel"
        " (in BED format).  input regions will be intersected with each line"
        " in this file, and the result will correspsond to an individual job",
        default=None)
    parser.add_argument(
        "--proc",
        help="number of processes (use in conjunction with --chroms)",
        type=int,
        default=1)
    parser.add_argument(
        "--co",
        help="count offset for segment labels.  only used internally",
        type=int,
        default=0)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.comp != "first" and args.comp != "prev":
        raise RuntimeError("--comp must be either first or prev")

    if args.chroms is not None:
        # hack to allow chroms argument to chunk and rerun
        parallelDispatch(argv, args)
        cleanBedTool(tempBedToolPath)
        return 0

    # read query intervals from the bed file
    tempFiles = []
    if args.delMask is not None:
        cutBed = cutOutMaskIntervals(args.allBed, args.delMask, sys.maxint,
                                     args.tracksInfo)
        if cutBed is not None:
            tempFiles.append(cutBed)
            args.allBed = cutBed
    logger.info("loading segment region intervals from %s" % args.allBed)
    mergedIntervals = getMergedBedIntervals(args.allBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.allBed)

    # read the tracks, while intersecting them with the query intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo,
                            mergedIntervals,
                            treatMaskAsBinary=True)

    # process the --cutTracks option
    trackList = trackData.getTrackList()
    cutList = np.zeros((len(trackList)), np.int)
    if args.cutTracks is not None:
        cutNames = args.cutTracks.split(",")
        for name in cutNames:
            track = trackList.getTrackByName(name)
            if track is None:
                raise RuntimeError("cutTrack %s not found" % name)
            trackNo = track.getNumber()
            assert trackNo < len(cutList)
            cutList[trackNo] = 1
    args.cutList = cutList

    # make sure mask tracks count as cut tracks
    for track in trackList:
        if track.getDist() == 'mask':
            args.cutList[track.getNumber()] = 1

    # process the --ignore option
    ignoreList = np.zeros((len(trackList)), np.int)
    if args.ignore is not None:
        ignoreNames = args.ignore.split(",")
        for name in ignoreNames:
            track = trackList.getTrackByName(name)
            if track is None:
                if name is not "sequence":
                    logger.warning("ignore track %s not found" % name)
                continue
            trackNo = track.getNumber()
            assert trackNo < len(ignoreList)
            ignoreList[trackNo] = 1
            if args.cutList[trackNo] == 1:
                raise RuntimeError("Same track (%s) cant be cut and ignored" %
                                   name)
    args.ignoreList = ignoreList

    #process the --cutUnscaled option
    if args.cutUnscaled is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.scale is None and track.shift is None and\
              track.logScale is None and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    #process the --cutMultinomial option
    if args.cutMultinomial is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist == "multinomial" and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    #process the --cutNonGaussian option
    if args.cutNonGaussian is True:
        for track in trackList:
            trackNo = track.getNumber()
            if track.dist != "gaussian" and\
              args.ignoreList[trackNo] == 0:
                assert trackNo < len(cutList)
                cutList[trackNo] = 1

    # segment the tracks
    stats = dict()
    segmentTracks(trackData, args, stats)
    writeStats(trackData, args, stats)

    if len(tempFiles) > 0:
        runShellCommand("rm -f %s" % " ".join(tempFiles))
    cleanBedTool(tempBedToolPath)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: teHmmTrain.py Projeto: glennhickey/teHmm

def main(argv=None):
    if argv is None:
        argv = sys.argv
        
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create a teHMM")

    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("trainingBed", help="Path of BED file containing"
                        " genome regions to train model on.  If --supervised "
                        "is used, the names in this bed file will be treated "
                        "as the true annotation (otherwise it is only used for "
                        "interval coordinates)")
    parser.add_argument("outputModel", help="Path of output hmm")
    parser.add_argument("--numStates", help="Number of states in model",
                        type = int, default=2)
    parser.add_argument("--iter", help="Number of EM iterations",
                        type = int, default=100)
    parser.add_argument("--supervised", help="Use name (4th) column of "
                        "<traingingBed> for the true hidden states of the"
                        " model.  Transition parameters will be estimated"
                        " directly from this information rather than EM."
                        " NOTE: The number of states will be determined "
                        "from the bed.",
                        action = "store_true", default = False)
    parser.add_argument("--cfg", help="Use Context Free Grammar insead of "
                        "HMM.  Only works with --supervised for now",
                        action = "store_true", default = False)
    parser.add_argument("--saPrior", help="Confidence in self alignment "
                        "track for CFG.  Probability of pair emission "
                        "is multiplied by this number if the bases are aligned"
                        " and its complement if bases are not aligned. Must"
                        " be between [0,1].", default=0.95, type=float)
    parser.add_argument("--pairStates", help="Comma-separated list of states"
                        " (from trainingBed) that are treated as pair-emitors"
                        " for the CFG", default=None)
    parser.add_argument("--emFac", help="Normalization factor for weighting"
                        " emission probabilities because when there are "
                        "many tracks, the transition probabilities can get "
                        "totally lost. 0 = no normalization. 1 ="
                        " divide by number of tracks.  k = divide by number "
                        "of tracks / k", type=int, default=0)
    parser.add_argument("--initTransProbs", help="Path of text file where each "
                        "line has three entries: FromState ToState Probability"
                        ".  This file (all other transitions get probability 0)"
                        " is used to specifiy the initial transition model."
                        " The names and number of states will be initialized "
                        "according to this file (overriding --numStates)",
                        default = None)
    parser.add_argument("--fixTrans", help="Do not learn transition parameters"
                        " (best used with --initTransProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ".  This file (all other emissions get probability 0)"
                        " is used to specifiy the initial emission model. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixEm", help="Do not learn emission parameters"
                        " (best used with --initEmProbs)",
                        action="store_true", default=False)
    parser.add_argument("--initStartProbs", help="Path of text file where each "
                        "line has two entries: State Probability"
                        ".  This file (all other start probs get probability 0)"
                        " is used to specifiy the initial start dist. All "
                        "states specified in this file must appear in the file"
                        " specified with --initTransProbs (but not vice versa).",
                        default = None)
    parser.add_argument("--fixStart", help="Do not learn start parameters"
                        " (best used with --initStartProbs)",
                        action="store_true", default=False)
    parser.add_argument("--forceTransProbs",
                        help="Path of text file where each "
                        "line has three entries: FromState ToState Probability" 
                        ". These transition probabilities will override any "
                        " learned probabilities after each training iteration"
                        " (unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed)" ,
                        default=None)
    parser.add_argument("--forceEmProbs", help="Path of text file where each "
                        "line has four entries: State Track Symbol Probability"
                        ". These "
                        "emission probabilities will override any learned"
                        " probabilities after each training iteration "
                        "(unspecified "
                        "will not be set to 0 in this case. the learned values"
                        " will be kept, but normalized as needed.)" ,
                        default = None) 
    parser.add_argument("--flatEm", help="Use a flat emission distribution as "
                        "a baseline.  If not specified, the initial emission "
                        "distribution will be randomized by default.  Emission"
                        " probabilities specified with --initEmpProbs or "
                        "--forceEmProbs will never be affected by randomizaiton"
                        ".  The randomization is important for Baum Welch "
                        "training, since if two states dont have at least one"
                        " different emission or transition probability to begin"
                        " with, they will never learn to be different.",
                        action="store_true", default=False)
    parser.add_argument("--emRandRange", help="When randomly initialzing an"
                        " emission distribution, constrain"
                        " the values to the given range (pair of "
                        "comma-separated numbers).  Overridden by "
                        "--initEmProbs and --forceEmProbs when applicable."
                        " Completely overridden by --flatEm (which is equivalent"
                        " to --emRandRange .5,.5.). Actual values used will"
                        " always be normalized.", default="0.2,0.8")
    parser.add_argument("--segment", help="Bed file of segments to treat as "
                        "single columns for HMM (ie as created with "
                        "segmentTracks.py).  IMPORTANT: this file must cover "
                        "the same regions as the traininBed file. Unless in "
                        "supervised mode, probably best to use same bed file "
                        " as both traingBed and --segment argument.  Otherwise"
                        " use intersectBed to make sure the overlap is exact",
                        default=None)
    parser.add_argument("--segLen", help="Effective segment length used for"
                        " normalizing input segments (specifying 0 means no"
                        " normalization applied)", type=int, default=0)
    parser.add_argument("--seed", help="Seed for random number generator"
                        " which will be used to initialize emissions "
                        "(if --flatEM and --supervised not specified)",
                        default=None, type=int)
    parser.add_argument("--reps", help="Number of replicates (with different"
                         " random initializations) to run. The replicate"
                         " with the highest likelihood will be chosen for the"
                         " output", default=1, type=int)
    parser.add_argument("--numThreads", help="Number of threads to use when"
                        " running replicates (see --rep) in parallel.",
                        type=int, default=1)
    parser.add_argument("--emThresh", help="Threshold used for convergence"
                        " in baum welch training.  IE delta log likelihood"
                        " must be bigger than this number (which should be"
                        " positive) for convergence", type=float,
                        default=0.001)
    parser.add_argument("--saveAllReps", help="Save all replicates (--reps)"
                        " models to disk, instead of just the best one"
                        ". Format is <outputModel>.repN.  There will be "
                        " --reps -1 such models saved as the best output"
                        " counts as a replicate",
                        action="store_true", default=False)
    parser.add_argument("--maxProb", help="Gaussian distributions and/or"
                        " segment length corrections can cause probability"
                        " to *decrease* during BW iteration.  Use this option"
                        " to remember the parameters with the highest probability"
                        " rather than returning the parameters after the final "
                        "iteration.", action="store_true", default=False)
    parser.add_argument("--maxProbCut", help="Use with --maxProb option to stop"
                        " training if a given number of iterations go by without"
                        " hitting a new maxProb", default=None, type=int)
    parser.add_argument("--transMatEpsilons", help="By default, epsilons are"
                        " added to all transition probabilities to prevent "
                        "converging on 0 due to rounding error only for fully"
                        " unsupervised training.  Use this option to force this"
                        " behaviour for supervised and semisupervised modes",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    if args.cfg is True:
        assert args.supervised is True
        assert args.saPrior >= 0. and args.saPrior <= 1.
    if args.pairStates is not None:
        assert args.cfg is True
    if args.initTransProbs is not None or args.fixTrans is True or\
      args.initEmProbs is not None or args.fixEm is not None:
        if args.cfg is True:
            raise RuntimeError("--transProbs, --fixTrans, --emProbs, --fixEm "
                               "are not currently compatible with --cfg.")
    if args.fixTrans is True and args.supervised is True:
        raise RuntimeError("--fixTrans option not compatible with --supervised")
    if args.fixEm is True and args.supervised is True:
        raise RuntimeError("--fixEm option not compatible with --supervised")
    if (args.forceTransProbs is not None or args.forceEmProbs is not None) \
      and args.cfg is True:
        raise RuntimeError("--forceTransProbs and --forceEmProbs are not "
                           "currently compatible with --cfg")
    if args.flatEm is True and args.supervised is False and\
      args.initEmProbs is None and args.initTransProbs is None:
      raise RuntimeError("--flatEm must be used with --initEmProbs and or"
                         " --initTransProbs")
    if args.initEmProbs is not None and args.initTransProbs is None:
        raise RuntimeError("--initEmProbs can only be used in conjunction with"
                           " --initTransProbs")
    if args.emRandRange is not None:
        args.emRandRange = args.emRandRange.split(",")
        try:
            assert len(args.emRandRange) == 2
            args.emRandRange = (float(args.emRandRange[0]),
                                float(args.emRandRange[1]))
        except:
            raise RuntimeError("Invalid --emRandRange specified")
    if args.transMatEpsilons is False:
        # old logic here. now overriden with above options
        args.transMatEpsilons = (args.supervised is False and
                                 args.initTransProbs is None and
                                 args.forceTransProbs is None)

    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # read training intervals from the bed file
    logger.info("loading training intervals from %s" % args.trainingBed)
    mergedIntervals = getMergedBedIntervals(args.trainingBed, ncol=4)
    if mergedIntervals is None or len(mergedIntervals) < 1:
        raise RuntimeError("Could not read any intervals from %s" %
                           args.trainingBed)

    # read segment intervals
    segIntervals = None
    if args.segment is not None:
        logger.info("loading segment intervals from %s" % args.segment)
        try:
            checkExactOverlap(args.trainingBed, args.segment)
        except:
            raise RuntimeError("bed file passed with --segments option"
                               " must exactly overlap trainingBed")
        segIntervals = readBedIntervals(args.segment, sort=True)
    elif args.segLen > 0:
        raise RuntimeError("--segLen can only be used with --segment")
    if args.segLen <= 0:
        args.segLen = None
    if args.segLen > 0 and args.segLen != 1:
        logger.warning("--segLen should be 0 (no correction) or 1 (base"
                       " correction).  Values > 1 may cause bias.")

    # read the tracks, while intersecting them with the training intervals
    logger.info("loading tracks %s" % args.tracksInfo)
    trackData = TrackData()
    trackData.loadTrackData(args.tracksInfo, mergedIntervals,
                            segmentIntervals=segIntervals)

    catMap = None
    userTrans = None
    if args.supervised is False and args.initTransProbs is not None:
        logger.debug("initializing transition model with user data")
        catMap = stateNamesFromUserTrans(args.initTransProbs)
        # state number is overrided by the transProbs file
        args.numStates = len(catMap)

    truthIntervals = None
    # state number is overrided by the input bed file in supervised mode
    if args.supervised is True:
        logger.info("processing supervised state names")
        # we reload because we don't want to be merging them here
        truthIntervals = readBedIntervals(args.trainingBed, ncol=4)
        catMap = mapStateNames(truthIntervals)
        args.numStates = len(catMap)

    # train the model
    seeds = [random.randint(0, 4294967294)]
    if args.seed is not None:
        seeds = [args.seed]
        random.seed(args.seed)
    seeds += [random.randint(0, sys.maxint) for x in xrange(1, args.reps)]

    def trainClosure(randomSeed):
        return trainModel(randomSeed, trackData=trackData, catMap=catMap,
                          userTrans=userTrans, truthIntervals=truthIntervals,
                          args=args)
    
    modelList = runParallelShellCommands(argList=seeds, numProc = args.numThreads,
                                         execFunction = trainClosure,
                                         useThreads = True)

    # select best model
    logmsg = ""
    bestModel = (-1, LOGZERO)
    for i in xrange(len(modelList)):
        curModel = (i, modelList[i].getLastLogProb())
        if curModel[1] > bestModel[1]:
            bestModel = curModel
        if curModel[1] is not None:
            logmsg += "Rep %i: TotalProb: %f\n" % curModel
    if len(modelList) > 1:
        logging.info("Training Replicates Statistics:\n%s" % logmsg)
        logging.info("Selecting best replicate (%d, %f)" % bestModel)
    model = modelList[bestModel[0]]
        
    # write the model to a pickle
    logger.info("saving trained model to %s" % args.outputModel)
    saveModel(args.outputModel, model)

    # write all replicates
    writtenCount = 0
    if args.saveAllReps is True:
        for i, repModel in enumerate(modelList):
            if i != bestModel[0]:
                repPath = "%s.rep%d" % (args.outputModel, writtenCount)
                logger.info("saving replicate model to %s" % repPath)                
                saveModel(repPath, repModel)
                writtenCount += 1

    cleanBedTool(tempBedToolPath)

Exemplo n.º 11

0

Exibir arquivo

def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Combine a bunch of non-numeric BED tracks into"
        " single file using fitStateNames.py to try to keep names "
        "consistent.  Idea is to be used as baseline to compare"
        " hmm to (via base-by-base statistics, primarily, since"
        " this procedure could induce some fragmentation)")

    parser.add_argument("tracksXML", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("regionBed", help="BED file representing "
                        "target region (best if whole genome)")
    parser.add_argument("outBed", help="Output bed")
    parser.add_argument("--tracks", help="Comma-separated list of "
                        "track names to use.  All tracks will be"
                        " used by default", default=None)
    parser.add_argument("--outside", help="Name to give non-annotated"
                        "regions", default="Outside")
    parser.add_argument("--fitThresh", help="Min map percentage (0,1)"
                        " in order to rename (see --qualThresh option"
                        "of fitStateNames.py", type=float,
                        default=0.5)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    inputTrackList = TrackList(args.tracksXML)
    iter = 0

    # get regionBed where all intervals are merged when possible
    regionIntervals = getMergedBedIntervals(args.regionBed, sort=True)
    tempRegionPath = getLocalTempPath("Temp", "_reg.bed")
    tempRegionFile = open(tempRegionPath, "w")
    for interval in regionIntervals:
        tempRegionFile.write("\t".join([str(x) for x in interval]) + "\n")
    tempRegionFile.close()

    # accumulate tracks in temp file
    tempOutPath = getLocalTempPath("Temp", "_out.bed")
    
    for track in inputTrackList:
        if track.shift is not None or track.scale is not None or\
          track.logScale is not None or track.dist == "gaussian" or\
          os.path.splitext(track.getPath())[1].lower() != ".bed":
          logger.warning("Skipping numeric track %s" % track.getName())
        elif args.tracks is None or track.getName() in args.tracks.split(","):
            combineTrack(track, tempOutPath, tempRegionPath, iter, args)
            iter += 1

    # nothing got written, make everything outside
    if iter == 0:
        tempOutFile = open(tempOutPath, "w")
        for interval in regionIntervals:
            tempOutFile.write("%s\t%s\t%s\t%s\n" % (interval[0], interval[1],
                                                   interval[2], args.outside))
        tempOutFile.close()

    runShellCommand("mv %s %s" % (tempOutPath, args.outBed))
    runShellCommand("rm -f %s" % (tempRegionPath))
                
    cleanBedTool(tempBedToolPath)

Exemplo n.º 12

0

Exibir arquivo

def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument(
        "trainingBeds",
        help="comma-separated list of training regions"
        " (training region size will be a variable in output table). "
        "if segmentation is activated, these must also be the "
        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states",
                        help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps",
                        help="number of replicates",
                        type=int,
                        default=1)
    parser.add_argument("--proc",
                        help="maximum number of processors to use"
                        " in parallel",
                        type=int,
                        default=1)
    parser.add_argument("--resume",
                        help="try not to rewrite existing files",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--initTrans",
        help="the states argument is overridden"
        " to specify a list of transition initialization files "
        "instead of state numbers",
        action="store_true",
        default=False)
    parser.add_argument("--numReps",
                        help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments",
                        action="store_true",
                        default=False)
    parser.add_argument("--numIter",
                        help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments",
                        action="store_true",
                        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[
                        states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)

    # run the training
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" %
                    stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize, trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(
                    args.outDir, "hmm_%d.%d.%d.%d.mod" %
                    (trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" %
                            (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" %
                                (np.mean(bics), np.min(bics), np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()

    cleanBedTool(tempBedToolPath)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: cleanTracks.py Projeto: glennhickey/teHmm

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="fix up track names and sort alphabetically.  easier to do here on xml than at end for pape\
        r.")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("outTracksInfo", help="Path to write modified tracks XML")

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    nm = dict()
    nm["hollister"] = "RM-RepBase-Hollister"
    nm["chaux"] = "RM-RepBase-deLaChaux"
    nm["repeat_modeler"] = "RM-RepeatModeler"
    nm["repbase"] = "RM-RepBase"
    nm["repet"] = "REPET"
    nm["ltr_finder"] = "LTR_FINDER"
    nm["ltr_harvest"] = "LTR_Harvest"
    nm["ltr_termini"] = "lastz-Termini"
    nm["lastz-Termini"] = "lastz-LTRTermini"
    nm["tir_termini"] = "lastz-InvTermini"
    nm["irf"] = "IRF"
    nm["palindrome"] = "lastz-Palindrome"
    nm["overlap"] = "lastz-Overlap"
    nm["mitehunter"] = "MITE-Hunter"
    nm["helitronscanner"] = "HelitronScanner"
    nm["cov_80-"] = "lastz-SelfLowId"
    nm["cov_80-90"] = "lastz-SelfMedId"
    nm["cov_90+"] = "lastz-SelfHighId"
    nm["left_peak_80-"] = "lastz-SelfPeakLeftLow"
    nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"]
    nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed"
    nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"]
    nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh"
    nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"]
    nm["right_peak_80-"] = "lastz-SelfPeakRightLow"
    nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"]
    nm["right_peak_80-90"] = "lastz-SelfPeakRightMed"
    nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"]
    nm["right_peak_90+"] = "lastz-SelfPeakRightHigh"
    nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"]
    nm["cov_maxPId"] = "lastz-SelfPctMaxId"
    nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"]
    nm["te_domains"] = "TE-Domains"
    nm["fgenesh"] = "Genes"
    nm["genes"] = nm["fgenesh"]
    nm["refseq"] = nm["fgenesh"]
    nm["mrna"] = "mRNA"
    nm["srna"] = "sRNA"
    nm["ortho_depth"] = "Alignment-Depth"
    nm["orthology"] = nm["ortho_depth"]
    nm["chain_depth"] = nm["ortho_depth"]
    nm["alignment_depth"] = nm["ortho_depth"]
    nm["gcpct"] = "GC"
    nm["trf"] = "TRF"
    nm["windowmasker"] = "WindowMasker"
    nm["polyN"] = "Ns"
    nm["phastcons_ce"] = "Conservation"
    nm["phastcons"] = nm["phastcons_ce"]
    nm["PhastCons"] = nm["phastcons_ce"]
    nm["phyloP"] = nm["phastcons_ce"]
    nm["phylop"] = nm["phastcons_ce"] 

    rtracks = dict()
    rtracks["tantan"] = True
    rtracks["polyA"] = True
    rtracks["transposon_psi"] = True
    rtracks["transposonpsi"] = True
    rtracks["repbase_censor"] = True
    rtracks["tsd"] = True
    rtracks["repbase_default"] = True
    rtracks["dustmasker"] = True
       
    inTracks = TrackList(args.tracksInfo)
    outTracks = TrackList()
    outList = []

    for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList):
        if not os.path.exists(track.path):
            raise RuntimeError("Track DNE %s" % track.path)
        if track.name not in rtracks:
            if track.name in nm:
                track.name = nm[track.name]
            else:
                logger.warning("Did not map track %s" % track.name)
            outList.append(track)                        
        else:
            logger.warning("Deleted track %s" % track.name)


    # sort the list
    def sortComp(x):
        lname = x.name.lower()
        if x.name == "RM-RepeatModeler":
            return "aaaaa" + lname
        elif "RM" in x.name:
            return "aaaa" + lname
        elif "REPET" in x.name:
            return "aaa" + lname
        elif "softmask" in lname or "tigr" in lname or "te-domains" in lname:
            return "aa" + lname
        elif x.getDist == "mask":
            return "zzzz" + lname
        else:
            return lname
        
    outList = sorted(outList, key = lambda track : sortComp(track))

    for track in outList:
        outTracks.addTrack(track)

    outTracks.saveXML(args.outTracksInfo)
    
    cleanBedTool(tempBedToolPath)

Exemplo n.º 14

0

Exibir arquivo

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "fix up track names and sort alphabetically.  easier to do here on xml than at end for pape\
        r.")

    parser.add_argument("tracksInfo",
                        help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("outTracksInfo",
                        help="Path to write modified tracks XML")

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()
    args.logOpString = "--logLevel %s" % getLogLevelString()
    if args.logFile is not None:
        args.logOpString += " --logFile %s" % args.logFile

    nm = dict()
    nm["hollister"] = "RM-RepBase-Hollister"
    nm["chaux"] = "RM-RepBase-deLaChaux"
    nm["repeat_modeler"] = "RM-RepeatModeler"
    nm["repbase"] = "RM-RepBase"
    nm["repet"] = "REPET"
    nm["ltr_finder"] = "LTR_FINDER"
    nm["ltr_harvest"] = "LTR_Harvest"
    nm["ltr_termini"] = "lastz-Termini"
    nm["lastz-Termini"] = "lastz-LTRTermini"
    nm["tir_termini"] = "lastz-InvTermini"
    nm["irf"] = "IRF"
    nm["palindrome"] = "lastz-Palindrome"
    nm["overlap"] = "lastz-Overlap"
    nm["mitehunter"] = "MITE-Hunter"
    nm["helitronscanner"] = "HelitronScanner"
    nm["cov_80-"] = "lastz-SelfLowId"
    nm["cov_80-90"] = "lastz-SelfMedId"
    nm["cov_90+"] = "lastz-SelfHighId"
    nm["left_peak_80-"] = "lastz-SelfPeakLeftLow"
    nm["lastz-SelfLowLeftPeak"] = nm["left_peak_80-"]
    nm["left_peak_80-90"] = "lastz-SelfPeakLeftMed"
    nm["lastz-SelfMedLeftPeak"] = nm["left_peak_80-90"]
    nm["left_peak_90+"] = "lastz-SelfPeakLeftHigh"
    nm["lastz-SelfHighLeftPeak"] = nm["left_peak_90+"]
    nm["right_peak_80-"] = "lastz-SelfPeakRightLow"
    nm["lastz-SelfLowRightPeak"] = nm["right_peak_80-"]
    nm["right_peak_80-90"] = "lastz-SelfPeakRightMed"
    nm["lastz-SelfMedRightPeak"] = nm["right_peak_80-90"]
    nm["right_peak_90+"] = "lastz-SelfPeakRightHigh"
    nm["lastz-SelfHighRightPeak"] = nm["right_peak_90+"]
    nm["cov_maxPId"] = "lastz-SelfPctMaxId"
    nm["lastz-SelfMaxPctId"] = nm["cov_maxPId"]
    nm["te_domains"] = "TE-Domains"
    nm["fgenesh"] = "Genes"
    nm["genes"] = nm["fgenesh"]
    nm["refseq"] = nm["fgenesh"]
    nm["mrna"] = "mRNA"
    nm["srna"] = "sRNA"
    nm["ortho_depth"] = "Alignment-Depth"
    nm["orthology"] = nm["ortho_depth"]
    nm["chain_depth"] = nm["ortho_depth"]
    nm["alignment_depth"] = nm["ortho_depth"]
    nm["gcpct"] = "GC"
    nm["trf"] = "TRF"
    nm["windowmasker"] = "WindowMasker"
    nm["polyN"] = "Ns"
    nm["phastcons_ce"] = "Conservation"
    nm["phastcons"] = nm["phastcons_ce"]
    nm["PhastCons"] = nm["phastcons_ce"]
    nm["phyloP"] = nm["phastcons_ce"]
    nm["phylop"] = nm["phastcons_ce"]

    rtracks = dict()
    rtracks["tantan"] = True
    rtracks["polyA"] = True
    rtracks["transposon_psi"] = True
    rtracks["transposonpsi"] = True
    rtracks["repbase_censor"] = True
    rtracks["tsd"] = True
    rtracks["repbase_default"] = True
    rtracks["dustmasker"] = True

    inTracks = TrackList(args.tracksInfo)
    outTracks = TrackList()
    outList = []

    for track in itertools.chain(inTracks.trackList, inTracks.maskTrackList):
        if not os.path.exists(track.path):
            raise RuntimeError("Track DNE %s" % track.path)
        if track.name not in rtracks:
            if track.name in nm:
                track.name = nm[track.name]
            else:
                logger.warning("Did not map track %s" % track.name)
            outList.append(track)
        else:
            logger.warning("Deleted track %s" % track.name)

    # sort the list
    def sortComp(x):
        lname = x.name.lower()
        if x.name == "RM-RepeatModeler":
            return "aaaaa" + lname
        elif "RM" in x.name:
            return "aaaa" + lname
        elif "REPET" in x.name:
            return "aaa" + lname
        elif "softmask" in lname or "tigr" in lname or "te-domains" in lname:
            return "aa" + lname
        elif x.getDist == "mask":
            return "zzzz" + lname
        else:
            return lname

    outList = sorted(outList, key=lambda track: sortComp(track))

    for track in outList:
        outTracks.addTrack(track)

    outTracks.saveXML(args.outTracksInfo)

    cleanBedTool(tempBedToolPath)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: statesVsBic.py Projeto: glennhickey/teHmm

def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=" Thin wrapper of teHmmTrain.py and teHmmEval.py "
        "to generate a table of Number-of-HMM-states VS BIC. Lower BIC"
        " is better")

    parser.add_argument("tracks", help="tracks xml used for training and eval")
    parser.add_argument("trainingBeds", help="comma-separated list of training regions"
                        " (training region size will be a variable in output table). "
                        "if segmentation is activated, these must also be the "
                        "segmented beds...")
    parser.add_argument("evalBed", help="eval region")
    parser.add_argument("trainOpts", help="all teHmmTrain options in quotes")
    parser.add_argument("evalOpts", help="all teHmmEval options in quotes")
    parser.add_argument("states", help="comma separated-list of numbers of states"
                        " to try")
    parser.add_argument("outDir", help="output directory")
    parser.add_argument("--reps", help="number of replicates", type = int,
                        default=1)
    parser.add_argument("--proc", help="maximum number of processors to use"
                        " in parallel", type = int, default = 1)
    parser.add_argument("--resume", help="try not to rewrite existing files",
                        action="store_true", default=False)
    parser.add_argument("--initTrans", help="the states argument is overridden"
                        " to specify a list of transition initialization files "
                        "instead of state numbers", action="store_true",
                        default=False)
    parser.add_argument("--numReps", help="the states argument is overridden"
                        " to specifiy a list of replicate numbers (--reps)"
                        " arguments", action="store_true", default=False)
    parser.add_argument("--numIter", help="the states argument is overridden"
                        " to specifiy a list of iteration counts (--iter)"
                        " arugments", action="store_true", default=False)
                        
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if sum([int(i) for i in [args.initTrans, args.numReps, args.numIter]]) > 1:
        raise RuntimeError("only one of {--initTrans, --numReps, --numIter} "
                           "can be used at a time")

    if not os.path.isdir(args.outDir):
        runShellCommand("mkdir %s" % args.outDir)

    # get the sizes of the trianing beds
    trainingSizes = []
    trainingBeds = []
    for tb in  args.trainingBeds.split(","):
        if len(tb) > 0:
            trainingBeds.append(tb)
    for bed in trainingBeds:
        assert os.path.isfile(bed)
        bedLen = 0
        for interval in readBedIntervals(bed):
            bedLen += interval[2] - interval[1]
        trainingSizes.append(bedLen)

    # make sure --bed not in teHmmEval options and --numStates not in train
    # options
    trainOpts = args.trainOpts.split()
    if "--numStates" in args.trainOpts and not args.numReps and not args.numIter:
        nsIdx = trainOpts.index("--numStates")
        assert nsIdx < len(trainOpts) - 1
        del trainOpts[nsIdx]
        del trainOpts[nsIdx]
    if "--initTransProbs" in args.trainOpts:
        tpIdx = trainOpts.index("--initTransProbs")
        assert tpIdx < len(trainOpts) - 1
        del trainOpts[tpIdx]
        del trianOpts[tpIdx]
    trainProcs = 1
    if "--numThreads" in args.trainOpts:
        npIdx = trainOpts.index("--numThreads")
        assert npIdx < len(trainOpts) - 1
        trainProcs = int(trainOpts[npIdx + 1])
    segOptIdx = -1
    if "--segment" in args.trainOpts:
        segIdx = trainOpts.index("--segment")
        assert segIdx < len(trainOpts) - 1
        segOptIdx = segIdx + 1
    if args.numReps and "--reps" in args.trainOpts:
        repsIdx = trainOpts.index("--reps")
        assert repsIdx < len(trainOpts) - 1
        del trainOpts[repsIdx]
        del trainOpts[repsIdx]
    if args.numIter and "--iter" in args.trainOpts:
        iterIdx = trainOpts.index("--iter")
        assert iterIdx < len(trainOpts) - 1
        del trainOpts[iterIdx]
        del trainOpts[iterIdx]
    evalOpts = args.evalOpts.split()
    if "--bed" in args.evalOpts:
        bedIdx = evalOpts.index("--bed")
        assert bedIdx < len(evalOpts) - 1
        del evalOpts[bedIdx]
        del evalOpts[bedIdx]
    if "--bic" in args.evalOpts:
        bicIdx = evalOpts.index("--bic")
        assert bicIdx < len(evalOpts) - 1
        del evalOpts[bicIdx]
        del evalOpts[bicIdx]

    # hack in support for --initTrans option by munging out model sizes
    # from the text files
    if args.initTrans is True:
        transFiles = args.states.split(",")
        states = []
        for tf in transFiles:
            stateSet = set()
            with open(tf) as f:
                for line in f:
                    toks = line.split()
                    print toks
                    if len(toks) > 1 and toks[0][0] != "#":
                        stateSet.add(toks[0])
                        stateSet.add(toks[1])
            states.append(len(stateSet))
    else:
        states = args.states.split(",")

    trainCmds = []
    evalCmds = []
    prevSize = -1
    sameSizeCount = 0
    for trainingSize, trainingBed in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        print prevSize, trainingSize, sameSizeCount
        for numStates in states:
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                if segOptIdx != -1:
                    trainOpts[segOptIdx] = trainingBed
                if args.initTrans is True:
                    statesOpt = "--initTransProbs %s" % transFiles[states.index(numStates)]
                elif args.numIter is True:
                    # states argument overridden by iterations
                    statesOpt = "--iter %d" % int(numStates)
                elif args.numReps is True:
                    # states argument overridden by reps
                    statesOpt = "--reps %d" % int(numStates)
                else:
                    statesOpt = "--numStates %d" % int(numStates)
                trainCmd = "teHmmTrain.py %s %s %s %s %s" % (
                    args.tracks, trainingBed, outMod, " ".join(trainOpts),
                    statesOpt)
                if not args.resume or not os.path.isfile(outMod) or \
                   os.path.getsize(outMod) < 100:
                    trainCmds.append(trainCmd)

                outBic = outMod.replace(".mod", ".bic")
                outBed = outMod.replace(".mod", "_eval.bed")
                evalCmd = "teHmmEval.py %s %s %s --bed %s --bic %s %s" % (
                    args.tracks, outMod, args.evalBed, outBed, outBic,
                    " ".join(evalOpts))
                if not args.resume or not os.path.isfile(outBic) or \
                   os.path.getsize(outBic) < 2:
                    evalCmds.append(evalCmd)
            
    # run the training            
    runParallelShellCommands(trainCmds, max(1, args.proc / trainProcs))

    # run the eval
    runParallelShellCommands(evalCmds, args.proc)

    # make the table header
    tableFile = open(os.path.join(args.outDir, "bictable.csv"), "w")
    stateColName = "states"
    if args.numIter is True:
        statesColName = "iter"
    elif args.numReps is True:
        stateColName = "reps"
    tableFile.write("trainFile, trainSize, %s, meanBic, minBic, maxBic" % stateColName)
    for i in xrange(args.reps):
        tableFile.write(", bic.%d" % i)
    tableFile.write("\n")

    # make the table body
    prevSize = -1
    sameSizeCount = 0
    for (trainingSize,trainingBed) in zip(trainingSizes, trainingBeds):
        # hack to take into account we may have different inputs with same
        # same size, so their corresponding results need unique filenames
        if trainingSize == prevSize:
            sameSizeCount += 1
        else:
            sameSizeCount = 0
        prevSize = trainingSize
        for numStates in states:
            bics = []
            printBics = []
            for rep in xrange(args.reps):
                outMod = os.path.join(args.outDir, "hmm_%d.%d.%d.%d.mod" % (
                    trainingSize, sameSizeCount, int(numStates), int(rep)))
                outBic = outMod.replace(".mod", ".bic")
                try:
                    with open(outBic, "r") as obFile:
                        for line in obFile:
                            bic = float(line.split()[0])
                            break
                    bics.append(bic)
                    printBics.append(bic)
                except:
                    logger.warning("Coudn't find bic %s" % outBic)
                    printBics.append("ERROR")
            # write row
            tableFile.write("%s, %d, %d" % (trainingBed, int(trainingSize), int(numStates)))
            if len(bics) > 0:
                tableFile.write(", %f, %f, %f" % (np.mean(bics), np.min(bics),
                                                  np.max(bics)))
            else:
                tableFile.write(", ERROR, ERROR, ERROR")
            for pb in printBics:
                tableFile.write(", %s" % pb)
            tableFile.write("\n")
    tableFile.close()
            
    cleanBedTool(tempBedToolPath)

Exemplo n.º 16

0

Exibir arquivo

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML",
                        help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed",
                        help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument(
        "inBed",
        help="TE prediction BED file.  State labels"
        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed",
                        help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument(
        "--maxLen",
        help="Maximum length of a masked interval"
        " to fill (inclusive). Use --delMask option with same value"
        "if running compareBedStates.py after.",
        type=int,
        default=sys.maxint)
    parser.add_argument("--default",
                        help="Default label to give to masked "
                        "region if no label can be determined",
                        default="0")
    parser.add_argument(
        "--tgts",
        help="Only relabel gaps that "
        "are flanked on both sides by the same state, and this state"
        " is in this comma- separated list. --default used for other"
        " gaps.  If not targetst specified then all states checked.",
        default=None)
    parser.add_argument(
        "--oneSidedTgts",
        help="Only relabel gaps that "
        "are flanked on at least one side by a state in this comma-"
        "separated list --default used for other gaps",
        default=None)
    parser.add_argument(
        "--onlyDefault",
        help="Add the default state (--default) no"
        " no all masked gaps no matter what. ie ignoring all other "
        "logic",
        action="store_true",
        default=False)
    parser.add_argument(
        "--cut",
        help="Cut out gaps for masked tracks from the input."
        " By default, the input is expected to come from the HMM "
        "with mask intervals already absent, and will crash on with"
        " an assertion error if an overlap is detected.",
        action="store_true",
        default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol=4, sort=True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0

    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand(
            "cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s"
            % (maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort=True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[
                    2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[
                    1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0

        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState

        # write our mask interval
        tempOutMaskFile.write(
            "%s\t%d\t%d\t%s\n" %
            (maskInterval[0], maskInterval[1], maskInterval[2], maskState))

    tempOutMaskFile.close()
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" %
                    (args.inBed, tempMergePath1, tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" %
                    (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" %
                    (tempMergePath2, tempScopePath, args.outBed))

    runShellCommand("rm -f %s" % " ".join([
        tempMaskBed, tempOutMask, tempMergePath1, tempMergePath2, tempScopePath
    ]))
    cleanBedTool(tempBedToolPath)

Exemplo n.º 17

0

Exibir arquivo

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Make some tables of statistics from a BED file.  All"
        " output will be written in one big CSV table to be viewed in a "
        "spreadsheet.")

    parser.add_argument("inBed", help="Input bed file")
    parser.add_argument("outCsv", help="Path to write output in CSV format")
    parser.add_argument("--ignore",
                        help="Comma-separated list of names"
                        " to ignore",
                        default="")
    parser.add_argument("--numBins",
                        help="Number of (linear) bins for "
                        "histograms",
                        type=int,
                        default=10)
    parser.add_argument("--logHist",
                        help="Apply log-transform to data for "
                        "histogram",
                        action="store_true",
                        default=False)
    parser.add_argument("--histRange",
                        help="Histogram range as comma-"
                        "separated pair of numbers",
                        default=None)
    parser.add_argument("--noHist",
                        help="skip hisograms",
                        action="store_true",
                        default=False)
    parser.add_argument("--noScore",
                        help="Just do length stats",
                        action="store_true",
                        default=False)
    parser.add_argument("--noLen",
                        help="Just do score stats",
                        action="store_true",
                        default=False)
    parser.add_argument("--nearness",
                        help="Compute nearness stats (instead "
                        "of normal stats) of input bed with given BED.  Output"
                        " will be a BED instead of CSV, with nearness in the "
                        "score position",
                        default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.histRange is not None:
        args.histRange = args.histRange.split(",")
        assert len(args.histRange) == 2
        args.histRange = int(args.histRange[0]), int(args.histRange[1])

    outFile = open(args.outCsv, "w")
    args.ignoreSet = set(args.ignore.split(","))

    intervals = readBedIntervals(args.inBed,
                                 ncol=5,
                                 sort=args.nearness is not None)

    csvStats = ""
    # nearness stats
    if args.nearness is not None:
        args.noScore = True
        csvStats = makeNearnessBED(intervals, args)

    # length stats
    elif args.noLen is False:
        csvStats = makeCSV(intervals, args, lambda x: int(x[2]) - int(x[1]),
                           "Length")
    # score stats
    try:
        if args.noScore is False:
            csvStats += "\n" + makeCSV(intervals, args, lambda x: float(x[4]),
                                       "Score")
            csvStats += "\n" + makeCSV(
                intervals, args, lambda x: float(x[4]) *
                (float(x[2]) - float(x[1])), "Score*Length")
    except Exception as e:
        logger.warning("Couldn't make score stats because %s" % str(e))
    outFile.write(csvStats)
    outFile.write("\n")
    outFile.close()
    cleanBedTool(tempBedToolPath)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: preprocessTracks.py Projeto: glennhickey/teHmm

def runTsd(args, tempTracksInfo):
    """ run addTsdTrack on termini and chaux to generate tsd track"""
    if args.noTsd is True:
        return

    origTrackList = TrackList(args.tracksInfo)
    outTrackList = TrackList(tempTracksInfo)

    tempFiles = []
    tsdInputFiles = []
    tsdInputTracks = []
        
    # preprocess termini
    lastzTracks = [origTrackList.getTrackByName(args.ltr_termini),
                  origTrackList.getTrackByName(args.tir)]
    for terminiTrack in lastzTracks:
        if terminiTrack is not None:
            inFile = terminiTrack.getPath()
            fillFile = getLocalTempPath("Temp_fill", ".bed")
            tempBed = None
            if inFile[-3:] == ".bb":
                tempBed = getLocalTempPath("Temp_termini", ".bed")
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
                inFile = tempBed
            runShellCommand("fillTermini.py %s %s" % (inFile, fillFile))
            tsdInputFiles.append(fillFile)
            tsdInputTracks.append(terminiTrack.getName())
            tempFiles.append(fillFile)
            if tempBed is not None:
                runShellCommand("rm -f %s" % tempBed)
        else:
            logger.warning("Could not find termini track")

    # add repeat_modeler
    repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler)
    if repeat_modelerTrack is not None:
        tsdInputFiles.append(repeat_modelerTrack.getPath())
        tsdInputTracks.append(repeat_modelerTrack.getName())

    # run addTsdTrack (appending except first time)
    # note we override input track paths in each case
    assert len(tsdInputFiles) == len(tsdInputTracks)
    for i in xrange(len(tsdInputFiles)):
        optString = ""
        if i > 0:
            optString += " --append"
        # really rough hardcoded params based on
        # (A unified classification system for eukaryotic transposable elements
        # Wicker et. al 2007)
        if tsdInputTracks[i] == args.repeat_modeler:
            optString += " --names LINE,SINE,Unknown"
            optString += " --maxScore 20"
            optString += " --left 20"
            optString += " --right 20"
            optString += " --min 5"
            optString += " --max 20"
            optString += " --overlap 20"
        elif tsdInputTracks[i] == args.ltr_termini:
            optString += " --maxScore 3"
            optString += " --left 8"
            optString += " --right 8"
            optString += " --min 3"
            optString += " --max 6"
        elif tsdInputTracks[i] == args.tir:
            optString += " --maxScore 3"
            optString += " --left 15"
            optString += " --right 15"
            optString += " --min 3"
            optString += " --max 12"

        tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml")
        runShellCommand("addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" % (
            tempTracksInfo,
            args.cleanTrackPath,
            tempXMLOut,
            tsdInputTracks[i],
            args.sequence,
            args.tsd,
            tsdInputFiles[i],
            optString,
            args.logOpString,
            args.numProc))
        
        runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo))

    for i in xrange(len(tempFiles)):
        runShellCommand("rm %s" % tempFiles[i])

Exemplo n.º 19

0

Exibir arquivo

Arquivo: bedStats.py Projeto: glennhickey/teHmm

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Make some tables of statistics from a BED file.  All"
        " output will be written in one big CSV table to be viewed in a "
        "spreadsheet.")

    parser.add_argument("inBed", help="Input bed file")
    parser.add_argument("outCsv", help="Path to write output in CSV format")
    parser.add_argument("--ignore", help="Comma-separated list of names"
                        " to ignore", default="")
    parser.add_argument("--numBins", help="Number of (linear) bins for "
                        "histograms", type=int, default=10)
    parser.add_argument("--logHist", help="Apply log-transform to data for "
                        "histogram", action="store_true", default=False)
    parser.add_argument("--histRange", help="Histogram range as comma-"
                        "separated pair of numbers", default=None)
    parser.add_argument("--noHist", help="skip hisograms", action="store_true",
                        default=False)
    parser.add_argument("--noScore", help="Just do length stats",
                        action="store_true", default=False)
    parser.add_argument("--noLen", help="Just do score stats",
                        action="store_true", default=False)
    parser.add_argument("--nearness", help="Compute nearness stats (instead "
                        "of normal stats) of input bed with given BED.  Output"
                        " will be a BED instead of CSV, with nearness in the "
                        "score position", default=None)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    if args.histRange is not None:
        args.histRange = args.histRange.split(",")
        assert len(args.histRange) == 2
        args.histRange = int(args.histRange[0]), int(args.histRange[1])

    outFile = open(args.outCsv, "w")
    args.ignoreSet = set(args.ignore.split(","))

    intervals = readBedIntervals(args.inBed, ncol = 5, sort = args.nearness is not None)

    csvStats = ""
    # nearness stats
    if args.nearness is not None:
        args.noScore = True
        csvStats = makeNearnessBED(intervals, args)
        
    # length stats
    elif args.noLen is False:
        csvStats = makeCSV(intervals, args, lambda x : int(x[2])-int(x[1]),
                           "Length")
    # score stats
    try:
        if args.noScore is False:
            csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]),
                                       "Score")
            csvStats += "\n" + makeCSV(intervals, args, lambda x : float(x[4]) * (
                float(x[2]) - float(x[1])), "Score*Length")
    except Exception as e:
        logger.warning("Couldn't make score stats because %s" % str(e))
    outFile.write(csvStats)
    outFile.write("\n")
    outFile.close()
    cleanBedTool(tempBedToolPath)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: interpolateMaskedRegions.py Projeto: glennhickey/teHmm

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Fill in masked intervals of an hmm prediction "
        "(from teHmmEval.py) with state corresponding to surrounding"
        " intervals.")

    parser.add_argument("tracksXML", help="XML track list (used to id masking"
                        " tracks")
    parser.add_argument("allBed", help="Target scope.  Masked intervals outside"
                        " of these regions will not be included")
    parser.add_argument("inBed", help="TE prediction BED file.  State labels"
                        " should probably be mapped (ie with fitStateNames.py)")
    parser.add_argument("outBed", help="Output BED.  Will be equivalent to"
                        " the input bed except all gaps corresponding to "
                        "masked intervals will be filled")
    parser.add_argument("--maxLen", help="Maximum length of a masked interval"
                        " to fill (inclusive). Use --delMask option with same value"
                        "if running compareBedStates.py after.",
                        type=int, default=sys.maxint)
    parser.add_argument("--default", help="Default label to give to masked "
                        "region if no label can be determined", default="0")
    parser.add_argument("--tgts", help="Only relabel gaps that "
                        "are flanked on both sides by the same state, and this state"
                        " is in this comma- separated list. --default used for other"
                        " gaps.  If not targetst specified then all states checked.",
                        default=None)
    parser.add_argument("--oneSidedTgts", help="Only relabel gaps that "
                        "are flanked on at least one side by a state in this comma-"
                        "separated list --default used for other gaps",
                         default=None)
    parser.add_argument("--onlyDefault", help="Add the default state (--default) no"
                        " no all masked gaps no matter what. ie ignoring all other "
                        "logic", action="store_true", default=False)
    parser.add_argument("--cut", help="Cut out gaps for masked tracks from the input."
                        " By default, the input is expected to come from the HMM "
                        "with mask intervals already absent, and will crash on with"
                        " an assertion error if an overlap is detected.",
                        action="store_true", default=False)

    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    # make sets
    tgtSet = set()
    if args.tgts is not None:
        tgtSet = set(args.tgts.split(","))
    oneSidedTgtSet = set()
    if args.oneSidedTgts is not None:
        oneSidedTgtSet = set(args.oneSidedTgts.split(","))
    assert len(tgtSet.intersection(oneSidedTgtSet)) == 0

    # read the track list
    trackList = TrackList(args.tracksXML)
    maskTracks = trackList.getMaskTracks()

    # read the input bed
    inBed = args.inBed
    if args.cut is True:
        inBed = cutOutMaskIntervals(inBed, -1, args.maxLen + 1, args.tracksXML)
    inputIntervals = readBedIntervals(inBed, ncol = 4, sort = True)
    if args.cut is True:
        runShellCommand("rm -f %s" % inBed)
    if len(maskTracks) == 0 or len(inputIntervals) == 0:
        runShellCommand("cp %s %s" % (args.inBed, args.outBed))
        logger.warning("No mask tracks located in %s or"
                       " %s empty" % (args.tracksXML, args.inBed))
        return 0


    # make a temporary, combined, merged masking bed file
    tempMaskBed = getLocalTempPath("Temp_mb", ".bed")
    for maskTrack in maskTracks:
        assert os.path.isfile(maskTrack.getPath())
        runShellCommand("cat %s | setBedCol.py 3 mask | awk \'{print $1\"\t\"$2\"\t\"$3}\'>> %s" % (
            maskTrack.getPath(), tempMaskBed))
    maskedIntervals = getMergedBedIntervals(tempMaskBed, sort = True)
    resolvedMasks = 0

    if len(inputIntervals) == 0:
        logger.warning("No mask tracks located in %s" % args.tracksXML)
        return
    inputIdx = 0
    rightFlank = inputIntervals[inputIdx]

    tempOutMask = getLocalTempPath("Temp_om", ".bed")
    tempOutMaskFile = open(tempOutMask, "w")

    for maskIdx, maskInterval in enumerate(maskedIntervals):
        if maskInterval[2] - maskInterval[1] > args.maxLen:
            continue
        # find candidate right flank
        while rightFlank < maskInterval:
            if inputIdx == len(inputIntervals) - 1:
                rightFlank = None
                break
            else:
                inputIdx += 1
                rightFlank = inputIntervals[inputIdx]

        # candidate left flank
        leftFlank = None
        if inputIdx > 0:
            leftFlank = inputIntervals[inputIdx - 1]

        # identify flanking states if the intervals perfectly abut
        leftState = None
        if leftFlank is not None:
            if leftFlank[0] == maskInterval[0] and leftFlank[2] == maskInterval[1]:
                leftState = str(leftFlank[3])
            else:
                assert intersectSize(leftFlank, maskInterval) == 0
        rightState = None
        if rightFlank is not None:
            if rightFlank[0] == maskInterval[0] and rightFlank[1] == maskInterval[2]:
                rightState = str(rightFlank[3])
            else:
                assert intersectSize(rightFlank, maskInterval) == 0
            
        # choose a state for the mask interval
        maskState = str(args.default)
        if args.onlyDefault is True:
            pass
        elif leftState is not None and leftState == rightState:
            if len(tgtSet) == 0 or leftState in tgtSet:
                maskState = leftState
        elif leftState in oneSidedTgtSet:
            maskState = leftState
        elif rightState in oneSidedTgtSet:
            maskState = rightState
        
        # write our mask interval
        tempOutMaskFile.write("%s\t%d\t%d\t%s\n" % (maskInterval[0], maskInterval[1],
                                                    maskInterval[2], maskState))

    
    tempOutMaskFile.close()    
    tempMergePath1 = getLocalTempPath("Temp_mp", ".bed")
    tempMergePath2 = getLocalTempPath("Temp_mp", ".bed")
    runShellCommand("cp %s %s ; cat %s >> %s" % (args.inBed, tempMergePath1,
                                                 tempOutMask, tempMergePath1))
    runShellCommand("cat %s | sortBed > %s" % (tempMergePath1, tempMergePath2))
    tempScopePath = getLocalTempPath("temp_all", ".bed")
    runShellCommand("mergeBed -i %s |sortBed > %s" % (args.allBed, tempScopePath))
    runShellCommand("intersectBed -a %s -b %s > %s" % (tempMergePath2, tempScopePath,
                                                       args.outBed))

    runShellCommand("rm -f %s" % " ".join([tempMaskBed, tempOutMask, tempMergePath1,
                                      tempMergePath2, tempScopePath]))
    cleanBedTool(tempBedToolPath)

Exemplo n.º 21

0

Exibir arquivo

def runTsd(args, tempTracksInfo):
    """ run addTsdTrack on termini and chaux to generate tsd track"""
    if args.noTsd is True:
        return

    origTrackList = TrackList(args.tracksInfo)
    outTrackList = TrackList(tempTracksInfo)

    tempFiles = []
    tsdInputFiles = []
    tsdInputTracks = []

    # preprocess termini
    lastzTracks = [
        origTrackList.getTrackByName(args.ltr_termini),
        origTrackList.getTrackByName(args.tir)
    ]
    for terminiTrack in lastzTracks:
        if terminiTrack is not None:
            inFile = terminiTrack.getPath()
            fillFile = getLocalTempPath("Temp_fill", ".bed")
            tempBed = None
            if inFile[-3:] == ".bb":
                tempBed = getLocalTempPath("Temp_termini", ".bed")
                runShellCommand("bigBedToBed %s %s" % (inFile, tempBed))
                inFile = tempBed
            runShellCommand("fillTermini.py %s %s" % (inFile, fillFile))
            tsdInputFiles.append(fillFile)
            tsdInputTracks.append(terminiTrack.getName())
            tempFiles.append(fillFile)
            if tempBed is not None:
                runShellCommand("rm -f %s" % tempBed)
        else:
            logger.warning("Could not find termini track")

    # add repeat_modeler
    repeat_modelerTrack = outTrackList.getTrackByName(args.repeat_modeler)
    if repeat_modelerTrack is not None:
        tsdInputFiles.append(repeat_modelerTrack.getPath())
        tsdInputTracks.append(repeat_modelerTrack.getName())

    # run addTsdTrack (appending except first time)
    # note we override input track paths in each case
    assert len(tsdInputFiles) == len(tsdInputTracks)
    for i in xrange(len(tsdInputFiles)):
        optString = ""
        if i > 0:
            optString += " --append"
        # really rough hardcoded params based on
        # (A unified classification system for eukaryotic transposable elements
        # Wicker et. al 2007)
        if tsdInputTracks[i] == args.repeat_modeler:
            optString += " --names LINE,SINE,Unknown"
            optString += " --maxScore 20"
            optString += " --left 20"
            optString += " --right 20"
            optString += " --min 5"
            optString += " --max 20"
            optString += " --overlap 20"
        elif tsdInputTracks[i] == args.ltr_termini:
            optString += " --maxScore 3"
            optString += " --left 8"
            optString += " --right 8"
            optString += " --min 3"
            optString += " --max 6"
        elif tsdInputTracks[i] == args.tir:
            optString += " --maxScore 3"
            optString += " --left 15"
            optString += " --right 15"
            optString += " --min 3"
            optString += " --max 12"

        tempXMLOut = getLocalTempPath("Temp_tsd_xml", ".xml")
        runShellCommand(
            "addTsdTrack.py %s %s %s %s %s %s --inPath %s %s %s --numProc %d" %
            (tempTracksInfo, args.cleanTrackPath, tempXMLOut,
             tsdInputTracks[i], args.sequence, args.tsd, tsdInputFiles[i],
             optString, args.logOpString, args.numProc))

        runShellCommand("mv %s %s" % (tempXMLOut, tempTracksInfo))

    for i in xrange(len(tempFiles)):
        runShellCommand("rm %s" % tempFiles[i])

Exemplo n.º 22

0

Exibir arquivo

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Automatically set the scale attributes of numeric tracks"
        " within a given tracks.xml function using some simple heuristics. ")
    
    parser.add_argument("tracksInfo", help="Path of Tracks Info file "
                        "containing paths to genome annotation tracks")
    parser.add_argument("allBed", help="Bed file spanning entire genome")
    parser.add_argument("outputTracks", help="Path to write modified tracks XML"
                        " to.")
    parser.add_argument("--numBins", help="Maximum number of bins after scaling",
                        default=10, type=int)
    parser.add_argument("--tracks", help="Comma-separated list of tracks "
                        "to process. If not set, all"
                        " tracks listed as having a multinomial distribution"
                        " (since this is the default value, this includes "
                        "tracks with no distribution attribute) or gaussian"
                        " distribution will be processed.", default=None)
    parser.add_argument("--skip", help="Comma-separated list of tracks to "
                        "skip.", default=None)
    parser.add_argument("--noLog", help="Never use log scaling",
                        action="store_true", default=False)
    
    addLoggingOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)
    tempBedToolPath = initBedTool()

    trackNames = []
    if args.tracks is not None:
        trackNames = args.tracks.split(",")
    skipNames = []
    if args.skip is not None:
        skipNames = args.skip.split(",")
    
    trackList = TrackList(args.tracksInfo)
    outTrackList = copy.deepcopy(trackList)

    allIntervals = getMergedBedIntervals(args.allBed)

    for track in trackList:
        trackExt = os.path.splitext(track.getPath())[1]
        isFasta = len(trackExt) >= 3 and trackExt[:3].lower() == ".fa"
        if track.getName() not in skipNames and\
          (track.getName() in trackNames or len(trackNames) == 0) and\
          (track.getDist() == "multinomial" or
           track.getDist() == "sparse_multinomial" or
          track.getDist() == "gaussian") and\
          not isFasta:
          try:
              setTrackScale(track, args.numBins, allIntervals, args.noLog)
          except ValueError as e:
              logger.warning("Skipping (non-numeric?) track %s due to: %s" % (
                  track.getName(), str(e)))

    trackList.saveXML(args.outputTracks)
    cleanBedTool(tempBedToolPath)