Exemplo n.º 1
0
def getHalTreeMutations(halPath, args, rootName=None):
    root = rootName
    if root is None:
        root = getHalRootName(halPath)
    for child in getHalChildrenNames(halPath, root):
        getHalBranchMutations(halPath, child, args)
        getHalTreeMutations(halPath, args, child)
Exemplo n.º 2
0
def getHalTreeMutations(halPath, args, rootName=None):
    root = rootName
    if root is None:
        root = getHalRootName(halPath)
    for child in getHalChildrenNames(halPath, root):
        getHalBranchMutations(halPath, child, args)
        getHalTreeMutations(halPath, args, child)
Exemplo n.º 3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Compute PhyloP scores (in wig format) for each genome in"
        " an alignment.  Scores are computed once per column with "
        "halPhyloPMP.py and iteratively lifted down the tree using "
        "halWiggleLiftover (starting at the root).")

    parser.add_argument("hal", help="input hal")
    parser.add_argument("mod", help="model file for PhyloP.  Can be created "
                        "with halPhyloPTrain.py")
    parser.add_argument("outWigDir", help="directory where output wig files"
                        " will be written")
    parser.add_argument("--root", help="Name of root.  If not specified the"
                        " HAL root will be used", default=None)
    parser.add_argument("--numProc",
                        help="Maximum number of processes.",
                        type=int, default=1)
    parser.add_argument("--bigWig",
                        help="Run wigToBigWig on each generated wiggle",
                        action="store_true", default=False)
    parser.add_argument("--subtree",
                        help="Run clade-specific acceleration/conservation on subtree below this node",
                        default=None)
    parser.add_argument("--prec",
                        help="Number of decimal places in wig output", type=int,
                        default=None)

    # need phyloP options here:
    
    args = parser.parse_args()

    if not os.path.isfile(args.hal):
        raise RuntimeError("Input hal file %s not found" % args.hal)
    if not os.path.isfile(args.mod):
        raise RuntimeError("Input mod file %s not found" % args.mod)
    if not os.path.isdir(args.outWigDir):
        os.makedirs(args.outWigDir)
    if not os.path.isdir(args.outWigDir):
        raise RuntimeError("%s not found" % args.outWigDir)

    args.halGenomes = getHalGenomes(args.hal)
    if args.root is None:
        args.root = getHalRootName(args.hal)

    if not args.root in args.halGenomes:
        raise RuntimeError("Root genome %s not found." % args.root)

    if args.subtree is not None and args.root not in args.halGenomes:
        raise RuntimeError("Subtree root %s not found." % args.subtree)

    # make a little id tag for temporary maf slices
    S = string.ascii_uppercase + string.digits
    args.tempID = 'halTreePhyloP' + ''.join(random.choice(S) for x in range(5))

    computeTreePhyloP(args)
Exemplo n.º 4
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    # we are going to deal with sequence coordinates
    if options.splitBySequence is True or options.refSequence is not None:
        for sequence, seqLen, nt, nb in refSequenceStats:
            if options.refSequence is None or sequence == options.refSequence:
                seqOpts = copy.deepcopy(options)
                if seqLen < options.smallSize:
                    seqOpts.smallFile = True
                seqOpts.refGenome = refGenome
                seqOpts.refSequence = sequence
                index = 0
                for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen):
                    seqOpts.start = sStart
                    seqOpts.length = sLen
                    seqOpts.sliceNumber = sIdx
                    sliceCmds.append(getHal2MafCmd(seqOpts))
                    sliceOpts.append(copy.deepcopy(seqOpts))
                if seqOpts.smallFile is True and seqLen > 0:
                    options.firstSmallFile = False
    # we are slicing the gnome coordinates directly
    else:
        seqOpts = copy.deepcopy(options)
        assert seqOpts.splitBySequence is False
        genomeLen = getHalGenomeLength(seqOpts.halFile, refGenome)
        # auto compute slice size from numprocs
        if seqOpts.sliceSize == None and seqOpts.numProc > 1:
            refLen = genomeLen
            if seqOpts.length is not None and seqOpts.length > 0:
                refLen = seqOpts.length
            seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))
                
        index = 0
        for sStart, sLen, sIdx in computeSlices(seqOpts, genomeLen):
            seqOpts.start = sStart
            seqOpts.length = sLen
            seqOpts.sliceNumber = sIdx
            sliceCmds.append(getHal2MafCmd(seqOpts))
            sliceOpts.append(copy.deepcopy(seqOpts))
            
    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)
Exemplo n.º 5
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    # we are going to deal with sequence coordinates
    if options.splitBySequence is True or options.refSequence is not None:
        for sequence, seqLen, nt, nb in refSequenceStats:
            if options.refSequence is None or sequence == options.refSequence:
                seqOpts = copy.deepcopy(options)
                if seqLen < options.smallSize:
                    seqOpts.smallFile = True
                seqOpts.refGenome = refGenome
                seqOpts.refSequence = sequence
                index = 0
                for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen):
                    seqOpts.start = sStart
                    seqOpts.length = sLen
                    seqOpts.sliceNumber = sIdx
                    sliceCmds.append(getHal2MafCmd(seqOpts))
                    sliceOpts.append(copy.deepcopy(seqOpts))
                if seqOpts.smallFile is True and seqLen > 0:
                    options.firstSmallFile = False
    # we are slicing the gnome coordinates directly
    else:
        seqOpts = copy.deepcopy(options)
        assert seqOpts.splitBySequence is False
        genomeLen = getHalGenomeLength(seqOpts.halFile, refGenome)
        # auto compute slice size from numprocs
        if seqOpts.sliceSize == None and seqOpts.numProc > 1:
            refLen = genomeLen
            if seqOpts.length is not None and seqOpts.length > 0:
                refLen = seqOpts.length
            seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))

        index = 0
        for sStart, sLen, sIdx in computeSlices(seqOpts, genomeLen):
            seqOpts.start = sStart
            seqOpts.length = sLen
            seqOpts.sliceNumber = sIdx
            sliceCmds.append(getHal2MafCmd(seqOpts))
            sliceOpts.append(copy.deepcopy(seqOpts))

    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)
Exemplo n.º 6
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    if options.refSequence is not None:   
        refStat = [x for x in refSequenceStats if x[1] == 
                   options.refSequence]
        if len(refStat != 1):
            raise RuntimeError("Sequence %s not found in genome %s" % (
                options.refSequence, options.refGenome))
        totalLength = int(refStat[1])
    else:
        totalLength = getHalGenomeLength(options.halFile, refGenome)
    
    seqOpts = copy.deepcopy(options)

    # auto compute slice size from numprocs
    if seqOpts.sliceSize == None and seqOpts.numProc > 1:
        refLen = totalLength
        if seqOpts.length is not None and seqOpts.length > 0:
            refLen = seqOpts.length
        seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))
                
    index = 0
    for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength):
        seqOpts.start = sStart
        seqOpts.length = sLen
        seqOpts.sliceNumber = sIdx
        sliceCmds.append(getHalPhyloPCmd(seqOpts))
        sliceOpts.append(copy.deepcopy(seqOpts))
            
    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)

    writeChromSizes(options)
Exemplo n.º 7
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    if options.refSequence is not None:
        refStat = [x for x in refSequenceStats if x[1] == options.refSequence]
        if len(refStat != 1):
            raise RuntimeError("Sequence %s not found in genome %s" %
                               (options.refSequence, options.refGenome))
        totalLength = int(refStat[1])
    else:
        totalLength = getHalGenomeLength(options.halFile, refGenome)

    seqOpts = copy.deepcopy(options)

    # auto compute slice size from numprocs
    if seqOpts.sliceSize == None and seqOpts.numProc > 1:
        refLen = totalLength
        if seqOpts.length is not None and seqOpts.length > 0:
            refLen = seqOpts.length
        seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))

    index = 0
    for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength):
        seqOpts.start = sStart
        seqOpts.length = sLen
        seqOpts.sliceNumber = sIdx
        sliceCmds.append(getHalPhyloPCmd(seqOpts))
        sliceOpts.append(copy.deepcopy(seqOpts))

    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)

    writeChromSizes(options)
Exemplo n.º 8
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    options.smallFile = False
    options.firstSmallFile = True
    if options.refTargets:
        sliceCmds, sliceOpts = partitionRefTargets(options)
    elif options.splitBySequence is True or options.refSequence is not None:
        sliceCmds, sliceOpts = partitionBySeqCoords(options, refGenome)
    else:
        sliceCmds, sliceOpts = partitionByGenomeCoords(options, refGenome)

    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # clean up temporary bed files (if present)
    for opts in sliceOpts:
        if opts.refTargets and os.path.isfile(opts.refTargets):
            os.remove(opts.refTargets)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)