def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Compute PhyloP scores (in wig format) for each genome in" " an alignment. Scores are computed once per column with " "halPhyloPMP.py and iteratively lifted down the tree using " "halWiggleLiftover (starting at the root).") parser.add_argument("hal", help="input hal") parser.add_argument("mod", help="model file for PhyloP. Can be created " "with halPhyloPTrain.py") parser.add_argument("outWigDir", help="directory where output wig files" " will be written") parser.add_argument("--root", help="Name of root. If not specified the" " HAL root will be used", default=None) parser.add_argument("--numProc", help="Maximum number of processes.", type=int, default=1) parser.add_argument("--bigWig", help="Run wigToBigWig on each generated wiggle", action="store_true", default=False) parser.add_argument("--subtree", help="Run clade-specific acceleration/conservation on subtree below this node", default=None) parser.add_argument("--prec", help="Number of decimal places in wig output", type=int, default=None) # need phyloP options here: args = parser.parse_args() if not os.path.isfile(args.hal): raise RuntimeError("Input hal file %s not found" % args.hal) if not os.path.isfile(args.mod): raise RuntimeError("Input mod file %s not found" % args.mod) if not os.path.isdir(args.outWigDir): os.makedirs(args.outWigDir) if not os.path.isdir(args.outWigDir): raise RuntimeError("%s not found" % args.outWigDir) args.halGenomes = getHalGenomes(args.hal) if args.root is None: args.root = getHalRootName(args.hal) if not args.root in args.halGenomes: raise RuntimeError("Root genome %s not found." % args.root) if args.subtree is not None and args.root not in args.halGenomes: raise RuntimeError("Subtree root %s not found." % args.subtree) # make a little id tag for temporary maf slices S = string.ascii_uppercase + string.digits args.tempID = 'halTreePhyloP' + ''.join(random.choice(S) for x in range(5)) computeTreePhyloP(args)
def getScanTime(inHalPath, outDir, step): srcHalPath = inHalPath if step > 0: srcHalPath = makePath(inHalPath, outDir, step, "lod", "hal") genomes = getHalGenomes(inHalPath) assert len(genomes) > 1 genName = genomes[1] bedPath = makePath(inHalPath, outDir, step, genName, "bed") t1 = time.time() runShellCommand("halBranchMutations %s %s --refFile %s" % (srcHalPath, genName, bedPath)) elapsedTime = time.time() - t1 return [elapsedTime]
def getScanTime(inHalPath, outDir, step): srcHalPath = inHalPath if step > 0: srcHalPath = makePath(inHalPath, outDir, step, "lod", "hal") genomes = getHalGenomes(inHalPath) assert len(genomes) > 1 genName = genomes[1] bedPath = makePath(inHalPath, outDir, step, genName, "bed") t1 = time.time() runShellCommand("halBranchMutations %s %s --refFile %s" % ( srcHalPath, genName, bedPath)) elapsedTime = time.time() - t1 return [elapsedTime]
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Compute a neutral substitution model for use with " "phyloP or halPhlyoP") parser.add_argument("hal", help="input hal") parser.add_argument("refGenome", help="Name of reference genome") parser.add_argument("bedDir", help="BED file or directory containing BED " "files. By " "default, these files are interpreted to contain only" " coordinates of coding exons, and fourfold degenerate" " sites will automatically be extracted from them." " To disable this behaviour and train on the entire " " file, use the --no4d option.", default=None) parser.add_argument("outMod", help="Path to output model file") parser.add_argument("--no4d", help="Do not extract fourfold degenerate" " positions from the input bed files. Rather use " "all bases they contain.", default=False, action="store_true") parser.add_argument("--numProc", help="Maximum number of processes for hal2maf.", type=int, default=1) parser.add_argument("--noAncestors", help="Don't write ancestral genomes in hal2maf", action="store_true", default=False) parser.add_argument("--maxBedLines", help="Split bed files so they have at most this many" " lines", type=int, default=None) parser.add_argument("--sliceSize", help="Slice size for hal2maf.", type=int, default=None) parser.add_argument("--tree", help="String describing phylogeny in NEWICK format " "that will be used instead of the tree stored in the" " HAL file. This tree should contain all the species" " in the alignment. Note that it is best to enclose" " this string in quotes", default=None) parser.add_argument("--targetGenomes", default=None, nargs='+', help="space separated list of targetGenomes to pass to " "hal2maf. If used, the tree given to --tree should match.") parser.add_argument("--substMod", help="Substitution model for phyloFit" ": valid options are JC69|F81|HKY85|HKY85+Gap|REV|" "SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S", default = "SSREV") parser.add_argument("--noModFreqs", help="By default, equilibrium " "frequencies for the nucleotides of the trained model" " are corrected with the observed frequencies of " "the reference genome (using the PHAST modFreqs" " tool. This flag disables this step, and keeps the" " trained frequencies", action="store_true", default=False) parser.add_argument("--error", help="File in which to output confidence" " intervals for the parameters in the model", default=None) args = parser.parse_args() # validate inputs if not os.path.isfile(args.hal): raise RuntimeError("Input hal file %s not found" % args.hal) if not os.path.exists(args.bedDir): raise RuntimeError("%s not found" % args.bedDir) # validarte substitution model if not args.substMod in "JC69|F81|HKY85|HKY85+Gap|REV|SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S".split("|"): raise RuntimeError("Invalid substitution model: %s" % args.substMod) # validate BEDs if os.path.isdir(args.bedDir): args.bedFiles = [os.path.join(args.bedDir, f) for f in os.listdir(args.bedDir) if os.path.isfile(os.path.join(args.bedDir, f))] else: args.bedFiles = [args.bedDir] # test output is writeable and has valid extension outTest = open(args.outMod, "w") if not outTest: raise RuntimeError("Unable to open output %s" % args.outMod) if os.path.splitext(args.outMod)[1] != ".mod": raise RuntimeError("Output model must have .mod extension") # if targetGenomes is set, use those. Otherwise, extract from HAL if args.targetGenomes is not None: args.halGenomes = args.targetGenomes else: args.halGenomes = getHalGenomes(args.hal) # if tree is set, use that. Otherwise, extract from HAL if args.tree is None: args.tree = getHalTree(args.hal) # Make sure that all members of halGenomes and tree are in the actual HAL halTree = getHalTree(args.hal) if args.refGenome not in halTree: raise RuntimeError("Reference genome %s not found." % args.refGenome) for targetGenome in args.halGenomes: if targetGenome not in halTree: raise RuntimeError("Target genome %s not in HAL." % targetGenome) if targetGenome not in args.tree: raise RuntimeError("Target genome %s not in --tree." % targetGenome) args.halGenomes = ','.join(args.halGenomes) args.outDir = os.path.dirname(args.outMod) args.outName = os.path.splitext(os.path.basename(args.outMod))[0] args.outMafName = args.outName + "_halPhyloPTrain_temp.maf" args.outMafPath = os.path.join(args.outDir, args.outMafName) args.outMafAllPaths = args.outMafPath.replace("_halPhyloPTrain_temp.maf", "_halPhyloPTrain_temp*.maf") args.outMafSS = args.outMafPath.replace("_halPhyloPTrain_temp.maf", "_halPhyloPTrain_temp.ss") computeModel(args)
def getHalTotalSegments(halPath): total = (0, 0) for genome in getHalGenomes(halPath): numSegs = getHalNumSegments(halPath, genome) total = (total[0] + numSegs[0], total[1] + numSegs[1]) return total
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Multi-Process wrapper for halPhyloP.") parser.add_argument("halFile", help="Input HAL file") parser.add_argument("refGenome", help="Reference genome to scan") parser.add_argument("modFile", help="Neutral model for PhyloP. Can be " "generated with halPhyloPTrain.py") parser.add_argument("wiggleFile", help="Output Wiggle file") parser.add_argument("--numProc", help="Maximum number of processes to create. If " " neither --sliceSize or --splitBySequence are " " specified, then the reference genome will be " "sliced to require --numProc jobs", type=int, default=1) parser.add_argument("--sliceSize", help="Maximum slice of reference sequence to process " "in a single process.", type=int, default=None) parser.add_argument("--chromSizes", help="Path of file to output chromosome sizes to. " "Necessary for wigToBigWig", default=None) ################################################################## #HDF5 OPTIONS (as copied from hal/api/hdf5_impl/hdf5CLParser.cpp) ################################################################## hdf5Grp = parser.add_argument_group('HDF5 HAL Options') hdf5Grp.add_argument("--cacheMDC", help="number of metadata slots in hdf5 cache", type=int, default=None) hdf5Grp.add_argument("--cacheRDC", help="number of regular slots in hdf5 cache. " "should be" " a prime number ~= 10 * DefaultCacheRDCBytes / " "chunk", type=int, default=None) hdf5Grp.add_argument("--cacheBytes", help="maximum size in bytes of regular hdf5 cache", type=int, default=None) hdf5Grp.add_argument("--cacheW0", help="w0 parameter fro hdf5 cache", type=int, default=None) hdf5Grp.add_argument("--inMemory", help="load all data in memory (& disable hdf5 cache)", action="store_true", default=False) ################################################################## #HALPHYLOP OPTIONS (as copied from hal/maf/impl/hal2maf.cpp) ################################################################## hppGrp = parser.add_argument_group('halPhyloP Options') hppGrp.add_argument("--refSequence", help="name of reference sequence within reference " "genome (all sequences if empty)", default=None) hppGrp.add_argument("--start", help="coordinate within reference genome (or sequence" " if specified) to start at", type=int, default=None) hppGrp.add_argument("--length", help="length of the reference genome (or sequence" " if specified) to convert. If set to 0," " the entire thing is converted", type=int, default=None) hppGrp.add_argument("--targetGenomes", help="comma-separated (no spaces) list of target " "genomes (others are excluded) (vist all if empty)", default=None) hppGrp.add_argument("--dupType", help="Which duplications to mask according to dupMask " "option. Choices are: " "\"all\": Any duplicated region; or " "\"ambiguous\": Regions within duplications where " "alignments from the same species do not contain" " the same base.", default=None) hppGrp.add_argument( "--dupMask", help="What to do with duplicated regions. Choices are: " "\"hard\": mask entire alignment column if any " "duplications occur; or " "\"soft\": mask species where duplications occur.", default=None) hppGrp.add_argument("--step", help="step size", type=int, default=None) hppGrp.add_argument("--refBed", help="Bed file with coordinates to annotate in the " "reference genome to stream from standard " " input.", default=None) hppGrp.add_argument( "--subtree", help="Subtree root for lineage-specific acceleration/conservation", default=None) hppGrp.add_argument("--prec", help="Number of decimal places in wig output", type=int, default=None) args = parser.parse_args() if not os.path.isfile(args.halFile): raise RuntimeError("Input hal file %s not found" % args.halFile) if not os.path.isfile(args.modFile): raise RuntimeError("Input mod file %s not found" % args.modFile) args.halGenomes = getHalGenomes(args.halFile) if not args.refGenome in args.halGenomes: raise RuntimeError("Reference genome %s not found." % args.refGenome) test = open(args.wiggleFile, "w") test.write("\n") test.close() os.remove(args.wiggleFile) if args.chromSizes is not None: test = open(args.chromSizes, "w") test.write("\n") test.close() os.remove(args.chromSizes) # make a little id tag for temporary slices S = string.ascii_uppercase + string.digits args.tempID = 'halPhyloPTemp' + ''.join(random.choice(S) for x in range(5)) runParallelSlices(args)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Compute a neutral substitution model for use with " "phyloP or halPhlyoP") parser.add_argument("hal", help="input hal") parser.add_argument("refGenome", help="Name of reference genome") parser.add_argument("bedDir", help="BED file or directory containing BED " "files. By " "default, these files are interpreted to contain only" " coordinates of coding exons, and fourfold degenerate" " sites will automatically be extracted from them." " To disable this behaviour and train on the entire " " file, use the --no4d option.", default=None) parser.add_argument("outMod", help="Path to output model file") parser.add_argument("--no4d", help="Do not extract fourfold degenerate" " positions from the input bed files. Rather use " "all bases they contain.", default=False, action="store_true") parser.add_argument("--numProc", help="Maximum number of processes for hal2maf.", type=int, default=1) parser.add_argument("--noAncestors", help="Don't write ancestral genomes in hal2maf", action="store_true", default=False) parser.add_argument("--maxBedLines", help="Split bed files so they have at most this many" " lines", type=int, default=None) parser.add_argument("--tree", help="String describing phylogeny in NEWICK format " "that will be used instead of the tree stored in the" " HAL file. This tree should contain all the species" " in the alignment. Note that it is best to enclose" " this string in quotes", default=None) parser.add_argument( "--targetGenomes", default=None, nargs='+', help="space separated list of targetGenomes to pass to " "hal2maf. If used, the tree given to --tree should match.") parser.add_argument("--substMod", help="Substitution model for phyloFit" ": valid options are JC69|F81|HKY85|HKY85+Gap|REV|" "SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S", default="SSREV") parser.add_argument("--noModFreqs", help="By default, equilibrium " "frequencies for the nucleotides of the trained model" " are corrected with the observed frequencies of " "the reference genome (using the PHAST modFreqs" " tool. This flag disables this step, and keeps the" " trained frequencies", action="store_true", default=False) parser.add_argument("--precision", help="Precision to pass to phyloFit (default MED)", choices=["HIGH", "MED", "LOW"], default="MED") parser.add_argument("--error", help="File in which to output confidence" " intervals for the parameters in the model", default=None) args = parser.parse_args() # validate inputs if not os.path.isfile(args.hal): raise RuntimeError("Input hal file %s not found" % args.hal) if not os.path.exists(args.bedDir): raise RuntimeError("%s not found" % args.bedDir) # validarte substitution model if not args.substMod in "JC69|F81|HKY85|HKY85+Gap|REV|SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S".split( "|"): raise RuntimeError("Invalid substitution model: %s" % args.substMod) # validate BEDs if os.path.isdir(args.bedDir): args.bedFiles = [ os.path.join(args.bedDir, f) for f in os.listdir(args.bedDir) if os.path.isfile(os.path.join(args.bedDir, f)) ] else: args.bedFiles = [args.bedDir] # test output is writeable and has valid extension outTest = open(args.outMod, "w") if not outTest: raise RuntimeError("Unable to open output %s" % args.outMod) if os.path.splitext(args.outMod)[1] != ".mod": raise RuntimeError("Output model must have .mod extension") # if targetGenomes is set, use those. Otherwise, extract from HAL if args.targetGenomes is not None: args.halGenomes = args.targetGenomes else: args.halGenomes = getHalGenomes(args.hal) # if tree is set, use that. Otherwise, extract from HAL if args.tree is None: args.tree = getHalTree(args.hal) # Make sure that all members of halGenomes and tree are in the actual HAL halTree = getHalTree(args.hal) if args.refGenome not in halTree: raise RuntimeError("Reference genome %s not found." % args.refGenome) for targetGenome in args.halGenomes: if targetGenome not in halTree: raise RuntimeError("Target genome %s not in HAL." % targetGenome) if targetGenome not in args.tree: raise RuntimeError("Target genome %s not in --tree." % targetGenome) args.halGenomes = ','.join(args.halGenomes) args.outDir = os.path.dirname(args.outMod) args.outName = os.path.splitext(os.path.basename(args.outMod))[0] # Random suffix so two runs don't collide suffix = "".join( [random.choice(string.ascii_uppercase) for _ in xrange(7)]) args.outMafName = args.outName + "_halPhyloPTrain_temp_%s.maf" % suffix args.outMafPath = os.path.join(args.outDir, args.outMafName) args.outMafAllPaths = args.outMafPath.replace( "_halPhyloPTrain_temp_%s.maf" % suffix, "_halPhyloPTrain_temp_%s*.maf" % suffix) # replace .maf suffix with .ss args.outMafSS = args.outMafPath[:-4] + ".ss" computeModel(args)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Compute a neutral substitution model for use with " "phyloP or halPhlyoP") parser.add_argument("hal", help="input hal") parser.add_argument("refGenome", help="Name of reference genome") parser.add_argument("bedDir", help="BED file or directory containing BED " "files. By " "default, these files are interpreted to contain only" " coordinates of coding exons, and fourfold degenerate" " sites will automatically be extracted from them." " To disable this behaviour and train on the entire " " file, use the --no4d option.", default=None) parser.add_argument("outMod", help="Path to output model file") parser.add_argument("--no4d", help="Do not extract fourfold degenerate" " positions from the input bed files. Rather use " "all bases they contain.", default=False, action="store_true") parser.add_argument("--numProc", help="Maximum number of processes for hal2maf.", type=int, default=1) parser.add_argument("--noAncestors", help="Don't write ancestral genomes in hal2maf", action="store_true", default=False) parser.add_argument("--maxBedLines", help="Split bed files so they have at most this many" " lines", type=int, default=None) parser.add_argument("--sliceSize", help="Slice size for hal2maf.", type=int, default=None) parser.add_argument("--tree", help="String describing phylogeny in NEWICK format " "that will be used instead of the tree stored in the" " HAL file. This tree should contain all the species" " in the alignment. Note that it is best to enclose" " this string in quotes", default=None) parser.add_argument("--substMod", help="Substitution model for phyloFit" ": valid options are JC69|F81|HKY85|HKY85+Gap|REV|" "SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S", default = "SSREV") parser.add_argument("--noModFreqs", help="By default, equilibrium " "frequencies for the nucleotides of the trained model" " are corrected with the observed frequencies of " "the reference genome (using the PHAST modFreqs" " tool. This flag disables this step, and keeps the" " trained frequencies", action="store_true", default=False) parser.add_argument("--error", help="File in which to output confidence" " intervals for the parameters in the model", default=None) args = parser.parse_args() if not os.path.isfile(args.hal): raise RuntimeError("Input hal file %s not found" % args.hal) if not os.path.exists(args.bedDir): raise RuntimeError("%s not found" % args.bedDir) if os.path.isdir(args.bedDir): args.bedFiles = [os.path.join(args.bedDir, f) for f in os.listdir(args.bedDir) if os.path.isfile(os.path.join(args.bedDir, f))] else: args.bedFiles = [args.bedDir] outTest = open(args.outMod, "w") if not outTest: raise RuntimeError("Unable to open output %s" % args.outMod) args.halGenomes = getHalGenomes(args.hal) if not args.refGenome in args.halGenomes: raise RuntimeError("Reference genome %s not found." % args.refGenome) if os.path.splitext(args.outMod)[1] != ".mod": raise RuntimeError("Output model must have .mod extension") if not args.substMod in "JC69|F81|HKY85|HKY85+Gap|REV|SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S".split("|"): raise RuntimeError("Invalid substitution model: %s" % args.substMod) args.outDir = os.path.dirname(args.outMod) args.outName = os.path.splitext(os.path.basename(args.outMod))[0] args.outMafName = args.outName + "_halPhyloPTrain_temp.maf" args.outMafPath = os.path.join(args.outDir, args.outMafName) args.outMafAllPaths = args.outMafPath.replace("_halPhyloPTrain_temp.maf", "_halPhyloPTrain_temp*.maf") args.outMafSS = args.outMafPath.replace("_halPhyloPTrain_temp.maf", "_halPhyloPTrain_temp.ss") computeModel(args)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Multi-Process wrapper for halPhyloP.") parser.add_argument("halFile", help="Input HAL file") parser.add_argument("refGenome", help="Reference genome to scan") parser.add_argument("modFile", help="Neutral model for PhyloP. Can be " "generated with halPhyloPTrain.py") parser.add_argument("wiggleFile", help="Output Wiggle file") parser.add_argument("--numProc", help="Maximum number of processes to create. If " " neither --sliceSize or --splitBySequence are " " specified, then the reference genome will be " "sliced to require --numProc jobs", type=int, default=1) parser.add_argument("--sliceSize", help="Maximum slice of reference sequence to process " "in a single process.", type=int, default=None) parser.add_argument("--chromSizes", help="Path of file to output chromosome sizes to. " "Necessary for wigToBigWig", default=None) ################################################################## #HDF5 OPTIONS (as copied from hal/api/hdf5_impl/hdf5CLParser.cpp) ################################################################## hdf5Grp = parser.add_argument_group('HDF5 HAL Options') hdf5Grp.add_argument("--cacheMDC", help="number of metadata slots in hdf5 cache", type=int, default=None) hdf5Grp.add_argument("--cacheRDC", help="number of regular slots in hdf5 cache. " "should be" " a prime number ~= 10 * DefaultCacheRDCBytes / " "chunk", type=int, default=None) hdf5Grp.add_argument("--cacheBytes", help="maximum size in bytes of regular hdf5 cache", type=int, default=None) hdf5Grp.add_argument("--cacheW0", help="w0 parameter fro hdf5 cache", type=int, default=None) hdf5Grp.add_argument("--inMemory", help="load all data in memory (& disable hdf5 cache)", action="store_true", default=False) ################################################################## #HALPHYLOP OPTIONS (as copied from hal/maf/impl/hal2maf.cpp) ################################################################## hppGrp = parser.add_argument_group('halPhyloP Options') hppGrp.add_argument("--refSequence", help="name of reference sequence within reference " "genome (all sequences if empty)", default=None) hppGrp.add_argument("--start", help="coordinate within reference genome (or sequence" " if specified) to start at", type=int, default=None) hppGrp.add_argument("--length", help="length of the reference genome (or sequence" " if specified) to convert. If set to 0," " the entire thing is converted", type=int, default=None) hppGrp.add_argument("--targetGenomes", help="comma-separated (no spaces) list of target " "genomes (others are excluded) (vist all if empty)", default=None) hppGrp.add_argument("--dupType", help="Which duplications to mask according to dupMask " "option. Choices are: " "\"all\": Any duplicated region; or " "\"ambiguous\": Regions within duplications where " "alignments from the same species do not contain" " the same base.", default=None) hppGrp.add_argument("--dupMask", help="What to do with duplicated regions. Choices are: " "\"hard\": mask entire alignment column if any " "duplications occur; or " "\"soft\": mask species where duplications occur.", default=None); hppGrp.add_argument("--step", help="step size", type=int, default=None) hppGrp.add_argument("--refBed", help="Bed file with coordinates to annotate in the " "reference genome to stream from standard " " input.", default=None) hppGrp.add_argument("--subtree", help="Subtree root for lineage-specific acceleration/conservation", default=None) hppGrp.add_argument("--prec", help="Number of decimal places in wig output", type=int, default=None) args = parser.parse_args() if not os.path.isfile(args.halFile): raise RuntimeError("Input hal file %s not found" % args.halFile) if not os.path.isfile(args.modFile): raise RuntimeError("Input mod file %s not found" % args.modFile) args.halGenomes = getHalGenomes(args.halFile) if not args.refGenome in args.halGenomes: raise RuntimeError("Reference genome %s not found." % args.refGenome) test = open(args.wiggleFile, "w") test.write("\n") test.close() os.remove(args.wiggleFile) if args.chromSizes is not None: test = open(args.chromSizes, "w") test.write("\n") test.close() os.remove(args.chromSizes) # make a little id tag for temporary slices S = string.ascii_uppercase + string.digits args.tempID = 'halPhyloPTemp' + ''.join(random.choice(S) for x in range(5)) runParallelSlices(args)