def count(args): """ %prog count bamfile gtf Count the number of reads mapped using `htseq-count`. """ p = OptionParser(count.__doc__) sp1.add_argument("--type", default="exon", help="Only count feature type") p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, gtf = args cpus = args.cpus pf = bamfile.split(".")[0] countfile = pf + ".count" if not need_update(bamfile, countfile): return nsorted = pf + "_nsorted" nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam" if need_update(bamfile, nsortedsam): cmd = "samtools sort -@ {0} -n {1} {2}".format(cpus, bamfile, nsorted) sh(cmd) cmd = "samtools view -@ {0} -h {1}".format(cpus, nsortedbam) sh(cmd, outfile=nsortedsam) if need_update(nsortedsam, countfile): cmd = "htseq-count --stranded=no --minaqual=10" cmd += " -t {0}".format(args.type) cmd += " {0} {1}".format(nsortedsam, gtf) sh(cmd, outfile=countfile)
def index(args): """ %prog index samfile/bamfile If SAM file, convert to BAM, sort and then index, using SAMTOOLS """ p = OptionParser(index.__doc__) sp1.add_argument("--fasta", dest="fasta", default=None, help="add @SQ header to the BAM file [default: %default]") sp1.add_argument("--unique", default=False, action="store_true", help="only retain uniquely mapped reads [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) samfile, = args cpus = args.cpus fastafile = args.fasta if fastafile: assert op.exists(fastafile) bamfile = samfile.replace(".sam", ".bam") if fastafile: faifile = fastafile + ".fai" if need_update(fastafile, faifile): sh("samtools faidx {0}".format(fastafile)) cmd = "samtools view -bt {0} {1} -o {2}".\ format(faifile, samfile, bamfile) else: cmd = "samtools view -bS {0} -o {1}".\ format(samfile, bamfile) cmd += " -@ {0}".format(cpus) if args.unique: cmd += " -q 1" if samfile.endswith(".sam") and need_update(samfile, bamfile): sh(cmd) # Already sorted? if bamfile.endswith(".sorted.bam"): sortedbamfile = bamfile else: prefix = bamfile.replace(".bam", "") sortedbamfile = prefix + ".sorted.bam" if need_update(bamfile, sortedbamfile): cmd = "samtools sort {0} -o {1}".format(bamfile, sortedbamfile) cmd += " -@ {0}".format(cpus) sh(cmd) baifile = sortedbamfile + ".bai" if need_update(sortedbamfile, baifile): sh("samtools index {0}".format(sortedbamfile)) return sortedbamfile
def frompsl(args): """ %prog frompsl old.new.psl old.fasta new.fasta Generate chain file from psl file. The pipeline is describe in: <http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver> """ from maize.formats.sizes import Sizes p = OptionParser(frompsl.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) pslfile, oldfasta, newfasta = args pf = oldfasta.split(".")[0] # Chain together alignments from using axtChain chainfile = pf + ".chain" twobitfiles = [] for fastafile in (oldfasta, newfasta): tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles if need_update(pslfile, chainfile): cmd = "axtChain -linearGap=medium -psl {0}".format(pslfile) cmd += " {0} {1} {2}".format(oldtwobit, newtwobit, chainfile) sh(cmd) # Sort chain files sortedchain = chainfile.rsplit(".", 1)[0] + ".sorted.chain" if need_update(chainfile, sortedchain): cmd = "chainSort {0} {1}".format(chainfile, sortedchain) sh(cmd) # Make alignment nets from chains netfile = pf + ".net" oldsizes = Sizes(oldfasta).filename newsizes = Sizes(newfasta).filename if need_update((sortedchain, oldsizes, newsizes), netfile): cmd = "chainNet {0} {1} {2}".format(sortedchain, oldsizes, newsizes) cmd += " {0} /dev/null".format(netfile) sh(cmd) # Create liftOver chain file liftoverfile = pf + ".liftover.chain" if need_update((netfile, sortedchain), liftoverfile): cmd = "netChainSubset {0} {1} {2}".\ format(netfile, sortedchain, liftoverfile) sh(cmd)
def fpkm(args): """ %prog fpkm fastafile *.bam Calculate FPKM values from BAM file. """ p = OptionParser(fpkm.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] # Create a DUMMY gff file for cuffdiff gffile = fastafile.rsplit(".", 1)[0] + ".gff" if need_update(fastafile, gffile): fw = open(gffile, "w") f = Fasta(fastafile, lazy=True) for key, size in f.itersizes_ordered(): print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\ 1, size, ".", ".", ".", "ID=" + key)) fw.close() logging.debug("Dummy GFF created: {0}".format(gffile)) cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles)) sh(cmd)
def first(args): """ %prog first N fastqfile(s) Get first N reads from file. """ from maize.apps.base import need_update p = OptionParser(first.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) N = int(args[0]) nlines = N * 4 fastqfiles = args[1:] fastqfile = fastqfiles[0] outfile = args.outfile if not need_update(fastqfiles, outfile): logging.debug("File `{0}` exists. Will not overwrite.".format(outfile)) return gz = fastqfile.endswith(".gz") for fastqfile in fastqfiles: if gz: cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines) else: cmd = "head -n {0} {1}".format(nlines, fastqfile) sh(cmd, outfile=args.outfile, append=True)
def filter(args): """ %prog filter <deltafile|coordsfile> Produce a new delta/coords file and filter based on id% or cov%. Use `delta-filter` for .delta file. """ p = OptionParser(filter.__doc__) p.set_align(pctid=0, hitlen=0) sp1.add_argument("--overlap", default=False, action="store_true", help="Print overlap status (e.g. terminal, contained)") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pctid = args.pctid hitlen = args.hitlen filename, = args if pctid == 0 and hitlen == 0: return filename pf, suffix = filename.rsplit(".", 1) outfile = "".join((pf, ".P{0}L{1}.".format(int(pctid), int(hitlen)), suffix)) if not need_update(filename, outfile): return outfile if suffix == "delta": cmd = "delta-filter -i {0} -l {1} {2}".format(pctid, hitlen, filename) sh(cmd, outfile=outfile) return outfile fp = open(filename) fw = must_open(outfile, "w") for row in fp: try: c = CoordsLine(row) except AssertionError: continue if c.identity < pctid: continue if c.len2 < hitlen: continue if args.overlap and not c.overlap: continue outrow = row.rstrip() if args.overlap: ov = Overlap_types[c.overlap] outrow += "\t" + ov print >> fw, outrow return outfile
def __init__(self, filename, index=False): super(Maf, self).__init__(filename) indexfile = filename + ".idx" if index: if need_update(filename, indexfile): self.build_index(filename, indexfile) self.index = maf.Index(filename, indexfile) fp = open(filename) self.reader = maf.Reader(fp)
def fasta(args): """ %prog fasta fastqfiles Convert fastq to fasta and qual file. """ p = OptionParser(fasta.__doc__) sp1.add_argument("--seqtk", default=False, action="store_true", help="Use seqtk to convert") p.set_outdir() p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args outdir = args.outdir if outdir and outdir != ".": mkdir(outdir) fastqfile = fastqfiles[0] pf = op.basename(fastqfile) gzinput = pf.endswith(".gz") if gzinput: pf = pf.rsplit(".", 1)[0] pf, sf = pf.rsplit(".", 1) if sf not in ("fq", "fastq"): logging.debug("Assumed FASTA: suffix not `fq` or `fastq`") return fastqfile, None fastafile, qualfile = pf + ".fasta", pf + ".qual" outfile = args.outfile or fastafile outfile = op.join(outdir, outfile) if args.seqtk: if need_update(fastqfiles, outfile): for i, fastqfile in enumerate(fastqfiles): cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile) # First one creates file, following ones append to it sh(cmd, outfile=outfile, append=i) else: logging.debug("Outfile `{0}` already exists.".format(outfile)) return outfile, None for fastqfile in fastqfiles: SeqIO.convert(fastqfile, "fastq", fastafile, "fasta") SeqIO.convert(fastqfile, "fastq", qualfile, "qual") return fastafile, qualfile
def coverage(args): """ %prog coverage fastafile bamfile Calculate coverage for BAM file. BAM file will be sorted unless with --nosort. """ p = OptionParser(coverage.__doc__) sp1.add_argument("--format", default="bigwig", choices=("bedgraph", "bigwig", "coverage"), help="Output format") sp1.add_argument("--nosort", default=False, action="store_true", help="Do not sort BAM") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, bamfile = args format = args.format if args.nosort: logging.debug("BAM sorting skipped") else: bamfile = index([bamfile, "--fasta={0}".format(fastafile)]) pf = bamfile.rsplit(".", 2)[0] sizesfile = Sizes(fastafile).filename cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile) if format in ("bedgraph", "bigwig"): cmd += " -bg" bedgraphfile = pf + ".bedgraph" sh(cmd, outfile=bedgraphfile) if format == "bedgraph": return bedgraphfile bigwigfile = pf + ".bigwig" cmd = "bedGraphToBigWig {0} {1} {2}".\ format(bedgraphfile, sizesfile, bigwigfile) sh(cmd) return bigwigfile coveragefile = pf + ".coverage" if need_update(fastafile, coveragefile): sh(cmd, outfile=coveragefile) gcf = GenomeCoverageFile(coveragefile) fw = must_open(args.outfile, "w") for seqid, cov in gcf.iter_coverage_seqid(): print >> fw, "\t".join((seqid, "{0:.1f}".format(cov))) fw.close()
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): fw.write("\t".join((k, str(size))) + "\n") fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def blast(args): """ %prog blast <deltafile|coordsfile> Covert delta or coordsfile to BLAST tabular output. """ p = OptionParser(blast.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) deltafile, = args blastfile = deltafile.rsplit(".", 1)[0] + ".blast" if need_update(deltafile, blastfile): coords = Coords(deltafile) fw = open(blastfile, "w") for c in coords: print >> fw, c.blastline
def __init__(self, filename, sorted=False, header=False): if filename.endswith(".delta"): coordsfile = filename.rsplit(".", 1)[0] + ".coords" if need_update(filename, coordsfile): fromdelta([filename]) filename = coordsfile super(Coords, self).__init__(filename) fp = open(filename) if header: self.cmd = fp.next() for row in fp: try: self.append(CoordsLine(row)) except AssertionError: pass if sorted: self.ref_sort()
def pairs(args): """ See __doc__ for OptionParser.set_pairs(). """ import maize.formats.bed p = OptionParser(pairs.__doc__) p.set_pairs() opts, targs = p.parse_args(args) if len(targs) != 1: sys.exit(not p.print_help()) samfile, = targs bedfile = samfile.rsplit(".", 1)[0] + ".bed" if need_update(samfile, bedfile): cmd = "bamToBed -i {0}".format(samfile) sh(cmd, outfile=bedfile) args[args.index(samfile)] = bedfile return maize.formats.bed.pairs(args)
def wrapper(*args, **kwargs): assert outfile in kwargs, \ "You need to specify `outfile=` on function call" if infile in kwargs: infilename = listify(kwargs[infile]) for x in infilename: assert op.exists(x), \ "The specified infile `{0}` does not exist".format(x) outfilename = kwargs[outfile] if need_update(infilename, outfilename): return func(*args, **kwargs) else: msg = "File `{0}` exists. Computation skipped." \ .format(outfilename) logging.debug(msg) outfilename = listify(outfilename) for x in outfilename: assert op.exists(x), \ "Something went wrong, `{0}` not found".format(x) return outfilename
def faToTwoBit(fastafile): twobitfile = fastafile.rsplit(".", 1)[0] + ".2bit" cmd = "faToTwoBit {0} {1}".format(fastafile, twobitfile) if need_update(fastafile, twobitfile): sh(cmd) return twobitfile
def mstmap(args): """ %prog mstmap bcffile/vcffile > matrixfile Convert bcf/vcf format to mstmap input. """ from maize.assembly.geneticmap import MSTMatrix p = OptionParser(mstmap.__doc__) p.add_option("--dh", default=False, action="store_true", help="Double haploid population, no het [default: %default]") p.add_option("--freq", default=.2, type="float", help="Allele must be above frequency [default: %default]") p.add_option("--mindepth", default=3, type="int", help="Only trust genotype calls with depth [default: %default]") p.add_option("--missing_threshold", default=.25, type="float", help="Fraction missing must be below") p.add_option("--noheader", default=False, action="store_true", help="Do not print MSTmap run parameters [default: %default]") p.add_option("--pv4", default=False, action="store_true", help="Enable filtering strand-bias, tail distance bias, etc. " "[default: %default]") p.add_option("--freebayes", default=False, action="store_true", help="VCF output from freebayes") p.set_sep(sep=".", help="Use separator to simplify individual names") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args if vcffile.endswith(".bcf"): bcffile = vcffile vcffile = bcffile.rsplit(".", 1)[0] + ".vcf" cmd = "bcftools view {0}".format(bcffile) cmd += " | vcfutils.pl varFilter" if not opts.pv4: cmd += " -1 0 -2 0 -3 0 -4 0 -e 0" if need_update(bcffile, vcffile): sh(cmd, outfile=vcffile) freq = opts.freq sep = opts.sep depth_index = 1 if opts.freebayes else 2 ptype = "DH" if opts.dh else "RIL6" nohet = ptype == "DH" fp = open(vcffile) genotypes = [] for row in fp: if row[:2] == "##": continue atoms = row.split() if row[0] == '#': ind = [x.split(sep)[0] for x in atoms[9:]] nind = len(ind) mh = ["locus_name"] + ind continue marker = "{0}.{1}".format(*atoms[:2]) geno = atoms[9:] geno = [encode_genotype(x, mindepth=opts.mindepth, depth_index=depth_index, nohet=nohet) for x in geno] assert len(geno) == nind f = 1. / nind if geno.count("A") * f < freq: continue if geno.count("B") * f < freq: continue if geno.count("-") * f > opts.missing_threshold: continue genotype = [marker] + geno genotypes.append(genotype) mm = MSTMatrix(genotypes, mh, ptype, opts.missing_threshold) mm.write(opts.outfile, header=(not opts.noheader))
def mstmap(args): """ %prog mstmap bcffile/vcffile > matrixfile Convert bcf/vcf format to mstmap input. """ from maize.assembly.geneticmap import MSTMatrix p = OptionParser(mstmap.__doc__) p.add_option("--dh", default=False, action="store_true", help="Double haploid population, no het [default: %default]") p.add_option("--freq", default=.2, type="float", help="Allele must be above frequency [default: %default]") p.add_option( "--mindepth", default=3, type="int", help="Only trust genotype calls with depth [default: %default]") p.add_option("--missing_threshold", default=.25, type="float", help="Fraction missing must be below") p.add_option("--noheader", default=False, action="store_true", help="Do not print MSTmap run parameters [default: %default]") p.add_option("--pv4", default=False, action="store_true", help="Enable filtering strand-bias, tail distance bias, etc. " "[default: %default]") p.add_option("--freebayes", default=False, action="store_true", help="VCF output from freebayes") p.set_sep(sep=".", help="Use separator to simplify individual names") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args if vcffile.endswith(".bcf"): bcffile = vcffile vcffile = bcffile.rsplit(".", 1)[0] + ".vcf" cmd = "bcftools view {0}".format(bcffile) cmd += " | vcfutils.pl varFilter" if not opts.pv4: cmd += " -1 0 -2 0 -3 0 -4 0 -e 0" if need_update(bcffile, vcffile): sh(cmd, outfile=vcffile) freq = opts.freq sep = opts.sep depth_index = 1 if opts.freebayes else 2 ptype = "DH" if opts.dh else "RIL6" nohet = ptype == "DH" fp = open(vcffile) genotypes = [] for row in fp: if row[:2] == "##": continue atoms = row.split() if row[0] == '#': ind = [x.split(sep)[0] for x in atoms[9:]] nind = len(ind) mh = ["locus_name"] + ind continue marker = "{0}.{1}".format(*atoms[:2]) geno = atoms[9:] geno = [ encode_genotype(x, mindepth=opts.mindepth, depth_index=depth_index, nohet=nohet) for x in geno ] assert len(geno) == nind f = 1. / nind if geno.count("A") * f < freq: continue if geno.count("B") * f < freq: continue if geno.count("-") * f > opts.missing_threshold: continue genotype = [marker] + geno genotypes.append(genotype) mm = MSTMatrix(genotypes, mh, ptype, opts.missing_threshold) mm.write(opts.outfile, header=(not opts.noheader))