def blast(args): """ %prog blast ref.fasta query.fasta Calls blast and then filter the BLAST hits. Default is megablast. """ task_choices = ("blastn", "blastn-short", "dc-megablast", \ "megablast", "vecscreen") p = OptionParser(blast.__doc__) p.set_align(pctid=None, evalue=.01) p.add_option("--wordsize", type="int", help="Word size [default: %default]") p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.add_option("--task", default="megablast", choices=task_choices, help="Task of the blastn [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args q = op.basename(queryfasta).split(".")[0] r = op.basename(reffasta).split(".")[0] blastfile = "{0}.{1}.blast".format(q, r) run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta, wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue, hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus) return blastfile
def batchoverlap(args): """ %prog batchoverlap pairs.txt outdir Check overlaps between pairs of sequences. """ p = OptionParser(batchoverlap.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, outdir = args fp = open(pairsfile) cmds = [] mkdir("overlaps") for row in fp: a, b = row.split()[:2] oa = op.join(outdir, a + ".fa") ob = op.join(outdir, b + ".fa") cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob) cmd += " -o overlaps/{0}_{1}.ov".format(a, b) cmds.append(cmd) print "\n".join(cmds)
def filtervcf(args): """ %prog filtervcf NA12878.hg38.vcf.gz Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list of vcf files. """ p = OptionParser(filtervcf.__doc__) p.set_home("lobstr", default="/mnt/software/lobSTR") p.set_aws_opts(store="hli-mv-data-science/htang/str") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args lhome = opts.lobstr_home store = opts.output_path if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] vcffiles = [x for x in vcffiles if ".filtered." not in x] run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_filter, run_args).get(): continue
def cufflinks(args): """ %prog cufflinks folder reference Run cufflinks on a folder containing tophat results. """ p = OptionParser(cufflinks.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args os.chdir(folder) bams = glob("*tophat/accepted_hits.bam") for bam in bams: pf, ab = op.split(bam) outdir = op.join(pf, "cufflinks") if op.exists(outdir): logging.debug("Directory {0} found. Skipping.".format(outdir)) continue cmd = "cufflinks" cmd += " -o {0}".format(outdir) cmd += " -p {0}".format(opts.cpus) if opts.gtf: cmd += " -g {0}".format(opts.gtf) cmd += " --frag-bias-correct {0}".format(reference) cmd += " --multi-read-correct" cmd += " {0}".format(bam) sh(cmd)
def blat(args): """ %prog blat ref.fasta query.fasta Calls blat and filters BLAST hits. """ p = OptionParser(blat.__doc__) p.set_align(pctid=95, hitlen=30) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args blastfile = get_outfile(reffasta, queryfasta, suffix="blat") run_blat( infile=queryfasta, outfile=blastfile, db=reffasta, pctid=opts.pctid, hitlen=opts.hitlen, cpus=opts.cpus, overwrite=False, ) return blastfile
def filterdata(args): """ %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids Filter subset of data after dropping remove.ids. """ p = OptionParser(filterdata.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 6: sys.exit(not p.print_help()) binfile, sampleids, strids, af, remove, final = args df, m, samples, loci = read_binfile(binfile, sampleids, strids) remove = [x.strip() for x in open(remove)] removes = set(remove) final = [x.strip() for x in open(final)] assert len(loci) == len(remove) + len(final) fp = open(af) percentiles = {} for row in fp: sname, counts = row.split() countsd = af_to_counts(counts) percentile = counts_to_percentile(countsd) percentiles[sname] = percentile run_args = [] for i, sname in enumerate(loci): if sname in removes: continue a = m[:, i] percentile = percentiles[sname] run_args.append((i, a, percentile)) cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) res = [] for r in p.map_async(convert_to_percentile, run_args).get(): res.append(r) res.sort() # Write mask (P-value) matrix ii, pvalues = zip(*res) m = np.vstack(pvalues).T write_csv("final.mask.tsv", m, samples, final) df.drop(remove, inplace=True, axis=1) df.columns = final # Save a copy of the raw numpy array filtered_bin = "filtered.bin" m = df.as_matrix() m[m < 0] = -1 m.tofile(filtered_bin) logging.debug("Binary matrix written to `{}`".format(filtered_bin)) # Write data output df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=98) p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment [%default: %default]") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. cmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, cmd) cmd += " -c {0}".format(identity) cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile) sh(cmd) dd = fastafile + ".cdhit" return dd
def count(args): """ %prog count bamfile gtf Count the number of reads mapped using `htseq-count`. """ p = OptionParser(count.__doc__) p.add_option("--type", default="exon", help="Only count feature type") p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, gtf = args cpus = opts.cpus pf = bamfile.split(".")[0] countfile = pf + ".count" if not need_update(bamfile, countfile): return nsorted = pf + "_nsorted" nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam" if need_update(bamfile, nsortedsam): cmd = "samtools sort -@ {0} -n {1} {2}".format(cpus, bamfile, nsorted) sh(cmd) cmd = "samtools view -@ {0} -h {1}".format(cpus, nsortedbam) sh(cmd, outfile=nsortedsam) if need_update(nsortedsam, countfile): cmd = "htseq-count --stranded=no --minaqual=10" cmd += " -t {0}".format(opts.type) cmd += " {0} {1}".format(nsortedsam, gtf) sh(cmd, outfile=countfile)
def alignextend(args): """ %prog alignextend ref.fasta read.1.fastq read.2.fastq Wrapper around AMOS alignextend. """ p = OptionParser(alignextend.__doc__) p.add_option("--nosuffix", default=False, action="store_true", help="Do not add /1/2 suffix to the read [default: %default]") p.set_home("amos") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref, r1, r2 = args pf = op.basename(r1).split(".")[0] cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") if not opts.nosuffix: cmd += " -suffix" bwa_idx = "{0}.ref.fa.sa".format(pf) if not need_update(ref, bwa_idx): cmd += " -noindex" cmd += " -threads {0}".format(opts.cpus) offset = guessoffset([r1]) if offset == 64: cmd += " -I" cmd += " ".join(("", pf, ref, r1, r2)) sh(cmd)
def align(args): """ %prog align database.fasta read1.fq read2.fq Wrapper for `gsnap` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fasta import join from jcvi.formats.fastq import guessoffset from jcvi.projects.tgbs import snp p = OptionParser(align.__doc__) p.add_option("--join", default=False, action="store_true", help="Join sequences with padded 50Ns") p.add_option("--rnaseq", default=False, action="store_true", help="Input is RNA-seq reads, turn splicing on") p.add_option("--snp", default=False, action="store_true", help="Call SNPs after GSNAP") p.set_cpus() opts, args = p.parse_args(args) if len(args) == 2: logging.debug("Single-end alignment") elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) dbfile, readfile = args[0:2] if opts.join: dbfile = join([dbfile, "--gapsize=50", "--newid=chr1"]) assert op.exists(dbfile) and op.exists(readfile) prefix = get_prefix(readfile, dbfile) logfile = prefix + ".log" gsnapfile = prefix + ".gsnap" if not need_update((dbfile, readfile), gsnapfile): logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname) cmd += " -B 5 -m 0.1 -i 2 -n 3" # memory, mismatch, indel penalty, nhits if opts.rnaseq: cmd += " -N 1" cmd += " -t {0}".format(opts.cpus) cmd += " --gmap-mode none --nofails" if readfile.endswith(".gz"): cmd += " --gunzip" try: offset = "sanger" if guessoffset([readfile]) == 33 else "illumina" cmd += " --quality-protocol {0}".format(offset) except AssertionError: pass cmd += " " + " ".join(args[1:]) sh(cmd, outfile=gsnapfile, errfile=logfile) if opts.snp: snp([gsnapfile, "--cpus={0}".format(opts.cpus)]) return gsnapfile, logfile
def extract(args): """ %prog extract bamfile contig Extract sub-bam for just one contig. """ p = OptionParser(extract.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, contig = args cpus = opts.cpus pf = bamfile.split(".")[0] outfile = ".".join((contig.split("|")[0], pf, "bam")) if op.exists(outfile): logging.error("Output name exists: `{}`".format(outfile)) return if need_update(bamfile, outfile): cmd = 'samtools view {} "{}" -@ {}'.format(bamfile, contig, cpus) cmd += " -b -o {}".format(outfile) sh(cmd) index([outfile, "--cpus={}".format(cpus)])
def cib(args): """ %prog cib bamfile samplekey Convert BAM to CIB (a binary storage of int8 per base). """ p = OptionParser(cib.__doc__) p.add_option("--prefix", help="Report seqids with this prefix only") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, samplekey = args mkdir(samplekey) bam = pysam.AlignmentFile(bamfile, "rb") refs = [x for x in bam.header["SQ"]] prefix = opts.prefix if prefix: refs = [x for x in refs if x["SN"].startswith(prefix)] task_args = [] for r in refs: task_args.append((bamfile, r, samplekey)) cpus = min(opts.cpus, len(task_args)) logging.debug("Use {} cpus".format(cpus)) p = Pool(processes=cpus) for res in p.imap(bam_to_cib, task_args): continue
def gcn(args): """ %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz Compile gene copy njumber based on CANVAS results. """ p = OptionParser(gcn.__doc__) p.set_cpus() p.set_tmpdir(tmpdir="tmp") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) exonbed = args[0] canvasvcfs = args[1:] tsvfile = opts.outfile tmpdir = opts.tmpdir mkdir(tmpdir) set_tempdir(tmpdir) df = vcf_to_df(canvasvcfs, exonbed, opts.cpus) for suffix in (".avgcn", ".medcn"): df_to_tsv(df, tsvfile, suffix)
def blat(args): """ %prog blat old.fasta new.fasta Generate psl file using blat. """ p = OptionParser(blat.__doc__) p.add_option("--minscore", default=100, type="int", help="Matches minus mismatches gap penalty [default: %default]") p.add_option("--minid", default=98, type="int", help="Minimum sequence identity [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args twobitfiles = [] for fastafile in args: tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat" cmd += " {0} {1}".format(oldtwobit, newfasta) cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\ format(opts.minscore, opts.minid) pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \ for x in (newfasta, oldfasta))) cmd += pslfile sh(cmd)
def bam(args): """ %prog snp input.gsnap ref.fasta Convert GSNAP output to BAM. """ from jcvi.formats.sizes import Sizes from jcvi.formats.sam import index p = OptionParser(bam.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gsnapfile, fastafile = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] uniqsam = pf + ".unique.sam" if need_update((gsnapfile, fastafile), uniqsam): cmd = op.join(EYHOME, "gsnap2gff3.pl") sizesfile = Sizes(fastafile).filename cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam) cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus) sh(cmd) index([uniqsam])
def snp(args): """ %prog snp input.gsnap Run SNP calling on GSNAP output after apps.gsnap.align(). """ p = OptionParser(snp.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gsnapfile, = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] nativefile = pf + ".native" if need_update(gsnapfile, nativefile): cmd = op.join(EYHOME, "convert2native.pl") cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile) cmd += " -proc {0}".format(opts.cpus) sh(cmd) snpfile = pf + ".snp" if need_update(nativefile, snpfile): cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl") cmd += " --native {0} -o {1}".format(nativefile, snpfile) cmd += " -a 2 -ac 0.3 -c 0.8" sh(cmd)
def density(args): """ %prog density test.clm Estimate link density of contigs. """ p = OptionParser(density.__doc__) p.add_option("--save", default=False, action="store_true", help="Write log densitites of contigs to file") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) clmfile, = args clm = CLMFile(clmfile) pf = clmfile.rsplit(".", 1)[0] if opts.save: logdensities = clm.calculate_densities() densityfile = pf + ".density" fw = open(densityfile, "w") for name, logd in logdensities.items(): s = clm.tig_to_size[name] print >> fw, "\t".join(str(x) for x in (name, s, logd)) fw.close() logging.debug("Density written to `{}`".format(densityfile)) tourfile = clmfile.rsplit(".", 1)[0] + ".tour" tour = clm.activate(tourfile=tourfile, backuptour=False) clm.flip_all(tour) clm.flip_whole(tour) clm.flip_one(tour)
def beagle(args): """ %prog beagle input.vcf 1 Use BEAGLE4.1 to impute vcf on chromosome 1. """ p = OptionParser(beagle.__doc__) p.set_home("beagle") p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, chr = args pf = vcffile.rsplit(".", 1)[0] outpf = pf + ".beagle" outfile = outpf + ".vcf.gz" mm = MakeManager() beagle_cmd = opts.beagle_home kg = op.join(opts.ref, "1000GP_Phase3") cmd = beagle_cmd + " gt={0}".format(vcffile) cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr) cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr) cmd += " out={0}".format(outpf) cmd += " nthreads=16 gprobs=true" mm.add(vcffile, outfile, cmd) mm.write()
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.add_option("--coverage", default=40, type="int", help="Expected sequence coverage [default: %default]") p.add_option("--prefix", default="jf", help="Database prefix [default: %default]") p.add_option("--nohist", default=False, action="store_true", help="Do not print histogram [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".\ format(human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)
def impute(args): """ %prog impute input.vcf hs37d5.fa 1 Use IMPUTE2 to impute vcf on chromosome 1. """ from pyfaidx import Fasta p = OptionParser(impute.__doc__) p.set_home("shapeit") p.set_home("impute") p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) vcffile, fastafile, chr = args mm = MakeManager() pf = vcffile.rsplit(".", 1)[0] hapsfile = pf + ".haps" kg = op.join(opts.ref, "1000GP_Phase3") shapeit_phasing(mm, chr, vcffile, opts) fasta = Fasta(fastafile) size = len(fasta[chr]) binsize = 5000000 bins = size / binsize # 5Mb bins if size % binsize: bins += 1 impute_cmd = op.join(opts.impute_home, "impute2") chunks = [] for x in xrange(bins + 1): chunk_start = x * binsize + 1 chunk_end = min(chunk_start + binsize - 1, size) outfile = pf + ".chunk{0:02d}.impute2".format(x) mapfile = "{0}/genetic_map_chr{1}_combined_b37.txt".format(kg, chr) rpf = "{0}/1000GP_Phase3_chr{1}".format(kg, chr) cmd = impute_cmd + " -m {0}".format(mapfile) cmd += " -known_haps_g {0}".format(hapsfile) cmd += " -h {0}.hap.gz -l {0}.legend.gz".format(rpf) cmd += " -Ne 20000 -int {0} {1}".format(chunk_start, chunk_end) cmd += " -o {0} -allow_large_regions -seed 367946".format(outfile) cmd += " && touch {0}".format(outfile) mm.add(hapsfile, outfile, cmd) chunks.append(outfile) # Combine all the files imputefile = pf + ".impute2" cmd = "cat {0} > {1}".format(" ".join(chunks), imputefile) mm.add(chunks, imputefile, cmd) # Convert to vcf vcffile = pf + ".impute2.vcf" cmd = "python -m jcvi.formats.vcf fromimpute2 {0} {1} {2} > {3}".\ format(imputefile, fastafile, chr, vcffile) mm.add(imputefile, vcffile, cmd) mm.write()
def last(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB and LASTAL. LAST program available: <http://last.cbrc.jp> Works with LAST-719. """ p = OptionParser(last.__doc__) p.add_option("--path", help="Specify LAST path") p.add_option("--mask", default=False, action="store_true", help="Invoke -c in lastdb") p.add_option("--format", default="BlastTab", choices=("TAB", "MAF", "BlastTab", "BlastTab+"), help="Output format") p.add_option("--minlen", default=0, type="int", help="Filter alignments by how many bases match") p.add_option("--minid", default=0, type="int", help="Minimum sequence identity") p.set_cpus() p.set_params() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args path = opts.path cpus = opts.cpus getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \ lastdb_bin=lastdb_bin) u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) cmd += " -P {0} -i3G".format(cpus) cmd += " -f {0}".format(opts.format) cmd += " {0} {1}".format(subjectdb, query) minlen = opts.minlen minid = opts.minid extra = opts.extra assert minid != 100, "Perfect match not yet supported" mm = minid / (100 - minid) if minlen: extra += " -e{0}".format(minlen) if minid: extra += " -r1 -q{0} -a{0} -b{0}".format(mm) if extra: cmd += " " + extra.strip() lastfile = get_outfile(subject, query, suffix="last") sh(cmd, outfile=lastfile)
def index(args): """ %prog index samfile/bamfile If SAM file, convert to BAM, sort and then index, using SAMTOOLS """ p = OptionParser(index.__doc__) p.add_option("--fasta", dest="fasta", default=None, help="add @SQ header to the BAM file [default: %default]") p.add_option("--unique", default=False, action="store_true", help="only retain uniquely mapped reads [default: %default]") p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) samfile, = args cpus = opts.cpus fastafile = opts.fasta if fastafile: assert op.exists(fastafile) bamfile = samfile.replace(".sam", ".bam") if fastafile: faifile = fastafile + ".fai" if need_update(fastafile, faifile): sh("samtools faidx {0}".format(fastafile)) cmd = "samtools view -bt {0} {1} -F 4 -o {2}".\ format(faifile, samfile, bamfile) else: cmd = "samtools view -bS {0} -F 4 -o {1}".\ format(samfile, bamfile) cmd += " -@ {0}".format(cpus) if opts.unique: cmd += " -q 1" if samfile.endswith(".sam") and need_update(samfile, bamfile): sh(cmd) # Already sorted? if bamfile.endswith(".sorted.bam"): sortedbamfile = bamfile else: prefix = bamfile.replace(".bam", "") sortedbamfile = prefix + ".sorted.bam" if need_update(bamfile, sortedbamfile): cmd = "samtools sort {0} {1}.sorted".format(bamfile, prefix) cmd += " -@ {0}".format(cpus) sh(cmd) baifile = sortedbamfile + ".bai" if need_update(sortedbamfile, baifile): sh("samtools index {0}".format(sortedbamfile)) return sortedbamfile
def mito(args): """ %prog mito chrM.fa input.bam Identify mitochondrial deletions. """ p = OptionParser(mito.__doc__) p.set_aws_opts(store="hli-mv-data-science/htang/mito-deletions") p.add_option("--realignonly", default=False, action="store_true", help="Realign only") p.add_option("--svonly", default=False, action="store_true", help="Run Realign => SV calls only") p.add_option("--support", default=1, type="int", help="Minimum number of supporting reads") p.set_home("speedseq", default="/mnt/software/speedseq/bin") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) chrMfa, bamfile = args store = opts.output_path cleanup = not opts.nocleanup if not op.exists(chrMfa): logging.debug("File `{}` missing. Exiting.".format(chrMfa)) return chrMfai = chrMfa + ".fai" if not op.exists(chrMfai): cmd = "samtools index {}".format(chrMfa) sh(cmd) if not bamfile.endswith(".bam"): bamfiles = [x.strip() for x in open(bamfile)] else: bamfiles = [bamfile] if store: computed = ls_s3(store) computed = [op.basename(x).split('.')[0] for x in computed if \ x.endswith(".depth")] remaining_samples = [x for x in bamfiles \ if op.basename(x).split(".")[0] not in computed] logging.debug("Already computed on `{}`: {}".\ format(store, len(bamfiles) - len(remaining_samples))) bamfiles = remaining_samples logging.debug("Total samples: {}".format(len(bamfiles))) for bamfile in bamfiles: run_mito(chrMfa, bamfile, opts, realignonly=opts.realignonly, svonly=opts.svonly, store=store, cleanup=cleanup)
def tophat(args): """ %prog tophat folder reference Run tophat on a folder of reads. """ from jcvi.apps.bowtie import check_index from jcvi.formats.fastq import guessoffset p = OptionParser(tophat.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.add_option("--single", default=False, action="store_true", help="Single end mapping") p.add_option("--intron", default=15000, type="int", help="Max intron size [default: %default]") p.add_option("--dist", default=-50, type="int", help="Mate inner distance [default: %default]") p.add_option("--stdev", default=50, type="int", help="Mate standard deviation [default: %default]") p.set_phred() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) num = 1 if opts.single else 2 folder, reference = args reference = check_index(reference) for p, prefix in iter_project(folder, n=num): outdir = "{0}_tophat".format(prefix) outfile = op.join(outdir, "accepted_hits.bam") if op.exists(outfile): logging.debug("File `{0}` found. Skipping.".format(outfile)) continue cmd = "tophat -p {0}".format(opts.cpus) if opts.gtf: cmd += " -G {0}".format(opts.gtf) cmd += " -o {0}".format(outdir) if num == 1: # Single-end a, = p else: # Paired-end a, b = p cmd += " --max-intron-length {0}".format(opts.intron) cmd += " --mate-inner-dist {0}".format(opts.dist) cmd += " --mate-std-dev {0}".format(opts.stdev) phred = opts.phred or str(guessoffset([a])) if phred == "64": cmd += " --phred64-quals" cmd += " {0} {1}".format(reference, " ".join(p)) sh(cmd)
def prepare(args): """ %prog prepare genomesize *.fastq Prepare MERACULOUS configuation file. Genome size should be entered in Mb. """ p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("-K", default=51, type="int", help="K-mer size") p.set_cpus(cpus=32) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) genomesize = float(args[0]) / 1000 fnames = args[1:] for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) s = comment_banner("Meraculous params file") + "\n" s += comment_banner("Basic parameters") + "\n" s += "# Describe the libraries ( one line per library )\n" s += "# " + " ".join(header.split()) + "\n" libs = get_libs(fnames) lib_seqs = [] rank = 0 for lib, fs in libs: size = lib.size if size == 0: continue rank += 1 library_name = lib.library_name name = library_name.replace("-", "") wildcard = "{0}*.1.*,{0}*.2.*".format(library_name) rl = max(readlen([x]) for x in fs) lib_seq = lib.get_lib_seq(wildcard, name, rl, rank) lib_seqs.append(lib_seq) s += "\n" + "\n".join(load_csv(None, lib_seqs, sep=" ")) + "\n" params = [("genome_size", genomesize), ("is_diploid", 0), ("mer_size", opts.K), ("num_prefix_blocks", 1), ("no_read_validation", 0), ("local_num_procs", opts.cpus)] s += "\n" + "\n".join(load_csv(None, params, sep=" ")) + "\n" cfgfile = "meraculous.config" write_file(cfgfile, s, tee=True) s = "~/export/meraculous/bin/run_meraculous.sh -c {0}"\ .format(cfgfile) runsh = "run.sh" write_file(runsh, s)
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=96, pctcov=0) p.add_option("--fast", default=False, action="store_true", help="Place sequence in the first cluster") p.add_option("--consensus", default=False, action="store_true", help="Compute consensus sequences") p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. fastafile, qualfile = fasta([fastafile, "--seqtk"]) ocmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, ocmd) cmd += " -c {0}".format(identity) if ocmd == "cd-hit-est": cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" if not opts.fast: cmd += " -g 1" if opts.pctcov != 0: cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.) dd = fastafile + ".P{0}.cdhit".format(opts.pctid) clstr = dd + ".clstr" cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd) if need_update(fastafile, (dd, clstr)): sh(cmd) if opts.consensus: cons = dd + ".consensus" cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus") cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\ format(clstr, fastafile, cons) if need_update((clstr, fastafile), cons): sh(cmd) return dd
def blasr(args): """ %prog blasr ref.fasta fofn Run blasr on a set of PacBio reads. This is based on a divide-and-conquer strategy described below. """ from jcvi.apps.grid import MakeManager from jcvi.utils.iter import grouper p = OptionParser(blasr.__doc__) p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, fofn = args flist = sorted([x.strip() for x in open(fofn)]) h5list = [] mm = MakeManager() for i, fl in enumerate(grouper(flist, 3)): chunkname = "chunk{0:03d}".format(i) fn = chunkname + ".fofn" h5 = chunkname + ".cmp.h5" fw = open(fn, "w") print >> fw, "\n".join(fl) fw.close() cmd = "pbalign {0} {1} {2}".format(fn, reffasta, h5) cmd += " --nproc {0} --forQuiver --tmpDir .".format(opts.cpus) mm.add((fn, reffasta), h5, cmd) h5list.append(h5) # Merge h5, sort and repack allh5 = "all.cmp.h5" tmph5 = "tmp.cmp.h5" cmd_merge = "cmph5tools.py merge --outFile {0}".format(allh5) cmd_merge += " " + " ".join(h5list) cmd_sort = "cmph5tools.py sort --deep {0} --tmpDir .".format(allh5) cmd_repack = "h5repack -f GZIP=1 {0} {1}".format(allh5, tmph5) cmd_repack += " && mv {0} {1}".format(tmph5, allh5) mm.add(h5list, allh5, [cmd_merge, cmd_sort, cmd_repack]) # Quiver pf = reffasta.rsplit(".", 1)[0] variantsgff = pf + ".variants.gff" consensusfasta = pf + ".consensus.fasta" cmd_faidx = "samtools faidx {0}".format(reffasta) cmd = "quiver -j 32 {0}".format(allh5) cmd += " -r {0} -o {1} -o {2}".format(reffasta, variantsgff, consensusfasta) mm.add(allh5, consensusfasta, [cmd_faidx, cmd]) mm.write()
def optimize(args): """ %prog optimize test.clm Optimize the contig order and orientation, based on CLM file. """ p = OptionParser(optimize.__doc__) p.add_option("--skiprecover", default=False, action="store_true", help="Do not import 'recover' contigs") p.add_option("--startover", default=False, action="store_true", help="Do not resume from existing tour file") p.add_option("--skipGA", default=False, action="store_true", help="Skip GA step") p.set_outfile(outfile=None) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) clmfile, = args startover = opts.startover runGA = not opts.skipGA cpus = opts.cpus # Load contact map clm = CLMFile(clmfile, skiprecover=opts.skiprecover) tourfile = opts.outfile or clmfile.rsplit(".", 1)[0] + ".tour" if startover: tourfile = None tour = clm.activate(tourfile=tourfile) fwtour = open(tourfile, "w") # Store INIT tour print_tour(fwtour, clm.tour, "INIT", clm.active_contigs, clm.oo, signs=clm.signs) if runGA: for phase in range(1, 3): tour = optimize_ordering(fwtour, clm, phase, cpus) tour = clm.prune_tour(tour, cpus) # Flip orientations phase = 1 while True: tag1, tag2 = optimize_orientations(fwtour, clm, phase, cpus) if tag1 == REJECT and tag2 == REJECT: logging.debug("Terminating ... no more {}".format(ACCEPT)) break phase += 1 fwtour.close()
def cluster(args): """ %prog cluster prefix fastqfiles Use `vsearch` to remove duplicate reads. This routine is heavily influenced by PyRAD: <https://github.com/dereneaton/pyrad>. """ p = OptionParser(cluster.__doc__) add_consensus_options(p) p.set_align(pctid=95) p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) prefix = args[0] fastqfiles = args[1:] cpus = opts.cpus pctid = opts.pctid mindepth = opts.mindepth minlength = opts.minlength fastafile, qualfile = fasta(fastqfiles + ["--seqtk", "--outdir={0}".format(opts.outdir), "--outfile={0}".format(prefix + ".fasta")]) prefix = op.join(opts.outdir, prefix) pf = prefix + ".P{0}".format(pctid) derepfile = prefix + ".derep" if need_update(fastafile, derepfile): derep(fastafile, derepfile, minlength, cpus) userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(derepfile, userfile): cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((derepfile, userfile, notmatchedfile), clustfile): makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=mindepth) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus) statsfile = pf + ".stats" if need_update(clustSfile, statsfile): makestats(clustSfile, statsfile, mindepth=mindepth)
def cufflinks(args): """ %prog cufflinks folder reference Run cufflinks on a folder containing tophat results. """ p = OptionParser(cufflinks.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args cpus = opts.cpus gtf = opts.gtf transcripts = "transcripts.gtf" mm = MakeManager() gtfs = [] for bam in iglob(folder, "*.bam"): pf = op.basename(bam).split(".")[0] outdir = pf + "_cufflinks" cmd = "cufflinks" cmd += " -o {0}".format(outdir) cmd += " -p {0}".format(cpus) if gtf: cmd += " -g {0}".format(gtf) cmd += " --frag-bias-correct {0}".format(reference) cmd += " --multi-read-correct" cmd += " {0}".format(bam) cgtf = op.join(outdir, transcripts) mm.add(bam, cgtf, cmd) gtfs.append(cgtf) assemblylist = "assembly_list.txt" cmd = 'find . -name "{0}" > {1}'.format(transcripts, assemblylist) mm.add(gtfs, assemblylist, cmd) mergedgtf = "merged/merged.gtf" cmd = "cuffmerge" cmd += " -o merged" cmd += " -p {0}".format(cpus) if gtf: cmd += " -g {0}".format(gtf) cmd += " -s {0}".format(reference) cmd += " {0}".format(assemblylist) mm.add(assemblylist, mergedgtf, cmd) mm.write()
def mcluster(args): """ %prog mcluster *.consensus Cluster across samples using consensus sequences. """ p = OptionParser(mcluster.__doc__) add_consensus_options(p) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) consensusfiles = args minlength = opts.minlength cpus = opts.cpus pf = opts.prefix pctid = find_pctid(consensusfiles) pf += ".P{0}".format(pctid) consensusfile = pf + ".consensus.fasta" if need_update(consensusfiles, consensusfile): fw_cons = must_open(consensusfile, "w") totalseqs = 0 for cf in consensusfiles: nseqs = 0 s = op.basename(cf).split(".")[0] for name, seq in parse_fasta(cf): name = ".".join((s, name)) print(">{0}\n{1}".format(name, seq), file=fw_cons) nseqs += 1 logging.debug("Read `{0}`: {1} seqs".format(cf, nseqs)) totalseqs += nseqs logging.debug("Total: {0} seqs".format(totalseqs)) fw_cons.close() userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(consensusfile, userfile): cluster_smallmem(consensusfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((consensusfile, userfile, notmatchedfile), clustfile): makeclust(consensusfile, userfile, notmatchedfile, clustfile) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus, minsamp=opts.minsamp)
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option("--stutter", default=False, action="store_true", help="Count stutter reads on chrY") p.add_option("--nofilter", default=False, action="store_true", help="Do not filter the variants") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir store = opts.output_path stutter = opts.stutter cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() run_args = [(x, filtered, cleanup, store, stutter) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_compile, run_args).get(): continue
def align(args): """ %prog align clustfile Align clustfile to clustSfile. Useful for benchmarking aligners. """ p = OptionParser(align.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) clustfile, = args parallel_musclewrap(clustfile, opts.cpus)
def mappability(args): """ %prog mappability reference.fasta Generate 50mer mappability for reference genome. Commands are based on gem mapper. See instructions: <https://github.com/xuefzhao/Reference.Mappability> """ p = OptionParser(mappability.__doc__) p.add_option("--mer", default=50, type="int", help="User mer size") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) ref, = args K = opts.mer pf = ref.rsplit(".", 1)[0] mm = MakeManager() gem = pf + ".gem" cmd = "gem-indexer -i {} -o {}".format(ref, pf) mm.add(ref, gem, cmd) mer = pf + ".{}mer".format(K) mapb = mer + ".mappability" cmd = "gem-mappability -I {} -l {} -o {} -T {}".\ format(gem, K, mer, opts.cpus) mm.add(gem, mapb, cmd) wig = mer + ".wig" cmd = "gem-2-wig -I {} -i {} -o {}".format(gem, mapb, mer) mm.add(mapb, wig, cmd) bw = mer + ".bw" cmd = "wigToBigWig {} {}.sizes {}".format(wig, mer, bw) mm.add(wig, bw, cmd) bg = mer + ".bedGraph" cmd = "bigWigToBedGraph {} {}".format(bw, bg) mm.add(bw, bg, cmd) merged = mer + ".filtered-1.merge.bed" cmd = "python -m jcvi.formats.bed filterbedgraph {} 1".format(bg) mm.add(bg, merged, cmd) mm.write()
def layout(args): """ %prog layout query.subject.simple query.seqids subject.seqids Compute optimal seqids order in a second genome, based on seqids on one genome, given the pairwise blocks in .simple format. """ from jcvi.algorithms.ec import GA_setup, GA_run p = OptionParser(layout.__doc__) p.set_beds() p.set_cpus(cpus=32) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) simplefile, qseqids, sseqids = args qbed, sbed, qorder, sorder, is_self = check_beds(simplefile, p, opts) qseqids = qseqids.strip().split(",") sseqids = sseqids.strip().split(",") qseqids_ii = dict((s, i) for i, s in enumerate(qseqids)) sseqids_ii = dict((s, i) for i, s in enumerate(sseqids)) blocks = SimpleFile(simplefile).blocks scores = defaultdict(int) for a, b, c, d, score, orientation, hl in blocks: qi, q = qorder[a] si, s = sorder[c] qseqid, sseqid = q.seqid, s.seqid if sseqid not in sseqids: continue scores[sseqids_ii[sseqid], qseqid] += score data = [] for (a, b), score in sorted(scores.items()): if b not in qseqids_ii: continue data.append((qseqids_ii[b], score)) tour = range(len(qseqids)) toolbox = GA_setup(tour) toolbox.register("evaluate", colinear_evaluate_weights, data=data) tour, fitness = GA_run(toolbox, ngen=100, npop=100, cpus=opts.cpus) tour = [qseqids[x] for x in tour] print ",".join(tour)
def bes(args): """ %prog bes bacfasta clonename Use the clone name to download BES gss sequences from Genbank, map and then visualize. """ from jcvi.apps.align import run_blat p = OptionParser(bes.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bacfasta, clonename = args entrez([clonename, "--database=nucgss", "--skipcheck"]) besfasta = clonename + ".fasta" blatfile = clonename + ".bes.blat" run_blat(infile=besfasta, outfile=blatfile, db=bacfasta, \ pctid=95, hitlen=100, cpus=opts.cpus) aid, asize = Fasta(bacfasta).itersizes().next() width = 50 msg = "=" * width msg += " " + aid print >> sys.stderr, msg ratio = width * 1. / asize _ = lambda x: int(round(x * ratio, 0)) blasts = [BlastLine(x) for x in open(blatfile)] for b in blasts: if b.orientation == '+': msg = " " * _(b.sstart) + "->" else: msg = " " * (_(b.sstop) - 2) + "<-" msg += " " * (width - len(msg) + 2) msg += b.query if b.orientation == '+': msg += " (hang={0})".format(b.sstart - 1) else: msg += " (hang={0})".format(asize - b.sstop) print >> sys.stderr, msg
def alignextend(args): """ %prog alignextend ref.fasta read.1.fastq read.2.fastq Wrapper around AMOS alignextend. """ choices = "prepare,align,filter,rmdup,genreads".split(",") p = OptionParser(alignextend.__doc__) p.add_option("--nosuffix", default=False, action="store_true", help="Do not add /1/2 suffix to the read [default: %default]") p.add_option("--rc", default=False, action="store_true", help="Reverse complement the reads before alignment") p.add_option("--len", default=100, type="int", help="Extend to this length") p.add_option("--stage", default="prepare", choices=choices, help="Start from certain stage") p.add_option("--dup", default=10, type="int", help="Filter duplicates with coordinates within this distance") p.add_option("--maxdiff", default=1, type="int", help="Maximum number of differences") p.set_home("amos") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref, r1, r2 = args pf = op.basename(r1).split(".")[0] cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") if not opts.nosuffix: cmd += " -suffix" bwa_idx = "{0}.ref.fa.sa".format(pf) if not need_update(ref, bwa_idx): cmd += " -noindex" cmd += " -threads {0}".format(opts.cpus) offset = guessoffset([r1]) if offset == 64: cmd += " -I" if opts.rc: cmd += " -rc" cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup) cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len) cmd += " -maxdiff {0}".format(opts.maxdiff) cmd += " -stage {0}".format(opts.stage) cmd += " ".join(("", pf, ref, r1, r2)) sh(cmd)
def gmap(args): """ %prog gmap database.fasta fastafile Wrapper for `gmap`. """ p = OptionParser(gmap.__doc__) p.add_option("--cross", default=False, action="store_true", help="Cross-species alignment") p.add_option( "--npaths", default=0, type="int", help="Maximum number of paths to show." " If set to 0, prints two paths if chimera" " detected, else one.", ) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) dbfile, fastafile = args assert op.exists(dbfile) and op.exists(fastafile) prefix = get_prefix(fastafile, dbfile) logfile = prefix + ".log" gmapfile = prefix + ".gmap.gff3" if not need_update((dbfile, fastafile), gmapfile): logging.error("`{0}` exists. `gmap` already run.".format(gmapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gmap -D {0} -d {1}".format(dbdir, dbname) cmd += " -f 2 --intronlength=100000" # Output format 2 cmd += " -t {0}".format(opts.cpus) cmd += " --npaths {0}".format(opts.npaths) if opts.cross: cmd += " --cross-species" cmd += " " + fastafile sh(cmd, outfile=gmapfile, errfile=logfile) return gmapfile, logfile
def clean(args): """ %prog clean 1.fastq 2.fastq [insertsize] Clean and dedup paired FASTQ files. """ p = OptionParser(clean.__doc__) p.add_option("-a", default=0, type="int", help="Trim length at 5' end [default: %default]") p.add_option("-b", default=50, type="int", help="Trim length at 3' end [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) == 2: p1, p2 = args size = get_size(p1) elif len(args) == 3: p1, p2, size = args size = int(size) else: sys.exit(not p.print_help()) pf = p1.split(".")[0] cpus = opts.cpus offset = guessoffset([p1]) a, b = opts.a, opts.b p1_clean = p1 + ".clean" p1_cleangz = p1_clean + ".gz" p2_clean = p2 + ".clean" p2_cleangz = p2_clean + ".gz" if need_update([p1, p2], [p1_cleangz, p2_cleangz]): cmd = "SOAPfilter_v2.0 -t {0} -m 2000000 -p -y -z -g".format(cpus) cmd += " -q {0} -w 10 -B 50 -f 0".format(offset) cmd += " -l {0} -a {1} -b {2} -c {1} -d {2}".format(size, a, b, a, b) cmd += " {0} {1} {2}.clean.stat {3} {4}".\ format(p1, p2, pf, p1_clean, p2_clean) sh(cmd)
def mergebam(args): """ %prog mergebam dir1 dir2 homo_outdir or %prog mergebam dir1 dir2/20.bam het_outdir Merge sets of BAMs to make diploid. Two modes: - Homozygous mode: pair-up the bams in the two folders and merge - Heterozygous mode: pair the bams in first folder with a particular bam """ p = OptionParser(mergebam.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) idir1, idir2, outdir = args dir1 = [idir1] if idir1.endswith(".bam") else iglob(idir1, "*.bam") dir2 = [idir2] if idir2.endswith(".bam") else iglob(idir2, "*.bam") nbams1 = len(dir1) nbams2 = len(dir2) # Make sure more or the same number of bams in first pile if nbams1 < nbams2: dir1, dir2 = dir2, dir1 if nbams1 == nbams2: logging.debug("Homozygous mode") elif nbams1 > nbams2: assert nbams2 == 1, "Second pile must contain a single bam" dir2 = [idir2] * nbams1 assert len(dir1) == len(dir2), "Two piles must contain same number of bams" cmd = "samtools merge {} {} {} && samtools index {}" cmds = [] mkdir(outdir) for a, b in zip(dir1, dir2): ia = op.basename(a).split(".")[0] ib = op.basename(b).split(".")[0] outfile = op.join(outdir, "{}_{}.bam".format(ia, ib)) cmds.append(cmd.format(outfile, a, b, outfile)) p = Parallel(cmds, cpus=opts.cpus) p.run()
def fill(args): """ %prog fill frag_reads_corr.fastb Run FillFragments on `frag_reads_corr.fastb`. """ p = OptionParser(fill.__doc__) p.add_option( "--stretch", default=3, type="int", help="MAX_STRETCH to pass to FillFragments", ) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastb, ) = args assert fastb == "frag_reads_corr.fastb" pcfile = "frag_reads_corr.k28.pc.info" nthreads = " NUM_THREADS={0}".format(opts.cpus) maxstretch = " MAX_STRETCH={0}".format(opts.stretch) if need_update(fastb, pcfile): cmd = "PathReads READS_IN=frag_reads_corr" cmd += nthreads sh(cmd) filledfastb = "filled_reads.fastb" if need_update(pcfile, filledfastb): cmd = "FillFragments PAIRS_OUT=frag_reads_corr_cpd" cmd += " PRECORRECT_LIBSTATS=True" cmd += maxstretch cmd += nthreads sh(cmd) filledfasta = "filled_reads.fasta" if need_update(filledfastb, filledfasta): cmd = "Fastb2Fasta IN=filled_reads.fastb OUT=filled_reads.fasta" sh(cmd)
def cp(args): """ %prog cp "s3://hli-mv-data-science/htang/str/*.csv" . Copy files to folder. Accepts list of s3 addresses as input. """ p = OptionParser(cp.__doc__) p.add_option("--force", default=False, action="store_true", help="Force overwrite if exists") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) store, folder = args force = opts.force cpus = opts.cpus if op.exists(store): contents = [x.strip().split(",") for x in open(store)] else: contents = glob_s3(store) tasks = [] for c in contents: if isinstance(c, basestring): oc = op.basename(c) tc = op.join(folder, oc) else: if len(c) == 2: c, tc = c else: c, = c tc = op.basename(c) tasks.append((c, tc, force)) worker_pool = Pool(cpus) worker_pool.map(worker, tasks) worker_pool.close() worker_pool.join()
def nucmer(args): """ %prog nucmer ref.fasta query.fasta Run NUCMER using query against reference. Parallel implementation derived from: <https://github.com/fritzsedlazeck/sge_mummer> """ from itertools import product from jcvi.apps.grid import MakeManager from jcvi.formats.base import split p = OptionParser(nucmer.__doc__) p.add_option("--chunks", type="int", help="Split both query and subject into chunks") p.set_params(prog="nucmer", params="-g 5000 -l 24 -c 500") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args cpus = opts.cpus nrefs = nqueries = opts.chunks or int(cpus**.5) refdir = ref.split(".")[0] + "-outdir" querydir = query.split(".")[0] + "-outdir" reflist = split([ref, refdir, str(nrefs)]).names querylist = split([query, querydir, str(nqueries)]).names mm = MakeManager() for i, (r, q) in enumerate(product(reflist, querylist)): pf = "{0:04d}".format(i) cmd = "nucmer -maxmatch" cmd += " {0}".format(opts.extra) cmd += " {0} {1} -p {2}".format(r, q, pf) deltafile = pf + ".delta" mm.add((r, q), deltafile, cmd) print cmd mm.write()
def blast(args): """ %prog blast ref.fasta query.fasta Calls blast and then filter the BLAST hits. Default is megablast. """ task_choices = ("blastn", "blastn-short", "dc-megablast", \ "megablast", "vecscreen") p = OptionParser(blast.__doc__) p.set_align(pctid=0, evalue=.01) p.add_option("--wordsize", type="int", help="Word size [default: %default]") p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.add_option("--task", default="megablast", choices=task_choices, help="Task of the blastn [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args blastfile = get_outfile(reffasta, queryfasta) run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta, wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue, hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus) return blastfile
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, opts.store, opts.cleanup) for x in vcffiles] for res in p.map_async(run, run_args).get(): continue
def close(args): """ %prog close scaffolds.fasta PE*.fastq Run GapFiller to fill gaps. """ p = OptionParser(close.__doc__) p.set_home("gapfiller") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) scaffolds = args[0] libtxt = write_libraries(args[1:], aligner="bwa") cmd = "perl " + op.join(opts.gapfiller_home, "GapFiller.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, scaffolds, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def scaffold(args): """ %prog scaffold contigs.fasta MP*.fastq Run SSPACE scaffolding. """ p = OptionParser(scaffold.__doc__) p.set_home("sspace") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) contigs = args[0] libtxt = write_libraries(args[1:]) cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Basic_v2.0.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def blat(args): """ %prog blat old.fasta new.fasta Generate psl file using blat. """ p = OptionParser(blat.__doc__) p.add_option( "--minscore", default=100, type="int", help="Matches minus mismatches gap penalty", ) p.add_option( "--minid", default=98, type="int", help="Minimum sequence identity", ) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args twobitfiles = [] for fastafile in args: tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat" cmd += " {0} {1}".format(oldtwobit, newfasta) cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".format( opts.minscore, opts.minid) pslfile = "{0}.{1}.psl".format(*(op.basename(x).split(".")[0] for x in (newfasta, oldfasta))) cmd += pslfile sh(cmd)
def prepare(args): """ %prog prepare alignAssembly.config est.fasta ref.fasta Generate PASA run script. """ p = OptionParser(prepare.__doc__) p.set_home("pasa") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) cfg, est, ref = args phome = opts.pasa_home cmd = op.join(phome, "scripts/Launch_PASA_pipeline.pl") cmd += " -c {0} --CPU {1}".format(cfg, opts.cpus) cmd += " -C -R --ALIGNERS blat,gmap" cmd += " -t {0} -g {1}".format(est, ref) runfile = "run.sh" write_file(runfile, cmd, meta="run script")
def correct(args): """ %prog correct *.fastq Correct reads using ErrorCorrection. Only PE will be used to build the K-mer table. """ p = OptionParser(correct.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) lstfile = "reads2cor.lst" fw = open(lstfile, "w") print("\n".join(x for x in args if x[:2] == "PE"), file=fw) fw.close() p1 = args[0] offset = guessoffset([p1]) cpus = opts.cpus freq = "output.freq.cz" freqlen = freq + ".len" if need_update(args, (freq, freqlen)): cmd = "KmerFreq_AR_v2.0 -k 17 -c -1 -q {0}".format(offset) cmd += " -m 1 -t {0}".format(cpus) cmd += " -p output {0}".format(lstfile) sh(cmd) fw = open(lstfile, "w") print("\n".join(args), file=fw) fw.close() cmd = "Corrector_AR_v2.0 -k 17 -l 3 -m 5 -c 5 -a 0 -e 1 -w 0 -r 45" cmd += " -Q {0} -q 30 -x 8 -t {1} -o 1 ".format(offset, cpus) cmd += " {0} {1} {2}".format(freq, freqlen, lstfile) sh(cmd)
def blat(args): """ %prog blat ref.fasta query.fasta Calls blat and filters BLAST hits. """ p = OptionParser(blat.__doc__) p.set_align(pctid=95, hitlen=30) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args blastfile = get_outfile(reffasta, queryfasta, suffix="blat") run_blat(infile=queryfasta, outfile=blastfile, db=reffasta, pctid=opts.pctid, hitlen=opts.hitlen, cpus=opts.cpus, overwrite=False) return blastfile
def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=21, type="int", help="Kmer size") p.add_option("--ci", default=2, type="int", help="Minimum value of a counter") p.add_option("--cs", default=2, type="int", help="Maximal value of a counter") p.add_option("--single", default=False, action="store_true", help="Input is single-end data, only one FASTQ") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k n = 1 if opts.single else 2 mm = MakeManager() for p, pf in iter_project(folder, n=n, commonprefix=False): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print >> fw, "\n".join(p) fw.close() cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus) cmd += " -ci{} -cs{}".format(opts.ci, opts.cs) cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
def genemark(args): """ %prog genemark species fastafile Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig model gff file is needed. """ p = OptionParser(genemark.__doc__) p.add_option("--junctions", help="Path to `junctions.bed` from Tophat2") p.set_home("gmes") p.set_cpus(cpus=32) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) species, fastafile = args junctions = opts.junctions mhome = opts.gmes_home license = op.expanduser("~/.gm_key") assert op.exists(license), "License key ({0}) not found!".format(license) cmd = "{0}/gmes_petap.pl --sequence {1}".format(mhome, fastafile) cmd += " --cores {0}".format(opts.cpus) if junctions: intronsgff = "introns.gff" if need_update(junctions, intronsgff): jcmd = "{0}/bet_to_gff.pl".format(mhome) jcmd += " --bed {0} --gff {1} --label Tophat2".\ format(junctions, intronsgff) sh(jcmd) cmd += " --ET {0} --et_score 10".format(intronsgff) else: cmd += " --ES" sh(cmd) logging.debug("GENEMARK matrix written to `output/gmhmm.mod")
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=98) p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. cmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, cmd) cmd += " -c {0}".format(identity) cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile) sh(cmd) dd = fastafile + ".cdhit" return dd
def meryl(args): """ %prog meryl folder Run meryl on Illumina reads. """ p = OptionParser(meryl.__doc__) p.add_option("-k", default=19, type="int", help="Kmer size") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (folder, ) = args K = opts.k cpus = opts.cpus mm = MakeManager() for p, pf in iter_project(folder): cmds = [] mss = [] for i, ip in enumerate(p): ms = "{}{}.ms{}".format(pf, i + 1, K) mss.append(ms) cmd = "meryl -B -C -m {} -threads {}".format(K, cpus) cmd += " -s {} -o {}".format(ip, ms) cmds.append(cmd) ams, bms = mss pms = "{}.ms{}".format(pf, K) cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms) cmds.append(cmd) cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".format( ams, ams, bms, bms) cmds.append(cmd) mm.add(p, pms + ".mcdat", cmds) mm.write()
def compare(args): """ %prog compare NA12878_array_hg38.bed *.seg Compare cnv output to known ground truths. """ p = OptionParser(compare.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) truths = args[0] cnvoutputs = args[1:] cpus = min(len(cnvoutputs), opts.cpus) p = Pool(processes=cpus) results = [] files = [(x, truths) for x in cnvoutputs] r = p.map_async(compare_worker, files, callback=results.append) r.wait() for res in results: print("\n".join(res))
def scaffold(args): """ %prog scaffold contigs.fasta MP*.fastq Run SSPACE scaffolding. """ p = OptionParser(scaffold.__doc__) p.set_aligner(aligner="bwa") p.set_home("sspace") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) contigs = args[0] libtxt = write_libraries(args[1:], aligner=opts.aligner) # Requires getopts.pl which may be missing download("http://web.vims.edu/bridge/bridge2/aw/lib/getopts.pl") cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl") cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus) runsh = "run.sh" write_file(runsh, cmd)
def batchlobstr(args): """ %prog batchlobstr bamlist Run lobSTR on a list of BAMs. The corresponding batch command for TREDPARSE: $ tred.py --toy bamlist --haploid CHR4 --workdir tredparse_results """ p = OptionParser(batchlobstr.__doc__) p.add_option("--haploid", default="chrY,chrM", help="Use haploid model for these chromosomes") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bamlist, = args cmd = "python -m jcvi.variation.str lobstr TOY" cmd += " --input_bam_path {}" cmd += " --haploid {}".format(opts.haploid) cmds = [cmd.format(x.strip()) for x in open(bamlist).readlines()] p = Parallel(cmds, cpus=opts.cpus) p.run()
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size") p.add_option( "--coverage", default=40, type="int", help="Expected sequence coverage", ) p.add_option("--prefix", default="jf", help="Database prefix") p.add_option( "--nohist", default=False, action="store_true", help="Do not print histogram", ) p.set_home("jellyfish") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".format( human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) jfcmd = op.join(opts.jellyfish_home, "jellyfish") cmd = jfcmd cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)