def batchccn(args): """ %prog batchccn test.csv Run CCN script in batch. Write makefile. """ p = OptionParser(batchccn.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) csvfile, = args mm = MakeManager() pf = op.basename(csvfile).split(".")[0] mkdir(pf) header = open(csvfile).next() header = None if header.strip().endswith(".bam") else "infer" logging.debug("Header={}".format(header)) df = pd.read_csv(csvfile, header=header) cmd = "perl /mnt/software/ccn_gcn_hg38_script/ccn_gcn_hg38.pl" cmd += " -n {} -b {}" cmd += " -o {} -r hg38".format(pf) for i, (sample_key, bam) in df.iterrows(): cmdi = cmd.format(sample_key, bam) outfile = "{}/{}/{}.ccn".format(pf, sample_key, sample_key) mm.add(csvfile, outfile, cmdi) mm.write()
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option( "--notreds", default=False, action="store_true", help="Remove TREDs from the bed file", ) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trfbed, fastafile = args pf = fastafile.split(".")[0] lhome = opts.lobstr_home mkdir(pf) if opts.notreds: newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 seen = set() for row in fp: r = STRLine(row) total += 1 name = r.longname if name in seen: continue seen.add(name) print(r, file=newbed) retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(newbedfile, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def impute(args): """ %prog impute input.vcf hs37d5.fa 1 Use IMPUTE2 to impute vcf on chromosome 1. """ from pyfaidx import Fasta p = OptionParser(impute.__doc__) p.set_home("shapeit") p.set_home("impute") p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) vcffile, fastafile, chr = args mm = MakeManager() pf = vcffile.rsplit(".", 1)[0] hapsfile = pf + ".haps" kg = op.join(opts.ref, "1000GP_Phase3") shapeit_phasing(mm, chr, vcffile, opts) fasta = Fasta(fastafile) size = len(fasta[chr]) binsize = 5000000 bins = size / binsize # 5Mb bins if size % binsize: bins += 1 impute_cmd = op.join(opts.impute_home, "impute2") chunks = [] for x in xrange(bins + 1): chunk_start = x * binsize + 1 chunk_end = min(chunk_start + binsize - 1, size) outfile = pf + ".chunk{0:02d}.impute2".format(x) mapfile = "{0}/genetic_map_chr{1}_combined_b37.txt".format(kg, chr) rpf = "{0}/1000GP_Phase3_chr{1}".format(kg, chr) cmd = impute_cmd + " -m {0}".format(mapfile) cmd += " -known_haps_g {0}".format(hapsfile) cmd += " -h {0}.hap.gz -l {0}.legend.gz".format(rpf) cmd += " -Ne 20000 -int {0} {1}".format(chunk_start, chunk_end) cmd += " -o {0} -allow_large_regions -seed 367946".format(outfile) cmd += " && touch {0}".format(outfile) mm.add(hapsfile, outfile, cmd) chunks.append(outfile) # Combine all the files imputefile = pf + ".impute2" cmd = "cat {0} > {1}".format(" ".join(chunks), imputefile) mm.add(chunks, imputefile, cmd) # Convert to vcf vcffile = pf + ".impute2.vcf" cmd = "python -m jcvi.formats.vcf fromimpute2 {0} {1} {2} > {3}".\ format(imputefile, fastafile, chr, vcffile) mm.add(imputefile, vcffile, cmd) mm.write()
def minimap(args): """ %prog minimap ref.fasta query.fasta Wrap minimap2 aligner using query against sequences. When query and ref is the same, we are in "self-scan" mode (e.g. useful for finding internal duplications resulted from mis-assemblies). """ from jcvi.apps.grid import MakeManager from jcvi.formats.fasta import Fasta p = OptionParser(minimap.__doc__) p.add_option( "--chunks", type="int", default=2000000, help="Split ref.fasta into chunks of size in self-scan mode", ) p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args chunks = opts.chunks outdir = opts.outdir if ref != query: raise NotImplementedError # "self-scan" mode # build faidx (otherwise, parallel make may complain) sh("samtools faidx {}".format(ref)) f = Fasta(ref) mkdir(outdir) mm = MakeManager() for name, size in f.itersizes(): start = 0 for end in range(chunks, size, chunks): fafile = op.join(outdir, "{}_{}_{}.fa".format(name, start + 1, end)) cmd = "samtools faidx {} {}:{}-{} -o {}".format( ref, name, start + 1, end, fafile) mm.add(ref, fafile, cmd) paffile = fafile.rsplit(".", 1)[0] + ".paf" cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile) mm.add(fafile, paffile, cmd) epsfile = fafile.rsplit(".", 1)[0] + ".eps" cmd = "minidot {} > {}".format(paffile, epsfile) mm.add(paffile, epsfile, cmd) start += chunks mm.write()
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38 Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option("--fixseq", action="store_true", default=False, help="Scan sequences to extract perfect STRs") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) trfbed, fastafile, pf = args lhome = opts.lobstr_home mkdir(pf) if opts.fixseq: genome = pyfasta.Fasta(fastafile) newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 for row in fp: s = STRLine(row) total += 1 for ns in s.iter_exact_str(genome): if not ns.is_valid(): continue print >> newbed, ns retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(trfbed, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).split(".")[0] cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | awk '($8 <= {} && $9 >= 0)'".format(datfile, READLEN) cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()
def blasr(args): """ %prog blasr ref.fasta fofn Run blasr on a set of PacBio reads. This is based on a divide-and-conquer strategy described below. """ from jcvi.apps.grid import MakeManager from jcvi.utils.iter import grouper p = OptionParser(blasr.__doc__) p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, fofn = args flist = sorted([x.strip() for x in open(fofn)]) h5list = [] mm = MakeManager() for i, fl in enumerate(grouper(flist, 3)): chunkname = "chunk{0:03d}".format(i) fn = chunkname + ".fofn" h5 = chunkname + ".cmp.h5" fw = open(fn, "w") print >> fw, "\n".join(fl) fw.close() cmd = "pbalign {0} {1} {2}".format(fn, reffasta, h5) cmd += " --nproc {0} --forQuiver --tmpDir .".format(opts.cpus) mm.add((fn, reffasta), h5, cmd) h5list.append(h5) # Merge h5, sort and repack allh5 = "all.cmp.h5" tmph5 = "tmp.cmp.h5" cmd_merge = "cmph5tools.py merge --outFile {0}".format(allh5) cmd_merge += " " + " ".join(h5list) cmd_sort = "cmph5tools.py sort --deep {0} --tmpDir .".format(allh5) cmd_repack = "h5repack -f GZIP=1 {0} {1}".format(allh5, tmph5) cmd_repack += " && mv {0} {1}".format(tmph5, allh5) mm.add(h5list, allh5, [cmd_merge, cmd_sort, cmd_repack]) # Quiver pf = reffasta.rsplit(".", 1)[0] variantsgff = pf + ".variants.gff" consensusfasta = pf + ".consensus.fasta" cmd_faidx = "samtools faidx {0}".format(reffasta) cmd = "quiver -j 32 {0}".format(allh5) cmd += " -r {0} -o {1} -o {2}".format(reffasta, variantsgff, consensusfasta) mm.add(allh5, consensusfasta, [cmd_faidx, cmd]) mm.write()
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option("--notreds", default=False, action="store_true", help="Remove TREDs from the bed file") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trfbed, fastafile = args pf = fastafile.split(".")[0] lhome = opts.lobstr_home mkdir(pf) if opts.notreds: newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 seen = set() for row in fp: r = STRLine(row) total += 1 name = r.longname if name in seen: continue seen.add(name) print >> newbed, r retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(newbedfile, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def cufflinks(args): """ %prog cufflinks folder reference Run cufflinks on a folder containing tophat results. """ p = OptionParser(cufflinks.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args cpus = opts.cpus gtf = opts.gtf transcripts = "transcripts.gtf" mm = MakeManager() gtfs = [] for bam in iglob(folder, "*.bam"): pf = op.basename(bam).split(".")[0] outdir = pf + "_cufflinks" cmd = "cufflinks" cmd += " -o {0}".format(outdir) cmd += " -p {0}".format(cpus) if gtf: cmd += " -g {0}".format(gtf) cmd += " --frag-bias-correct {0}".format(reference) cmd += " --multi-read-correct" cmd += " {0}".format(bam) cgtf = op.join(outdir, transcripts) mm.add(bam, cgtf, cmd) gtfs.append(cgtf) assemblylist = "assembly_list.txt" cmd = 'find . -name "{0}" > {1}'.format(transcripts, assemblylist) mm.add(gtfs, assemblylist, cmd) mergedgtf = "merged/merged.gtf" cmd = "cuffmerge" cmd += " -o merged" cmd += " -p {0}".format(cpus) if gtf: cmd += " -g {0}".format(gtf) cmd += " -s {0}".format(reference) cmd += " {0}".format(assemblylist) mm.add(assemblylist, mergedgtf, cmd) mm.write()
def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=21, type="int", help="Kmer size") p.add_option("--ci", default=2, type="int", help="Exclude kmers with less than ci counts") p.add_option("--cs", default=2, type="int", help="Maximal value of a counter") p.add_option("--cx", default=None, type="int", help="Exclude kmers with more than cx counts") p.add_option("--single", default=False, action="store_true", help="Input is single-end data, only one FASTQ/FASTA") p.add_option("--fasta", default=False, action="store_true", help="Input is FASTA instead of FASTQ") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k n = 1 if opts.single else 2 pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \ "*.fq,*.fq.gz,*.fastq,*.fastq.gz" mm = MakeManager() for p, pf in iter_project(folder, pattern=pattern, n=n, commonprefix=False): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print >> fw, "\n".join(p) fw.close() cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus) cmd += " -ci{} -cs{}".format(opts.ci, opts.cs) if opts.cx: cmd += " -cx{}".format(opts.cx) if opts.fasta: cmd += " -fm" cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
def lastgenome(args): """ %prog genome_A.fasta genome_B.fasta Run LAST by calling LASTDB, LASTAL. The script runs the following steps: $ lastdb -P0 -uNEAR -R01 Chr10A-NEAR Chr10A.fa $ lastal -E0.05 -C2 Chr10A-NEAR Chr10A.fa -fTAB > Chr10A.Chr10A.tab $ last-dotplot Chr10A.Chr10A.tab """ from jcvi.apps.grid import MakeManager p = OptionParser(lastgenome.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gA, gB = args mm = MakeManager() bb = lambda x : op.basename(x).rsplit(".", 1)[0] gA_pf, gB_pf = bb(gA), bb(gB) # Build LASTDB dbname = "-".join((gA_pf, "NEAR")) dbfile = dbname + ".suf" build_db_cmd = "lastdb -P0 -uNEAR -R01 {} {}".format(dbfile, gA) mm.add(gA, dbfile, build_db_cmd) # Run LASTAL tabfile = "{}.{}.tab".format(gA_pf, gB_pf) lastal_cmd = "lastal -E0.05 -C2 {} {}".format(dbname, gB) lastal_cmd += " -fTAB > {}".format(tabfile) mm.add([dbfile, gB], tabfile, lastal_cmd) mm.write()
def batch(args): """ %prog batch all.cds *.anchors Compute Ks values for a set of anchors file. This will generate a bunch of work directories for each comparisons. The anchorsfile should be in the form of specie1.species2.anchors. """ from jcvi.apps.grid import MakeManager p = OptionParser(batch.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) cdsfile = args[0] anchors = args[1:] workdirs = [".".join(op.basename(x).split(".")[:2]) for x in anchors] for wd in workdirs: mkdir(wd) mm = MakeManager() for wd, ac in zip(workdirs, anchors): pairscdsfile = wd + ".cds.fasta" cmd = "python -m jcvi.apps.ks prepare {} {} -o {}".\ format(ac, cdsfile, pairscdsfile) mm.add((ac, cdsfile), pairscdsfile, cmd) ksfile = wd + ".ks" cmd = "python -m jcvi.apps.ks calc {} -o {} --workdir {}".\ format(pairscdsfile, ksfile, wd) mm.add(pairscdsfile, ksfile, cmd) mm.write()
def lastgenomeuniq(args): """ %prog genome_A.fasta genome_B.fasta Run LAST by calling LASTDB, LASTAL and LAST-SPLIT. The recipe is based on tutorial here: <https://github.com/mcfrith/last-genome-alignments> The script runs the following steps: $ lastdb -P0 -uNEAR -R01 Chr10A-NEAR Chr10A.fa $ lastal -E0.05 -C2 Chr10A-NEAR Chr10B.fa | last-split -m1 | maf-swap | last-split -m1 -fMAF > Chr10A.Chr10B.1-1.maf $ maf-convert -n blasttab Chr10A.Chr10B.1-1.maf > Chr10A.Chr10B.1-1.blast Works with LAST v959. """ from jcvi.apps.grid import MakeManager p = OptionParser(lastgenome.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gA, gB = args mm = MakeManager() bb = lambda x : op.basename(x).rsplit(".", 1)[0] gA_pf, gB_pf = bb(gA), bb(gB) # Build LASTDB dbname = "-".join((gA_pf, "NEAR")) dbfile = dbname + ".suf" build_db_cmd = "lastdb -P0 -uNEAR -R01 {} {}".format(dbfile, gA) mm.add(gA, dbfile, build_db_cmd) # Run LASTAL maffile = "{}.{}.1-1.maf".format(gA_pf, gB_pf) lastal_cmd = "lastal -E0.05 -C2 {} {}".format(dbname, gB) lastal_cmd += " | last-split -m1" lastal_cmd += " | maf-swap" lastal_cmd += " | last-split -m1 -fMAF > {}".format(maffile) mm.add([dbfile, gB], maffile, lastal_cmd) # Convert to BLAST format blastfile = maffile.replace(".maf", ".blast") convert_cmd = "maf-convert -n blasttab {} > {}".format(maffile, blastfile) mm.add(maffile, blastfile, convert_cmd) mm.write()
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--minlength", default=MINSCORE / 2, type="int", help="Minimum length of repeat tract") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args minlength = opts.minlength mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).split(".")[0] cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | awk '($8 >= {} && $8 <= {})'".\ format(datfile, minlength, READLEN - minlength) cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()
def lastgenome(args): """ %prog genome_A.fasta genome_B.fasta Run LAST by calling LASTDB, LASTAL and LAST-SPLIT. The recipe is based on tutorial here: <https://github.com/mcfrith/last-genome-alignments> The script runs the following steps: $ lastdb -P0 -uNEAR -R01 Chr10A-NEAR Chr10A.fa $ lastal -E0.05 -C2 Chr10A-NEAR Chr10B.fa | last-split -m1 | maf-swap | last-split -m1 -fMAF > Chr10A.Chr10B.1-1.maf $ maf-convert -n blasttab Chr10A.Chr10B.1-1.maf > Chr10A.Chr10B.1-1.blast Works with LAST v959. """ from jcvi.apps.grid import MakeManager p = OptionParser(lastgenome.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gA, gB = args mm = MakeManager() bb = lambda x : op.basename(x).rsplit(".", 1)[0] gA_pf, gB_pf = bb(gA), bb(gB) # Build LASTDB dbname = "-".join((gA_pf, "NEAR")) dbfile = dbname + ".suf" build_db_cmd = "lastdb -P0 -uNEAR -R01 {} {}".format(dbfile, gA) mm.add(gA, dbfile, build_db_cmd) # Run LASTAL maffile = "{}.{}.1-1.maf".format(gA_pf, gB_pf) lastal_cmd = "lastal -E0.05 -C2 {} {}".format(dbname, gB) lastal_cmd += " | last-split -m1" lastal_cmd += " | maf-swap" lastal_cmd += " | last-split -m1 -fMAF > {}".format(maffile) mm.add([dbfile, gB], maffile, lastal_cmd) # Convert to BLAST format blastfile = maffile.replace(".maf", ".blast") convert_cmd = "maf-convert -n blasttab {} > {}".format(maffile, blastfile) mm.add(maffile, blastfile, convert_cmd) mm.write()
def cyntenator(args): """ %prog cyntenator athaliana.athaliana.last athaliana.bed Prepare input for Cyntenator. """ p = OptionParser(cyntenator.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) lastfile = args[0] fp = open(lastfile) filteredlastfile = lastfile + ".blast" fw = open(filteredlastfile, "w") for row in fp: b = BlastLine(row) if b.query == b.subject: continue print >> fw, "\t".join((b.query, b.subject, str(b.score))) fw.close() bedfiles = args[1:] fp = open(lastfile) b = BlastLine(fp.next()) subject = b.subject txtfiles = [] for bedfile in bedfiles: order = Bed(bedfile).order if subject in order: db = op.basename(bedfile).split(".")[0][:20] logging.debug("Found db: {0}".format(db)) txtfile = write_txt(bedfile) txtfiles.append(txtfile) db += ".txt" mm = MakeManager() for txtfile in txtfiles: outfile = txtfile + ".alignment" cmd = 'cyntenator -t "({0} {1})" -h blast {2} > {3}'\ .format(txtfile, db, filteredlastfile, outfile) mm.add((txtfile, db, filteredlastfile), outfile, cmd) mm.write()
def star(args): """ %prog star folder reference Run star on a folder with reads. """ p = OptionParser(star.__doc__) p.add_option("--single", default=False, action="store_true", help="Single end mapping") p.set_fastq_names() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args cpus = opts.cpus mm = MakeManager() num = 1 if opts.single else 2 folder, reference = args gd = "GenomeDir" mkdir(gd) STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd) # Step 0: build genome index genomeidx = op.join(gd, "Genome") if need_update(reference, genomeidx): cmd = STAR + " --runMode genomeGenerate" cmd += " --genomeFastaFiles {0}".format(reference) mm.add(reference, genomeidx, cmd) # Step 1: align for p, prefix in iter_project(folder, opts.names, num): pf = "{0}_star".format(prefix) bamfile = pf + "Aligned.sortedByCoord.out.bam" cmd = STAR + " --readFilesIn {0}".format(" ".join(p)) if p[0].endswith(".gz"): cmd += " --readFilesCommand zcat" cmd += " --outSAMtype BAM SortedByCoordinate" cmd += " --outFileNamePrefix {0}".format(pf) cmd += " --twopassMode Basic" # Compatibility for cufflinks cmd += " --outSAMstrandField intronMotif" cmd += " --outFilterIntronMotifs RemoveNoncanonical" mm.add(p, bamfile, cmd) mm.write()
def nucmer(args): """ %prog nucmer ref.fasta query.fasta Run NUCMER using query against reference. Parallel implementation derived from: <https://github.com/fritzsedlazeck/sge_mummer> """ from itertools import product from jcvi.apps.grid import MakeManager from jcvi.formats.base import split p = OptionParser(nucmer.__doc__) p.add_option("--chunks", type="int", help="Split both query and subject into chunks") p.set_params(prog="nucmer", params="-l 100 -c 500") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args cpus = opts.cpus nrefs = nqueries = opts.chunks or int(cpus ** .5) refdir = ref.split(".")[0] + "-outdir" querydir = query.split(".")[0] + "-outdir" reflist = split([ref, refdir, str(nrefs)]).names querylist = split([query, querydir, str(nqueries)]).names mm = MakeManager() for i, (r, q) in enumerate(product(reflist, querylist)): pf = "{0:04d}".format(i) cmd = "nucmer -maxmatch" cmd += " {0}".format(opts.extra) cmd += " {0} {1} -p {2}".format(r, q, pf) deltafile = pf + ".delta" mm.add((r, q), deltafile, cmd) print cmd mm.write()
def batchccn(args): """ %prog batchccn test.csv Run CCN script in batch. Write makefile. """ p = OptionParser(batchccn.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (csvfile, ) = args mm = MakeManager() pf = op.basename(csvfile).split(".")[0] mkdir(pf) header = next(open(csvfile)) header = None if header.strip().endswith(".bam") else "infer" logging.debug("Header={}".format(header)) df = pd.read_csv(csvfile, header=header) cmd = "perl /mnt/software/ccn_gcn_hg38_script/ccn_gcn_hg38.pl" cmd += " -n {} -b {}" cmd += " -o {} -r hg38".format(pf) for i, (sample_key, bam) in df.iterrows(): cmdi = cmd.format(sample_key, bam) outfile = "{}/{}/{}.ccn".format(pf, sample_key, sample_key) mm.add(csvfile, outfile, cmdi) mm.write()
def beagle(args): """ %prog beagle input.vcf 1 Use BEAGLE4.1 to impute vcf on chromosome 1. """ p = OptionParser(beagle.__doc__) p.set_home("beagle") p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, chr = args pf = vcffile.rsplit(".", 1)[0] outpf = pf + ".beagle" outfile = outpf + ".vcf.gz" mm = MakeManager() beagle_cmd = opts.beagle_home kg = op.join(opts.ref, "1000GP_Phase3") cmd = beagle_cmd + " gt={0}".format(vcffile) cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr) cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr) cmd += " out={0}".format(outpf) cmd += " nthreads=16 gprobs=true" mm.add(vcffile, outfile, cmd) mm.write()
def batch(args): """ %proj batch database.fasta project_dir output_dir Run bwa in batch mode. """ p = OptionParser(batch.__doc__) set_align_options(p) p.set_sam_options() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref_fasta, proj_dir, outdir = args outdir = outdir.rstrip("/") s3dir = None if outdir.startswith("s3://"): s3dir = outdir outdir = op.basename(outdir) mkdir(outdir) mm = MakeManager() for p, pf in iter_project(proj_dir): targs = [ref_fasta] + p cmd1, bamfile = mem(targs, opts) if cmd1: cmd1 = output_bam(cmd1, bamfile) nbamfile = op.join(outdir, bamfile) cmd2 = "mv {} {}".format(bamfile, nbamfile) cmds = [cmd1, cmd2] if s3dir: cmd = "aws s3 cp {} {} --sse".format(nbamfile, op.join(s3dir, bamfile)) cmds.append(cmd) mm.add(p, nbamfile, cmds) mm.write()
def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=21, type="int", help="Kmer size") p.add_option("--ci", default=2, type="int", help="Exclude kmers with less than ci counts") p.add_option("--cs", default=2, type="int", help="Maximal value of a counter") p.add_option("--cx", default=None, type="int", help="Exclude kmers with more than cx counts") p.add_option("--single", default=False, action="store_true", help="Input is single-end data, only one FASTQ/FASTA") p.add_option("--fasta", default=False, action="store_true", help="Input is FASTA instead of FASTQ") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k n = 1 if opts.single else 2 pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \ "*.fq,*.fq.gz,*.fastq,*.fastq.gz" mm = MakeManager() for p, pf in iter_project(folder, pattern=pattern, n=n, commonprefix=False): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print("\n".join(p), file=fw) fw.close() cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus) cmd += " -ci{} -cs{}".format(opts.ci, opts.cs) if opts.cx: cmd += " -cx{}".format(opts.cx) if opts.fasta: cmd += " -fm" cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=19, type="int", help="Kmer size") p.add_option("-c", default=2, type="int", help="Maximal value of a counter") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k mm = MakeManager() for p, pf in iter_project(folder): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print >> fw, "\n".join(p) fw.close() cmd = "kmc -k{} -m64 -t{} -cs{}".format(K, opts.cpus, opts.c) cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
def meryl(args): """ %prog meryl folder Run meryl on Illumina reads. """ p = OptionParser(meryl.__doc__) p.add_option("-k", default=19, type="int", help="Kmer size") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k cpus = opts.cpus mm = MakeManager() for p, pf in iter_project(folder): cmds = [] mss = [] for i, ip in enumerate(p): ms = "{}{}.ms{}".format(pf, i + 1, K) mss.append(ms) cmd = "meryl -B -C -m {} -threads {}".format(K, cpus) cmd += " -s {} -o {}".format(ip, ms) cmds.append(cmd) ams, bms = mss pms = "{}.ms{}".format(pf, K) cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms) cmds.append(cmd) cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".\ format(ams, ams, bms, bms) cmds.append(cmd) mm.add(p, pms + ".mcdat", cmds) mm.write()
def merge(args): """ %prog merge merged_bams bams1_dir bams2_dir ... Merge BAM files. Treat the bams with the same prefix as a set. Output the commands first. """ from jcvi.apps.grid import MakeManager p = OptionParser(merge.__doc__) p.set_sep(sep="_", help="Separator to group per prefix") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) merged_bams = args[0] bamdirs = args[1:] mkdir(merged_bams) bams = [] for x in bamdirs: bams += glob(op.join(x, "*.bam")) bams = [x for x in bams if "nsorted" not in x] logging.debug("Found a total of {0} BAM files.".format(len(bams))) sep = opts.sep key = lambda x: op.basename(x).split(sep)[0] bams.sort(key=key) mm = MakeManager() for prefix, files in groupby(bams, key=key): files = sorted(list(files)) nfiles = len(files) source = " ".join(files) target = op.join(merged_bams, op.basename(files[0])) if nfiles == 1: source = get_abs_path(source) cmd = "ln -s {0} {1}".format(source, target) mm.add("", target, cmd) else: cmd = "samtools merge -@ 8 {0} {1}".format(target, source) mm.add(files, target, cmd, remove=True) mm.write()
def merge(args): """ %prog merge merged_bams bams1_dir bams2_dir ... Merge BAM files. Treat the bams with the same prefix as a set. Output the commands first. """ from jcvi.apps.softlink import get_abs_path from jcvi.apps.grid import MakeManager p = OptionParser(merge.__doc__) p.add_option("--sep", default="_", help="Separator to group per prefix") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) merged_bams = args[0] bamdirs = args[1:] mkdir(merged_bams) bams = [] for x in bamdirs: bams += glob(op.join(x, "*.bam")) bams = [x for x in bams if "nsorted" not in x] logging.debug("Found a total of {0} BAM files.".format(len(bams))) sep = opts.sep key = lambda x: op.basename(x).split(sep)[0] bams.sort(key=key) mm = MakeManager() for prefix, files in groupby(bams, key=key): files = sorted(list(files)) nfiles = len(files) source = " ".join(files) target = op.join(merged_bams, op.basename(files[0])) if nfiles == 1: source = get_abs_path(source) cmd = "ln -s {0} {1}".format(source, target) mm.add("", target, cmd) else: cmds = [] cmds.append("rm {0}".format(target)) cmds.append("samtools merge {0} {1}".format(target, source)) mm.add(files, target, cmds) mm.write()
def cyntenator(args): """ %prog cyntenator athaliana.athaliana.last athaliana.bed Prepare input for Cyntenator. """ p = OptionParser(cyntenator.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) lastfile = args[0] fp = open(lastfile) filteredlastfile = lastfile + ".blast" fw = open(filteredlastfile, "w") for row in fp: b = BlastLine(row) if b.query == b.subject: continue print("\t".join((b.query, b.subject, str(b.score))), file=fw) fw.close() bedfiles = args[1:] fp = open(lastfile) b = BlastLine(next(fp)) subject = b.subject txtfiles = [] for bedfile in bedfiles: order = Bed(bedfile).order if subject in order: db = op.basename(bedfile).split(".")[0][:20] logging.debug("Found db: {0}".format(db)) txtfile = write_txt(bedfile) txtfiles.append(txtfile) db += ".txt" mm = MakeManager() for txtfile in txtfiles: outfile = txtfile + ".alignment" cmd = 'cyntenator -t "({0} {1})" -h blast {2} > {3}'\ .format(txtfile, db, filteredlastfile, outfile) mm.add((txtfile, db, filteredlastfile), outfile, cmd) mm.write()
def nucmer(args): """ %prog nucmer ref.fasta query.fasta Run NUCMER using query against reference. Parallel implementation derived from: <https://github.com/fritzsedlazeck/sge_mummer> """ from itertools import product from jcvi.apps.grid import MakeManager from jcvi.formats.base import split p = OptionParser(nucmer.__doc__) p.add_option("--chunks", type="int", help="Split both query and subject into chunks") p.set_params(prog="nucmer", params="-g 5000 -l 24 -c 500") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args cpus = opts.cpus nrefs = nqueries = opts.chunks or int(cpus**.5) refdir = ref.split(".")[0] + "-outdir" querydir = query.split(".")[0] + "-outdir" reflist = split([ref, refdir, str(nrefs)]).names querylist = split([query, querydir, str(nqueries)]).names mm = MakeManager() for i, (r, q) in enumerate(product(reflist, querylist)): pf = "{0:04d}".format(i) cmd = "nucmer -maxmatch" cmd += " {0}".format(opts.extra) cmd += " {0} {1} -p {2}".format(r, q, pf) deltafile = pf + ".delta" mm.add((r, q), deltafile, cmd) print cmd mm.write()
def kmc(args): """ %prog kmc folder Run kmc3 on Illumina reads. """ p = OptionParser(kmc.__doc__) p.add_option("-k", default=21, type="int", help="Kmer size") p.add_option("--ci", default=2, type="int", help="Minimum value of a counter") p.add_option("--cs", default=2, type="int", help="Maximal value of a counter") p.add_option("--single", default=False, action="store_true", help="Input is single-end data, only one FASTQ") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args K = opts.k n = 1 if opts.single else 2 mm = MakeManager() for p, pf in iter_project(folder, n=n, commonprefix=False): pf = pf.split("_")[0] + ".ms{}".format(K) infiles = pf + ".infiles" fw = open(infiles, "w") print >> fw, "\n".join(p) fw.close() cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus) cmd += " -ci{} -cs{}".format(opts.ci, opts.cs) cmd += " @{} {} .".format(infiles, pf) outfile = pf + ".kmc_suf" mm.add(p, outfile, cmd) mm.write()
def meryl(args): """ %prog meryl folder Run meryl on Illumina reads. """ p = OptionParser(meryl.__doc__) p.add_option("-k", default=19, type="int", help="Kmer size") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (folder, ) = args K = opts.k cpus = opts.cpus mm = MakeManager() for p, pf in iter_project(folder): cmds = [] mss = [] for i, ip in enumerate(p): ms = "{}{}.ms{}".format(pf, i + 1, K) mss.append(ms) cmd = "meryl -B -C -m {} -threads {}".format(K, cpus) cmd += " -s {} -o {}".format(ip, ms) cmds.append(cmd) ams, bms = mss pms = "{}.ms{}".format(pf, K) cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms) cmds.append(cmd) cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".format( ams, ams, bms, bms) cmds.append(cmd) mm.add(p, pms + ".mcdat", cmds) mm.write()
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--haploid", default="chrY,chrM", help="Use haploid model for these chromosomes") p.add_option("--chr", help="Run only this chromosome") p.add_option("--simulation", default=False, action="store_true", help="Simulation mode") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args if opts.simulation: # Simulation mode cmd, vcf_file = allelotype_on_chr(bamfile, "chr4", "/mnt/software/lobSTR/", "TREDs", haploid=opts.haploid) stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats" results_dir = "lobstr_results" mkdir(results_dir) sh(cmd) sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file)) return s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx, haploid=opts.haploid) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def gatk(args): """ %prog gatk bamfile reference.fasta Call SNPs based on GATK best practices. """ p = OptionParser(gatk.__doc__) p.add_option("--indelrealign", default=False, action="store_true", help="Perform indel realignment") p.set_home("gatk") p.set_home("picard") p.set_phred() p.set_cpus(cpus=24) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, ref = args pf = bamfile.rsplit(".", 1)[0] mm = MakeManager() picard = "java -Xmx32g -jar {0}/picard.jar".format(opts.picard_home) tk = "java -Xmx32g -jar {0}/GenomeAnalysisTK.jar".format(opts.gatk_home) tk += " -R {0}".format(ref) # Step 0 - build reference dictfile = ref.rsplit(".", 1)[0] + ".dict" cmd1 = picard + " CreateSequenceDictionary" cmd1 += " R={0} O={1}".format(ref, dictfile) cmd2 = "samtools faidx {0}".format(ref) mm.add(ref, dictfile, (cmd1, cmd2)) # Step 1 - sort bam sortedbamfile = pf + ".sorted.bam" cmd = picard + " SortSam" cmd += " INPUT={0} OUTPUT={1}".format(bamfile, sortedbamfile) cmd += " SORT_ORDER=coordinate CREATE_INDEX=true" mm.add(bamfile, sortedbamfile, cmd) # Step 2 - mark duplicates dedupbamfile = pf + ".dedup.bam" cmd = picard + " MarkDuplicates" cmd += " INPUT={0} OUTPUT={1}".format(sortedbamfile, dedupbamfile) cmd += " METRICS_FILE=dedup.log CREATE_INDEX=true" mm.add(sortedbamfile, dedupbamfile, cmd) if opts.indelrealign: # Step 3 - create indel realignment targets intervals = pf + ".intervals" cmd = tk + " -T RealignerTargetCreator" cmd += " -I {0} -o {1}".format(dedupbamfile, intervals) mm.add(dedupbamfile, intervals, cmd) # Step 4 - indel realignment realignedbamfile = pf + ".realigned.bam" cmd = tk + " -T IndelRealigner" cmd += " -targetIntervals {0}".format(intervals) cmd += " -I {0} -o {1}".format(dedupbamfile, realignedbamfile) mm.add((dictfile, intervals), realignedbamfile, cmd) else: realignedbamfile = dedupbamfile # Step 5 - SNP calling vcf = pf + ".vcf" cmd = tk + " -T HaplotypeCaller" cmd += " -I {0}".format(realignedbamfile) cmd += " --genotyping_mode DISCOVERY" cmd += " -stand_emit_conf 10 -stand_call_conf 30" cmd += " -nct {0}".format(opts.cpus) cmd += " -o {0}".format(vcf) if opts.phred == "64": cmd += " --fix_misencoded_quality_scores" mm.add(realignedbamfile, vcf, cmd) # Step 6 - SNP filtering filtered_vcf = pf + ".filtered.vcf" cmd = tk + " -T VariantFiltration" cmd += " -V {0}".format(vcf) cmd += ' --filterExpression "DP < 10 || DP > 300 || QD < 2.0 || FS > 60.0 || MQ < 40.0"' cmd += ' --filterName "LOWQUAL"' cmd += ' --genotypeFilterExpression "isHomVar == 1"' cmd += ' --genotypeFilterName "HOMOVAR"' cmd += ' --genotypeFilterExpression "isHet == 1"' cmd += ' --genotypeFilterName "HET"' cmd += " -o {0}".format(filtered_vcf) mm.add(vcf, filtered_vcf, cmd) mm.write()
def snpflow(args): """ %prog snpflow trimmed reference.fasta Run SNP calling pipeline until allele_counts are generated. This includes generation of native files, SNP_Het file. Speedup for fragmented genomes are also supported. """ p = OptionParser(snpflow.__doc__) p.set_fastq_names() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, ref = args nseqs = len(Fasta(ref)) supercat = nseqs >= 1000 if supercat: logging.debug("Total seqs in ref: {0} (supercat={1})".\ format(nseqs, supercat)) reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure nativedir, countsdir = "native", "allele_counts" for d in (nativedir, countsdir): mkdir(d) mm = MakeManager() # Step 0 - index database db = op.join(*check_index(ref, supercat=supercat, go=False)) cmd = "python -m jcvi.apps.gmap index {0}".format(ref) if supercat: cmd += " --supercat" coordsfile = db + ".coords" supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta" mm.add(ref, (db, coordsfile), cmd) else: mm.add(ref, db, cmd) # Step 1 - GSNAP alignment and conversion to native file allnatives = [] allsamstats = [] gmapdb = supercatfile if supercat else ref for f in reads: prefix = get_prefix(f, ref) gsnapfile = op.join(nativedir, prefix + ".gsnap") nativefile = op.join(nativedir, prefix + ".unique.native") samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats") cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f) cmd += " --outdir={0} --native --cpus=1".format(nativedir) mm.add((f, db), nativefile, cmd) cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\ format(gsnapfile, gmapdb) mm.add(nativefile, samstatsfile, cmd) allnatives.append(nativefile) allsamstats.append(samstatsfile) # Step 2 - call SNP discovery if supercat: nativeconverted = nativedir + "-converted" mkdir(nativeconverted) allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives] cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl" cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted) cmd += " -c {0}".format(coordsfile) cmds = ["rm -rf {0}".format(nativeconverted), cmd] mm.add(allnatives + [coordsfile], allnativesc, cmds) runfile = "speedup.sh" write_file(runfile, speedupsh.format(nativeconverted, opts.cpus)) nativedir = nativeconverted allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples] mm.add(allnativesc, allsnps, "./{0}".format(runfile)) else: for s in samples: snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s)) cmd = "SNP_Discovery-short.pl" cmd += " -native {0}/{1}.*unique.native".format(nativedir, s) cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile) flist = [x for x in allnatives if op.basename(x).split(".")[0] == s] mm.add(flist, snpfile, cmd) # Step 3 - generate equal file allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples] for s in samples: equalfile = op.join(nativedir, "{0}.equal".format(s)) cmd = "extract_reference_alleles.pl" cmd += " --native {0}/{1}.*unique.native".format(nativedir, s) cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s) cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir) cmd += " --fasta {0} --output {1}".format(ref, equalfile) mm.add(allsnps, equalfile, cmd) # Step 4 - generate snp matrix allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples] matrix = "snps.matrix.txt" cmd = "generate_matrix.pl" cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir) cmd += " --fasta {0} --output {1}".format(ref, matrix) mm.add(allsnps + allequals, matrix, cmd) # Step 5 - generate allele counts allcounts = [] for s in samples: allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s)) cmd = "count_reads_per_allele.pl -m snps.matrix.txt" cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir) cmd += " -o {0}".format(allele_counts) mm.add(matrix, allele_counts, cmd) allcounts.append(allele_counts) # Step 6 - generate raw snps rawsnps = "Genotyping.H3.txt" cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3" cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps) cmds = ["rm -f {0}".format(rawsnps), cmd] mm.add(allcounts, rawsnps, cmds) # Step 7 - generate alignment report sam_summary = "sam.summary" cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl" cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary) mm.add(allsamstats, sam_summary, cmd) native_summary = "native.summary" cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl" cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary) mm.add(allnatives, native_summary, cmd) mm.write()
def novo2(args): """ %prog novo2 trimmed projectname Reference-free tGBS pipeline v2. """ p = OptionParser(novo2.__doc__) p.set_fastq_names() p.set_align(pctid=95) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, pf = args pctid = opts.pctid reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure clustdir = "uclust" acdir ="allele_counts" for d in (clustdir, acdir): mkdir(d) mm = MakeManager() clustfiles = [] # Step 0 - clustering within sample for s in samples: flist = [x for x in reads if op.basename(x).split(".")[0] == s] outfile = s + ".P{0}.clustS".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust cluster --cpus=8" cmd += " {0} {1}".format(s, " ".join(flist)) cmd += " --outdir={0}".format(clustdir) cmd += " --pctid={0}".format(pctid) mm.add(flist, outfile, cmd) clustfiles.append(outfile) # Step 1 - make consensus within sample allcons = [] for s, clustfile in zip(samples, clustfiles): outfile = s + ".P{0}.consensus".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust consensus" cmd += " {0}".format(clustfile) mm.add(clustfile, outfile, cmd) allcons.append(outfile) # Step 2 - clustering across samples clustSfile = pf + ".P{0}.clustS".format(pctid) cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons, clustSfile, cmd) # Step 3 - make consensus across samples locifile = pf + ".P{0}.loci".format(pctid) cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons + [clustSfile], locifile, cmd) mm.write()
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob cparams = "1 1 2 80 5 200 2000" p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--lobstr", default=False, action="store_true", help="Generate output for lobSTR") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") p.add_option("--centromeres", default=False, action="store_true", help="Run centromere search: {}".format(cparams)) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args minlength = opts.minscore / 2 mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() if opts.centromeres: params = cparams.split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).rsplit(".", 1)[0] # Commands starting with trf ignores errors cmd1 = "-trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | grep -v ^Parameters".format(datfile) if opts.lobstr: cmd2 += " | awk '($8 >= {} && $8 <= {})'".\ format(minlength, READLEN - minlength) else: cmd2 += " | awk '($8 >= 0)'" cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()
def minimac(args): """ %prog batchminimac input.txt Use MINIMAC3 to impute vcf on all chromosomes. """ p = OptionParser(minimac.__doc__) p.set_home("shapeit") p.set_home("minimac") p.set_outfile() p.set_chr() p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) txtfile, = args ref = opts.ref mm = MakeManager() pf = txtfile.split(".")[0] allrawvcf = [] alloutvcf = [] chrs = opts.chr.split(",") for x in chrs: px = CM[x] chrvcf = pf + ".{0}.vcf".format(px) if txtfile.endswith(".vcf"): cmd = "vcftools --vcf {0} --chr {1}".format(txtfile, x) cmd += " --out {0}.{1} --recode".format(pf, px) cmd += " && mv {0}.{1}.recode.vcf {2}".format(pf, px, chrvcf) else: # 23andme cmd = "python -m jcvi.formats.vcf from23andme {0} {1}".format(txtfile, x) cmd += " --ref {0}".format(ref) mm.add(txtfile, chrvcf, cmd) chrvcf_hg38 = pf + ".{0}.23andme.hg38.vcf".format(px) minimac_liftover(mm, chrvcf, chrvcf_hg38, opts) allrawvcf.append(chrvcf_hg38) minimacvcf = "{0}.{1}.minimac.dose.vcf".format(pf, px) if x == "X": minimac_X(mm, x, chrvcf, opts) elif x in ["Y", "MT"]: cmd = "python -m jcvi.variation.impute passthrough" cmd += " {0} {1}".format(chrvcf, minimacvcf) mm.add(chrvcf, minimacvcf, cmd) else: minimac_autosome(mm, x, chrvcf, opts) # keep the best line for multi-allelic markers uniqvcf= "{0}.{1}.minimac.uniq.vcf".format(pf, px) cmd = "python -m jcvi.formats.vcf uniq {0} > {1}".\ format(minimacvcf, uniqvcf) mm.add(minimacvcf, uniqvcf, cmd) minimacvcf_hg38 = "{0}.{1}.minimac.hg38.vcf".format(pf, px) minimac_liftover(mm, uniqvcf, minimacvcf_hg38, opts) alloutvcf.append(minimacvcf_hg38) if len(allrawvcf) > 1: rawhg38vcfgz = pf + ".all.23andme.hg38.vcf.gz" cmd = "vcf-concat {0} | bgzip > {1}".format(" ".join(allrawvcf), rawhg38vcfgz) mm.add(allrawvcf, rawhg38vcfgz, cmd) if len(alloutvcf) > 1: outhg38vcfgz = pf + ".all.minimac.hg38.vcf.gz" cmd = "vcf-concat {0} | bgzip > {1}".format(" ".join(alloutvcf), outhg38vcfgz) mm.add(alloutvcf, outhg38vcfgz, cmd) mm.write()
def mappability(args): """ %prog mappability reference.fasta Generate 50mer mappability for reference genome. Commands are based on gem mapper. See instructions: <https://github.com/xuefzhao/Reference.Mappability> """ p = OptionParser(mappability.__doc__) p.add_option("--mer", default=50, type="int", help="User mer size") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) ref, = args K = opts.mer pf = ref.rsplit(".", 1)[0] mm = MakeManager() gem = pf + ".gem" cmd = "gem-indexer -i {} -o {}".format(ref, pf) mm.add(ref, gem, cmd) mer = pf + ".{}mer".format(K) mapb = mer + ".mappability" cmd = "gem-mappability -I {} -l {} -o {} -T {}".\ format(gem, K, mer, opts.cpus) mm.add(gem, mapb, cmd) wig = mer + ".wig" cmd = "gem-2-wig -I {} -i {} -o {}".format(gem, mapb, mer) mm.add(mapb, wig, cmd) bw = mer + ".bw" cmd = "wigToBigWig {} {}.sizes {}".format(wig, mer, bw) mm.add(wig, bw, cmd) bg = mer + ".bedGraph" cmd = "bigWigToBedGraph {} {}".format(bw, bg) mm.add(bw, bg, cmd) merged = mer + ".filtered-1.merge.bed" cmd = "python -m jcvi.formats.bed filterbedgraph {} 1".format(bg) mm.add(bg, merged, cmd) mm.write()
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def lobstr(args): """ %prog lobstr bamfile lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.add_option("--prefix", help="Use prefix file name") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) bamfile = args[0] lbindices = args[1:] s3mode = bamfile.startswith("s3") store = opts.store workdir = opts.workdir mkdir(workdir) os.chdir(workdir) pf = opts.prefix or bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "s3://{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile lhome = opts.lobstr_home chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: mm = MakeManager(filename="makefile.{0}".format(lbidx)) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) vcffiles.append(vcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if opts.cleanup: sh("rm -f *")
def novo2(args): """ %prog novo2 trimmed projectname Reference-free tGBS pipeline v2. """ p = OptionParser(novo2.__doc__) p.set_fastq_names() p.set_align(pctid=94) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, pf = args pctid = opts.pctid reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure clustdir = "uclust" acdir ="allele_counts" for d in (clustdir, acdir): mkdir(d) mm = MakeManager() clustfiles = [] # Step 0 - clustering within sample for s in samples: flist = [x for x in reads if op.basename(x).split(".")[0] == s] outfile = s + ".P{0}.clustS".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust cluster --cpus=8" cmd += " {0} {1}".format(s, " ".join(flist)) cmd += " --outdir={0}".format(clustdir) cmd += " --pctid={0}".format(pctid) mm.add(flist, outfile, cmd) clustfiles.append(outfile) # Step 1 - make consensus within sample allcons = [] for s, clustfile in zip(samples, clustfiles): outfile = s + ".P{0}.consensus".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust consensus" cmd += " {0}".format(clustfile) mm.add(clustfile, outfile, cmd) allcons.append(outfile) # Step 2 - clustering across samples clustSfile = pf + ".P{0}.clustS".format(pctid) cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons, clustSfile, cmd) # Step 3 - make consensus across samples locifile = pf + ".P{0}.loci".format(pctid) cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons + [clustSfile], locifile, cmd) mm.write()