def star(args): """ %prog star folder reference Run star on a folder with reads. """ p = OptionParser(star.__doc__) p.add_option("--single", default=False, action="store_true", help="Single end mapping") p.set_fastq_names() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args cpus = opts.cpus mm = MakeManager() num = 1 if opts.single else 2 folder, reference = args gd = "GenomeDir" mkdir(gd) STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd) # Step 0: build genome index genomeidx = op.join(gd, "Genome") if need_update(reference, genomeidx): cmd = STAR + " --runMode genomeGenerate" cmd += " --genomeFastaFiles {0}".format(reference) mm.add(reference, genomeidx, cmd) # Step 1: align for p, prefix in iter_project(folder, opts.names, num): pf = "{0}_star".format(prefix) bamfile = pf + "Aligned.sortedByCoord.out.bam" cmd = STAR + " --readFilesIn {0}".format(" ".join(p)) if p[0].endswith(".gz"): cmd += " --readFilesCommand zcat" cmd += " --outSAMtype BAM SortedByCoordinate" cmd += " --outFileNamePrefix {0}".format(pf) cmd += " --twopassMode Basic" # Compatibility for cufflinks cmd += " --outSAMstrandField intronMotif" cmd += " --outFilterIntronMotifs RemoveNoncanonical" mm.add(p, bamfile, cmd) mm.write()
def snpflow(args): """ %prog snpflow trimmed reference.fasta Run SNP calling pipeline until allele_counts are generated. This includes generation of native files, SNP_Het file. Speedup for fragmented genomes are also supported. """ p = OptionParser(snpflow.__doc__) p.set_fastq_names() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, ref = args nseqs = len(Fasta(ref)) supercat = nseqs >= 1000 if supercat: logging.debug("Total seqs in ref: {0} (supercat={1})".\ format(nseqs, supercat)) reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure nativedir, countsdir = "native", "allele_counts" for d in (nativedir, countsdir): mkdir(d) mm = MakeManager() # Step 0 - index database db = op.join(*check_index(ref, supercat=supercat, go=False)) cmd = "python -m jcvi.apps.gmap index {0}".format(ref) if supercat: cmd += " --supercat" coordsfile = db + ".coords" supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta" mm.add(ref, (db, coordsfile), cmd) else: mm.add(ref, db, cmd) # Step 1 - GSNAP alignment and conversion to native file allnatives = [] allsamstats = [] gmapdb = supercatfile if supercat else ref for f in reads: prefix = get_prefix(f, ref) gsnapfile = op.join(nativedir, prefix + ".gsnap") nativefile = op.join(nativedir, prefix + ".unique.native") samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats") cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f) cmd += " --outdir={0} --native --cpus=1".format(nativedir) mm.add((f, db), nativefile, cmd) cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\ format(gsnapfile, gmapdb) mm.add(nativefile, samstatsfile, cmd) allnatives.append(nativefile) allsamstats.append(samstatsfile) # Step 2 - call SNP discovery if supercat: nativeconverted = nativedir + "-converted" mkdir(nativeconverted) allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives] cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl" cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted) cmd += " -c {0}".format(coordsfile) cmds = ["rm -rf {0}".format(nativeconverted), cmd] mm.add(allnatives + [coordsfile], allnativesc, cmds) runfile = "speedup.sh" write_file(runfile, speedupsh.format(nativeconverted, opts.cpus)) nativedir = nativeconverted allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples] mm.add(allnativesc, allsnps, "./{0}".format(runfile)) else: for s in samples: snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s)) cmd = "SNP_Discovery-short.pl" cmd += " -native {0}/{1}.*unique.native".format(nativedir, s) cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile) flist = [x for x in allnatives if op.basename(x).split(".")[0] == s] mm.add(flist, snpfile, cmd) # Step 3 - generate equal file allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples] for s in samples: equalfile = op.join(nativedir, "{0}.equal".format(s)) cmd = "extract_reference_alleles.pl" cmd += " --native {0}/{1}.*unique.native".format(nativedir, s) cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s) cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir) cmd += " --fasta {0} --output {1}".format(ref, equalfile) mm.add(allsnps, equalfile, cmd) # Step 4 - generate snp matrix allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples] matrix = "snps.matrix.txt" cmd = "generate_matrix.pl" cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir) cmd += " --fasta {0} --output {1}".format(ref, matrix) mm.add(allsnps + allequals, matrix, cmd) # Step 5 - generate allele counts allcounts = [] for s in samples: allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s)) cmd = "count_reads_per_allele.pl -m snps.matrix.txt" cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir) cmd += " -o {0}".format(allele_counts) mm.add(matrix, allele_counts, cmd) allcounts.append(allele_counts) # Step 6 - generate raw snps rawsnps = "Genotyping.H3.txt" cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3" cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps) cmds = ["rm -f {0}".format(rawsnps), cmd] mm.add(allcounts, rawsnps, cmds) # Step 7 - generate alignment report sam_summary = "sam.summary" cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl" cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary) mm.add(allsamstats, sam_summary, cmd) native_summary = "native.summary" cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl" cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary) mm.add(allnatives, native_summary, cmd) mm.write()
def novo2(args): """ %prog novo2 trimmed projectname Reference-free tGBS pipeline v2. """ p = OptionParser(novo2.__doc__) p.set_fastq_names() p.set_align(pctid=94) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, pf = args pctid = opts.pctid reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure clustdir = "uclust" acdir ="allele_counts" for d in (clustdir, acdir): mkdir(d) mm = MakeManager() clustfiles = [] # Step 0 - clustering within sample for s in samples: flist = [x for x in reads if op.basename(x).split(".")[0] == s] outfile = s + ".P{0}.clustS".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust cluster --cpus=8" cmd += " {0} {1}".format(s, " ".join(flist)) cmd += " --outdir={0}".format(clustdir) cmd += " --pctid={0}".format(pctid) mm.add(flist, outfile, cmd) clustfiles.append(outfile) # Step 1 - make consensus within sample allcons = [] for s, clustfile in zip(samples, clustfiles): outfile = s + ".P{0}.consensus".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust consensus" cmd += " {0}".format(clustfile) mm.add(clustfile, outfile, cmd) allcons.append(outfile) # Step 2 - clustering across samples clustSfile = pf + ".P{0}.clustS".format(pctid) cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons, clustSfile, cmd) # Step 3 - make consensus across samples locifile = pf + ".P{0}.loci".format(pctid) cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons + [clustSfile], locifile, cmd) mm.write()
def prepare(args): """ %prog prepare [--options] folder [--bam rnaseq.coordSorted.bam] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN-Trinity. If coord-sorted BAM is provided, prepare script for GG-Trinity, using BAM as starting point. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_fastq_names() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] paired = opts.paired merge = opts.merge trinity_home = opts.trinity_home hpc_grid_runner_home = opts.hpcgridrunner_home method = "DN" bam = opts.bam if bam and op.exists(bam): bam = op.abspath(bam) method = "GG" pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) cmds = [] # set TRINITY_HOME env variable when preparing shell script env_cmd = 'export TRINITY_HOME="{0}"'.format(trinity_home) cmds.append(env_cmd) if method == "DN": assert op.exists("../" + inparam) flist = iglob("../" + inparam, opts.names) if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x or "_R1" in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x or "_R2" in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(trinity_home, "Trinity") cmd += " --seqType fq --max_memory {0} --CPU {1}".format(opts.max_memory, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome_guided_bam {0}".format(bam) cmd += " --genome_guided_max_intron {0}".format(opts.max_intron) else: if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: cmd += " --left {0}".format(",".join(f1)) cmd += " --right {0}".format(",".join(f2)) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.grid and opts.grid_conf_file: hpc_grid_runner = op.join(hpc_grid_runner_home, "hpc_cmds_GridRunner.pl") hpc_grid_conf_file = op.join(hpc_grid_runner_home, "hpc_conf", opts.grid_conf_file) assert op.exists(hpc_grid_conf_file), "HpcGridRunner conf file does not exist: {0}".format(hpc_grid_conf_file) cmd += ' --grid_exec "{0} --grid_conf {1} -c"'.format(hpc_grid_runner, hpc_grid_conf_file) if opts.extra: cmd += " {0}".format(opts.extra) cmds.append(cmd) if opts.cleanup: cleanup_cmd = 'rm -rf !("Trinity.fasta"|"Trinity.gene_trans_map"|"Trinity.timing")' \ if method == "DN" else \ 'rm -rf !("Trinity-GG.fasta"|"Trinity-GG.gene_trans_map"|"Trinity.timing")' cmd.append(cleanup_cmd) runfile = "run.sh" write_file(runfile, "\n".join(cmds)) os.chdir(cwd)
def novo2(args): """ %prog novo2 trimmed projectname Reference-free tGBS pipeline v2. """ p = OptionParser(novo2.__doc__) p.set_fastq_names() p.set_align(pctid=95) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trimmed, pf = args pctid = opts.pctid reads, samples = scan_read_files(trimmed, opts.names) # Set up directory structure clustdir = "uclust" acdir ="allele_counts" for d in (clustdir, acdir): mkdir(d) mm = MakeManager() clustfiles = [] # Step 0 - clustering within sample for s in samples: flist = [x for x in reads if op.basename(x).split(".")[0] == s] outfile = s + ".P{0}.clustS".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust cluster --cpus=8" cmd += " {0} {1}".format(s, " ".join(flist)) cmd += " --outdir={0}".format(clustdir) cmd += " --pctid={0}".format(pctid) mm.add(flist, outfile, cmd) clustfiles.append(outfile) # Step 1 - make consensus within sample allcons = [] for s, clustfile in zip(samples, clustfiles): outfile = s + ".P{0}.consensus".format(pctid) outfile = op.join(clustdir, outfile) cmd = "python -m jcvi.apps.uclust consensus" cmd += " {0}".format(clustfile) mm.add(clustfile, outfile, cmd) allcons.append(outfile) # Step 2 - clustering across samples clustSfile = pf + ".P{0}.clustS".format(pctid) cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons, clustSfile, cmd) # Step 3 - make consensus across samples locifile = pf + ".P{0}.loci".format(pctid) cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons)) cmd += " --prefix={0}".format(pf) mm.add(allcons + [clustSfile], locifile, cmd) mm.write()