def batchseeds(args): """ %prog batchseeds folder Extract seed metrics for each image in a directory. """ from jcvi.formats.pdf import cat xargs = args[1:] p = OptionParser(batchseeds.__doc__) opts, args, iopts = add_seeds_options(p, args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args folder = folder.rstrip('/') outdir = folder + "-debug" outfile = folder + "-output.tsv" assert op.isdir(folder) images = [] jsonfile = opts.calibrate or op.join(folder, "calibrate.json") if not op.exists(jsonfile): jsonfile = None for im in iglob(folder, "*.jpg", "*.JPG", "*.png"): if im.endswith(".resize.jpg") or \ im.endswith(".main.jpg") or \ im.endswith(".label.jpg"): continue if op.basename(im).startswith("calibrate"): continue images.append(im) fw = must_open(outfile, 'w') print >> fw, Seed.header(calibrate=jsonfile) nseeds = 0 for im in images: imargs = [im, "--noheader", "--outdir={0}".format(outdir)] + xargs if jsonfile: imargs += ["--calibrate={0}".format(jsonfile)] objects = seeds(imargs) for o in objects: print >> fw, o nseeds += len(objects) fw.close() logging.debug("Processed {0} images.".format(len(images))) logging.debug("A total of {0} objects written to `{1}`.".\ format(nseeds, outfile)) pdfs = iglob(outdir, "*.pdf") outpdf = folder + "-output.pdf" cat(pdfs + ["--outfile={0}".format(outpdf)]) logging.debug("Debugging information written to `{0}`.".format(outpdf)) return outfile
def batchseeds(args): """ %prog batchseeds folder Extract seed metrics for each image in a directory. """ from jcvi.formats.pdf import cat xargs = args[1:] p = OptionParser(batchseeds.__doc__) opts, args, iopts = add_seeds_options(p, args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args folder = folder.rstrip('/') outdir = folder + "-debug" outfile = folder + "-output.tsv" assert op.isdir(folder) images = [] jsonfile = opts.calibrate or op.join(folder, "calibrate.json") if not op.exists(jsonfile): jsonfile = None for im in iglob(folder, "*.jpg,*.JPG,*.png"): if im.endswith(".resize.jpg") or \ im.endswith(".main.jpg") or \ im.endswith(".label.jpg"): continue if op.basename(im).startswith("calibrate"): continue images.append(im) fw = must_open(outfile, 'w') print >> fw, Seed.header(calibrate=jsonfile) nseeds = 0 for im in images: imargs = [im, "--noheader", "--outdir={0}".format(outdir)] + xargs if jsonfile: imargs += ["--calibrate={0}".format(jsonfile)] objects = seeds(imargs) for o in objects: print >> fw, o nseeds += len(objects) fw.close() logging.debug("Processed {0} images.".format(len(images))) logging.debug("A total of {0} objects written to `{1}`.".\ format(nseeds, outfile)) pdfs = iglob(outdir, "*.pdf") outpdf = folder + "-output.pdf" cat(pdfs + ["--outfile={0}".format(outpdf)]) logging.debug("Debugging information written to `{0}`.".format(outpdf)) return outfile
def compile(args): """ %prog compile directory Extract telomere length and ccn. """ p = OptionParser(compile.__doc__) p.set_outfile(outfile="age.tsv") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) dfs = [] for folder in args: ofolder = os.listdir(folder) # telomeres subdir = [x for x in ofolder if x.startswith("telomeres")][0] subdir = op.join(folder, subdir) filename = op.join(subdir, "tel_lengths.txt") df = pd.read_csv(filename, sep="\t") d1 = df.ix[0].to_dict() # ccn subdir = [x for x in ofolder if x.startswith("ccn")][0] subdir = op.join(folder, subdir) filename = iglob(subdir, "*.ccn.json")[0] js = json.load(open(filename)) d1.update(js) df = pd.DataFrame(d1, index=[0]) dfs.append(df) df = pd.concat(dfs, ignore_index=True) df.to_csv(opts.outfile, sep="\t", index=False)
def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'. format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def traits(args): """ %prog traits directory Make HTML page that reports eye and skin color. """ p = OptionParser(traits.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) samples = [] for folder in args: targets = iglob(folder, "*-traits.json") if not targets: continue filename = targets[0] js = json.load(open(filename)) js["skin_rgb"] = make_rgb( js["traits"]["skin-color"]["L"], js["traits"]["skin-color"]["A"], js["traits"]["skin-color"]["B"]) js["eye_rgb"] = make_rgb( js["traits"]["eye-color"]["L"], js["traits"]["eye-color"]["A"], js["traits"]["eye-color"]["B"]) samples.append(js) template = Template(traits_template) fw = open("report.html", "w") print >> fw, template.render(samples=samples) logging.debug("Report written to `{}`".format(fw.name)) fw.close()
def compilevcf(args): """ %prog compilevcf dir Compile vcf outputs into lists. """ from jcvi.variation.str import LobSTRvcf p = OptionParser(compilevcf.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args vcf_files = iglob(folder, "*.vcf,*.vcf.gz") for vcf_file in vcf_files: try: p = LobSTRvcf(columnidsfile=None) p.parse(vcf_file, filtered=False) res = p.items() if res: k, v = res[0] res = v.replace(',', '/') else: res = "-1/-1" num = op.basename(vcf_file).split(".")[0] print num, res except (TypeError, AttributeError) as e: p = TREDPARSEvcf(vcf_file) continue
def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'.\ format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def scan_read_files(trimmed, patterns): reads = iglob(trimmed, patterns) samples = sorted(set(op.basename(x).split(".")[0] for x in reads)) logging.debug( "Total {0} read files from {1} samples".format(len(reads), len(samples)) ) return reads, samples
def traits(args): """ %prog traits directory Make HTML page that reports eye and skin color. """ p = OptionParser(traits.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) samples = [] for folder in args: targets = iglob(folder, "*-traits.json") if not targets: continue filename = targets[0] js = json.load(open(filename)) js["skin_rgb"] = make_rgb(js["traits"]["skin-color"]["L"], js["traits"]["skin-color"]["A"], js["traits"]["skin-color"]["B"]) js["eye_rgb"] = make_rgb(js["traits"]["eye-color"]["L"], js["traits"]["eye-color"]["A"], js["traits"]["eye-color"]["B"]) samples.append(js) template = Template(traits_template) fw = open("report.html", "w") print(template.render(samples=samples), file=fw) logging.debug("Report written to `{}`".format(fw.name)) fw.close()
def mergebam(args): """ %prog mergebam dir1 dir2 homo_outdir or %prog mergebam dir1 dir2/20.bam het_outdir Merge sets of BAMs to make diploid. Two modes: - Homozygous mode: pair-up the bams in the two folders and merge - Heterozygous mode: pair the bams in first folder with a particular bam """ p = OptionParser(mergebam.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) idir1, idir2, outdir = args dir1 = [idir1] if idir1.endswith(".bam") else iglob(idir1, "*.bam") dir2 = [idir2] if idir2.endswith(".bam") else iglob(idir2, "*.bam") nbams1 = len(dir1) nbams2 = len(dir2) # Make sure more or the same number of bams in first pile if nbams1 < nbams2: dir1, dir2 = dir2, dir1 if nbams1 == nbams2: logging.debug("Homozygous mode") elif nbams1 > nbams2: assert nbams2 == 1, "Second pile must contain a single bam" dir2 = [idir2] * nbams1 assert len(dir1) == len(dir2), "Two piles must contain same number of bams" cmd = "samtools merge {} {} {} && samtools index {}" cmds = [] mkdir(outdir) for a, b in zip(dir1, dir2): ia = op.basename(a).split(".")[0] ib = op.basename(b).split(".")[0] outfile = op.join(outdir, "{}_{}.bam".format(ia, ib)) cmds.append(cmd.format(outfile, a, b, outfile)) p = Parallel(cmds, cpus=opts.cpus) p.run()
def iter_project(folder, pattern, n=2): # Check for paired reads and extract project id filelist = [x for x in iglob(folder, pattern)] for p in grouper(filelist, n): if len(p) != n: continue pp = [op.basename(x) for x in p] pf = pairspf(pp) yield list(p), pf
def iter_project(folder, pattern="*.fq,*.fq.gz,*.fastq,*.fastq.gz", n=2): # Check for paired reads and extract project id filelist = [x for x in iglob(folder, pattern)] for p in grouper(filelist, n): if len(p) != n or None in p: continue pp = [op.basename(x) for x in p] pf = pairspf(pp) yield list(p), pf
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).split(".")[0] cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | awk '($8 <= {} && $9 >= 0)'".format(datfile, READLEN) cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()
def iter_project(folder, pattern="*.fq,*.fq.gz,*.fastq,*.fastq.gz", n=2, commonprefix=True): # Check for paired reads and extract project id filelist = [x for x in iglob(folder, pattern)] for p in grouper(filelist, n): if len(p) != n or None in p: continue pp = [op.basename(x) for x in p] pf = pairspf(pp, commonprefix=commonprefix) yield sorted(p), pf
def cufflinks(args): """ %prog cufflinks folder reference Run cufflinks on a folder containing tophat results. """ p = OptionParser(cufflinks.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args cpus = opts.cpus gtf = opts.gtf transcripts = "transcripts.gtf" mm = MakeManager() gtfs = [] for bam in iglob(folder, "*.bam"): pf = op.basename(bam).split(".")[0] outdir = pf + "_cufflinks" cmd = "cufflinks" cmd += " -o {0}".format(outdir) cmd += " -p {0}".format(cpus) if gtf: cmd += " -g {0}".format(gtf) cmd += " --frag-bias-correct {0}".format(reference) cmd += " --multi-read-correct" cmd += " {0}".format(bam) cgtf = op.join(outdir, transcripts) mm.add(bam, cgtf, cmd) gtfs.append(cgtf) assemblylist = "assembly_list.txt" cmd = 'find . -name "{0}" > {1}'.format(transcripts, assemblylist) mm.add(gtfs, assemblylist, cmd) mergedgtf = "merged/merged.gtf" cmd = "cuffmerge" cmd += " -o merged" cmd += " -p {0}".format(cpus) if gtf: cmd += " -g {0}".format(gtf) cmd += " -s {0}".format(reference) cmd += " {0}".format(assemblylist) mm.add(assemblylist, mergedgtf, cmd) mm.write()
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--minlength", default=MINSCORE / 2, type="int", help="Minimum length of repeat tract") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args minlength = opts.minlength mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).split(".")[0] cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | awk '($8 >= {} && $8 <= {})'".\ format(datfile, minlength, READLEN - minlength) cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()
def gallery(args): """ %prog gallery folder link_prefix Convert a folder of figures to a HTML table. For example: $ python -m jcvi.formats.html gallery Paper-figures/ https://dl.dropboxusercontent.com/u/15937715/Data/Paper-figures/ Maps the images from local to remote. """ from more_itertools import grouper from jcvi.apps.base import iglob p = OptionParser(gallery.__doc__) p.add_option("--columns", default=3, type="int", help="How many cells per row") p.add_option("--width", default=200, type="int", help="Image width") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, link_prefix = args width = opts.width images = iglob(folder, "*.jpg,*.JPG,*.png") td = '<td>{0}<br><a href="{1}"><img src="{1}" width="{2}"></a></td>' print("<table>") for ims in grouper(images, opts.columns): print('<tr height="{0}" valign="top">'.format(width + 5)) for im in ims: if not im: continue im = op.basename(im) pf = im.split(".")[0].replace("_", "-") link = link_prefix.rstrip("/") + "/" + im print(td.format(pf, link, width)) print("</tr>") print("</table>")
def __init__(self, filename, delimiter=','): super(Layout, self).__init__(filename) if not op.exists(filename): ksfiles = iglob(".", "*.ks") header = "Ks file|ncomponents|label|color|marker".split("|") contents = [] for ksfile in ksfiles: leg = op.basename(ksfile).rsplit(".", 1)[0] if leg.count(".") == 1: leg = leg.replace(".", " *vs.* ") contents.append((ksfile, "1", leg, "", "")) write_csv(header, contents, comment=True, filename=filename) fp = open(filename) for row in fp: if row[0] == '#': continue self.append(LayoutLine(row, delimiter=delimiter)) self.assign_colors() self.assign_markers()
def gallery(args): """ %prog gallery folder link_prefix Convert a folder of figures to a HTML table. For example: $ python -m jcvi.formats.html gallery Paper-figures/ https://dl.dropboxusercontent.com/u/15937715/Data/Paper-figures/ Maps the images from local to remote. """ from jcvi.apps.base import iglob from jcvi.utils.iter import grouper p = OptionParser(gallery.__doc__) p.add_option("--columns", default=3, type="int", help="How many cells per row") p.add_option("--width", default=200, type="int", help="Image width") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, link_prefix = args width = opts.width images = iglob(folder, "*.jpg,*.JPG,*.png") td = '<td>{0}<br><a href="{1}"><img src="{1}" width="{2}"></a></td>' print("<table>") for ims in grouper(images, opts.columns): print('<tr height="{0}" valign="top">'.format(width + 5)) for im in ims: if not im: continue im = op.basename(im) pf = im.split('.')[0].replace('_', '-') link = link_prefix.rstrip("/") + "/" + im print(td.format(pf, link, width)) print("</tr>") print("</table>")
def stats(args): """ %prog stats folder Generate table summarizing .stats files. """ p = OptionParser(stats.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args statsfiles = iglob(folder, "*.stats") after_equal = lambda x: x.split("=")[-1] header = "Library Assembled_reads Contigs".split() contents = [] # label=M0096 total=7443 cnts=948 mean=7.851 std=35.96 for statsfile in statsfiles: fp = open(statsfile) for row in fp: if row.startswith("label="): break label, total, cnts = row.split()[:3] label = after_equal(label) reads = int(after_equal(total)) contigs = int(after_equal(cnts)) contents.append((label, reads, contigs)) all_labels, all_reads, all_contigs = zip(*contents) contents.append(("SUM", sum(all_reads), sum(all_contigs))) contents.append(("AVERAGE (per sample)", \ int(np.mean(all_reads)), int(np.mean(all_contigs)))) contents.append(("MEDIAN (per sample)", \ int(np.median(all_reads)), int(np.median(all_contigs)))) write_csv(header, contents, filename=opts.outfile)
def prepare(args): """ %prog prepare [--options] folder [genome.fasta] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN If genome.fasta is provided, prepare script for GG-Trinity. If coord-sorted BAM is provided, then it will use it as starting point. Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu` In such cases, the `--cpu` should be set to a larger value to help speedup upstream steps such as GSNAP read mapping or coordinate sorting of BAM files. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] assert op.exists(inparam) genome = args[1] if len(args) == 2 else None method = "GG" if genome is not None else "DN" paired = opts.paired merge = opts.merge thome = opts.trinity_home use_bam = opts.use_bam gg_cpu = opts.gg_cpu pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = iglob("../" + inparam, opts.names) if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity") cmd += " --seqType fq --max_memory {0} --CPU {1}".format( opts.max_memory, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome {0} --genome_guided_max_intron {1}".format( genome, opts.max_intron) if use_bam: cmd += " --genome_guided_use_bam {0}".format(use_bam) if gg_cpu: cmd += " --genome_guided_CPU {0}".format(gg_cpu) if opts.grid and opts.grid_conf_file: cmd += " --grid_conf_file={0}".format(opts.grid_conf_file) if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: for lf, rf in zip(f1, f2): cmd += " --left {0}".format(lf) cmd += " --right {0}".format(rf) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.extra: cmd += " {0}".format(opts.extra) cmd += " --bypass_java_version_check" runfile = "run.sh" write_file(runfile, cmd) os.chdir(cwd)
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob cparams = "1 1 2 80 5 200 2000" p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--lobstr", default=False, action="store_true", help="Generate output for lobSTR") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") p.add_option("--centromeres", default=False, action="store_true", help="Run centromere search: {}".format(cparams)) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args minlength = opts.minscore / 2 mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() if opts.centromeres: params = cparams.split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).rsplit(".", 1)[0] # Commands starting with trf ignores errors cmd1 = "-trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | grep -v ^Parameters".format(datfile) if opts.lobstr: cmd2 += " | awk '($8 >= {} && $8 <= {})'".\ format(minlength, READLEN - minlength) else: cmd2 += " | awk '($8 >= 0)'" cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()
def resolve(args): """ %prog resolve matrixfile fastafile bamfolder Separate repeats along collapsed contigs. First scan the matrixfile for largely heterozygous sites. For each heterozygous site, we scan each bam to retrieve distinct haplotypes. The frequency of each haplotype is then computed, the haplotype with the highest frequency, assumed to be paralogous, is removed. """ import pysam from collections import defaultdict from itertools import groupby p = OptionParser(resolve.__doc__) p.add_option("--missing", default=.5, type="float", help="Max level of missing data") p.add_option("--het", default=.5, type="float", help="Min level of heterozygous calls") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) matrixfile, fastafile, bamfolder = args #f = Fasta(fastafile) fp = open(matrixfile) for row in fp: if row[0] != '#': break header = row.split() ngenotypes = len(header) - 4 nmissing = int(round(opts.missing * ngenotypes)) logging.debug("A total of {0} individuals scanned".format(ngenotypes)) logging.debug("Look for markers with < {0} missing and > {1} het".\ format(opts.missing, opts.het)) bamfiles = iglob(bamfolder, "*.bam") logging.debug("Folder `{0}` contained {1} bam files".\ format(bamfolder, len(bamfiles))) data = [] for row in fp: if row[0] == '#': continue atoms = row.split() seqid, pos, ref, alt = atoms[:4] genotypes = atoms[4:] c = Counter(genotypes) c0 = c.get('0', 0) c3 = c.get('3', 0) if c0 >= nmissing: continue hetratio = c3 * 1. / (ngenotypes - c0) if hetratio <= opts.het: continue pos = int(pos) data.append((seqid, pos, ref, alt, c, hetratio)) data.sort() logging.debug("A total of {0} target markers in {1} contigs.".\ format(len(data), len(set(x[0] for x in data)))) samfiles = [pysam.AlignmentFile(x, "rb") for x in bamfiles] samfiles = [(op.basename(x.filename).split(".")[0], x) for x in samfiles] samfiles.sort() logging.debug("BAM files grouped to {0} individuals".\ format(len(set(x[0] for x in samfiles)))) fw = must_open(opts.outfile, "w") for seqid, d in groupby(data, lambda x: x[0]): d = list(d) nmarkers = len(d) logging.debug("Process contig {0} ({1} markers)".format(seqid, nmarkers)) haplotype_set = [] for pf, sf in groupby(samfiles, key=lambda x: x[0]): haplotypes = [] for pfi, samfile in sf: reads = defaultdict(list) positions = [] for s, pos, ref, alt, c, hetratio in d: for c in samfile.pileup(seqid): if c.reference_pos != pos - 1: continue for r in c.pileups: rname = r.alignment.query_name rbase = r.alignment.query_sequence[r.query_position] reads[rname].append((pos, rbase)) positions.append(pos) for read in reads.values(): hap = ['-'] * nmarkers for p, rbase in read: hap[positions.index(p)] = rbase hap = "".join(hap) if "-" in hap: continue haplotypes.append(hap) haplotypes = set(haplotypes) haplotype_set.append(haplotypes) hr = HaplotypeResolver(haplotype_set) print >> fw, seqid, hr hr.solve(fw)
def resolve(args): """ %prog resolve matrixfile fastafile bamfolder Separate repeats along collapsed contigs. First scan the matrixfile for largely heterozygous sites. For each heterozygous site, we scan each bam to retrieve distinct haplotypes. The frequency of each haplotype is then computed, the haplotype with the highest frequency, assumed to be paralogous, is removed. """ import pysam from collections import defaultdict from itertools import groupby p = OptionParser(resolve.__doc__) p.add_option("--missing", default=.5, help="Maximum level of missing data") p.add_option("--het", default=.5, help="Maximum level of heterozygous calls") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) matrixfile, fastafile, bamfolder = args #f = Fasta(fastafile) fp = open(matrixfile) for row in fp: if row[0] != '#': break header = row.split() ngenotypes = len(header) - 4 nmissing = int(round(opts.missing * ngenotypes)) logging.debug("A total of {0} individuals scanned".format(ngenotypes)) logging.debug("Look for markers with < {0} missing and > {1} het".\ format(opts.missing, opts.het)) bamfiles = iglob(bamfolder, "*.bam") logging.debug("Folder `{0}` contained {1} bam files".\ format(bamfolder, len(bamfiles))) data = [] for row in fp: if row[0] == '#': continue atoms = row.split() seqid, pos, ref, alt = atoms[:4] genotypes = atoms[4:] c = Counter(genotypes) c0 = c.get('0', 0) c3 = c.get('3', 0) if c0 >= nmissing: continue hetratio = c3 * 1. / (ngenotypes - c0) if hetratio <= opts.het: continue pos = int(pos) data.append((seqid, pos, ref, alt, c, hetratio)) data.sort() logging.debug("A total of {0} target markers in {1} contigs.".\ format(len(data), len(set(x[0] for x in data)))) samfiles = [pysam.AlignmentFile(x, "rb") for x in bamfiles] samfiles = [(op.basename(x.filename).split(".")[0], x) for x in samfiles] samfiles.sort() logging.debug("BAM files grouped to {0} individuals".\ format(len(set(x[0] for x in samfiles)))) fw = must_open(opts.outfile, "w") for seqid, d in groupby(data, lambda x: x[0]): d = list(d) nmarkers = len(d) logging.debug("Process contig {0} ({1} markers)".format( seqid, nmarkers)) haplotype_set = [] for pf, sf in groupby(samfiles, key=lambda x: x[0]): haplotypes = [] for pfi, samfile in sf: reads = defaultdict(list) positions = [] for s, pos, ref, alt, c, hetratio in d: for c in samfile.pileup(seqid): if c.reference_pos != pos - 1: continue for r in c.pileups: rname = r.alignment.query_name rbase = r.alignment.query_sequence[ r.query_position] reads[rname].append((pos, rbase)) positions.append(pos) for read in reads.values(): hap = ['-'] * nmarkers for p, rbase in read: hap[positions.index(p)] = rbase hap = "".join(hap) if "-" in hap: continue haplotypes.append(hap) haplotypes = set(haplotypes) haplotype_set.append(haplotypes) hr = HaplotypeResolver(haplotype_set) print >> fw, seqid, hr hr.solve(fw)
def score(args): """ %prog score main_results/ cached_data/ contigsfasta Score the current LACHESIS CLM. """ p = OptionParser(score.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) mdir, cdir, contigsfasta = args orderingfiles = natsorted(iglob(mdir, "*.ordering")) sizes = Sizes(contigsfasta) contig_names = list(sizes.iter_names()) contig_ids = dict((name, i) for (i, name) in enumerate(contig_names)) oo = [] # Load contact matrix glm = op.join(cdir, "all.GLM") N = len(contig_ids) M = np.zeros((N, N), dtype=int) fp = open(glm) for row in fp: if row[0] == '#': continue x, y, z = row.split() if x == 'X': continue M[int(x), int(y)] = int(z) fwtour = open("tour", "w") def callback(tour, gen, oo): fitness = tour.fitness if hasattr(tour, "fitness") else None label = "GA-{0}".format(gen) if fitness: fitness = "{0}".format(fitness).split(",")[0].replace("(", "") label += "-" + fitness print_tour(fwtour, tour, label, contig_names, oo) return tour for ofile in orderingfiles: co = ContigOrdering(ofile) for x in co: contig_id = contig_ids[x.contig_name] oo.append(contig_id) pf = op.basename(ofile).split(".")[0] print pf print oo tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M) # Store INIT tour print_tour(fwtour, tour, "INIT", contig_names, oo) # Faster Cython version for evaluation from .chic import score_evaluate_M callbacki = partial(callback, oo=oo) toolbox = GA_setup(tour) toolbox.register("evaluate", score_evaluate_M, tour_sizes=tour_sizes, tour_M=tour_M) tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus, callback=callbacki) print tour, tour.fitness break fwtour.close()
def scan_read_files(trimmed, patterns): reads = iglob(trimmed, patterns) samples = sorted(set(op.basename(x).split(".")[0] for x in reads)) logging.debug("Total {0} read files from {1} samples".\ format(len(reads), len(samples))) return reads, samples
def prepare(args): """ %prog prepare [--options] folder [genome.fasta] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN If genome.fasta is provided, prepare script for GG-Trinity. If coord-sorted BAM is provided, then it will use it as starting point. Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu` In such cases, the `--cpu` should be set to a larger value to help speedup upstream steps such as GSNAP read mapping or coordinate sorting of BAM files. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] genome = args[1] if len(args) == 2 else None method = "GG" if genome is not None else "DN" paired = opts.paired merge = opts.merge thome = opts.trinity_home use_bam = opts.use_bam gg_cpu = opts.gg_cpu pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = iglob("../" + inparam, "*.fq", "*.fastq", "*.fq.gz", "*.fastq.gz") if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity") cmd += " --seqType fq --JM {0} --CPU {1}".format(opts.JM, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome {0} --genome_guided_max_intron {1}".format(genome, opts.max_intron) if use_bam: cmd += " --genome_guided_use_bam {0}".format(use_bam) if gg_cpu: cmd += " --genome_guided_CPU {0}".format(gg_cpu) if opts.grid and opts.grid_conf_file: cmd += " --grid_conf_file={0}".format(opts.grid_conf_file) if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: for lf, rf in zip(f1, f2): cmd += " --left {0}".format(lf) cmd += " --right {0}".format(rf) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.extra: cmd += " {0}".format(opts.extra) runfile = "run.sh" write_file(runfile, cmd) os.chdir(cwd)
def prepare(args): """ %prog prepare [--options] folder [--bam rnaseq.coordSorted.bam] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN-Trinity. If coord-sorted BAM is provided, prepare script for GG-Trinity, using BAM as starting point. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_fastq_names() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] paired = opts.paired merge = opts.merge trinity_home = opts.trinity_home hpc_grid_runner_home = opts.hpcgridrunner_home method = "DN" bam = opts.bam if bam and op.exists(bam): bam = op.abspath(bam) method = "GG" pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) cmds = [] # set TRINITY_HOME env variable when preparing shell script env_cmd = 'export TRINITY_HOME="{0}"'.format(trinity_home) cmds.append(env_cmd) if method == "DN": assert op.exists("../" + inparam) flist = iglob("../" + inparam, opts.names) if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x or "_R1" in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x or "_R2" in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(trinity_home, "Trinity") cmd += " --seqType fq --max_memory {0} --CPU {1}".format(opts.max_memory, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome_guided_bam {0}".format(bam) cmd += " --genome_guided_max_intron {0}".format(opts.max_intron) else: if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: cmd += " --left {0}".format(",".join(f1)) cmd += " --right {0}".format(",".join(f2)) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.grid and opts.grid_conf_file: hpc_grid_runner = op.join(hpc_grid_runner_home, "hpc_cmds_GridRunner.pl") hpc_grid_conf_file = op.join(hpc_grid_runner_home, "hpc_conf", opts.grid_conf_file) assert op.exists(hpc_grid_conf_file), "HpcGridRunner conf file does not exist: {0}".format(hpc_grid_conf_file) cmd += ' --grid_exec "{0} --grid_conf {1} -c"'.format(hpc_grid_runner, hpc_grid_conf_file) if opts.extra: cmd += " {0}".format(opts.extra) cmds.append(cmd) if opts.cleanup: cleanup_cmd = 'rm -rf !("Trinity.fasta"|"Trinity.gene_trans_map"|"Trinity.timing")' \ if method == "DN" else \ 'rm -rf !("Trinity-GG.fasta"|"Trinity-GG.gene_trans_map"|"Trinity.timing")' cmd.append(cleanup_cmd) runfile = "run.sh" write_file(runfile, "\n".join(cmds)) os.chdir(cwd)