def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join( (gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \ + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def mcscanx(args): """ %prog mcscanx athaliana.athaliana.last athaliana.bed Wrap around MCScanX. """ p = OptionParser(mcscanx.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) blastfile = args[0] bedfiles = args[1:] prefix = "_".join(op.basename(x)[:2] for x in bedfiles) symlink(blastfile, prefix + ".blast") allbedfile = prefix + ".gff" fw = open(allbedfile, "w") for i, bedfile in enumerate(bedfiles): prefix = chr(ord('A') + i) make_gff(bedfile, prefix, fw) fw.close()
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_pasa_opts() p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = op.join(opts.tgi_home, "seqclean") accn_extract = which(op.join(PASA_HOME, "misc_utilities", \ "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", \ "build_comprehensive_transcriptome.dbi")) fl_accs = opts.fl_accs cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctid = opts.compreh_pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice cmds = [] # set PASAHOME env variable if preparing shell script if prepare: env_cmd = 'export PASAHOME="{0}"'.format(PASA_HOME) cmds.append(env_cmd) if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn) cmds.append(accn_extract_cmd) if not prepare: sh(accn_extract_cmd) else: symlink(dnfasta, tfasta) transcripts = tfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: ccpus = 16 if cpus >= 16 else cpus cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, ccpus) if prepare: cmds.append(cleancmd) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print(alignAssembly_conf.format("{0}_pasa".format(pasa_db), \ pctcov, pctid, bpsplice), file=aafw) aafw.close() symlink(genome, gfasta) aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, gfasta) aacmd += " -t {0}.clean -T -u {0}".format(transcripts) if clean else \ " -t {0}".format(transcripts) if fl_accs: symlink(fl_accs, flaccs) aacmd += " -f {0}".format(flaccs) if ggfasta: aacmd += " --TDN {0}".format(tdn) aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \ opts.intron, cpus) if prepare: cmds.append(aacmd) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if opts.compreh and ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(compreh_pctid, compreh_pctcov) if prepare: cmds.append(comprehcmd) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts) if prepare: write_file(runfile, "\n".join(cmds)) # initialize run script
def compare(args): """ %prog compare pasa_db_name [--annots_gff3=annotation.gff3] Run the PASA annotation comparison pipeline This assumes that PASA alignment assembly has alredy been completed and run directory contains `genome.fasta` and `transcript.fasta` files. If `--annots_gff3` is specified, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) pasa_db, = args PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) annots_gff3 = opts.annots_gff3 grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "", append=True, skipcheck=True) # initialize run script acfw = must_open(acconf, "w") print(annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons), file=acfw) acfw.close() if not op.exists(gfasta): sys.exit("Genome fasta file `{0}` does not exist".format(gfasta)) transcripts = tfasta if not op.exists(transcripts): sys.exit("Transcript fasta file `{0}` does not exist".format(transcripts)) if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, gfasta, transcripts, opts.genetic_code) if annots_gff3: if not op.exists(annots_gff3): sys.exit("Annotation gff3 file `{0}` does not exist".format(annots_gff3)) symlink(annots_gff3, annotation) accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def movie(args): """ %prog movie test.tour test.clm ref.contigs.last Plot optimization history. """ p = OptionParser(movie.__doc__) p.add_option("--frames", default=500, type="int", help="Only plot every N frames") p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"), help="Movie engine, output MP4 or GIF") p.set_beds() opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=300) if len(args) != 3: sys.exit(not p.print_help()) tourfile, clmfile, lastfile = args tourfile = op.abspath(tourfile) clmfile = op.abspath(clmfile) lastfile = op.abspath(lastfile) cwd = os.getcwd() odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie" anchorsfile, qbedfile, contig_to_beds = \ prepare_synteny(tourfile, lastfile, odir, p, opts) args = [] for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames): padi = "{:06d}".format(i) # Make sure the anchorsfile and bedfile has the serial number in, # otherwise parallelization may fail a, b = op.basename(anchorsfile).split(".", 1) ianchorsfile = a + "_" + padi + "." + b symlink(anchorsfile, ianchorsfile) # Make BED file with new order qb = Bed() for contig, o in zip(tour, tour_o): if contig not in contig_to_beds: continue bedlines = contig_to_beds[contig][:] if o == '-': bedlines.reverse() for x in bedlines: qb.append(x) a, b = op.basename(qbedfile).split(".", 1) ibedfile = a + "_" + padi + "." + b qb.print_to_file(ibedfile) # Plot dot plot, but do not sort contigs by name (otherwise losing # order) image_name = padi + "." + iopts.format tour = ",".join(tour) args.append([[ tour, clmfile, ianchorsfile, "--outfile", image_name, "--label", label ]]) Jobs(movieframe, args).run() os.chdir(cwd) make_movie(odir, odir, engine=opts.engine, format=iopts.format)
def movie(args): """ %prog movie test.tour test.clm ref.contigs.last Plot optimization history. """ p = OptionParser(movie.__doc__) p.add_option("--frames", default=500, type="int", help="Only plot every N frames") p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"), help="Movie engine, output MP4 or GIF") p.set_beds() opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=300) if len(args) != 3: sys.exit(not p.print_help()) tourfile, clmfile, lastfile = args tourfile = op.abspath(tourfile) clmfile = op.abspath(clmfile) lastfile = op.abspath(lastfile) cwd = os.getcwd() odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie" anchorsfile, qbedfile, contig_to_beds = \ prepare_synteny(tourfile, lastfile, odir, p, opts) args = [] for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames): padi = "{:06d}".format(i) # Make sure the anchorsfile and bedfile has the serial number in, # otherwise parallelization may fail a, b = op.basename(anchorsfile).split(".", 1) ianchorsfile = a + "_" + padi + "." + b symlink(anchorsfile, ianchorsfile) # Make BED file with new order qb = Bed() for contig, o in zip(tour, tour_o): if contig not in contig_to_beds: continue bedlines = contig_to_beds[contig][:] if o == '-': bedlines.reverse() for x in bedlines: qb.append(x) a, b = op.basename(qbedfile).split(".", 1) ibedfile = a + "_" + padi + "." + b qb.print_to_file(ibedfile) # Plot dot plot, but do not sort contigs by name (otherwise losing # order) image_name = padi + "." + iopts.format tour = ",".join(tour) args.append([[tour, clmfile, ianchorsfile, "--outfile", image_name, "--label", label]]) Jobs(movieframe, args).run() os.chdir(cwd) make_movie(odir, odir, engine=opts.engine, format=iopts.format)