def add_plot_options(p): p.add_option("--vmin", default=0., type="float", help="Minimum value, inclusive [default: %default]") p.add_option("--vmax", default=2., type="float", help="Maximum value, inclusive [default: %default]") p.add_option("--bins", default=20, type="int", help="Number of bins to plot in the histogram [default: %default]") group = OptionGroup(p, "Image options") group.add_option("--legendp", default="upper right", help="Place of the legend [default: %default]") group.add_option("--fill", default=False, action="store_true", help="Fill the histogram area [default: %default]") group.add_option("--title", default="Ks distribution", help="Title of the plot [default: %default]") p.add_option_group(group)
if sqlite: c.execute("create index q on synteny (query)") conn.commit() c.close() else: fw.close() if __name__ == '__main__': p = OptionParser(__doc__) p.set_beds() p.set_stripnames() p.set_outfile() coge_group = OptionGroup(p, "CoGe-specific options") coge_group.add_option("--sqlite", help="Write sqlite database") coge_group.add_option("--qnote", default="null", help="Query dataset group id") coge_group.add_option("--snote", default="null", help="Subject dataset group id") params_group = OptionGroup(p, "Synteny parameters") params_group.add_option("--window", type="int", default=40, help="Synteny window size") params_group.add_option("--cutoff", type="float", default=.1, help="Minimum number of anchors to call synteny") supported_scoring = ("collinear", "density") params_group.add_option("--scoring", choices=supported_scoring, default="collinear", help="Scoring scheme")
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option("--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric" \ + " [default: %default]") p.add_option("--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \ + " [default: %default]") g1 = OptionGroup(p, "Optional parameters (alignment):\n" \ + "Use if resolving ambiguities based on sequence `alignment`") g1.add_option("--pid", dest="pid", default=35., type="float", help="Percent identity cutoff [default: %default]") g1.add_option("--score", dest="score", default=250., type="float", help="Alignment score cutoff [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters (overlap):\n" \ + "Use if resolving ambiguities based on `overlap` length\n" \ + "Parameters equivalent to `intersectBed`") g2.add_option("-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0) [default: %default]") g2.add_option("-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal [default: %default]") g2.add_option("-s", dest="s", default=True, action="store_true", help="Require same strandedness [default: %default]") p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".\ format(opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning("`{0}` already exists. Checking for needle output".\ format(pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error("`{0}` does not exist. Please process {1} using `needle`".\ format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning("`{0}' exists. Storing scores in memory".\ format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print >> afh, abedline[accn] afh.close() sort([abedfile, "-i"])
def prepare(args): """ %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile> Inferred file names --------------------------------------------- `lookuptblfile` : rearraylibrary.lookup `rearraylibfile`: rearraylibrary.fasta Pick sequences from the original library file and the rearrayed library file based on the mapping information provided in the `lookuptblfile`. # lookuptblfile format: column number (index) # 1 (0) 2 (1) 3 (2) 4 (3) 5 (4) 6 (5) # source_clone source_plate source_well dest_clone dest_plate dest_well The 1st and 4th column in the `lookuptblfile` form the pair of clones which constitute the elements used for the per-clone assembly. """ from operator import itemgetter from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(prepare.__doc__) p.add_option("--rearray_lib", default=None, help="name of the rearrayed library [default: %default]") p.add_option("--orig_lib_file", help="fasta file containing reads from the original libraries [default: %default]") g = OptionGroup(p, "Optional parameters") g.add_option("--output_folder", default="to_assemble", help="output folder to write the FASTA files to [default: %default]") p.add_option_group(g) opts, args = p.parse_args(args) if not opts.rearray_lib or not opts.orig_lib_file: logging.error("Please specify the required parameters") sys.exit(not p.print_help()) rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file if not op.isfile(origlibfile): logging.error("Original library reads file `{0}` does not exist!".format(origlibfile)) sys.exit() lookuptblfile = rearraylib + '.lookup' logging.debug(lookuptblfile) if not op.isfile(lookuptblfile): logging.error("Lookup table file `{0}` does not exist!".format(lookuptblfile)) sys.exit() rearraylibfile = rearraylib + '.fasta' logging.debug(rearraylibfile) if not op.isfile(rearraylibfile): logging.error("Rearrayed library reads file `{0}` does not exist!".format(rearraylibfile)) sys.exit() origlibFasta = Fasta(origlibfile) rearraylibFasta = Fasta(rearraylibfile) origlibids = [o for o in origlibFasta.iterkeys_ordered()] rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()] if not op.isdir(opts.output_folder): logging.warning("Output directory `{0}` missing. Creating it now...".format(opts.output_folder)) os.makedirs(opts.output_folder) logfile = rearraylib + '.log' log = open(logfile, 'w') fp = open(lookuptblfile, 'r') for row in fp: origprefix, rearrayprefix = itemgetter(0,3)(row.split('\t')) libpair = origprefix + '_' + rearrayprefix outfile = opts.output_folder + '/' + libpair + '.fasta' ofp = open(outfile, 'w') for o in origlibids: if re.match(origprefix, o): SeqIO.write(origlibFasta[o], ofp, 'fasta') for r in rearraylibids: if re.match(rearrayprefix, r): SeqIO.write(rearraylibFasta[r], ofp, 'fasta') ofp.close() print >>log, outfile log.close() logging.debug('Wrote log file `{0}`'.format(logfile))
def assemble(args): """ Run `cap3` on a single multi FASTA file containing reads or a folder containing several multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc. """ p = OptionParser(assemble.__doc__) g1 = OptionGroup(p, "Input file options (required)", "Note: Please choose from and provide values for one of the following parameters") g1.add_option("--input_file", default=None, help="input file of reads [default: %default]") g1.add_option("--input_folder", default=None, help="input folder containing multi FASTA files of reads [default: %default]") g1.add_option("--input_file_list", default=None, help="list file containing paths to multi FASTA files of reads [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters", "Note: If not specified, `cap3` defaults will be used") g2.add_option("-f", "--max_gap_len", default=20, type="int", help="maximum gap length in any overlap [default: %default]\n" +\ "Same as cap3 `-f` parameter.") g2.add_option("-p", "--ovl_pct_id", default=90, type="int", help="overlap percent identity cutoff [default: %default]\n" +\ "Same as cap3 `-p` parameter.") g2.add_option("-s", "--ovl_sim_score", default=900, type="int", help="overlap similarity score cutoff [default: %default]\n" +\ "Same as cap3 `-s` parameter.") g2.add_option("-x", "--prefix", dest="prefix", default="cap3", help="prefix string for output file name [default: %default]") p.add_option_group(g2) p.set_params() opts, args = p.parse_args(args) if opts.max_gap_len and opts.max_gap_len <= 1: logging.error("--max_gap_len should be > 1") sys.exit() elif opts.ovl_pct_id and opts.ovl_pct_id <= 65: logging.error("--ovl_pct_id should be > 65") sys.exit() elif opts.ovl_sim_score and opts.ovl_sim_score <= 250: logging.error("--ovl_sim_score should be > 250") sys.exit() file_list = [] if opts.input_file_list: if not op.isfile(opts.input_file_list): logging.error("Input file list {0} does not exist".format(opts.input_file_list)) sys.exit() with open(opts.input_file_list, 'r') as f: file_list = f.read().splitlines() elif opts.input_folder: if not op.isdir(opts.input_folder): logging.error("Input folder {0} does not exist".format(opts.input_folder)) sys.exit() file_list = [file for file in os.listdir(opts.input_folder) \ if file.lower().endswith('.fa') or file.lower().endswith('.fasta')] folder = opts.input_folder folder = folder.rstrip('/') for i in xrange(len(file_list)): file_list[i] = folder + "/" + file_list[i] elif opts.input_file: file_list.append(opts.input_file) else: logging.error("Please specify one of the options for input files") sys.exit(not p.print_help()) if len(file_list) == 0: logging.warning("List of files to process is empty. Please check your input!") sys.exit() for file in file_list: if not op.isfile(file): logging.warning("Input file {0} does not exist".format(file)) else: cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \ opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix) if opts.extra: cmd += " {0}".format(opts.extra) logfile = "{0}.{1}.log".format(file, opts.prefix) sh(cmd, outfile=logfile)
def add_seeds_options(p, args): g1 = OptionGroup(p, "Image manipulation") g1.add_option("--rotate", default=0, type="int", help="Rotate degrees clockwise") g1.add_option("--rows", default=':', help="Crop rows e.g. `:800` from first 800 rows") g1.add_option("--cols", default=':', help="Crop cols e.g. `-800:` from last 800 cols") g1.add_option("--labelrows", help="Label rows e.g. `:800` from first 800 rows") g1.add_option("--labelcols", help="Label cols e.g. `-800: from last 800 rows") p.add_option_group(g1) g2 = OptionGroup(p, "Object recognition") g2.add_option("--minsize", default=.05, type="float", help="Min percentage of object to image") g2.add_option("--maxsize", default=50, type="float", help="Max percentage of object to image") g2.add_option("--count", default=100, type="int", help="Report max number of objects") g2.add_option("--watershed", default=False, action="store_true", help="Run watershed to segment touching objects") p.add_option_group(g2) g3 = OptionGroup(p, "De-noise") valid_filters = ("canny", "roberts", "sobel") g3.add_option("--filter", default="canny", choices=valid_filters, help="Edge detection algorithm") g3.add_option("--sigma", default=1, type="int", help="Canny edge detection sigma, higher for noisy image") g3.add_option("--kernel", default=2, type="int", help="Edge closure, higher if the object edges are dull") g3.add_option("--border", default=5, type="int", help="Remove image border of certain pixels") p.add_option_group(g3) g4 = OptionGroup(p, "Output") g4.add_option("--calibrate", help="JSON file to correct distance and color") g4.add_option("--edges", default=False, action="store_true", help="Visualize edges in middle PDF panel") g4.add_option("--outdir", default=".", help="Store intermediate images and PDF in folder") g4.add_option("--prefix", help="Output prefix") g4.add_option("--noheader", default=False, action="store_true", help="Do not print header") p.add_option_group(g4) opts, args, iopts = p.set_image_options(args, figsize='12x6', style="white") return opts, args, iopts
def main(): """ %prog bedfile id_mappings Takes a bedfile that contains the coordinates of features to plot on the chromosomes, and `id_mappings` file that map the ids to certain class. Each class will get assigned a unique color. `id_mappings` file is optional (if omitted, will not paint the chromosome features, except the centromere). The extent of the chromosomes are given by --sizes, which contains chr<tab>size, one per line. If not specified, the extent of the chromosomes are assumed to be the end for the last feature, which might be an underestimate. """ p = OptionParser(main.__doc__) p.add_option( "--sizes", help="FASTA sizes file, which contains chr<tab>size, one per line") g = OptionGroup(p, "Display accessories") g.add_option( "--title", help="title of the image", ) g.add_option( "--gauge", default=False, action="store_true", help="draw a gauge with size label", ) p.add_option_group(g) g = OptionGroup(p, "HTML image map") g.add_option( "--imagemap", default=False, action="store_true", help="generate an HTML image map associated with the image", ) g.add_option( "--winsize", default=50000, type="int", help= "if drawing an imagemap, specify the window size (bases) of each map element ", ) p.add_option_group(g) g = OptionGroup(p, "Color legend") g.add_option( "--nolegend", dest="legend", default=True, action="store_false", help="Do not generate color legend", ) g.add_option("--mergedist", default=0, type="int", help="Merge regions closer than ") g.add_option("--empty", help="Write legend for unpainted region") p.add_option_group(g) opts, args, iopts = p.set_image_options(figsize="6x6", dpi=300) if len(args) not in (1, 2): sys.exit(p.print_help()) bedfile = args[0] mappingfile = None if len(args) == 2: mappingfile = args[1] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) draw_chromosomes( root, bedfile, sizes=opts.sizes, iopts=iopts, mergedist=opts.mergedist, winsize=opts.winsize, imagemap=opts.imagemap, mappingfile=mappingfile, gauge=opts.gauge, legend=opts.legend, empty=opts.empty, title=opts.title, ) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() prefix = bedfile.rsplit(".", 1)[0] figname = prefix + "." + opts.format savefig(figname, dpi=iopts.dpi, iopts=iopts)
def prepare(args): """ %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile> Inferred file names --------------------------------------------- `lookuptblfile` : rearraylibrary.lookup `rearraylibfile`: rearraylibrary.fasta Pick sequences from the original library file and the rearrayed library file based on the mapping information provided in the `lookuptblfile`. # lookuptblfile format: column number (index) # 1 (0) 2 (1) 3 (2) 4 (3) 5 (4) 6 (5) # source_clone source_plate source_well dest_clone dest_plate dest_well The 1st and 4th column in the `lookuptblfile` form the pair of clones which constitute the elements used for the per-clone assembly. """ from operator import itemgetter from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(prepare.__doc__) p.add_option("--rearray_lib", default=None, help="name of the rearrayed library [default: %default]") p.add_option( "--orig_lib_file", help= "fasta file containing reads from the original libraries [default: %default]" ) g = OptionGroup(p, "Optional parameters") g.add_option( "--output_folder", default="to_assemble", help="output folder to write the FASTA files to [default: %default]") p.add_option_group(g) opts, args = p.parse_args(args) if not opts.rearray_lib or not opts.orig_lib_file: logging.error("Please specify the required parameters") sys.exit(not p.print_help()) rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file if not op.isfile(origlibfile): logging.error( "Original library reads file `{0}` does not exist!".format( origlibfile)) sys.exit() lookuptblfile = rearraylib + '.lookup' logging.debug(lookuptblfile) if not op.isfile(lookuptblfile): logging.error( "Lookup table file `{0}` does not exist!".format(lookuptblfile)) sys.exit() rearraylibfile = rearraylib + '.fasta' logging.debug(rearraylibfile) if not op.isfile(rearraylibfile): logging.error( "Rearrayed library reads file `{0}` does not exist!".format( rearraylibfile)) sys.exit() origlibFasta = Fasta(origlibfile) rearraylibFasta = Fasta(rearraylibfile) origlibids = [o for o in origlibFasta.iterkeys_ordered()] rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()] if not op.isdir(opts.output_folder): logging.warning( "Output directory `{0}` missing. Creating it now...".format( opts.output_folder)) os.makedirs(opts.output_folder) logfile = rearraylib + '.log' log = open(logfile, 'w') fp = open(lookuptblfile, 'r') for row in fp: origprefix, rearrayprefix = itemgetter(0, 3)(row.split('\t')) libpair = origprefix + '_' + rearrayprefix outfile = opts.output_folder + '/' + libpair + '.fasta' ofp = open(outfile, 'w') for o in origlibids: if re.match(origprefix, o): SeqIO.write(origlibFasta[o], ofp, 'fasta') for r in rearraylibids: if re.match(rearrayprefix, r): SeqIO.write(rearraylibFasta[r], ofp, 'fasta') ofp.close() print >> log, outfile log.close() logging.debug('Wrote log file `{0}`'.format(logfile))
def assemble(args): """ Run `cap3` on a single multi FASTA file containing reads or a folder containing several multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc. """ p = OptionParser(assemble.__doc__) g1 = OptionGroup( p, "Input file options (required)", "Note: Please choose from and provide values for one of the following parameters" ) g1.add_option("--input_file", default=None, help="input file of reads [default: %default]") g1.add_option( "--input_folder", default=None, help= "input folder containing multi FASTA files of reads [default: %default]" ) g1.add_option( "--input_file_list", default=None, help= "list file containing paths to multi FASTA files of reads [default: %default]" ) p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters", "Note: If not specified, `cap3` defaults will be used") g2.add_option("-f", "--max_gap_len", default=20, type="int", help="maximum gap length in any overlap [default: %default]\n" +\ "Same as cap3 `-f` parameter.") g2.add_option("-p", "--ovl_pct_id", default=90, type="int", help="overlap percent identity cutoff [default: %default]\n" +\ "Same as cap3 `-p` parameter.") g2.add_option("-s", "--ovl_sim_score", default=900, type="int", help="overlap similarity score cutoff [default: %default]\n" +\ "Same as cap3 `-s` parameter.") g2.add_option( "-x", "--prefix", dest="prefix", default="cap3", help="prefix string for output file name [default: %default]") p.add_option_group(g2) p.set_params() opts, args = p.parse_args(args) if opts.max_gap_len and opts.max_gap_len <= 1: logging.error("--max_gap_len should be > 1") sys.exit() elif opts.ovl_pct_id and opts.ovl_pct_id <= 65: logging.error("--ovl_pct_id should be > 65") sys.exit() elif opts.ovl_sim_score and opts.ovl_sim_score <= 250: logging.error("--ovl_sim_score should be > 250") sys.exit() file_list = [] if opts.input_file_list: if not op.isfile(opts.input_file_list): logging.error("Input file list {0} does not exist".format( opts.input_file_list)) sys.exit() with open(opts.input_file_list, 'r') as f: file_list = f.read().splitlines() elif opts.input_folder: if not op.isdir(opts.input_folder): logging.error("Input folder {0} does not exist".format( opts.input_folder)) sys.exit() file_list = [file for file in os.listdir(opts.input_folder) \ if file.lower().endswith('.fa') or file.lower().endswith('.fasta')] folder = opts.input_folder folder = folder.rstrip('/') for i in xrange(len(file_list)): file_list[i] = folder + "/" + file_list[i] elif opts.input_file: file_list.append(opts.input_file) else: logging.error("Please specify one of the options for input files") sys.exit(not p.print_help()) if len(file_list) == 0: logging.warning( "List of files to process is empty. Please check your input!") sys.exit() for file in file_list: if not op.isfile(file): logging.warning("Input file {0} does not exist".format(file)) else: cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \ opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix) if opts.extra: cmd += " {0}".format(opts.extra) logfile = "{0}.{1}.log".format(file, opts.prefix) sh(cmd, outfile=logfile)