def A50(args): """ %prog A50 contigs_A.fasta contigs_B.fasta ... Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/) """ p = OptionParser(A50.__doc__) p.add_option("--overwrite", default=False, action="store_true", help="overwrite .rplot file if exists [default: %default]") p.add_option("--cutoff", default=0, type="int", dest="cutoff", help="use contigs above certain size [default: %default]") p.add_option("--stepsize", default=10, type="int", dest="stepsize", help="stepsize for the distribution [default: %default]") opts, args = p.parse_args(args) if not args: sys.exit(p.print_help()) import numpy as np from jcvi.utils.table import loadtable stepsize = opts.stepsize # use stepsize to speed up drawing rplot = "A50.rplot" if not op.exists(rplot) or opts.overwrite: fw = open(rplot, "w") header = "\t".join(("index", "cumsize", "fasta")) statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum", "Counts") statsrows = [] print >>fw, header for fastafile in args: f = Fasta(fastafile, index=False) ctgsizes = [length for k, length in f.itersizes()] ctgsizes = np.array(ctgsizes) a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff) cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes) csum, counts = np.sum(ctgsizes), len(ctgsizes) cmean = int(round(cmean)) statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum, counts)) logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes)) tag = "{0} (L50={1})".format(\ op.basename(fastafile).rsplit(".", 1)[0], l50) logging.debug(tag) for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]): print >> fw, "\t".join((str(i), str(s / 1000000.), tag)) fw.close() table = loadtable(statsheader, statsrows) print >> sys.stderr, table generate_plot(rplot)
def count(args): """ %prog count *.gz Count reads based on FASTQC results. FASTQC needs to be run on all the input data given before running this command. """ from jcvi.utils.table import loadtable, write_csv p = OptionParser(count.__doc__) p.add_option("--dir", help="Sub-directory where FASTQC was run [default: %default]") p.add_option("--human", default=False, action="store_true", help="Human friendly numbers [default: %default]") p.set_table() p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) filenames = args subdir = opts.dir header = "Filename|Total Sequences|Sequence length|Total Bases".split("|") rows = [] human = opts.human for f in filenames: folder = f.replace(".gz", "").rsplit(".", 1)[0] + "_fastqc" if subdir: folder = op.join(subdir, folder) summaryfile = op.join(folder, "fastqc_data.txt") fqcdata = FastQCdata(summaryfile, human=human) row = [fqcdata[x] for x in header] rows.append(row) print >> sys.stderr, loadtable(header, rows) write_csv(header, rows, sep=opts.sep, filename=opts.outfile, align=opts.align)
def count(args): """ %prog count *.gz Count reads based on FASTQC results. FASTQC needs to be run on all the input data given before running this command. """ from jcvi.utils.table import loadtable p = OptionParser(count.__doc__) p.add_option("--dir", help="Sub-directory where FASTQC was run [default: %default]") p.add_option("--human", default=False, action="store_true", help="Human friendly numbers [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) filenames = args subdir = opts.dir header = "Filename|Total Sequences|Sequence length|Total Bases".split("|") rows = [] human = opts.human for f in filenames: folder = f.replace(".gz", "").rsplit(".", 1)[0] + "_fastqc" if subdir: folder = op.join(subdir, folder) summaryfile = op.join(folder, "fastqc_data.txt") fqcdata = FastQCdata(summaryfile, human=human) row = [fqcdata[x] for x in header] rows.append(row) print >> sys.stderr, loadtable(header, rows) fw = must_open(opts.outfile, "w") data = [header] + rows for d in data: print >> fw, ",".join(str(x) for x in d)