def filterdata(args): """ %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids Filter subset of data after dropping remove.ids. """ p = OptionParser(filterdata.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 6: sys.exit(not p.print_help()) binfile, sampleids, strids, af, remove, final = args df, m, samples, loci = read_binfile(binfile, sampleids, strids) remove = [x.strip() for x in open(remove)] removes = set(remove) final = [x.strip() for x in open(final)] assert len(loci) == len(remove) + len(final) fp = open(af) percentiles = {} for row in fp: sname, counts = row.split() countsd = af_to_counts(counts) percentile = counts_to_percentile(countsd) percentiles[sname] = percentile run_args = [] for i, sname in enumerate(loci): if sname in removes: continue a = m[:, i] percentile = percentiles[sname] run_args.append((i, a, percentile)) cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) res = [] for r in p.map_async(convert_to_percentile, run_args).get(): res.append(r) res.sort() # Write mask (P-value) matrix ii, pvalues = zip(*res) m = np.vstack(pvalues).T write_csv("final.mask.tsv", m, samples, final) df.drop(remove, inplace=True, axis=1) df.columns = final # Save a copy of the raw numpy array filtered_bin = "filtered.bin" m = df.as_matrix() m[m < 0] = -1 m.tofile(filtered_bin) logging.debug("Binary matrix written to `{}`".format(filtered_bin)) # Write data output df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
def write_mask(cpus, samples, final_columns, run_args, filename="mask.tsv"): p = Pool(processes=cpus) res = [] r = p.map_async(convert_to_percentile, run_args, callback=res.append) r.wait() res.sort() if len(res) == 1: # sometimes res end up with one more nest res, = res # Write mask (P-value) matrix ii, pvalues = zip(*res) m = np.vstack(pvalues).T write_csv(filename, m, samples, final_columns)
def count(args): """ %prog count *.gz Count reads based on FASTQC results. FASTQC needs to be run on all the input data given before running this command. """ from jcvi.utils.table import loadtable, write_csv p = OptionParser(count.__doc__) p.add_option("--dir", help="Sub-directory where FASTQC was run [default: %default]") p.add_option("--human", default=False, action="store_true", help="Human friendly numbers [default: %default]") p.set_table() p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) filenames = args subdir = opts.dir header = "Filename|Total Sequences|Sequence length|Total Bases".split("|") rows = [] human = opts.human for f in filenames: folder = f.replace(".gz", "").rsplit(".", 1)[0] + "_fastqc" if subdir: folder = op.join(subdir, folder) summaryfile = op.join(folder, "fastqc_data.txt") fqcdata = FastQCdata(summaryfile, human=human) row = [fqcdata[x] for x in header] rows.append(row) print >> sys.stderr, loadtable(header, rows) write_csv(header, rows, sep=opts.sep, filename=opts.outfile, align=opts.align)
def __init__(self, filename, delimiter=','): super(Layout, self).__init__(filename) if not op.exists(filename): ksfiles = iglob(".", "*.ks") header = "Ks file|ncomponents|label|color|marker".split("|") contents = [] for ksfile in ksfiles: leg = op.basename(ksfile).rsplit(".", 1)[0] if leg.count(".") == 1: leg = leg.replace(".", " *vs.* ") contents.append((ksfile, "1", leg, "", "")) write_csv(header, contents, comment=True, filename=filename) fp = open(filename) for row in fp: if row[0] == '#': continue self.append(LayoutLine(row, delimiter=delimiter)) self.assign_colors() self.assign_markers()
def summary(args): """ %prog summary agpfile print a table of scaffold statistics, number of BACs, no of scaffolds, scaffold N50, scaffold L50, actual sequence, PSMOL NNNs, PSMOL-length, % of PSMOL sequenced. """ from jcvi.utils.table import write_csv p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) agpfile, = args header = "Chromosome #_Distinct #_Components #_Scaffolds " \ "Scaff_N50 Scaff_L50 Length".split() agp = AGP(agpfile) data = list(agp.summary_all()) write_csv(header, data, sep=" ")
def stats(args): """ %prog stats folder Generate table summarizing .stats files. """ p = OptionParser(stats.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args statsfiles = iglob(folder, "*.stats") after_equal = lambda x: x.split("=")[-1] header = "Library Assembled_reads Contigs".split() contents = [] # label=M0096 total=7443 cnts=948 mean=7.851 std=35.96 for statsfile in statsfiles: fp = open(statsfile) for row in fp: if row.startswith("label="): break label, total, cnts = row.split()[:3] label = after_equal(label) reads = int(after_equal(total)) contigs = int(after_equal(cnts)) contents.append((label, reads, contigs)) all_labels, all_reads, all_contigs = zip(*contents) contents.append(("SUM", sum(all_reads), sum(all_contigs))) contents.append(("AVERAGE (per sample)", \ int(np.mean(all_reads)), int(np.mean(all_contigs)))) contents.append(("MEDIAN (per sample)", \ int(np.median(all_reads)), int(np.median(all_contigs)))) write_csv(header, contents, filename=opts.outfile)
def ystr(args): """ %prog ystr chrY.vcf Print out Y-STR info given VCF. Marker name extracted from tabfile. """ from jcvi.utils.table import write_csv p = OptionParser(ystr.__doc__) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args si = STRFile(opts.lobstr_home, db="hg38-named") register = si.register header = "Marker|Reads|Ref|Genotype|Motif".split("|") contents = [] fp = must_open(vcffile) reader = vcf.Reader(fp) simple_register = {} for record in reader: name = register[(record.CHROM, record.POS)] info = record.INFO ref = int(float(info["REF"])) rpa = info.get("RPA", ref) if isinstance(rpa, list): rpa = "|".join(str(int(float(x))) for x in rpa) ru = info["RU"] simple_register[name] = rpa for sample in record.samples: contents.append((name, sample["ALLREADS"], ref, rpa, ru)) # Multi-part markers a, b, c = "DYS389I", "DYS389B.1", "DYS389B" if a in simple_register and b in simple_register: simple_register[c] = int(simple_register[a]) + int(simple_register[b]) # Multi-copy markers mm = ["DYS385", "DYS413", "YCAII"] for m in mm: ma, mb = m + 'a', m + 'b' if ma not in simple_register or mb not in simple_register: simple_register[ma] = simple_register[mb] = None del simple_register[ma] del simple_register[mb] continue if simple_register[ma] > simple_register[mb]: simple_register[ma], simple_register[mb] = \ simple_register[mb], simple_register[ma] write_csv(header, contents, sep=" ") print("[YSEARCH]") build_ysearch_link(simple_register) print("[YFILER]") build_yhrd_link(simple_register, panel=YHRD_YFILER) print("[YFILERPLUS]") build_yhrd_link(simple_register, panel=YHRD_YFILERPLUS) print("[YSTR-ALL]") build_yhrd_link(simple_register, panel=USYSTR_ALL)
def prepare(args): """ %prog prepare "B. oleracea" *.fastq Scan input fastq files (see below) and create `in_groups.csv` and `in_libs.csv`. The species name does not really matter. """ from jcvi.utils.table import write_csv from jcvi.formats.base import write_file from jcvi.formats.fastq import guessoffset p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("--corr", default=False, action="store_true", help="Extra parameters for corrected data [default: %default]") p.add_option("--norun", default=False, action="store_true", help="Don't write `run.sh` script [default: %default]") p.add_option("--ploidy", default="2", choices=("1", "2"), help="Ploidy [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) organism_name = args[0] project_name = "".join(x[0] for x in organism_name.split()).upper() fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:]) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) offset = guessoffset([fnames[0]]) phred64 = offset == 64 assert all(guessoffset([x]) == offset for x in fnames[1:]) groupheader = "group_name library_name file_name".split() libheader = "library_name project_name organism_name type paired "\ "frag_size frag_stddev insert_size insert_stddev read_orientation "\ "genomic_start genomic_end".split() groupcontents = [] libs = [] for file_name in fnames: group_name = op.basename(file_name).split(".")[0] library_name = "-".join(group_name.split("-")[:2]) # Handle paired files and convert to wildcard if ".1." in file_name: file_name = file_name.replace(".1.", ".?.") elif ".2." in file_name: continue groupcontents.append((group_name, library_name, file_name)) if library_name not in libs: libs.append(library_name) libcontents = [] for library_name in libs: L = Library(library_name) size = L.size stddev = L.stddev type = L.type paired = L.paired read_orientation = L.read_orientation size = size or "" stddev = stddev or "" frag_size = size if type == "fragment" else "" frag_stddev = stddev if type == "fragment" else "" insert_size = size if type != "fragment" else "" insert_stddev = stddev if type != "fragment" else "" genomic_start, genomic_end = "", "" libcontents.append((library_name, project_name, organism_name, type, \ paired, frag_size, frag_stddev, insert_size, insert_stddev, \ read_orientation, genomic_start, genomic_end)) write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True) logging.debug("`in_group.csv` created (# of groups = {0}).".\ format(len(groupcontents))) write_csv(libheader, libcontents, filename="in_libs.csv", tee=True) logging.debug("`in_libs.csv` created (# of libs = {0}).".\ format(len(libcontents))) runfile = "run.sh" extra = "" if opts.corr: extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0" extra += " REMOVE_DODGY_READS_FRAG=False FE_MAX_KMER_FREQ_TO_MARK=1" if not opts.norun: contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, phred64, extra) write_file(runfile, contents)
def plot(args): """ %prog plot tagged.new.bed chr1 Plot gene identifiers along a particular chromosome, often to illustrate the gene id assignment procedure. """ from jcvi.graphics.base import plt, savefig from jcvi.graphics.chromosome import ChromosomeMap p = OptionParser(plot.__doc__) p.add_option("--firstn", type="int", help="Only plot the first N genes") p.add_option("--ymax", type="int", help="Y-axis max value") p.add_option("--log", action="store_true", help="Write plotting data [default: %default]") opts, args, iopts = p.set_image_options(args, figsize="6x4") if len(args) != 2: sys.exit(not p.print_help()) taggedbed, chr = args bed = Bed(taggedbed) beds = list(bed.sub_bed(chr)) old, new = [], [] i = 0 for b in beds: accn = b.extra[0] if "te" in accn: continue accn, tag = accn.split("|") if tag == "OVERLAP": continue c, r = atg_name(accn) if tag == "NEW": new.append((i, r)) else: old.append((i, r)) i += 1 ngenes = i assert ngenes == len(new) + len(old) logging.debug("Imported {0} ranks on {1}.".format(ngenes, chr)) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) xstart, xend = .2, .8 ystart, yend = .2, .8 pad = .02 ngenes = opts.firstn or ngenes ymax = opts.ymax or 500000 title = "Assignment of Medtr identifiers" if opts.ymax: subtitle = "{0}, first {1} genes".format(chr, ngenes) else: subtitle = "{0}, {1} genes ({2} new)".format(chr, ngenes, len(new)) chr_map = ChromosomeMap(fig, root, xstart, xend, ystart, yend, pad, 0, ymax, 5, title, subtitle) ax = chr_map.axes if opts.log: from jcvi.utils.table import write_csv header = ["x", "y"] write_csv(header, new, filename=chr + ".new") write_csv(header, old, filename=chr + ".old") x, y = zip(*new) ax.plot(x, y, "b,") x, y = zip(*old) ax.plot(x, y, "r,") # Legends ymid = (ystart + yend) / 2 y = ymid + pad root.plot([.2], [y], "r.", lw=2) root.text(.2 + pad, y, "Existing Medtr ids", va="center", size=10) y = ymid - pad root.plot([.2], [y], "b.", lw=2) root.text(.2 + pad, y, "Newly instantiated ids", va="center", size=10) ax.set_xlim(0, ngenes) ax.set_ylim(0, ymax) ax.set_axis_off() root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = chr + ".identifiers." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def ystr(args): """ %prog ystr chrY.vcf Print out Y-STR info given VCF. Marker name extracted from tabfile. """ from jcvi.utils.table import write_csv p = OptionParser(ystr.__doc__) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args si = STRFile(opts.lobstr_home, db="hg38-named") register = si.register header = "Marker|Reads|Ref|Genotype|Motif".split("|") contents = [] fp = must_open(vcffile) reader = vcf.Reader(fp) simple_register = {} for record in reader: name = register[(record.CHROM, record.POS)] info = record.INFO ref = int(float(info["REF"])) rpa = info.get("RPA", ref) if isinstance(rpa, list): rpa = "|".join(str(int(float(x))) for x in rpa) ru = info["RU"] simple_register[name] = rpa for sample in record.samples: contents.append((name, sample["ALLREADS"], ref, rpa, ru)) # Multi-part markers a, b, c = "DYS389I", "DYS389B.1", "DYS389B" if a in simple_register and b in simple_register: simple_register[c] = int(simple_register[a]) + int(simple_register[b]) # Multi-copy markers mm = ["DYS385", "DYS413", "YCAII"] for m in mm: ma, mb = m + 'a', m + 'b' if ma not in simple_register or mb not in simple_register: simple_register[ma] = simple_register[mb] = None del simple_register[ma] del simple_register[mb] continue if simple_register[ma] > simple_register[mb]: simple_register[ma], simple_register[mb] = \ simple_register[mb], simple_register[ma] write_csv(header, contents, sep=" ") print "[YSEARCH]" build_ysearch_link(simple_register) print "[YFILER]" build_yhrd_link(simple_register, panel=YHRD_YFILER) print "[YFILERPLUS]" build_yhrd_link(simple_register, panel=YHRD_YFILERPLUS) print "[YSTR-ALL]" build_yhrd_link(simple_register, panel=USYSTR_ALL)
def prepare(args): """ %prog prepare "B. oleracea" *.fastq Scan input fastq files (see below) and create `in_groups.csv` and `in_libs.csv`. The species name does not really matter. """ from jcvi.utils.table import write_csv from jcvi.formats.base import check_exists p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("--norun", default=False, action="store_true", help="Don't write `run.sh` script [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) organism_name = args[0] project_name = "".join(x[0] for x in organism_name.split()).upper() fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:]) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) groupheader = "group_name library_name file_name".split() libheader = "library_name project_name organism_name type paired "\ "frag_size frag_stddev insert_size insert_stddev read_orientation "\ "genomic_start genomic_end".split() groupcontents = [] libs = [] for file_name in fnames: group_name = op.basename(file_name).split(".")[0] library_name = "-".join(group_name.split("-")[:2]) # Handle paired files and convert to wildcard if ".1." in file_name: file_name = file_name.replace(".1.", ".?.") elif ".2." in file_name: continue groupcontents.append((group_name, library_name, file_name)) if library_name not in libs: libs.append(library_name) libcontents = [] for library_name in libs: L = Library(library_name) size = L.size stddev = L.stddev type = L.type paired = L.paired read_orientation = L.read_orientation size = size or "" stddev = stddev or "" frag_size = size if type == "fragment" else "" frag_stddev = stddev if type == "fragment" else "" insert_size = size if type != "fragment" else "" insert_stddev = stddev if type != "fragment" else "" genomic_start, genomic_end = "", "" libcontents.append((library_name, project_name, organism_name, type, \ paired, frag_size, frag_stddev, insert_size, insert_stddev, \ read_orientation, genomic_start, genomic_end)) write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True) logging.debug("`in_group.csv` created (# of groups = {0}).".\ format(len(groupcontents))) write_csv(libheader, libcontents, filename="in_libs.csv", tee=True) logging.debug("`in_libs.csv` created (# of libs = {0}).".\ format(len(libcontents))) runfile = "run.sh" if not opts.norun and check_exists(runfile): fw = open(runfile, "w") print >> fw, ALLPATHSRUN logging.debug("Run script written to `{0}`.".format(runfile))