def genestatus(args): """ %prog genestatus diploid.gff3.exon.ids Tag genes based on translation from GMAP models, using fasta.translate() --ids. """ p = OptionParser(genestatus.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) idsfile, = args data = get_tags(idsfile) key = lambda x: x[0].split(".")[0] for gene, cc in groupby(data, key=key): cc = list(cc) tags = [x[-1] for x in cc] if "complete" in tags: tag = "complete" elif "partial" in tags: tag = "partial" else: tag = "pseudogene" print "\t".join((gene, tag))
def links(args): """ %prog links url Extract all the links "<a href=''>" from web page. """ p = OptionParser(links.__doc__) p.add_option("--img", default=False, action="store_true", help="Extract <img> tags [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) url, = args img = opts.img htmlfile = download(url) page = open(htmlfile).read() soup = BeautifulSoup(page) tag = 'img' if img else 'a' src = 'src' if img else 'href' aa = soup.findAll(tag) for a in aa: link = a.get(src) link = urljoin(url, link) print(link)
def traits(args): """ %prog traits directory Make HTML page that reports eye and skin color. """ p = OptionParser(traits.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) samples = [] for folder in args: targets = iglob(folder, "*-traits.json") if not targets: continue filename = targets[0] js = json.load(open(filename)) js["skin_rgb"] = make_rgb( js["traits"]["skin-color"]["L"], js["traits"]["skin-color"]["A"], js["traits"]["skin-color"]["B"]) js["eye_rgb"] = make_rgb( js["traits"]["eye-color"]["L"], js["traits"]["eye-color"]["A"], js["traits"]["eye-color"]["B"]) samples.append(js) template = Template(traits_template) fw = open("report.html", "w") print >> fw, template.render(samples=samples) logging.debug("Report written to `{}`".format(fw.name)) fw.close()
def diff(args): """ %prog diff simplefile Calculate difference of pairwise syntenic regions. """ from jcvi.utils.cbook import SummaryStats p = OptionParser(diff.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) simplefile, = args fp = open(simplefile) data = [x.split() for x in fp] spans = [] for block_id, ab in groupby(data[1:], key=lambda x: x[0]): a, b = list(ab) aspan, bspan = a[4], b[4] aspan, bspan = int(aspan), int(bspan) spans.append((aspan, bspan)) aspans, bspans = zip(*spans) dspans = [b - a for a, b, in spans] s = SummaryStats(dspans) print >> sys.stderr, "For a total of {0} blocks:".format(len(dspans)) print >> sys.stderr, "Sum of A: {0}".format(sum(aspans)) print >> sys.stderr, "Sum of B: {0}".format(sum(bspans)) print >> sys.stderr, "Sum of Delta: {0} ({1})".format(sum(dspans), s)
def compile(args): """ %prog compile directory Extract telomere length and ccn. """ p = OptionParser(compile.__doc__) p.set_outfile(outfile="age.tsv") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) dfs = [] for folder in args: ofolder = os.listdir(folder) # telomeres subdir = [x for x in ofolder if x.startswith("telomeres")][0] subdir = op.join(folder, subdir) filename = op.join(subdir, "tel_lengths.txt") df = pd.read_csv(filename, sep="\t") d1 = df.ix[0].to_dict() # ccn subdir = [x for x in ofolder if x.startswith("ccn")][0] subdir = op.join(folder, subdir) filename = iglob(subdir, "*.ccn.json")[0] js = json.load(open(filename)) d1.update(js) df = pd.DataFrame(d1, index=[0]) dfs.append(df) df = pd.concat(dfs, ignore_index=True) df.to_csv(opts.outfile, sep="\t", index=False)
def flip(args): """ %prog flip fastafile Go through each FASTA record, check against Genbank file and determines whether or not to flip the sequence. This is useful before updates of the sequences to make sure the same orientation is used. """ p = OptionParser(flip.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta" fo = open(outfastafile, "w") f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): tmpfasta = "a.fasta" fw = open(tmpfasta, "w") SeqIO.write([rec], fw, "fasta") fw.close() o = overlap([tmpfasta, name]) if o.orientation == '-': rec.seq = rec.seq.reverse_complement() SeqIO.write([rec], fo, "fasta") os.remove(tmpfasta)
def batchoverlap(args): """ %prog batchoverlap pairs.txt outdir Check overlaps between pairs of sequences. """ p = OptionParser(batchoverlap.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, outdir = args fp = open(pairsfile) cmds = [] mkdir("overlaps") for row in fp: a, b = row.split()[:2] oa = op.join(outdir, a + ".fa") ob = op.join(outdir, b + ".fa") cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob) cmd += " -o overlaps/{0}_{1}.ov".format(a, b) cmds.append(cmd) print "\n".join(cmds)
def summary(args): """ %prog summary *.gff Print gene statistics table. """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gff_files = args for metric in metrics: logging.debug("Parsing files in `{0}`..".format(metric)) table = {} for x in gff_files: pf = op.basename(x).split(".")[0] numberfile = op.join(metric, pf + ".txt") ar = [int(x.strip()) for x in open(numberfile)] sum = SummaryStats(ar).todict().items() keys, vals = zip(*sum) keys = [(pf, x) for x in keys] table.update(dict(zip(keys, vals))) print >> sys.stderr, tabulate(table)
def histogram(args): """ %prog histogram *.gff Plot gene statistics based on output of stats. For each gff file, look to see if the metrics folder (i.e. Exon_Length) contains the data and plot them. """ from jcvi.graphics.histogram import histogram_multiple p = OptionParser(histogram.__doc__) p.add_option("--bins", dest="bins", default=40, type="int", help="number of bins to plot in the histogram [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gff_files = args # metrics = ("Exon_Length", "Intron_Length", "Gene_Length", "Exon_Count") colors = ("red", "green", "blue", "black") vmaxes = (1000, 1000, 4000, 20) xlabels = ("bp", "bp", "bp", "number") for metric, color, vmax, xlabel in zip(metrics, colors, vmaxes, xlabels): logging.debug("Parsing files in `{0}`..".format(metric)) numberfiles = [op.join(metric, op.basename(x).split(".")[0] + ".txt") \ for x in gff_files] histogram_multiple(numberfiles, 0, vmax, xlabel, metric, bins=opts.bins, facet=True, fill=color, prefix=metric + ".")
def unitigs(args): """ %prog unitigs best.edges Reads Celera Assembler's "best.edges" and extract all unitigs. """ p = OptionParser(unitigs.__doc__) p.add_option("--maxerr", default=2, type="int", help="Maximum error rate") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bestedges, = args G = read_graph(bestedges, maxerr=opts.maxerr, directed=True) H = nx.Graph() intconv = lambda x: int(x.split("-")[0]) for k, v in G.iteritems(): if k == G.get(v, None): H.add_edge(intconv(k), intconv(v)) nunitigs = nreads = 0 for h in nx.connected_component_subgraphs(H, copy=False): st = [x for x in h if h.degree(x) == 1] if len(st) != 2: continue src, target = st path = list(nx.all_simple_paths(h, src, target)) assert len(path) == 1 path, = path print "|".join(str(x) for x in path) nunitigs += 1 nreads += len(path) logging.debug("A total of {0} unitigs built from {1} reads.".format(nunitigs, nreads))
def tracedb(args): """ %prog tracedb <xml|lib|frg> Run `tracedb-to-frg.pl` within current folder. """ p = OptionParser(tracedb.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) action, = args assert action in ("xml", "lib", "frg") CMD = "tracedb-to-frg.pl" xmls = glob("xml*") if action == "xml": for xml in xmls: cmd = CMD + " -xml {0}".format(xml) sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True) elif action == "lib": cmd = CMD + " -lib {0}".format(" ".join(xmls)) sh(cmd) elif action == "frg": for xml in xmls: cmd = CMD + " -frg {0}".format(xml) sh(cmd, background=True)
def ids(args): """ %prog ids cdhit.clstr Get the representative ids from clstr file. """ p = OptionParser(ids.__doc__) p.add_option("--prefix", type="int", help="Find rep id for prefix of len [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) clstrfile, = args cf = ClstrFile(clstrfile) prefix = opts.prefix if prefix: reads = list(cf.iter_reps_prefix(prefix=prefix)) else: reads = list(cf.iter_reps()) nreads = len(reads) idsfile = clstrfile.replace(".clstr", ".ids") fw = open(idsfile, "w") for i, name in reads: print("\t".join(str(x) for x in (i, name)), file=fw) logging.debug("A total of {0} unique reads written to `{1}`.".\ format(nreads, idsfile)) fw.close() return idsfile
def csv(args): """ %prog csv excelfile Convert EXCEL to csv file. """ from xlrd import open_workbook p = OptionParser(csv.__doc__) p.set_sep(sep=',') opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) excelfile, = args sep = opts.sep csvfile = excelfile.rsplit(".", 1)[0] + ".csv" wb = open_workbook(excelfile) fw = open(csvfile, "w") for s in wb.sheets(): print >> sys.stderr, 'Sheet:',s.name for row in range(s.nrows): values = [] for col in range(s.ncols): values.append(s.cell(row, col).value) print >> fw, sep.join(str(x) for x in values)
def main(): """ %prog numbers1.txt number2.txt ... Print histogram of the data files. The data files contain one number per line. If more than one file is inputted, the program will combine the histograms into the same plot. """ allowed_format = ("emf", "eps", "pdf", "png", "ps", \ "raw", "rgba", "svg", "svgz") p = OptionParser(main.__doc__) p.add_option("--skip", default=0, type="int", help="skip the first several lines [default: %default]") p.set_histogram() p.add_option("--tags", dest="tags", default=None, help="tags for data if multiple input files, comma sep") p.add_option("--ascii", default=False, action="store_true", help="print ASCII text stem-leaf plot [default: %default]") p.add_option("--base", default="0", choices=("0", "2", "10"), help="use logarithm axis with base, 0 to disable [default: %default]") p.add_option("--facet", default=False, action="store_true", help="place multiple histograms side-by-side [default: %default]") p.add_option("--fill", default="white", help="color of the bin [default: %default]") p.add_option("--format", default="pdf", choices=allowed_format, help="Generate image of format [default: %default]") p.add_option("--quick", default=False, action="store_true", help="Use quick plot, assuming bins are already counted") p.add_option("--noprintstats", default=False, action="store_true", help="Write basic stats when using --quick") opts, args = p.parse_args() if len(args) < 1: sys.exit(not p.print_help()) skip = opts.skip vmin, vmax = opts.vmin, opts.vmax bins = opts.bins xlabel, title = opts.xlabel, opts.title title = title or args[0] base = int(opts.base) fileno = len(args) if opts.quick: assert fileno == 1, "Single input file expected using --quick" filename = args[0] figname = filename.rsplit(".", 1)[0] + ".pdf" data = DictFile(filename, keycast=int, cast=int) quickplot(data, vmin, vmax, xlabel, title, figname=figname, print_stats=(not opts.noprintstats)) return if fileno == 1: histogram(args[0], vmin, vmax, xlabel, title, outfmt=opts.format, bins=bins, skip=skip, ascii=opts.ascii, base=base, fill=opts.fill) else: histogram_multiple(args, vmin, vmax, xlabel, title, outfmt=opts.format, tags=opts.tags, bins=bins, skip=skip, ascii=opts.ascii, facet=opts.facet, fill=opts.fill)
def blast(args): """ %prog blast ref.fasta query.fasta Calls blast and then filter the BLAST hits. Default is megablast. """ task_choices = ("blastn", "blastn-short", "dc-megablast", \ "megablast", "vecscreen") p = OptionParser(blast.__doc__) p.set_align(pctid=None, evalue=.01) p.add_option("--wordsize", type="int", help="Word size [default: %default]") p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.add_option("--task", default="megablast", choices=task_choices, help="Task of the blastn [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) reffasta, queryfasta = args q = op.basename(queryfasta).split(".")[0] r = op.basename(reffasta).split(".")[0] blastfile = "{0}.{1}.blast".format(q, r) run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta, wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue, hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus) return blastfile
def passthrough(args): """ %prog passthrough chrY.vcf chrY.new.vcf Pass through Y and MT vcf. """ p = OptionParser(passthrough.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, newvcffile = args fp = open(vcffile) fw = open(newvcffile, "w") gg = ["0/0", "0/1", "1/1"] for row in fp: if row[0] == "#": print(row.strip(), file=fw) continue v = VcfLine(row) v.filter = "PASS" v.format = "GT:GP" probs = [0] * 3 probs[gg.index(v.genotype)] = 1 v.genotype = v.genotype.replace("/", "|") + \ ":{0}".format(",".join("{0:.3f}".format(x) for x in probs)) print(v, file=fw) fw.close()
def agp(args): """ %prog agp <fastafile|sizesfile> Convert the sizes file to a trivial AGP file. """ from jcvi.formats.agp import OO p = OptionParser(agp.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) sizesfile, = args sizes = Sizes(sizesfile) agpfile = sizes.filename.rsplit(".", 1)[0] + ".agp" fw = open(agpfile, "w") o = OO() # Without a filename for ctg, size in sizes.iter_sizes(): o.add(ctg, ctg, size) o.write_AGP(fw) fw.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) return agpfile
def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def beagle(args): """ %prog beagle input.vcf 1 Use BEAGLE4.1 to impute vcf on chromosome 1. """ p = OptionParser(beagle.__doc__) p.set_home("beagle") p.set_ref() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, chr = args pf = vcffile.rsplit(".", 1)[0] outpf = pf + ".beagle" outfile = outpf + ".vcf.gz" mm = MakeManager() beagle_cmd = opts.beagle_home kg = op.join(opts.ref, "1000GP_Phase3") cmd = beagle_cmd + " gt={0}".format(vcffile) cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr) cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr) cmd += " out={0}".format(outpf) cmd += " nthreads=16 gprobs=true" mm.add(vcffile, outfile, cmd) mm.write()
def snp(args): """ %prog snp input.gsnap Run SNP calling on GSNAP output after apps.gsnap.align(). """ p = OptionParser(snp.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gsnapfile, = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] nativefile = pf + ".native" if need_update(gsnapfile, nativefile): cmd = op.join(EYHOME, "convert2native.pl") cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile) cmd += " -proc {0}".format(opts.cpus) sh(cmd) snpfile = pf + ".snp" if need_update(nativefile, snpfile): cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl") cmd += " --native {0} -o {1}".format(nativefile, snpfile) cmd += " -a 2 -ac 0.3 -c 0.8" sh(cmd)
def group(args): """ %prog group anchorfiles Group the anchors into ortho-groups. Can input multiple anchor files. """ p = OptionParser(group.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) anchorfiles = args groups = Grouper() for anchorfile in anchorfiles: ac = AnchorFile(anchorfile) for a, b, idx in ac.iter_pairs(): groups.join(a, b) logging.debug("Created {0} groups with {1} members.".\ format(len(groups), groups.num_members)) outfile = opts.outfile fw = must_open(outfile, "w") for g in groups: print >> fw, ",".join(sorted(g)) fw.close() return outfile
def filter(args): """ %prog filter consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=10, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args minsize = opts.minsize f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) if size < minsize: continue SeqIO.write(rec, fw, "fasta")
def fromimpute2(args): """ %prog fromimpute2 impute2file fastafile 1 Convert impute2 output to vcf file. Imputed file looks like: --- 1:10177:A:AC 10177 A AC 0.451 0.547 0.002 """ p = OptionParser(fromimpute2.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) impute2file, fastafile, chr = args fasta = Fasta(fastafile) print get_vcfstanza(fastafile, fasta) fp = open(impute2file) seen = set() for row in fp: snp_id, rsid, pos, ref, alt, aa, ab, bb = row.split() pos = int(pos) if pos in seen: continue seen.add(pos) code = max((float(aa), "0/0"), (float(ab), "0/1"), (float(bb), "1/1"))[-1] tag = "PR" if snp_id == chr else "IM" print "\t".join(str(x) for x in \ (chr, pos, rsid, ref, alt, ".", ".", tag, \ "GT:GP", code + ":" + ",".join((aa, ab, bb))))
def uniq(args): """ %prog uniq vcffile Retain only the first entry in vcf file. """ from urlparse import parse_qs p = OptionParser(uniq.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args fp = must_open(vcffile) data = [] for row in fp: if row[0] == '#': print row.strip() continue v = VcfLine(row) data.append(v) for pos, vv in groupby(data, lambda x: x.pos): vv = list(vv) if len(vv) == 1: print vv[0] continue bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0])) print bestv
def sample(args): """ %prog sample vcffile 0.9 Sample subset of vcf file. """ from random import random p = OptionParser(sample.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) vcffile, ratio = args ratio = float(ratio) fp = open(vcffile) pf = vcffile.rsplit(".", 1)[0] kept = pf + ".kept.vcf" withheld = pf + ".withheld.vcf" fwk = open(kept, "w") fww = open(withheld, "w") nkept = nwithheld = 0 for row in fp: if row[0] == '#': print >> fwk, row.strip() continue if random() < ratio: nkept += 1 print >> fwk, row.strip() else: nwithheld += 1 print >> fww, row.strip() logging.debug("{0} records kept to `{1}`".format(nkept, kept)) logging.debug("{0} records withheld to `{1}`".format(nwithheld, withheld))
def blat(args): """ %prog blat old.fasta new.fasta Generate psl file using blat. """ p = OptionParser(blat.__doc__) p.add_option("--minscore", default=100, type="int", help="Matches minus mismatches gap penalty [default: %default]") p.add_option("--minid", default=98, type="int", help="Minimum sequence identity [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args twobitfiles = [] for fastafile in args: tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat" cmd += " {0} {1}".format(oldtwobit, newfasta) cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\ format(opts.minscore, opts.minid) pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \ for x in (newfasta, oldfasta))) cmd += pslfile sh(cmd)
def summary(args): """ %prog summary old.new.chain old.fasta new.fasta Provide stats of the chain file. """ from jcvi.formats.fasta import summary as fsummary from jcvi.utils.cbook import percentage, human_size p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) chainfile, oldfasta, newfasta = args chain = Chain(chainfile) ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq print >> sys.stderr, "File `{0}` contains {1} chains.".\ format(chainfile, len(chain)) print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\ format(human_size(ungapped), human_size(dt), human_size(dq)) oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"]) print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\ format(oldfasta, percentage(ungapped, oldreal)) newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"]) print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\ format(newfasta, percentage(ungapped, newreal))
def uclust(args): """ %prog uclust fastafile Use `usearch` to remove duplicate reads. """ p = OptionParser(uclust.__doc__) p.set_align(pctid=98) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. pf, sf = fastafile.rsplit(".", 1) sortedfastafile = pf + ".sorted.fasta" if need_update(fastafile, sortedfastafile): cmd = "usearch -sortbylength {0} -fastaout {1}".\ format(fastafile, sortedfastafile) sh(cmd) pf = fastafile + ".P{0}.uclust".format(opts.pctid) clstrfile = pf + ".clstr" centroidsfastafile = pf + ".centroids.fasta" if need_update(sortedfastafile, centroidsfastafile): cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile) cmd += " -id {0}".format(identity) cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile) sh(cmd)
def fromagp(args): """ %prog fromagp agpfile componentfasta objectfasta Generate chain file from AGP format. The components represent the old genome (target) and the objects represent new genome (query). """ from jcvi.formats.agp import AGP from jcvi.formats.sizes import Sizes p = OptionParser(fromagp.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, componentfasta, objectfasta = args chainfile = agpfile.rsplit(".", 1)[0] + ".chain" fw = open(chainfile, "w") agp = AGP(agpfile) componentsizes = Sizes(componentfasta).mapping objectsizes = Sizes(objectfasta).mapping chain = "chain" score = 1000 tStrand = "+" id = 0 for a in agp: if a.is_gap: continue tName = a.component_id tSize = componentsizes[tName] tStart = a.component_beg tEnd = a.component_end tStart -= 1 qName = a.object qSize = objectsizes[qName] qStrand = "-" if a.orientation == "-" else "+" qStart = a.object_beg qEnd = a.object_end if qStrand == '-': _qStart = qSize - qEnd + 1 _qEnd = qSize - qStart + 1 qStart, qEnd = _qStart, _qEnd qStart -= 1 id += 1 size = a.object_span headerline = "\t".join(str(x) for x in ( chain, score, tName, tSize, tStrand, tStart, tEnd, qName, qSize, qStrand, qStart, qEnd, id )) alignmentline = size print >> fw, headerline print >> fw, alignmentline print >> fw fw.close() logging.debug("File written to `{0}`.".format(chainfile))
def bam(args): """ %prog snp input.gsnap ref.fasta Convert GSNAP output to BAM. """ from jcvi.formats.sizes import Sizes from jcvi.formats.sam import index p = OptionParser(bam.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gsnapfile, fastafile = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] uniqsam = pf + ".unique.sam" if need_update((gsnapfile, fastafile), uniqsam): cmd = op.join(EYHOME, "gsnap2gff3.pl") sizesfile = Sizes(fastafile).filename cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam) cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus) sh(cmd) index([uniqsam])
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob cparams = "1 1 2 80 5 200 2000" p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--lobstr", default=False, action="store_true", help="Generate output for lobSTR") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") p.add_option("--centromeres", default=False, action="store_true", help="Run centromere search: {}".format(cparams)) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args minlength = opts.minscore / 2 mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() if opts.centromeres: params = cparams.split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).split(".")[0] cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | grep -v ^Parameters".format(datfile) if opts.lobstr: cmd2 += " | awk '($8 >= {} && $8 <= {})'".\ format(minlength, READLEN - minlength) else: cmd2 += " | awk '($8 >= 0)'" cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()
def from23andme(args): """ %prog from23andme txtfile 1 Convert from23andme file to vcf file. --ref points to the folder that contains chr1.rsids $ zcat 1000GP_Phase3/1000GP_Phase3_chr1.legend.gz \\ | cut -d" " -f1 | grep ":" > chr1.rsids """ p = OptionParser(from23andme.__doc__) p.set_ref() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) txtfile, seqid = args ref_dir = opts.ref fastafile = op.join(ref_dir, "hs37d5.fa") fasta = Fasta(fastafile) pf = txtfile.rsplit(".", 1)[0] px = CM[seqid] chrvcf = pf + ".{0}.vcf".format(px) legend = op.join(ref_dir, "1000GP_Phase3/{0}.rsids".format(px)) register = read_rsid(seqid, legend) fw = open(chrvcf, "w") print(get_vcfstanza(fastafile, txtfile), file=fw) fp = open(txtfile) seen = set() duplicates = skipped = missing = 0 for row in fp: if row[0] == "#": continue rsid, chr, pos, genotype = row.split() if chr != seqid: continue pos = int(pos) if (chr, pos) in seen: duplicates += 1 continue seen.add((chr, pos)) genotype = list(genotype) if "-" in genotype: # missing daa missing += 1 continue # Y or MT if not register: assert len(genotype) == 1 ref = fasta[chr][pos - 1].seq.upper() if "D" in genotype or "I" in genotype: skipped += 1 continue genotype = genotype[0] code = "0/0" if ref == genotype else "1/1" alt = "." if ref == genotype else genotype print( "\t".join( str(x) for x in (chr, pos, rsid, ref, alt, ".", ".", "PR", "GT", code)), file=fw, ) continue # If rsid is seen in the db, use that if rsid in register: pos, ref, alt = register[rsid] elif pos in register: pos, ref, alt = register[pos] else: skipped += 1 # Not in reference panel continue assert fasta[chr][pos - 1:pos + len(ref) - 1].seq.upper() == ref # Keep it bi-allelic not_seen = [x for x in alt if x not in genotype] while len(alt) > 1 and not_seen: alt.remove(not_seen.pop()) if len(alt) > 1: alt = [alt[0]] alleles = [ref] + alt if len(genotype) == 1: genotype = [genotype[0]] * 2 alt = ",".join(alt) or "." if "D" in genotype or "I" in genotype: max_allele = max((len(x), x) for x in alleles)[1] alleles = [("I" if x == max_allele else "D") for x in alleles] assert "I" in alleles and "D" in alleles a, b = genotype try: ia, ib = alleles.index(a), alleles.index(b) except ValueError: # alleles not seen logging.error("{0}: alleles={1}, genotype={2}".format( rsid, alleles, genotype)) skipped += 1 continue code = "/".join(str(x) for x in sorted((ia, ib))) print( "\t".join( str(x) for x in (chr, pos, rsid, ref, alt, ".", ".", "PR", "GT", code)), file=fw, ) logging.debug("duplicates={0} skipped={1} missing={2}".format( duplicates, skipped, missing))
logging.error("Term `{0}` does not exist".format(term)) sys.exit(1) if oterm != term: logging.debug("Resolved term `{0}` to `{1}`".format(oterm, term)) return term if __name__ == "__main__": p = OptionParser(__doc__) p.add_option( "--term", help="Write the parents and children of this query term", ) opts, args = p.parse_args() if len(args) != 1: sys.exit(p.print_help()) (obo_file, ) = args def description(record): level = "level-{:>02}".format(record.level) desc = "{} [{}]".format(record.name, record.namespace) if record.is_obsolete: desc += " obsolete" alt_ids = ",".join(record.alt_ids) return "\t".join((record.item_id, level, desc, alt_ids)) g = GODag(obo_file, prt=None)
def mito(args): """ %prog mito chrM.fa input.bam Identify mitochondrial deletions. """ p = OptionParser(mito.__doc__) p.set_aws_opts(store="hli-mv-data-science/htang/mito-deletions") p.add_option("--realignonly", default=False, action="store_true", help="Realign only") p.add_option( "--svonly", default=False, action="store_true", help="Run Realign => SV calls only", ) p.add_option("--support", default=1, type="int", help="Minimum number of supporting reads") p.set_home("speedseq", default="/mnt/software/speedseq/bin") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) chrMfa, bamfile = args store = opts.output_path cleanup = not opts.nocleanup if not op.exists(chrMfa): logging.debug("File `{}` missing. Exiting.".format(chrMfa)) return chrMfai = chrMfa + ".fai" if not op.exists(chrMfai): cmd = "samtools index {}".format(chrMfa) sh(cmd) if not bamfile.endswith(".bam"): bamfiles = [x.strip() for x in open(bamfile)] else: bamfiles = [bamfile] if store: computed = ls_s3(store) computed = [ op.basename(x).split(".")[0] for x in computed if x.endswith(".depth") ] remaining_samples = [ x for x in bamfiles if op.basename(x).split(".")[0] not in computed ] logging.debug("Already computed on `{}`: {}".format( store, len(bamfiles) - len(remaining_samples))) bamfiles = remaining_samples logging.debug("Total samples: {}".format(len(bamfiles))) for bamfile in bamfiles: run_mito( chrMfa, bamfile, opts, realignonly=opts.realignonly, svonly=opts.svonly, store=store, cleanup=cleanup, )
def ystr(args): """ %prog ystr chrY.vcf Print out Y-STR info given VCF. Marker name extracted from tabfile. """ from jcvi.utils.table import write_csv p = OptionParser(ystr.__doc__) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args si = STRFile(opts.lobstr_home, db="hg38-named") register = si.register header = "Marker|Reads|Ref|Genotype|Motif".split("|") contents = [] fp = must_open(vcffile) reader = vcf.Reader(fp) simple_register = {} for record in reader: name = register[(record.CHROM, record.POS)] info = record.INFO ref = int(float(info["REF"])) rpa = info.get("RPA", ref) if isinstance(rpa, list): rpa = "|".join(str(int(float(x))) for x in rpa) ru = info["RU"] simple_register[name] = rpa for sample in record.samples: contents.append((name, sample["ALLREADS"], ref, rpa, ru)) # Multi-part markers a, b, c = "DYS389I", "DYS389B.1", "DYS389B" if a in simple_register and b in simple_register: simple_register[c] = int(simple_register[a]) + int(simple_register[b]) # Multi-copy markers mm = ["DYS385", "DYS413", "YCAII"] for m in mm: ma, mb = m + 'a', m + 'b' if ma not in simple_register or mb not in simple_register: simple_register[ma] = simple_register[mb] = None del simple_register[ma] del simple_register[mb] continue if simple_register[ma] > simple_register[mb]: simple_register[ma], simple_register[mb] = \ simple_register[mb], simple_register[ma] write_csv(header, contents, sep=" ") print "[YSEARCH]" build_ysearch_link(simple_register) print "[YFILER]" build_yhrd_link(simple_register, panel=YHRD_YFILER) print "[YFILERPLUS]" build_yhrd_link(simple_register, panel=YHRD_YFILERPLUS) print "[YSTR-ALL]" build_yhrd_link(simple_register, panel=USYSTR_ALL)
def meta(args): """ %prog meta data.bin samples STR.ids STR-exons.wo.bed Compute allele frequencies and prune sites based on missingness. Filter subset of loci that satisfy: 1. no redundancy (unique chr:pos) 2. variable (n_alleles > 1) 3. low level of missing data (>= 50% autosomal + X, > 25% for Y) Write meta file with the following infor: 1. id 2. title 3. gene_name 4. variant_type 5. motif 6. allele_frequency `STR-exons.wo.bed` can be generated like this: $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed """ p = OptionParser(meta.__doc__) p.add_option("--cutoff", default=.5, type="float", help="Percent observed required (chrY half cutoff)") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) binfile, sampleids, strids, wobed = args cutoff = opts.cutoff af_file = "allele_freq" if need_update(binfile, af_file): df, m, samples, loci = read_binfile(binfile, sampleids, strids) nalleles = len(samples) fw = must_open(af_file, "w") for i, locus in enumerate(loci): a = m[:, i] counts = alleles_to_counts(a) af = counts_to_af(counts) seqid = locus.split("_")[0] remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff) print >> fw, "\t".join((locus, af, remove)) fw.close() logging.debug("Load gene intersections from `{}`".format(wobed)) fp = open(wobed) gene_map = defaultdict(set) for row in fp: chr1, start1, end1, chr2, start2, end2, name, ov = row.split() gene_map[(chr1, start1)] |= set(name.split(",")) for k, v in gene_map.items(): non_enst = sorted(x for x in v if not x.startswith("ENST")) #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST")) gene_map[k] = ",".join(non_enst) tredsfile = op.join(datadir, "TREDs.meta.hg38.csv") TREDS = read_treds(tredsfile) metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp()) write_meta(af_file, gene_map, TREDS, filename=metafile) logging.debug("File `{}` written.".format(metafile))
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--haploid", default="chrY,chrM", help="Use haploid model for these chromosomes") p.add_option("--chr", help="Run only this chromosome") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args if lbindices[0] == "TOY": # Simulation mode cmd, vcf_file = allelotype_on_chr(bamfile, "CHR4", "/mnt/software/lobSTR/", "TOY", haploid=opts.haploid) stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats" results_dir = "lobstr_results" mkdir(results_dir) sh(cmd) sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file)) return s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx, haploid=opts.haploid) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def filterloci(args): """ %prog filterloci allele_freq STR-exons.wo.bed SAMPLES Filter subset of loci that satisfy: 1. no redundancy (unique chr:pos) 2. variable (n_alleles > 1) 3. low level of missing data (>= 50% autosomal + X, > 25% for Y) Write meta file with the following infor: 1. id 2. title 3. gene_name 4. variant_type 5. motif 6. allele_frequency `STR-exons.wo.bed` can be generated like this: $ tail -n 854476 /mnt/software/lobSTR-4.0.0/hg38/index.tab | cut -f1-3 > all-STR.bed $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed """ p = OptionParser(filterloci.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) af, wobed, samples = args nsamples = len([x.strip() for x in open(samples)]) nalleles = nsamples * 2 logging.debug("Load {} samples ({} alleles) from `{}`".\ format(nsamples, nalleles, samples)) logging.debug("Load gene intersections from `{}`".format(wobed)) fp = open(wobed) gene_map = defaultdict(set) for row in fp: chr1, start1, end1, chr2, start2, end2, name, ov = row.split() gene_map[(chr1, start1)] |= set(name.split(",")) for k, v in gene_map.items(): non_enst = sorted(x for x in v if not x.startswith("ENST")) enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST")) gene_map[k] = ",".join(non_enst + enst) logging.debug("Filtering loci from `{}`".format(af)) fp = open(af) treds = """chr19_45770205_CAG chr6_170561926_CAG chrX_67545318_CAG chr9_69037287_GAA chrX_147912051_CGG chr4_3074877_CAG chrX_148500639_CCG chr12_6936729_CAG chr13_70139384_CTG chr6_16327636_CTG chr14_92071011_CTG chr12_111598951_CTG chr3_63912686_CAG chr19_13207859_CTG""".split() seen = set(treds) remove = [] fw = open("meta.tsv", "w") header = "id title gene_name variant_type motif allele_frequency".\ replace(" ", "\t") print >> fw, header variant_type = "short tandem repeats" title = "Short tandem repeats ({})n" for row in fp: sname, counts = row.split() name = sname.rsplit("_", 1)[0] seqid, pos, motif = name.split("_") countst = [x for x in counts.strip("{}").split(",") if x] countsd = {} for x in countst: a, b = x.split(":") countsd[int(a)] = int(b) if counts_filter(countsd, nalleles, seqid): remove.append(sname) continue if name in seen: remove.append(sname) continue seen.add(name) gene_name = gene_map.get((seqid, pos), "") print >> fw, "\t".join((name, title.format(motif), gene_name, variant_type, motif, counts)) fw.close() removeidsfile = "remove.ids" fw = open(removeidsfile, "w") print >> fw, "\n".join(remove) fw.close() logging.debug("A total of {} filtered loci written to `{}`".\ format(len(remove), removeidsfile))
def A50(args): """ %prog A50 contigs_A.fasta contigs_B.fasta ... Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/) """ p = OptionParser(A50.__doc__) p.add_option( "--overwrite", default=False, action="store_true", help="overwrite .rplot file if exists", ) p.add_option( "--cutoff", default=0, type="int", dest="cutoff", help="use contigs above certain size", ) p.add_option( "--stepsize", default=10, type="int", dest="stepsize", help="stepsize for the distribution", ) opts, args = p.parse_args(args) if not args: sys.exit(p.print_help()) import numpy as np from jcvi.utils.table import loadtable stepsize = opts.stepsize # use stepsize to speed up drawing rplot = "A50.rplot" if not op.exists(rplot) or opts.overwrite: fw = open(rplot, "w") header = "\t".join(("index", "cumsize", "fasta")) statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum", "Counts") statsrows = [] print(header, file=fw) for fastafile in args: f = Fasta(fastafile, index=False) ctgsizes = [length for k, length in f.itersizes()] ctgsizes = np.array(ctgsizes) a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff) cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes) csum, counts = np.sum(ctgsizes), len(ctgsizes) cmean = int(round(cmean)) statsrows.append( (fastafile, l50, n50, cmin, cmax, cmean, csum, counts)) logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes)) tag = "{0} (L50={1})".format( op.basename(fastafile).rsplit(".", 1)[0], l50) logging.debug(tag) for i, s in zip(range(0, len(a50), stepsize), a50[::stepsize]): print("\t".join((str(i), str(s / 1000000.0), tag)), file=fw) fw.close() table = loadtable(statsheader, statsrows) print(table, file=sys.stderr) generate_plot(rplot)
def lobstr(args): """ %prog lobstr bamfile lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.add_option("--prefix", help="Use prefix file name") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) bamfile = args[0] lbindices = args[1:] s3mode = bamfile.startswith("s3") store = opts.store workdir = opts.workdir mkdir(workdir) os.chdir(workdir) pf = opts.prefix or bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "s3://{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile lhome = opts.lobstr_home chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: mm = MakeManager(filename="makefile.{0}".format(lbidx)) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) vcffiles.append(vcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if opts.cleanup: sh("rm -f *")
def scaffold(args): """ %prog scaffold ctgfasta reads1.fasta mapping1.bed reads2.fasta mapping2.bed ... Run BAMBUS on set of contigs, reads and read mappings. """ from more_itertools import grouper from jcvi.formats.base import FileMerger from jcvi.formats.bed import mates from jcvi.formats.contig import frombed from jcvi.formats.fasta import join p = OptionParser(scaffold.__doc__) p.set_rclip(rclip=1) p.add_option("--conf", help="BAMBUS configuration file [default: %default]") p.add_option( "--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]", ) opts, args = p.parse_args(args) nargs = len(args) if nargs < 3 or nargs % 2 != 1: sys.exit(not p.print_help()) rclip = opts.rclip ctgfasta = args[0] duos = list(grouper(args[1:], 2)) trios = [] for fastafile, bedfile in duos: prefix = bedfile.rsplit(".", 1)[0] matefile = prefix + ".mates" matebedfile = matefile + ".bed" if need_update(bedfile, [matefile, matebedfile]): matesopt = [ bedfile, "--lib", "--nointra", "--rclip={0}".format(rclip), "--cutoff={0}".format(opts.cutoff), ] if opts.prefix: matesopt += ["--prefix"] matefile, matebedfile = mates(matesopt) trios.append((fastafile, matebedfile, matefile)) # Merge the readfasta, bedfile and matefile bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates" for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)): FileMerger(files, outfile=outfile).merge(checkexists=True) ctgfile = "bambus.contig" idsfile = "bambus.ids" frombedInputs = [bbbed, ctgfasta, bbfasta] if need_update(frombedInputs, ctgfile): frombed(frombedInputs) inputfasta = "bambus.contigs.fasta" singletonfasta = "bambus.singletons.fasta" cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile) sh(cmd + inputfasta) sh(cmd + singletonfasta + " -exclude") # Run bambus prefix = "bambus" cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix) if opts.conf: cmd += " -C {0}".format(opts.conf) sh(cmd) cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".format( prefix) sh(cmd) final = "final" cmd = ("printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " "-merge -detail -oo -sum -o {1}".format(prefix, final)) sh(cmd) oofile = final + ".oo" join([inputfasta, "--oo={0}".format(oofile)])
def filterdata(args): """ %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids Filter subset of data after dropping remove.ids. """ p = OptionParser(filterdata.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 6: sys.exit(not p.print_help()) binfile, sampleids, strids, af, remove, final = args df, m, samples, loci = read_binfile(binfile, sampleids, strids) remove = [x.strip() for x in open(remove)] removes = set(remove) final = [x.strip() for x in open(final)] assert len(loci) == len(remove) + len(final) fp = open(af) percentiles = {} for row in fp: sname, counts = row.split() countst = [x for x in counts.strip("{}").split(",") if x] countsd = {} for x in countst: a, b = x.split(":") countsd[int(a)] = int(b) percentile = counts_to_percentile(countsd) percentiles[sname] = percentile p = Pool(processes=opts.cpus) run_args = [] for i, sname in enumerate(loci): if sname in removes: continue a = m[:, i] percentile = percentiles[sname] run_args.append((i, a, percentile)) res = [] for r in p.map_async(convert_to_percentile, run_args).get(): res.append(r) res.sort() # Write mask (P-value) matrix ii, pvalues = zip(*res) m = np.vstack(pvalues).T write_csv("final.mask.tsv", m, samples, final) df.drop(remove, inplace=True, axis=1) df.columns = final # Save a copy of the raw numpy array filtered_bin = "filtered.bin" m = df.as_matrix() m[m < 0] = -1 m.tofile(filtered_bin) logging.debug("Binary matrix written to `{}`".format(filtered_bin)) # Write data output df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
def subset(args): """ %prog subset pairsfile ksfile1 ksfile2 ... -o pairs.ks Subset some pre-calculated ks ka values (in ksfile) according to pairs in tab delimited pairsfile/anchorfile. """ p = OptionParser(subset.__doc__) p.add_option( "--noheader", action="store_true", help="don't write ksfile header line" ) p.add_option( "--block", action="store_true", help="preserve block structure in input" ) p.set_stripnames() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, ksfiles = args[0], args[1:] noheader = opts.noheader block = opts.block if block: noheader = True outfile = opts.outfile ksvals = {} for ksfile in ksfiles: ksvals.update( dict( (line.name, line) for line in KsFile(ksfile, strip_names=opts.strip_names) ) ) fp = open(pairsfile) fw = must_open(outfile, "w") if not noheader: print(fields, file=fw) i = j = 0 for row in fp: if row[0] == "#": if block: print(row.strip(), file=fw) continue a, b = row.split()[:2] name = ";".join((a, b)) if name not in ksvals: name = ";".join((b, a)) if name not in ksvals: j += 1 print("\t".join((a, b, ".", ".")), file=fw) continue ksline = ksvals[name] if block: print("\t".join(str(x) for x in (a, b, ksline.ks)), file=fw) else: ksline.name = ";".join((a, b)) print(ksline, file=fw) i += 1 fw.close() logging.debug("{0} pairs not found in ksfiles".format(j)) logging.debug("{0} ks records written to `{1}`".format(i, outfile)) return outfile
def qc(args): """ %prog qc prefix Expects data files including: 1. `prefix.bedpe` draws Bezier curve between paired reads 2. `prefix.sizes` draws length of the contig/scaffold 3. `prefix.gaps.bed` mark the position of the gaps in sequence 4. `prefix.bed.coverage` plots the base coverage 5. `prefix.pairs.bed.coverage` plots the clone coverage See assembly.coverage.posmap() for the generation of these files. """ from jcvi.graphics.glyph import Bezier p = OptionParser(qc.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) (prefix, ) = args scf = prefix # All these files *must* be present in the current folder bedpefile = prefix + ".bedpe" fastafile = prefix + ".fasta" sizesfile = prefix + ".sizes" gapsbedfile = prefix + ".gaps.bed" bedfile = prefix + ".bed" bedpefile = prefix + ".bedpe" pairsbedfile = prefix + ".pairs.bed" sizes = Sizes(fastafile).mapping size = sizes[scf] fig = plt.figure(1, (8, 5)) root = fig.add_axes([0, 0, 1, 1]) # the scaffold root.add_patch(Rectangle((0.1, 0.15), 0.8, 0.03, fc="k")) # basecoverage and matecoverage ax = fig.add_axes([0.1, 0.45, 0.8, 0.45]) bins = 200 # Smooth the curve basecoverage = Coverage(bedfile, sizesfile) matecoverage = Coverage(pairsbedfile, sizesfile) x, y = basecoverage.get_plot_data(scf, bins=bins) (baseline, ) = ax.plot(x, y, "g-") x, y = matecoverage.get_plot_data(scf, bins=bins) (mateline, ) = ax.plot(x, y, "r-") legends = ("Base coverage", "Mate coverage") leg = ax.legend((baseline, mateline), legends, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) ax.set_xlim(0, size) # draw the read pairs fp = open(bedpefile) pairs = [] for row in fp: scf, astart, aend, scf, bstart, bend, clonename = row.split() astart, bstart = int(astart), int(bstart) aend, bend = int(aend), int(bend) start = min(astart, bstart) + 1 end = max(aend, bend) pairs.append((start, end)) bpratio = 0.8 / size cutoff = 1000 # inserts smaller than this are not plotted # this convert from base => x-coordinate pos = lambda x: (0.1 + x * bpratio) ypos = 0.15 + 0.03 for start, end in pairs: dist = end - start if dist < cutoff: continue dist = min(dist, 10000) # 10Kb == .25 canvas height height = 0.25 * dist / 10000 xstart = pos(start) xend = pos(end) p0 = (xstart, ypos) p1 = (xstart, ypos + height) p2 = (xend, ypos + height) p3 = (xend, ypos) Bezier(root, p0, p1, p2, p3) # gaps on the scaffold fp = open(gapsbedfile) for row in fp: b = BedLine(row) start, end = b.start, b.end xstart = pos(start) xend = pos(end) root.add_patch(Rectangle((xstart, 0.15), xend - xstart, 0.03, fc="w")) root.text(0.5, 0.1, scf, color="b", ha="center") warn_msg = "Only the inserts > {0}bp are shown".format(cutoff) root.text(0.5, 0.1, scf, color="b", ha="center") root.text(0.5, 0.05, warn_msg, color="gray", ha="center") # clean up and output set_human_base_axis(ax) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() figname = prefix + ".pdf" savefig(figname, dpi=300)
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_pasa_opts() p.add_option( "--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error( "PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = op.join(opts.tgi_home, "seqclean") accn_extract = which(op.join(PASA_HOME, "misc_utilities", \ "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", \ "build_comprehensive_transcriptome.dbi")) cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice mkdir(pasa_db) os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format( dnfasta, accn_extract, tdn) write_file(runfile, accn_extract_cmd, append=True) \ if prepare else sh(accn_extract_cmd) else: transcripts = dnfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus) if prepare: write_file(runfile, cleancmd, append=True) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), \ pctcov, pctid, bpsplice) aafw.close() aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, genome) aacmd += " -t {0}.clean -T -u {0} ".format(transcripts) if clean else \ " -t {0} ".format(transcripts) if ggfasta: aacmd += " --TDN {0} ".format(tdn) aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \ opts.intron, cpus) if prepare: write_file(runfile, aacmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if opts.compreh and ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format( compreh_pctid, compreh_pctcov) if prepare: write_file(runfile, comprehcmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)
def last(args, dbtype=None): """ %prog database.fasta query.fasta Run LAST by calling LASTDB and LASTAL. LAST program available: <http://last.cbrc.jp> Works with LAST-719. """ p = OptionParser(last.__doc__) p.add_option( "--dbtype", default="nucl", choices=("nucl", "prot"), help="Molecule type of subject database", ) p.add_option("--path", help="Specify LAST path") p.add_option( "--mask", default=False, action="store_true", help="Invoke -c in lastdb" ) p.add_option( "--format", default="BlastTab", choices=("TAB", "MAF", "BlastTab", "BlastTab+"), help="Output format", ) p.add_option( "--minlen", default=0, type="int", help="Filter alignments by how many bases match", ) p.add_option("--minid", default=0, type="int", help="Minimum sequence identity") p.set_cpus() p.set_outdir() p.set_params() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args path = opts.path cpus = opts.cpus if not dbtype: dbtype = opts.dbtype getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") subjectdb = subject.rsplit(".", 1)[0] run_lastdb( infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, lastdb_bin=lastdb_bin, dbtype=dbtype, ) u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) cmd += " -P {0} -i3G".format(cpus) cmd += " -f {0}".format(opts.format) cmd += " {0} {1}".format(subjectdb, query) minlen = opts.minlen minid = opts.minid extra = opts.extra assert minid != 100, "Perfect match not yet supported" mm = minid / (100 - minid) if minlen: extra += " -e{0}".format(minlen) if minid: extra += " -r1 -q{0} -a{0} -b{0}".format(mm) if extra: cmd += " " + extra.strip() lastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir) sh(cmd, outfile=lastfile) return lastfile
def longest(args): """ %prog longest pasa.fasta output.subclusters.out Find the longest PASA assembly and label it as full-length. Also removes transcripts shorter than half the length of the longest, or shorter than 200bp. The assemblies for the same locus is found in `output.subclusters.out`. In particular the lines that look like: sub-cluster: asmbl_25 asmbl_26 asmbl_27 """ from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.sizes import Sizes p = OptionParser(longest.__doc__) p.add_option("--prefix", default="pasa", help="Replace asmbl_ with prefix [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, subclusters = args prefix = fastafile.rsplit(".", 1)[0] idsfile = prefix + ".fl.ids" fw = open(idsfile, "w") sizes = Sizes(fastafile).mapping name_convert = lambda x: x.replace("asmbl", opts.prefix) keep = set() # List of IDs to write fp = open(subclusters) nrecs = 0 for row in fp: if not row.startswith("sub-cluster:"): continue asmbls = row.split()[1:] longest_asmbl = max(asmbls, key=lambda x: sizes[x]) longest_size = sizes[longest_asmbl] print >> fw, name_convert(longest_asmbl) nrecs += 1 cutoff = max(longest_size / 2, 200) keep.update(set(x for x in asmbls if sizes[x] >= cutoff)) fw.close() logging.debug("{0} fl-cDNA records written to `{1}`.".format( nrecs, idsfile)) f = Fasta(fastafile, lazy=True) newfastafile = prefix + ".clean.fasta" fw = open(newfastafile, "w") nrecs = 0 for name, rec in f.iteritems_ordered(): if name not in keep: continue rec.id = name_convert(name) rec.description = "" SeqIO.write([rec], fw, "fasta") nrecs += 1 fw.close() logging.debug("{0} valid records written to `{1}`.".format( nrecs, newfastafile))
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw (default) or muscle. 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ from jcvi.formats.fasta import translate p = OptionParser(calc.__doc__) p.add_option( "--longest", action="store_true", help="Get longest ORF, only works if no pep file, e.g. ESTs", ) p.add_option( "--msa", default="clustalw", choices=("clustalw", "muscle"), help="software used to align the proteins", ) p.add_option("--workdir", default=os.getcwd(), help="Work directory") p.set_outfile() opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print("Incorrect arguments", file=sys.stderr) sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") print(fields, file=output_h) work_dir = op.join(opts.workdir, "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in zip( prot_iterator, prot_iterator, dna_iterator, dna_iterator ): print("--------", p_rec_1.name, p_rec_2.name, file=sys.stderr) if opts.msa == "clustalw": align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir) mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = find_synonymous( mrtrans_fasta, work_dir ) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write( "%s\n" % ( ",".join( str(x) for x in ( pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng, ) ) ) ) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def gss(args): """ %prog gss fastafile plateMapping Generate sequence files and metadata templates suited for gss submission. The FASTA file is assumed to be exported from the JCVI data delivery folder which looks like: >1127963806024 /library_name=SIL1T054-B-01-120KB /clear_start=0 /clear_end=839 /primer_id=1049000104196 /trace_id=1064147620169 /trace_file_id=1127963805941 /clone_insert_id=1061064364776 /direction=reverse /sequencer_run_id=1064147620155 /sequencer_plate_barcode=B906423 /sequencer_plate_well_coordinates=C3 /sequencer_plate_96well_quadrant=1 /sequencer_plate_96well_coordinates=B02 /template_plate_barcode=CC0251602AB /growth_plate_barcode=BB0273005AB AGCTTTAGTTTCAAGGATACCTTCATTGTCATTCCCGGTTATGATGATATCATCAAGATAAACAAGAATG ACAATGATACCTGTTTGGTTCTGAAGTGTAAAGAGGGTATGTTCAGCTTCAGATCTTCTAAACCCTTTGT CTAGTAAGCTGGCACTTAGCTTCCTATACCAAACCCTTTGTGATTGCTTCAGTCCATAAATTGCCTTTTT Plate mapping file maps the JTC `sequencer_plate_barcode` to external IDs. For example: B906423 SIL-001 """ p = OptionParser(gss.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) fastafile, mappingfile = args seen = defaultdict(int) clone = defaultdict(set) plateMapping = DictFile(mappingfile) fw = open("MetaData.txt", "w") print >> fw, PublicationTemplate.format(**vars) print >> fw, LibraryTemplate.format(**vars) print >> fw, ContactTemplate.format(**vars) logging.debug("Meta data written to `{0}`".format(fw.name)) fw = open("GSS.txt", "w") fw_log = open("GSS.log", "w") for rec in SeqIO.parse(fastafile, "fasta"): # First pass just check well number matchings and populate sequences in # the same clone description = rec.description a = parse_description(description) direction = a["direction"][0] sequencer_plate_barcode = a["sequencer_plate_barcode"][0] sequencer_plate_well_coordinates = \ a["sequencer_plate_well_coordinates"][0] sequencer_plate_96well_quadrant = \ a["sequencer_plate_96well_quadrant"][0] sequencer_plate_96well_coordinates = \ a["sequencer_plate_96well_coordinates"][0] # Check the 96-well ID is correctly converted to 384-well ID w96 = sequencer_plate_96well_coordinates w96quad = int(sequencer_plate_96well_quadrant) w384 = sequencer_plate_well_coordinates assert convert_96_to_384(w96, w96quad) == w384 plate = sequencer_plate_barcode assert plate in plateMapping, \ "{0} not found in `{1}` !".format(plate, mappingfile) plate = plateMapping[plate] d = Directions[direction] cloneID = "{0}{1}".format(plate, w384) gssID = "{0}{1}".format(cloneID, d) seen[gssID] += 1 if seen[gssID] > 1: gssID = "{0}{1}".format(gssID, seen[gssID]) seen[gssID] += 1 clone[cloneID].add(gssID) seen = defaultdict(int) for rec in SeqIO.parse(fastafile, "fasta"): # need to populate gssID, mateID, cloneID, seq, plate, row, column description = rec.description a = parse_description(description) direction = a["direction"][0] sequencer_plate_barcode = a["sequencer_plate_barcode"][0] sequencer_plate_well_coordinates = \ a["sequencer_plate_well_coordinates"][0] w384 = sequencer_plate_well_coordinates plate = sequencer_plate_barcode plate = plateMapping[plate] d = Directions[direction] cloneID = "{0}{1}".format(plate, w384) gssID = "{0}{1}".format(cloneID, d) seen[gssID] += 1 if seen[gssID] > 1: logging.error("duplicate key {0} found".format(gssID)) gssID = "{0}{1}".format(gssID, seen[gssID]) othergss = clone[cloneID] - set([gssID]) othergss = ", ".join(sorted(othergss)) vars.update(locals()) print >> fw, GSSTemplate.format(**vars) # Write conversion logs to log file print >> fw_log, "{0}\t{1}".format(gssID, description) print >> fw_log, "=" * 60 logging.debug("A total of {0} seqs written to `{1}`".\ format(len(seen), fw.name)) fw.close() fw_log.close()
def consolidate(args): """ %prog consolidate gffile1 gffile2 ... > consolidated.out Given 2 or more gff files generated by pasa annotation comparison, iterate through every gene locus and identify all cases of same and different isoforms across the different input datasets. """ from jcvi.formats.base import longest_unique_prefix from jcvi.formats.gff import make_index from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper from itertools import combinations, product p = OptionParser(consolidate.__doc__) p.add_option("--slop", default=False, action="store_true", help="allow minor variation in terminal 5'/3' UTR" + \ " start/stop position [default: %default]") p.set_outfile() opts, args = p.parse_args(args) slop = opts.slop if len(args) < 2: sys.exit(not p.print_help()) gffdbx = {} gene_coords = {} mrna = AutoVivification() for gffile in args: dbn = longest_unique_prefix(gffile, args) gffdbx[dbn] = make_index(gffile) for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')): if gene.id not in gene_coords: gene_coords[gene.id] = [] gene_coords[gene.id].extend([gene.start, gene.stop]) c = list(gffdbx[dbn].children(gene, featuretype='mRNA', order_by='start')) if len(c) > 0: mrna[gene.id][dbn] = c fw = must_open(opts.outfile, "w") print >> fw, "##gff-version 3" summary = ["id"] summary.extend(gffdbx.keys()) print >> sys.stderr, "\t".join(str(x) for x in summary) for gene in mrna: g = Grouper() dbns = list(combinations(mrna[gene], 2)) if len(dbns) > 0: for dbn1, dbn2 in dbns: for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]): g.join((dbn1, mrna1.id)) g.join((dbn2, mrna2.id)) fUTR, tUTR = None, None if match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2]): fUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \ featuretype='five_prime_UTR', slop=slop) tUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \ featuretype='three_prime_UTR', slop=slop) if fUTR and tUTR: g.join((dbn1, mrna1.id), (dbn2, mrna2.id)) else: for dbn1 in mrna[gene]: for mrna1 in mrna[gene][dbn1]: g.join((dbn1, mrna1.id)) dbn = mrna[gene].keys()[0] gene_coords[gene].sort() _gene = gffdbx[dbn][gene] _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1] print >> fw, _gene logging.debug(list(g)) for group in g: dbs, mrnas = [el[0] for el in group], [el[1] for el in group] d, m = dbs[0], mrnas[0] if slop: mlen = 0 for D, M in zip(dbs, mrnas): _mrna = gffdbx[D][M] _mlen = (_mrna.stop - _mrna.start) + 1 if _mlen > mlen: d, m, mlen = D, M, _mlen dbid, _mrnaid = "".join(str(x) for x in set(dbs)), [] _mrnaid = [x for x in mrnas if x not in _mrnaid] mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid)) _mrna = gffdbx[d][m] _mrna.attributes['ID'] = [mrnaid] children = gffdbx[d].children(m, order_by='start') print >> fw, _mrna for child in children: child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)] child.attributes['Parent'] = [mrnaid] print >> fw, child summary = [mrnaid] summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx]) print >> sys.stderr, "\t".join(str(x) for x in summary) fw.close()
def htgnew(args): """ %prog htgnew fastafile phasefile template.sbt Prepare sqnfiles for submitting new Genbank HTG records. `fastafile` contains the sequences. `phasefile` contains the phase information, it is a two column file: mth2-45h12 3 `template.sbt` is the Genbank submission template. This function is simpler than htg, since the record names have not be assigned yet (so less bookkeeping). """ from jcvi.formats.fasta import sequin p = OptionParser(htgnew.__doc__) p.add_option("--comment", default="", help="Comments for this submission [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) fastafile, phasefile, sbtfile = args comment = opts.comment fastadir = "fasta" sqndir = "sqn" mkdir(fastadir) mkdir(sqndir) cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir) sh(cmd, outfile="/dev/null", errfile="/dev/null") acmd = 'tbl2asn -a z -p fasta -r {sqndir}' acmd += ' -i {splitfile} -t {sbtfile} -C tigr' acmd += ' -j "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]"' acmd += ' -o {sqndir}/{accession_nv}.sqn -V Vbr' acmd += ' -y "{comment}" -W T -T T' nupdated = 0 for row in open(phasefile): name, phase = row.split()[:2] fafile = op.join(fastadir, name + ".fa") cloneopt = "--clone={0}".format(name) splitfile, gaps = sequin([fafile, cloneopt]) splitfile = op.basename(splitfile) accession = accession_nv = name phase = int(phase) assert phase in (1, 2, 3) cmd = acmd.format(accession_nv=accession_nv, sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile, phase=phase, comment=comment) sh(cmd) verify_sqn(sqndir, accession) nupdated += 1 print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
def compare(args): """ %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff] Run the PASA annotation comparison pipeline If annotation.gff file is provided, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option( "--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, transcripts, = args[:3] annotation = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error( "PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if opts.grid and not opts.threaded: opts.threaded = opts.cpus acfw = must_open(acconf, "w") print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons) acfw.close() if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, genome, transcripts, opts.genetic_code) if annotation: accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def pairinplace(args): """ %prog pairinplace bulk.fastq Pair up the records in bulk.fastq by comparing the names for adjancent records. If they match, print to bulk.pairs.fastq, else print to bulk.frags.fastq. """ from jcvi.utils.iter import pairwise p = OptionParser(pairinplace.__doc__) p.set_rclip() p.set_tag() p.add_option("--base", help="Base name for the output files [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args base = opts.base or op.basename(fastqfile).split(".")[0] frags = base + ".frags.fastq" pairs = base + ".pairs.fastq" if fastqfile.endswith(".gz"): frags += ".gz" pairs += ".gz" fragsfw = must_open(frags, "w") pairsfw = must_open(pairs, "w") N = opts.rclip tag = opts.tag strip_name = (lambda x: x[:-N]) if N else None fh_iter = iter_fastq(fastqfile, key=strip_name) skipflag = False # controls the iterator skip for a, b in pairwise(fh_iter): if b is None: # hit the eof break if skipflag: skipflag = False continue if a.name == b.name: if tag: a.name += "/1" b.name += "/2" print >> pairsfw, a print >> pairsfw, b skipflag = True else: print >> fragsfw, a # don't forget the last one, when b is None if not skipflag: print >> fragsfw, a logging.debug("Reads paired into `%s` and `%s`" % (pairs, frags)) return pairs
def htg(args): """ %prog htg fastafile template.sbt Prepare sqnfiles for Genbank HTG submission to update existing records. `fastafile` contains the records to update, multiple records are allowed (with each one generating separate sqn file in the sqn/ folder). The record defline has the accession ID. For example, >AC148290.3 Internally, this generates two additional files (phasefile and namesfile) and download records from Genbank. Below is implementation details: `phasefile` contains, for each accession, phase information. For example: AC148290.3 3 HTG 2 mth2-45h12 which means this is a Phase-3 BAC. Record with only a single contig will be labeled as Phase-3 regardless of the info in the `phasefile`. Template file is the Genbank sbt template. See jcvi.formats.sbt for generation of such files. Another problem is that Genbank requires the name of the sequence to stay the same when updating and will kick back with a table of name conflicts. For example: We are unable to process the updates for these entries for the following reason: Seqname has changed Accession Old seq_name New seq_name --------- ------------ ------------ AC239792 mtg2_29457 AC239792.1 To prepare a submission, this script downloads genbank and asn.1 format, and generate the phase file and the names file (use formats.agp.phase() and apps.gbsubmit.asn(), respectively). These get automatically run. However, use --phases if the genbank files contain outdated information. For example, the clone name changes or phase upgrades. In this case, run formats.agp.phase() manually, modify the phasefile and use --phases to override. """ from jcvi.formats.fasta import sequin, ids from jcvi.formats.agp import phase from jcvi.apps.entrez import fetch p = OptionParser(htg.__doc__) p.add_option("--phases", default=None, help="Use another phasefile to override [default: %default]") p.add_option("--comment", default="", help="Comments for this update [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, sbtfile = args pf = fastafile.rsplit(".", 1)[0] idsfile = pf + ".ids" phasefile = pf + ".phases" namesfile = pf + ".names" ids([fastafile, "--outfile={0}".format(idsfile)]) asndir = "asn.1" mkdir(asndir) fetch([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)]) asn(glob("{0}/*".format(asndir)) + \ ["--outfile={0}".format(namesfile)]) if opts.phases is None: gbdir = "gb" mkdir(gbdir) fetch([idsfile, "--format=gb", "--outdir={0}".format(gbdir)]) phase(glob("{0}/*".format(gbdir)) + \ ["--outfile={0}".format(phasefile)]) else: phasefile = opts.phases assert op.exists(namesfile) and op.exists(phasefile) newphasefile = phasefile + ".new" newphasefw = open(newphasefile, "w") comment = opts.comment fastadir = "fasta" sqndir = "sqn" mkdir(fastadir) mkdir(sqndir) from jcvi.graphics.histogram import stem_leaf_plot names = DictFile(namesfile) assert len(set(names.keys())) == len(set(names.values())) phases = DictFile(phasefile) ph = [int(x) for x in phases.values()] # vmin 1, vmax 4, bins 3 stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates") logging.debug("Information loaded for {0} records.".format(len(phases))) assert len(names) == len(phases) newph = [] cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir) sh(cmd, outfile="/dev/null", errfile="/dev/null") acmd = 'tbl2asn -a z -p fasta -r {sqndir}' acmd += ' -i {splitfile} -t {sbtfile} -C tigr' acmd += ' -j "{qualifiers}"' acmd += ' -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr' acmd += ' -y "{comment}" -W T -T T' qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]" nupdated = 0 for row in open(phasefile): atoms = row.rstrip().split("\t") # see formats.agp.phase() for column contents accession, phase, clone = atoms[0], atoms[1], atoms[-1] fafile = op.join(fastadir, accession + ".fa") accession_nv = accession.split(".", 1)[0] newid = names[accession_nv] newidopt = "--newid={0}".format(newid) cloneopt = "--clone={0}".format(clone) splitfile, gaps = sequin([fafile, newidopt, cloneopt]) splitfile = op.basename(splitfile) phase = int(phase) assert phase in (1, 2, 3) oldphase = phase if gaps == 0 and phase != 3: phase = 3 if gaps != 0 and phase == 3: phase = 2 print >> newphasefw, "{0}\t{1}\t{2}".\ format(accession_nv, oldphase, phase) newph.append(phase) qualifiers = qq.format(phase=phase) if ";" in clone: qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]" cmd = acmd.format(accession=accession, accession_nv=accession_nv, sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile, qualifiers=qualifiers, comment=comment) sh(cmd) verify_sqn(sqndir, accession) nupdated += 1 stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates") print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome using Globus API. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum The downloader will prompt you to enter Phytozome user name and password during downloading. Please register for a login at: https://phytozome.jgi.doe.gov/pz/portal.html. """ from jcvi.apps.biomart import GlobusXMLParser p = OptionParser(phytozome.__doc__) p.add_option( "--version", default="12", choices=("9", "10", "11", "12", "12_unrestricted"), help="Phytozome version", ) p.add_option( "--assembly", default=False, action="store_true", help="Download assembly [default: %default]", ) p.add_option( "--format", default=False, action="store_true", help="Format to CDS and BED for synteny inference", ) opts, args = p.parse_args(args) cookies = get_cookies() directory_listing = ".phytozome_directory_V{}.xml".format(opts.version) # Get directory listing base_url = "http://genome.jgi.doe.gov" dlist = "{}/ext-api/downloads/get-directory?organism=PhytozomeV{}".format( base_url, opts.version) d = download(dlist, filename=directory_listing, cookies=cookies) g = GlobusXMLParser(directory_listing) genomes = g.get_genomes() valid_species = genomes.keys() species_tile = tile(valid_species) p.set_usage("\n".join((phytozome.__doc__, species_tile))) if len(args) != 1: sys.exit(not p.print_help()) species, = args if species == "all": species = ",".join(valid_species) species = species.split(",") for s in species: res = download_species_phytozome(genomes, s, valid_species, base_url, cookies, assembly=opts.assembly) if not res: logging.error("No files downloaded") gff, fa = res.get("gff"), res.get("cds") if opts.format: format_bed_and_cds(s, gff, fa)
def names(args): """ %prog names namelist templatefile Generate name blocks from the `namelist` file. The `namelist` file is tab-delimited that contains >=4 columns of data. Three columns are mandatory. First name, middle initial and last name. First row is table header. For the extra columns, the first column will go in the `$N0` field in the template file, second to the `$N1` field, etc. In the alternative mode, the namelist just contains several sections. First row will go in the `$N0` in the template file, second to the `$N1` field. The namelist may look like: [Sequence] Bruce A. Roe, Frederic Debelle, Giles Oldroyd, Rene Geurts [Manuscript] Haibao Tang1, Vivek Krishnakumar1, Shelby Bidwell1, Benjamin Rosen1 Then in this example Sequence section goes into N0, Manuscript goes into N1. Useful hints for constructing the template file can be found in: <http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/asn_spec/seq.asn.html> Often the template file can be retrieved from web form: <http://www.ncbi.nlm.nih.gov/WebSub/template.cgi> """ p = OptionParser(names.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) namelist, templatefile = args # First check the alternative format if open(namelist).read()[0] == '[': out = parse_names(namelist) make_template(templatefile, out) return reader = csv.reader(open(namelist), delimiter="\t") header = reader.next() ncols = len(header) assert ncols > 3 nextras = ncols - 3 blocks = [] bools = [] for row in reader: first, middle, last = row[:3] extras = row[3:] bools.append([(x.upper() == 'Y') for x in extras]) middle = middle.strip() if middle != "": middle = middle.rstrip('.') + '.' initials = "{0}.{1}".format(first[0], middle) suffix = "" nameblock = NameTemplate.format(last=last, first=first, initials=initials, suffix=suffix) blocks.append(nameblock) selected_idx = zip(*bools) out = [] * nextras for i, sbools in enumerate(selected_idx): selected = [] for b, ss in zip(blocks, sbools): if ss: selected.append(b) bigblock = ",\n".join(selected) out.append(bigblock) logging.debug("List N{0} contains a total of {1} names.".format( i, len(selected))) make_template(templatefile, out)
def astat(args): """ %prog astat coverage.log Create coverage-rho scatter plot. """ p = OptionParser(astat.__doc__) p.add_option("--cutoff", default=1000, type="int", help="Length cutoff [default: %default]") p.add_option("--genome", default="", help="Genome name [default: %default]") p.add_option("--arrDist", default=False, action="store_true", help="Use arrDist instead [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) covfile, = args cutoff = opts.cutoff genome = opts.genome plot_arrDist = opts.arrDist suffix = ".{0}".format(cutoff) small_covfile = covfile + suffix update_covfile = need_update(covfile, small_covfile) if update_covfile: fw = open(small_covfile, "w") else: logging.debug("Found `{0}`, will use this one".format(small_covfile)) covfile = small_covfile fp = open(covfile) header = fp.next() if update_covfile: fw.write(header) data = [] msg = "{0} tigs scanned ..." for row in fp: tigID, rho, covStat, arrDist = row.split() tigID = int(tigID) if tigID % 1000000 == 0: sys.stderr.write(msg.format(tigID) + "\r") rho, covStat, arrDist = [float(x) for x in (rho, covStat, arrDist)] if rho < cutoff: continue if update_covfile: fw.write(row) data.append((tigID, rho, covStat, arrDist)) print >> sys.stderr, msg.format(tigID) from jcvi.graphics.base import plt, savefig logging.debug("Plotting {0} data points.".format(len(data))) tigID, rho, covStat, arrDist = zip(*data) y = arrDist if plot_arrDist else covStat ytag = "arrDist" if plot_arrDist else "covStat" fig = plt.figure(1, (7, 7)) ax = fig.add_axes([.12, .1, .8, .8]) ax.plot(rho, y, ".", color="lightslategrey") xtag = "rho" info = (genome, xtag, ytag) title = "{0} {1} vs. {2}".format(*info) ax.set_title(title) ax.set_xlabel(xtag) ax.set_ylabel(ytag) if plot_arrDist: ax.set_yscale('log') imagename = "{0}.png".format(".".join(info)) savefig(imagename, dpi=150)
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = { "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"], "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"], "gb": ["genome", "nuccore", "nucgss"], "est": ["nucest"], "gss": ["nucgss"], "acc": ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene") p.add_option( "--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions", ) p.add_option( "--format", default="fasta", choices=valid_formats, help="download format [default: %default]", ) p.add_option( "--database", default="nuccore", choices=valid_databases, help="search database [default: %default]", ) p.add_option( "--retmax", default=1000000, type="int", help="how many results to return [default: %default]", ) p.add_option( "--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence [default: %default]", ) p.add_option( "--batchsize", default=500, type="int", help="download the results in batch for speed-up [default: %default]", ) p.set_outdir(outdir=None) p.add_option("--outprefix", default="out", help="output file name prefix [default: %default]") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) filename, = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert (database in allowed_databases[fmt] ), "For output format '{0}', allowed databases are: {1}".format( fmt, allowed_databases[fmt]) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez( list_of_terms, retmax=opts.retmax, rettype=fmt, db=database, batchsize=batchsize, email=opts.email, ): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print(rec, file=fw) print(file=fw) seen.add(id) if seen: print( "A total of {0} {1} records downloaded.".format( totalsize, fmt.upper()), file=sys.stderr, ) return outfile
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option("--clean", default=False, action="store_true", help="Clean up irregular chars in seq") p.add_option("--matefile", help="Matepairs file") p.add_option("--maxreadlen", default=0, type="int", help="Maximum read length allowed") p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed") p.add_option("--readname", default=False, action="store_true", help="Keep read name (e.g. long Pacbio name)") p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args maxreadlen = opts.maxreadlen minreadlen = opts.minreadlen if maxreadlen > 0: split = False f = Fasta(fastafile, lazy=True) for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\ format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = (opts.size != 0) mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = plate[:2].upper() frgfile = libname + ".frg" if opts.clean: cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta if mated: qualfile = make_qual(fastafile, score=21) if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile) return fw = must_open(frgfile, "w") print >> fw, headerTemplate.format(libID=libname) sequential = not opts.readname f = Fasta(fastafile, lazy=True) i = j = 0 for fragID, seq in parse_fasta(fastafile): if len(seq) < minreadlen: j += 1 continue i += 1 if sequential: fragID = libname + str(100000000 + i) emitFragment(fw, fragID, libname, seq) fw.close() logging.debug("A total of {0} fragments written to `{1}` ({2} discarded).".\ format(i, frgfile, j))
def wgsim(args): """ %prog wgsim fastafile Run dwgsim on fastafile. """ p = OptionParser(wgsim.__doc__) p.add_option("--erate", default=.02, type="float", help="Base error rate of the read [default: %default]") p.add_option( "--distance", default=500, type="int", help="Outer distance between the two ends [default: %default]") p.add_option("--genomesize", type="int", help="Genome size in Mb [default: estimate from data]") p.add_option("--readlen", default=100, type="int", help="Length of the read [default: %default]") p.add_option("--noerrors", default=False, action="store_true", help="Simulate reads with no errors [default: %default]") p.set_depth(depth=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args pf = fastafile.split(".")[0] genomesize = opts.genomesize size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize depth = opts.depth readlen = opts.readlen readnum = size * depth / (2 * readlen) distance = opts.distance stdev = distance / 5 outpf = "{0}.{1}bp.{2}x".format(pf, distance, depth) distance -= 2 * readlen # Outer distance => Inner distance assert distance >= 0, "Outer distance must be >= 2 * readlen" logging.debug("Total genome size: {0} bp".format(size)) logging.debug("Target depth: {0}x".format(depth)) logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum)) if opts.noerrors: opts.erate = 0 cmd = "dwgsim -e {0} -E {0}".format(opts.erate) if opts.noerrors: cmd += " -r 0 -R 0 -X 0 -y 0" cmd += " -d {0} -s {1}".format(distance, stdev) cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen) cmd += " {0} {1}".format(fastafile, outpf) sh(cmd)