def phase(accession): gbdir = "gb" gbfile = op.join(gbdir, accession + ".gb") if not op.exists(gbfile): entrez([accession, "--skipcheck", "--outdir=" + gbdir, "--format=gb"]) rec = SeqIO.parse(gbfile, "gb").next() ph, keywords = get_phase(rec) return ph, len(rec)
def phase(accession): gbdir = "gb" gbfile = op.join(gbdir, accession + ".gb") if not op.exists(gbfile): entrez([accession, "--skipcheck", "--outdir=" + gbdir, "--format=gb"]) rec = next(SeqIO.parse(gbfile, "gb")) ph, keywords = get_phase(rec) return ph, len(rec)
def _get_records(self): gbdir = "gb" dirmade = mkdir(gbdir) if not dirmade: sh("rm -rf {0}_old; mv -f {0} {0}_old".format(gbdir,)) assert mkdir(gbdir) entrez([self.idfile, "--format=gb", "--database=nuccore", "--outdir={0}"\ .format(gbdir)]) logging.debug('GenBank records written to {0}.'.format(gbdir)) return gbdir
def bes(args): """ %prog bes bacfasta clonename Use the clone name to download BES gss sequences from Genbank, map and then visualize. """ from jcvi.apps.align import run_blat p = OptionParser(bes.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bacfasta, clonename = args entrez([clonename, "--database=nucgss", "--skipcheck"]) besfasta = clonename + ".fasta" blatfile = clonename + ".bes.blat" run_blat( infile=besfasta, outfile=blatfile, db=bacfasta, pctid=95, hitlen=100, cpus=opts.cpus, ) aid, asize = next(Fasta(bacfasta).itersizes()) width = 50 msg = "=" * width msg += " " + aid print(msg, file=sys.stderr) ratio = width * 1.0 / asize _ = lambda x: int(round(x * ratio, 0)) blasts = [BlastLine(x) for x in open(blatfile)] for b in blasts: if b.orientation == "+": msg = " " * _(b.sstart) + "->" else: msg = " " * (_(b.sstop) - 2) + "<-" msg += " " * (width - len(msg) + 2) msg += b.query if b.orientation == "+": msg += " (hang={0})".format(b.sstart - 1) else: msg += " (hang={0})".format(asize - b.sstop) print(msg, file=sys.stderr)
def blast(args): """ %prog blast allfasta clonename Insert a component into agpfile by aligning to the best hit in pool and see if they have good overlaps. """ from jcvi.apps.align import run_megablast p = OptionParser(blast.__doc__) p.add_option("-n", type="int", default=2, help="Take best N hits [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) allfasta, clonename = args fastadir = "fasta" infile = op.join(fastadir, clonename + ".fasta") if not op.exists(infile): entrez([clonename, "--skipcheck", "--outdir=" + fastadir]) outfile = "{0}.{1}.blast".format(clonename, allfasta.split(".")[0]) run_megablast(infile=infile, outfile=outfile, db=allfasta, \ pctid=GoodPct, hitlen=GoodOverlap) blasts = [BlastLine(x) for x in open(outfile)] besthits = [] for b in blasts: if b.query.count("|") >= 3: b.query = b.query.split("|")[3] if b.subject.count("|") >= 3: b.subject = b.subject.split("|")[3] b.query = b.query.rsplit(".", 1)[0] b.subject = b.subject.rsplit(".", 1)[0] if b.query == b.subject: continue if b.subject not in besthits: besthits.append(b.subject) if len(besthits) == opts.n: break for b in besthits: overlap([clonename, b, "--dir=" + fastadir])
def bes(args): """ %prog bes bacfasta clonename Use the clone name to download BES gss sequences from Genbank, map and then visualize. """ from jcvi.apps.align import run_blat p = OptionParser(bes.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bacfasta, clonename = args entrez([clonename, "--database=nucgss", "--skipcheck"]) besfasta = clonename + ".fasta" blatfile = clonename + ".bes.blat" run_blat(infile=besfasta, outfile=blatfile, db=bacfasta, \ pctid=95, hitlen=100, cpus=opts.cpus) aid, asize = Fasta(bacfasta).itersizes().next() width = 50 msg = "=" * width msg += " " + aid print >> sys.stderr, msg ratio = width * 1. / asize _ = lambda x: int(round(x * ratio, 0)) blasts = [BlastLine(x) for x in open(blatfile)] for b in blasts: if b.orientation == '+': msg = " " * _(b.sstart) + "->" else: msg = " " * (_(b.sstop) - 2) + "<-" msg += " " * (width - len(msg) + 2) msg += b.query if b.orientation == '+': msg += " (hang={0})".format(b.sstart - 1) else: msg += " (hang={0})".format(asize - b.sstop) print >> sys.stderr, msg
def certificate(args): """ %prog certificate tpffile certificatefile Generate certificate file for all overlaps in tpffile. tpffile can be generated by jcvi.formats.agp.tpf(). North chr1 2 0 AC229737.8 telomere 58443 South chr1 2 1 AC229737.8 AC202463.29 58443 37835 58443 + Non-terminal Each line describes a relationship between the current BAC and the north/south BAC. First, "North/South" tag, then the chromosome, phases of the two BACs, ids of the two BACs, the size and the overlap start-stop of the CURRENT BAC, and orientation. Each BAC will have two lines in the certificate file. """ p = OptionParser(certificate.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) tpffile, certificatefile = args fastadir = "fasta" tpf = TPF(tpffile) data = check_certificate(certificatefile) fw = must_open(certificatefile, "w") for i, a in enumerate(tpf): if a.is_gap: continue aid = a.component_id af = op.join(fastadir, aid + ".fasta") if not op.exists(af): # Check to avoid redownload entrez([aid, "--skipcheck", "--outdir=" + fastadir]) north, south = tpf.getNorthSouthClone(i) aphase, asize = phase(aid) for tag, p in (("North", north), ("South", south)): if not p: # end of the chromosome ov = "telomere\t{0}".format(asize) elif p.isCloneGap: bphase = "0" ov = "{0}\t{1}".format(p.gap_type, asize) else: bid = p.component_id bphase, bsize = phase(bid) key = (tag, aid, bid) if key in data: print >> fw, data[key] continue ar = [aid, bid, "--dir=" + fastadir] o = overlap(ar) ov = o.certificateline if o \ else "{0}\t{1}\tNone".format(bid, asize) print >> fw, "\t".join(str(x) for x in \ (tag, a.object, aphase, bphase, aid, ov)) fw.flush()
def overlap(args): """ %prog overlap <a|a.fasta> <b|b.fasta> Check overlaps between two fasta records. The arguments can be genBank IDs instead of FASTA files. In case of IDs, the sequences will be downloaded first. """ from jcvi.formats.blast import chain_HSPs p = OptionParser(overlap.__doc__) p.add_option("--dir", default=os.getcwd(), help="Download sequences to dir [default: %default]") p.add_option("--suffix", default="fasta", help="Suffix of the sequence file in dir [default: %default]") p.add_option("--qreverse", default=False, action="store_true", help="Reverse seq a [default: %default]") p.add_option("--nochain", default=False, action="store_true", help="Do not chain adjacent HSPs [default: chain HSPs]") p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01) p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) afasta, bfasta = args dir = opts.dir chain = not opts.nochain suffix = opts.suffix evalue = opts.evalue pctid = opts.pctid hitlen = opts.hitlen cutoff = Cutoff(pctid, hitlen) # Check first whether it is file or accession name if not op.exists(afasta): af = op.join(dir, ".".join((afasta, suffix))) if not op.exists(af): # Check to avoid redownload entrez([afasta, "--skipcheck", "--outdir=" + dir]) afasta = af if not op.exists(bfasta): bf = op.join(dir, ".".join((bfasta, suffix))) if not op.exists(bf): entrez([bfasta, "--skipcheck", "--outdir=" + dir]) bfasta = bf assert op.exists(afasta) and op.exists(bfasta) cmd = "blastn -dust no" cmd += " -query {0} -subject {1}".format(afasta, bfasta) cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid) fp = popen(cmd) hsps = fp.readlines() hsps = [BlastLine(x) for x in hsps] hsps = [x for x in hsps if x.hitlen >= hitlen] if chain: logging.debug("Chain HSPs in the Blast output.") dist = 2 * hitlen # Distance to chain the HSPs hsps = chain_HSPs(hsps, xdist=dist, ydist=dist) if len(hsps) == 0: print >> sys.stderr, "No match found." return None besthsp = hsps[0] aid, asize = Fasta(afasta).itersizes().next() bid, bsize = Fasta(bfasta).itersizes().next() o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse) o.print_graphic() if opts.outfile: fw = must_open(opts.outfile, "w") print >> fw, str(o) fw.close() return o
def htg(args): """ %prog htg fastafile template.sbt Prepare sqnfiles for Genbank HTG submission to update existing records. `fastafile` contains the records to update, multiple records are allowed (with each one generating separate sqn file in the sqn/ folder). The record defline has the accession ID. For example, >AC148290.3 Internally, this generates two additional files (phasefile and namesfile) and download records from Genbank. Below is implementation details: `phasefile` contains, for each accession, phase information. For example: AC148290.3 3 HTG 2 mth2-45h12 which means this is a Phase-3 BAC. Record with only a single contig will be labeled as Phase-3 regardless of the info in the `phasefile`. Template file is the Genbank sbt template. See jcvi.formats.sbt for generation of such files. Another problem is that Genbank requires the name of the sequence to stay the same when updating and will kick back with a table of name conflicts. For example: We are unable to process the updates for these entries for the following reason: Seqname has changed Accession Old seq_name New seq_name --------- ------------ ------------ AC239792 mtg2_29457 AC239792.1 To prepare a submission, this script downloads genbank and asn.1 format, and generate the phase file and the names file (use formats.agp.phase() and apps.gbsubmit.asn(), respectively). These get automatically run. However, use --phases if the genbank files contain outdated information. For example, the clone name changes or phase upgrades. In this case, run formats.agp.phase() manually, modify the phasefile and use --phases to override. """ from jcvi.formats.fasta import sequin, ids from jcvi.formats.agp import phase from jcvi.apps.fetch import entrez p = OptionParser(htg.__doc__) p.add_option("--phases", default=None, help="Use another phasefile to override [default: %default]") p.add_option("--comment", default="", help="Comments for this update [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, sbtfile = args pf = fastafile.rsplit(".", 1)[0] idsfile = pf + ".ids" phasefile = pf + ".phases" namesfile = pf + ".names" ids([fastafile, "--outfile={0}".format(idsfile)]) asndir = "asn.1" mkdir(asndir) entrez([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)]) asn(glob("{0}/*".format(asndir)) + \ ["--outfile={0}".format(namesfile)]) if opts.phases is None: gbdir = "gb" mkdir(gbdir) entrez([idsfile, "--format=gb", "--outdir={0}".format(gbdir)]) phase(glob("{0}/*".format(gbdir)) + \ ["--outfile={0}".format(phasefile)]) else: phasefile = opts.phases assert op.exists(namesfile) and op.exists(phasefile) newphasefile = phasefile + ".new" newphasefw = open(newphasefile, "w") comment = opts.comment fastadir = "fasta" sqndir = "sqn" mkdir(fastadir) mkdir(sqndir) from jcvi.graphics.histogram import stem_leaf_plot names = DictFile(namesfile) assert len(set(names.keys())) == len(set(names.values())) phases = DictFile(phasefile) ph = [int(x) for x in phases.values()] # vmin 1, vmax 4, bins 3 stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates") logging.debug("Information loaded for {0} records.".format(len(phases))) assert len(names) == len(phases) newph = [] cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir) sh(cmd, outfile="/dev/null", errfile="/dev/null") acmd = 'tbl2asn -a z -p fasta -r {sqndir}' acmd += ' -i {splitfile} -t {sbtfile} -C tigr' acmd += ' -j "{qualifiers}"' acmd += ' -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr' acmd += ' -y "{comment}" -W T -T T' qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]" nupdated = 0 for row in open(phasefile): atoms = row.rstrip().split("\t") # see formats.agp.phase() for column contents accession, phase, clone = atoms[0], atoms[1], atoms[-1] fafile = op.join(fastadir, accession + ".fa") accession_nv = accession.split(".", 1)[0] newid = names[accession_nv] newidopt = "--newid={0}".format(newid) cloneopt = "--clone={0}".format(clone) splitfile, gaps = sequin([fafile, newidopt, cloneopt]) splitfile = op.basename(splitfile) phase = int(phase) assert phase in (1, 2, 3) oldphase = phase if gaps == 0 and phase != 3: phase = 3 if gaps != 0 and phase == 3: phase = 2 print("{0}\t{1}\t{2}".\ format(accession_nv, oldphase, phase), file=newphasefw) newph.append(phase) qualifiers = qq.format(phase=phase) if ";" in clone: qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]" cmd = acmd.format(accession=accession, accession_nv=accession_nv, sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile, qualifiers=qualifiers, comment=comment) sh(cmd) verify_sqn(sqndir, accession) nupdated += 1 stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates") print("A total of {0} records updated.".format(nupdated), file=sys.stderr)
def htg(args): """ %prog htg fastafile template.sbt Prepare sqnfiles for Genbank HTG submission to update existing records. `fastafile` contains the records to update, multiple records are allowed (with each one generating separate sqn file in the sqn/ folder). The record defline has the accession ID. For example, >AC148290.3 Internally, this generates two additional files (phasefile and namesfile) and download records from Genbank. Below is implementation details: `phasefile` contains, for each accession, phase information. For example: AC148290.3 3 HTG 2 mth2-45h12 which means this is a Phase-3 BAC. Record with only a single contig will be labeled as Phase-3 regardless of the info in the `phasefile`. Template file is the Genbank sbt template. See jcvi.formats.sbt for generation of such files. Another problem is that Genbank requires the name of the sequence to stay the same when updating and will kick back with a table of name conflicts. For example: We are unable to process the updates for these entries for the following reason: Seqname has changed Accession Old seq_name New seq_name --------- ------------ ------------ AC239792 mtg2_29457 AC239792.1 To prepare a submission, this script downloads genbank and asn.1 format, and generate the phase file and the names file (use formats.agp.phase() and apps.gbsubmit.asn(), respectively). These get automatically run. However, use --phases if the genbank files contain outdated information. For example, the clone name changes or phase upgrades. In this case, run formats.agp.phase() manually, modify the phasefile and use --phases to override. """ from jcvi.formats.fasta import sequin, ids from jcvi.formats.agp import phase from jcvi.apps.fetch import entrez p = OptionParser(htg.__doc__) p.add_option( "--phases", default=None, help="Use another phasefile to override", ) p.add_option("--comment", default="", help="Comments for this update") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, sbtfile = args pf = fastafile.rsplit(".", 1)[0] idsfile = pf + ".ids" phasefile = pf + ".phases" namesfile = pf + ".names" ids([fastafile, "--outfile={0}".format(idsfile)]) asndir = "asn.1" mkdir(asndir) entrez([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)]) asn(glob("{0}/*".format(asndir)) + ["--outfile={0}".format(namesfile)]) if opts.phases is None: gbdir = "gb" mkdir(gbdir) entrez([idsfile, "--format=gb", "--outdir={0}".format(gbdir)]) phase( glob("{0}/*".format(gbdir)) + ["--outfile={0}".format(phasefile)]) else: phasefile = opts.phases assert op.exists(namesfile) and op.exists(phasefile) newphasefile = phasefile + ".new" newphasefw = open(newphasefile, "w") comment = opts.comment fastadir = "fasta" sqndir = "sqn" mkdir(fastadir) mkdir(sqndir) from jcvi.graphics.histogram import stem_leaf_plot names = DictFile(namesfile) assert len(set(names.keys())) == len(set(names.values())) phases = DictFile(phasefile) ph = [int(x) for x in phases.values()] # vmin 1, vmax 4, bins 3 stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates") logging.debug("Information loaded for {0} records.".format(len(phases))) assert len(names) == len(phases) newph = [] cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir) sh(cmd, outfile="/dev/null", errfile="/dev/null") acmd = "tbl2asn -a z -p fasta -r {sqndir}" acmd += " -i {splitfile} -t {sbtfile} -C tigr" acmd += ' -j "{qualifiers}"' acmd += " -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr" acmd += ' -y "{comment}" -W T -T T' qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]" nupdated = 0 for row in open(phasefile): atoms = row.rstrip().split("\t") # see formats.agp.phase() for column contents accession, phase, clone = atoms[0], atoms[1], atoms[-1] fafile = op.join(fastadir, accession + ".fa") accession_nv = accession.split(".", 1)[0] newid = names[accession_nv] newidopt = "--newid={0}".format(newid) cloneopt = "--clone={0}".format(clone) splitfile, gaps = sequin([fafile, newidopt, cloneopt]) splitfile = op.basename(splitfile) phase = int(phase) assert phase in (1, 2, 3) oldphase = phase if gaps == 0 and phase != 3: phase = 3 if gaps != 0 and phase == 3: phase = 2 print("{0}\t{1}\t{2}".format(accession_nv, oldphase, phase), file=newphasefw) newph.append(phase) qualifiers = qq.format(phase=phase) if ";" in clone: qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]" cmd = acmd.format( accession=accession, accession_nv=accession_nv, sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile, qualifiers=qualifiers, comment=comment, ) sh(cmd) verify_sqn(sqndir, accession) nupdated += 1 stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates") print("A total of {0} records updated.".format(nupdated), file=sys.stderr)