def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print >> fw, "\t".join((a, b, str(c))) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def ortholog(args): """ %prog ortholog species_a species_b Run a sensitive pipeline to find orthologs between two species a and b. The pipeline runs LAST and generate .lifted.anchors. `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of such predictions. Extra orthologs will be recruited from reciprocal best match (RBH). """ from jcvi.apps.align import last as last_main from jcvi.compara.blastfilter import main as blastfilter_main from jcvi.compara.quota import main as quota_main from jcvi.compara.synteny import scan, mcscan, liftover from jcvi.formats.blast import cscore, filter p = OptionParser(ortholog.__doc__) p.add_option("--full", default=False, action="store_true", help="Run in full mode, including blocks and RBH") p.add_option("--cscore", default=0.7, type="float", help="C-score cutoff [default: %default]") p.add_option("--dist", default=20, type="int", help="Extent of flanking regions to search") p.add_option("--quota", help="Quota align parameter") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) a, b = args abed, afasta = a + ".bed", a + ".cds" bbed, bfasta = b + ".bed", b + ".cds" ccscore = opts.cscore quota = opts.quota dist = "--dist={0}".format(opts.dist) aprefix = afasta.split(".")[0] bprefix = bfasta.split(".")[0] pprefix = ".".join((aprefix, bprefix)) qprefix = ".".join((bprefix, aprefix)) last = pprefix + ".last" if need_update((afasta, bfasta), last): last_main([bfasta, afasta]) if a == b: last = filter([last, "--hitlen=0", "--pctid=98", "--inverse", "--noself"]) filtered_last = last + ".filtered" if need_update(last, filtered_last): blastfilter_main([last, "--cscore={0}".format(ccscore)]) anchors = pprefix + ".anchors" lifted_anchors = pprefix + ".lifted.anchors" if not opts.full: if need_update(filtered_last, lifted_anchors): scan([filtered_last, anchors, dist, "--liftover={0}".format(last)]) if quota: quota_main([lifted_anchors, "--quota={0}".format(quota), "--screen"]) return if need_update(filtered_last, anchors): scan([filtered_last, anchors, dist]) ooanchors = pprefix + ".1x1.anchors" if need_update(anchors, ooanchors): quota_main([anchors, "--quota=1:1", "--screen"]) lifted_anchors = pprefix + ".1x1.lifted.anchors" if need_update((last, ooanchors), lifted_anchors): liftover([last, ooanchors, dist]) pblocks = pprefix + ".1x1.blocks" qblocks = qprefix + ".1x1.blocks" if need_update(lifted_anchors, [pblocks, qblocks]): mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks]) mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks]) rbh = pprefix + ".rbh" if need_update(last, rbh): cscore([last, "-o", rbh]) portho = pprefix + ".ortholog" qortho = qprefix + ".ortholog" if need_update([pblocks, qblocks, rbh], [portho, qortho]): make_ortholog(pblocks, rbh, portho) make_ortholog(qblocks, rbh, qortho)
def ortholog(args): """ %prog ortholog species_a species_b Run a sensitive pipeline to find orthologs between two species a and b. The pipeline runs LAST and generate .lifted.anchors. `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of such predictions. Extra orthologs will be recruited from reciprocal best match (RBH). """ from jcvi.apps.last import main as last_main from jcvi.compara.blastfilter import main as blastfilter_main from jcvi.compara.quota import main as quota_main from jcvi.compara.synteny import scan, mcscan, liftover from jcvi.formats.blast import cscore, filter p = OptionParser(ortholog.__doc__) p.add_option("--full", default=False, action="store_true", help="Run in full mode, including blocks and RBH") p.add_option("--cscore", default=0.7, type="float", help="C-score cutoff [default: %default]") p.add_option("--dist", default=20, type="int", help="Extent of flanking regions to search") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) a, b = args abed, afasta = a + ".bed", a + ".cds" bbed, bfasta = b + ".bed", b + ".cds" ccscore = opts.cscore dist = "--dist={0}".format(opts.dist) aprefix = afasta.split(".")[0] bprefix = bfasta.split(".")[0] pprefix = ".".join((aprefix, bprefix)) qprefix = ".".join((bprefix, aprefix)) last = pprefix + ".last" if need_update((afasta, bfasta), last): last_main([bfasta, afasta, "-o", last]) if a == b: last = filter([last, "--hitlen=0", "--pctid=98", "--inverse"]) filtered_last = last + ".filtered" if need_update(last, filtered_last): blastfilter_main([last, "--cscore={0}".format(ccscore)]) anchors = pprefix + ".anchors" lifted_anchors = pprefix + ".lifted.anchors" if not opts.full: if need_update(filtered_last, lifted_anchors): scan([filtered_last, anchors, dist, "--liftover={0}".format(last)]) return if need_update(filtered_last, anchors): scan([filtered_last, anchors, dist]) ooanchors = pprefix + ".1x1.anchors" if need_update(anchors, ooanchors): quota_main([anchors, "--quota=1:1", "--screen"]) lifted_anchors = pprefix + ".1x1.lifted.anchors" if need_update((last, ooanchors), lifted_anchors): liftover([last, ooanchors, dist]) pblocks = pprefix + ".1x1.blocks" qblocks = qprefix + ".1x1.blocks" if need_update(lifted_anchors, [pblocks, qblocks]): mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks]) mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks]) rbh = pprefix + ".rbh" if need_update(last, rbh): cscore([last, "-o", rbh]) portho = pprefix + ".ortholog" qortho = qprefix + ".ortholog" if need_update([pblocks, qblocks, rbh], [portho, qortho]): make_ortholog(pblocks, rbh, portho) make_ortholog(qblocks, rbh, qortho)
def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format( len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print >> fw, "\t".join((a, b, str(c))) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def ortholog(args): """ %prog ortholog species_a species_b Run a sensitive pipeline to find orthologs between two species a and b. The pipeline runs LAST and generate .lifted.anchors. `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of such predictions. Extra orthologs will be recruited from reciprocal best match (RBH). """ from jcvi.apps.align import last as last_main from jcvi.compara.blastfilter import main as blastfilter_main from jcvi.compara.quota import main as quota_main from jcvi.compara.synteny import scan, mcscan, liftover from jcvi.formats.blast import cscore, filter p = OptionParser(ortholog.__doc__) p.add_option( "--dbtype", default="nucl", choices=("nucl", "prot"), help="Molecule type of subject database", ) p.add_option( "--full", default=False, action="store_true", help="Run in full 1x1 mode, including blocks and RBH", ) p.add_option("--cscore", default=0.7, type="float", help="C-score cutoff") p.add_option( "--dist", default=20, type="int", help="Extent of flanking regions to search" ) p.add_option( "-n", "--min_size", dest="n", type="int", default=4, help="minimum number of anchors in a cluster", ) p.add_option("--quota", help="Quota align parameter") p.add_option( "--no_strip_names", default=False, action="store_true", help="Do not strip alternative splicing (e.g. At5g06540.1 -> At5g06540)", ) p.set_cpus() p.set_dotplot_opts() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) a, b = args dbtype = opts.dbtype suffix = ".cds" if dbtype == "nucl" else ".pep" abed, afasta = a + ".bed", a + suffix bbed, bfasta = b + ".bed", b + suffix ccscore = opts.cscore quota = opts.quota dist = "--dist={0}".format(opts.dist) minsize_flag = "--min_size={}".format(opts.n) cpus_flag = "--cpus={}".format(opts.cpus) aprefix = afasta.split(".")[0] bprefix = bfasta.split(".")[0] pprefix = ".".join((aprefix, bprefix)) qprefix = ".".join((bprefix, aprefix)) last = pprefix + ".last" if need_update((afasta, bfasta), last): last_main([bfasta, afasta, cpus_flag], dbtype) if a == b: lastself = last + ".P98L0.inverse" if need_update(last, lastself): filter([last, "--hitlen=0", "--pctid=98", "--inverse", "--noself"]) last = lastself filtered_last = last + ".filtered" if need_update(last, filtered_last): if opts.no_strip_names: blastfilter_main([last, "--cscore={0}".format(ccscore), "--no_strip_names"]) else: blastfilter_main([last, "--cscore={0}".format(ccscore)]) anchors = pprefix + ".anchors" lifted_anchors = pprefix + ".lifted.anchors" pdf = pprefix + ".pdf" if not opts.full: if need_update(filtered_last, lifted_anchors): dargs = [ filtered_last, anchors, minsize_flag, dist, "--liftover={0}".format(last), ] if opts.no_strip_names: dargs += [ "--no_strip_names", ] scan(dargs) if quota: quota_main([lifted_anchors, "--quota={0}".format(quota), "--screen"]) if need_update(anchors, pdf): from jcvi.graphics.dotplot import dotplot_main dargs = [anchors] if opts.nostdpf: dargs += ["--nostdpf"] if opts.nochpf: dargs += ["--nochpf"] if opts.skipempty: dargs += ["--skipempty"] if opts.genomenames: dargs += ["--genomenames", opts.genomenames] if opts.theme: dargs += ["--theme", opts.theme] dotplot_main(dargs) return if need_update(filtered_last, anchors): if opts.no_strip_names: scan([filtered_last, anchors, dist, "--no_strip_names"]) else: scan([filtered_last, anchors, dist]) ooanchors = pprefix + ".1x1.anchors" if need_update(anchors, ooanchors): quota_main([anchors, "--quota=1:1", "--screen"]) lifted_anchors = pprefix + ".1x1.lifted.anchors" if need_update((last, ooanchors), lifted_anchors): if opts.no_strip_names: liftover([last, ooanchors, dist, "--no_strip_names"]) else: liftover([last, ooanchors, dist]) pblocks = pprefix + ".1x1.blocks" qblocks = qprefix + ".1x1.blocks" if need_update(lifted_anchors, [pblocks, qblocks]): mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks]) mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks]) rbh = pprefix + ".rbh" if need_update(last, rbh): cscore([last, "-o", rbh]) portho = pprefix + ".ortholog" qortho = qprefix + ".ortholog" if need_update([pblocks, qblocks, rbh], [portho, qortho]): make_ortholog(pblocks, rbh, portho) make_ortholog(qblocks, rbh, qortho)
def ortholog(args): """ %prog ortholog a.bed a.cds b.bed b.cds Run a sensitive pipeline to find orthologs between two species a and b. The pipeline runs LAST and 1-to-1 quota synteny blocks as the backbone of such predictions. Extra orthologs will be recruited from reciprocal best match (RBH). """ from jcvi.apps.last import main as last_main from jcvi.compara.blastfilter import main as blastfilter_main from jcvi.compara.quota import main as quota_main from jcvi.compara.synteny import scan, mcscan, liftover from jcvi.formats.blast import cscore p = OptionParser(ortholog.__doc__) p.add_option("--cscore", default=0.99, type="float", help="C-score cutoff [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) abed, afasta, bbed, bfasta = args ccscore = opts.cscore aprefix = afasta.split(".")[0] bprefix = bfasta.split(".")[0] pprefix = ".".join((aprefix, bprefix)) qprefix = ".".join((bprefix, aprefix)) last = pprefix + ".last" if need_update((afasta, bfasta), last): last_main([bfasta, afasta, "-o", last]) filtered_last = last + ".filtered" bstring = ["--qbed=" + abed, "--sbed=" + bbed] if need_update(last, filtered_last): blastfilter_main([last, "--cscore={0}".format(ccscore), "--tandem_Nmax=10"] + bstring) anchors = pprefix + ".anchors" lifted_anchors = pprefix + ".lifted.anchors" if need_update(filtered_last, lifted_anchors): scan([filtered_last, anchors] + bstring) ooanchors = pprefix + ".1x1.anchors" if need_update(anchors, ooanchors): quota_main([anchors, "--quota=1:1", "--screen"] + bstring) lifted_anchors = pprefix + ".1x1.lifted.anchors" if need_update((last, ooanchors), lifted_anchors): liftover([last, ooanchors] + bstring) pblocks = pprefix + ".1x1.blocks" qblocks = qprefix + ".1x1.blocks" if need_update(lifted_anchors, [pblocks, qblocks]): mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks]) mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks]) rbh = pprefix + ".rbh" if need_update(last, rbh): cscore([last, "-o", rbh]) portho = pprefix + ".ortholog" qortho = qprefix + ".ortholog" if need_update([pblocks, qblocks, rbh], [portho, qortho]): make_ortholog(pblocks, rbh, portho) make_ortholog(qblocks, rbh, qortho)
def ortholog(args): """ %prog ortholog species_a species_b Run a sensitive pipeline to find orthologs between two species a and b. The pipeline runs LAST and generate .lifted.anchors. `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of such predictions. Extra orthologs will be recruited from reciprocal best match (RBH). """ from jcvi.apps.align import last as last_main from jcvi.compara.blastfilter import main as blastfilter_main from jcvi.compara.quota import main as quota_main from jcvi.compara.synteny import scan, mcscan, liftover from jcvi.formats.blast import cscore, filter p = OptionParser(ortholog.__doc__) p.add_option("--dbtype", default="nucl", choices=("nucl", "prot"), help="Molecule type of subject database") p.add_option("--full", default=False, action="store_true", help="Run in full mode, including blocks and RBH") p.add_option("--cscore", default=0.7, type="float", help="C-score cutoff [default: %default]") p.add_option("--dist", default=20, type="int", help="Extent of flanking regions to search") p.add_option("--quota", help="Quota align parameter") p.add_option("--nostdpf", default=False, action="store_true", help="Do not standardize contig names") p.add_option("--no_strip_names", default=False, action="store_true", help="Do not strip alternative splicing " "(e.g. At5g06540.1 -> At5g06540)") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) a, b = args dbtype = opts.dbtype suffix = ".cds" if dbtype == "nucl" else ".pep" abed, afasta = a + ".bed", a + suffix bbed, bfasta = b + ".bed", b + suffix ccscore = opts.cscore quota = opts.quota dist = "--dist={0}".format(opts.dist) aprefix = afasta.split(".")[0] bprefix = bfasta.split(".")[0] pprefix = ".".join((aprefix, bprefix)) qprefix = ".".join((bprefix, aprefix)) last = pprefix + ".last" if need_update((afasta, bfasta), last): last_main([bfasta, afasta], dbtype) if a == b: lastself = last + ".P98L0.inverse" if need_update(last, lastself): filter([last, "--hitlen=0", "--pctid=98", "--inverse", "--noself"]) last = lastself filtered_last = last + ".filtered" if need_update(last, filtered_last): if opts.no_strip_names: blastfilter_main([last, "--cscore={0}".format(ccscore), "--no_strip_names"]) else: blastfilter_main([last, "--cscore={0}".format(ccscore)]) anchors = pprefix + ".anchors" lifted_anchors = pprefix + ".lifted.anchors" pdf = pprefix + ".pdf" if not opts.full: if need_update(filtered_last, lifted_anchors): if opts.no_strip_names: scan([filtered_last, anchors, dist, "--liftover={0}".format(last), "--no_strip_names"]) else: scan([filtered_last, anchors, dist, "--liftover={0}".format(last)]) if quota: quota_main([lifted_anchors, "--quota={0}".format(quota), "--screen"]) if need_update(anchors, pdf): from jcvi.graphics.dotplot import dotplot_main dargs = [anchors] if opts.nostdpf: dargs += ["--nostdpf", "--skipempty"] dotplot_main(dargs) return if need_update(filtered_last, anchors): if opts.no_strip_names: scan([filtered_last, anchors, dist, "--no_strip_names"]) else: scan([filtered_last, anchors, dist]) ooanchors = pprefix + ".1x1.anchors" if need_update(anchors, ooanchors): quota_main([anchors, "--quota=1:1", "--screen"]) lifted_anchors = pprefix + ".1x1.lifted.anchors" if need_update((last, ooanchors), lifted_anchors): if opts.no_strip_names: liftover([last, ooanchors, dist, "--no_strip_names"]) else: liftover([last, ooanchors, dist]) pblocks = pprefix + ".1x1.blocks" qblocks = qprefix + ".1x1.blocks" if need_update(lifted_anchors, [pblocks, qblocks]): mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks]) mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks]) rbh = pprefix + ".rbh" if need_update(last, rbh): cscore([last, "-o", rbh]) portho = pprefix + ".ortholog" qortho = qprefix + ".ortholog" if need_update([pblocks, qblocks, rbh], [portho, qortho]): make_ortholog(pblocks, rbh, portho) make_ortholog(qblocks, rbh, qortho)