Пример #1
0
def soap_trios(p, pf, tag, extra):
    """
    Take one pair of reads and 'widow' reads after correction and run SOAP.
    """
    from jcvi.assembly.soap import prepare

    logging.debug("Work on {0} ({1})".format(pf, ",".join(p)))
    asm = "{0}.closed.scafSeq".format(pf)
    if not need_update(p, asm):
        logging.debug("Assembly found: {0}. Skipped.".format(asm))
        return

    slink(p, pf, tag, extra)

    cwd = os.getcwd()
    os.chdir(pf)
    prepare(
        sorted(glob("*.fastq") + glob("*.fastq.gz"))
        + ["--assemble_1st_rank_only", "-K 31"]
    )
    sh("./run.sh")
    sh("cp asm31.closed.scafSeq ../{0}".format(asm))

    logging.debug("Assembly finished: {0}".format(asm))
    os.chdir(cwd)
Пример #2
0
def correct_pairs(p, pf, tag):
    """
    Take one pair of reads and correct to generate *.corr.fastq.
    """
    from jcvi.assembly.preprocess import correct as cr

    logging.debug("Work on {0} ({1})".format(pf, ','.join(p)))
    itag = tag[0]
    cm = ".".join((pf, itag))
    targets = (cm + ".1.corr.fastq", cm + ".2.corr.fastq", \
                pf + ".PE-0.corr.fastq")
    if not need_update(p, targets):
        logging.debug("Corrected reads found: {0}. Skipped.".format(targets))
        return

    slink(p, pf, tag)

    cwd = os.getcwd()
    os.chdir(pf)
    cr(sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--nofragsdedup"])
    sh("mv {0}.1.corr.fastq ../{1}".format(itag, targets[0]))
    sh("mv {0}.2.corr.fastq ../{1}".format(itag, targets[1]))
    sh("mv frag_reads_corr.corr.fastq ../{0}".format(targets[2]))

    logging.debug("Correction finished: {0}".format(targets))
    os.chdir(cwd)
Пример #3
0
def correct_pairs(p, pf, tag):
    """
    Take one pair of reads and correct to generate *.corr.fastq.
    """
    from jcvi.assembly.preprocess import correct as cr

    logging.debug("Work on {0} ({1})".format(pf, ','.join(p)))
    itag = tag[0]
    cm = ".".join((pf, itag))
    targets = (cm + ".1.corr.fastq", cm + ".2.corr.fastq", \
                pf + ".PE-0.corr.fastq")
    if not need_update(p, targets):
        logging.debug("Corrected reads found: {0}. Skipped.".format(targets))
        return

    slink(p, pf, tag)

    cwd = os.getcwd()
    os.chdir(pf)
    cr(sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--nofragsdedup"])
    sh("mv {0}.1.corr.fastq ../{1}".format(itag, targets[0]))
    sh("mv {0}.2.corr.fastq ../{1}".format(itag, targets[1]))
    sh("mv frag_reads_corr.corr.fastq ../{0}".format(targets[2]))

    logging.debug("Correction finished: {0}".format(targets))
    os.chdir(cwd)
Пример #4
0
def get_info():
    infofiles = glob("*.info")
    info = {}
    for row in must_open(infofiles):
        a = row.split()[0]
        info[a] = row.rstrip()
    return info
Пример #5
0
def merge(args):
    """
    %prog merge folder1 ...

    Consolidate split contents in the folders. The folders can be generated by
    the split() process and several samples may be in separate fastq files. This
    program merges them.
    """
    p = OptionParser(merge.__doc__)
    p.add_option("--outdir",
                 default="outdir",
                 help="Output final reads in [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    folders = args
    outdir = opts.outdir
    mkdir(outdir)

    files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
    files = list(files)
    key = lambda x: op.basename(x).split(".")[0]
    files.sort(key=key)
    for id, fns in groupby(files, key=key):
        fns = list(fns)
        outfile = op.join(outdir, "{0}.fastq".format(id))
        FileMerger(fns, outfile=outfile).merge(checkexists=True)
Пример #6
0
def trace(args):
    """
    %prog trace unitig{version}.{partID}.{unitigID}

    Call `grep` to get the erroneous fragment placement.
    """
    p = OptionParser(trace.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    s, = args
    version, partID, unitigID = get_ID(s)

    flist = glob("../5-consensus/*_{0:03d}.err".format(int(partID)))
    assert len(flist) == 1
    fp = open(flist[0])

    instate = False
    for row in fp:
        if working in row and unitigID in row:
            rows = []
            instate = True
        if instate:
            rows.append(row)
        if failed in row:
            instate = False
            if len(rows) > 20:
                ignore_line = "... ({0} lines skipped)\n".format(len(rows) - 20)
                rows = rows[:10] + [ignore_line] + rows[-10:]

    print >> sys.stderr, "".join(rows)
Пример #7
0
def tracedb(args):
    """
    %prog tracedb <xml|lib|frg>

    Run `tracedb-to-frg.pl` within current folder.
    """
    p = OptionParser(tracedb.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    action, = args
    assert action in ("xml", "lib", "frg")

    CMD = "tracedb-to-frg.pl"
    xmls = glob("xml*")

    if action == "xml":
        for xml in xmls:
            cmd = CMD + " -xml {0}".format(xml)
            sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True)

    elif action == "lib":
        cmd = CMD + " -lib {0}".format(" ".join(xmls))
        sh(cmd)

    elif action == "frg":
        for xml in xmls:
            cmd = CMD + " -frg {0}".format(xml)
            sh(cmd, background=True)
Пример #8
0
def cufflinks(args):
    """
    %prog cufflinks folder reference

    Run cufflinks on a folder containing tophat results.
    """
    p = OptionParser(cufflinks.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    os.chdir(folder)
    bams = glob("*tophat/accepted_hits.bam")
    for bam in bams:
        pf, ab = op.split(bam)
        outdir = op.join(pf, "cufflinks")
        if op.exists(outdir):
            logging.debug("Directory {0} found. Skipping.".format(outdir))
            continue
        cmd = "cufflinks"
        cmd += " -o {0}".format(outdir)
        cmd += " -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -g {0}".format(opts.gtf)
        cmd += " --frag-bias-correct {0}".format(reference)
        cmd += " --multi-read-correct"
        cmd += " {0}".format(bam)
        sh(cmd)
Пример #9
0
def tracedb(args):
    """
    %prog tracedb <xml|lib|frg>

    Run `tracedb-to-frg.pl` within current folder.
    """
    p = OptionParser(tracedb.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    action, = args
    assert action in ("xml", "lib", "frg")

    CMD = "tracedb-to-frg.pl"
    xmls = glob("xml*")

    if action == "xml":
        for xml in xmls:
            cmd = CMD + " -xml {0}".format(xml)
            sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True)

    elif action == "lib":
        cmd = CMD + " -lib {0}".format(" ".join(xmls))
        sh(cmd)

    elif action == "frg":
        for xml in xmls:
            cmd = CMD + " -frg {0}".format(xml)
            sh(cmd, background=True)
Пример #10
0
def get_info():
    infofiles = glob("*.info")
    info = {}
    for row in must_open(infofiles):
        a = row.split()[0]
        info[a] = row.rstrip()
    return info
Пример #11
0
def trace(args):
    """
    %prog trace unitig{version}.{partID}.{unitigID}

    Call `grep` to get the erroneous fragment placement.
    """
    p = OptionParser(trace.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    s, = args
    version, partID, unitigID = get_ID(s)

    flist = glob("../5-consensus/*_{0:03d}.err".format(int(partID)))
    assert len(flist) == 1
    fp = open(flist[0])

    instate = False
    for row in fp:
        if working in row and unitigID in row:
            rows = []
            instate = True
        if instate:
            rows.append(row)
        if failed in row:
            instate = False
            if len(rows) > 20:
                ignore_line = "... ({0} lines skipped)\n".format(
                    len(rows) - 20)
                rows = rows[:10] + [ignore_line] + rows[-10:]

    print >> sys.stderr, "".join(rows)
Пример #12
0
def merge(args):
    """
    %prog merge folder1 ...

    Consolidate split contents in the folders. The folders can be generated by
    the split() process and several samples may be in separate fastq files. This
    program merges them.
    """
    p = OptionParser(merge.__doc__)
    p.set_outdir(outdir="outdir")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    folders = args
    outdir = opts.outdir
    mkdir(outdir)

    files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
    files = list(files)
    key = lambda x: op.basename(x).split(".")[0]
    files.sort(key=key)
    for id, fns in groupby(files, key=key):
        fns = list(fns)
        outfile = op.join(outdir, "{0}.fastq".format(id))
        FileMerger(fns, outfile=outfile).merge(checkexists=True)
Пример #13
0
def cufflinks(args):
    """
    %prog cufflinks folder reference

    Run cufflinks on a folder containing tophat results.
    """
    p = OptionParser(cufflinks.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    os.chdir(folder)
    bams = glob("*tophat/accepted_hits.bam")
    for bam in bams:
        pf, ab = op.split(bam)
        outdir = op.join(pf, "cufflinks")
        if op.exists(outdir):
            logging.debug("Directory {0} found. Skipping.".format(outdir))
            continue
        cmd = "cufflinks"
        cmd += " -o {0}".format(outdir)
        cmd += " -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -g {0}".format(opts.gtf)
        cmd += " --frag-bias-correct {0}".format(reference)
        cmd += " --multi-read-correct"
        cmd += " {0}".format(bam)
        sh(cmd)
Пример #14
0
def batch(args):
    """
    %prog batch splits output

    The arguments are two folders.
    Input FASTA sequences are in splits/.
    Output csv files are in output/.

    Must have folders swissprot/, tair/, trembl/ that contains the respective
    BLAST output. Once finished, you can run, for example:

    $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml
    """
    p = OptionParser(batch.__doc__)

    ahrd_weights = {"blastp": [0.5, 0.3, 0.2], "blastx": [0.6, 0.4, 0.0]}
    blast_progs = tuple(ahrd_weights.keys())

    p.add_option("--path",
                 default="~/code/AHRD/",
                 help="Path where AHRD is installed [default: %default]")
    p.add_option("--blastprog", default="blastp", choices=blast_progs,
                help="Specify the blast program being run. Based on this option," \
                   + " the AHRD parameters (score_weights) will be modified." \
                   + " [default: %default]")
    p.add_option("--iprscan", default=None,
                help="Specify path to InterProScan results file if available." \
                   + " If specified, the yml conf file will be modified" \
                   + " appropriately. [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    splits, output = args
    mkdir(output)

    bit_score, db_score, ovl_score = ahrd_weights[opts.blastprog]

    for f in glob("{0}/*.fa*".format(splits)):
        fb = op.basename(f).rsplit(".", 1)[0]
        fw = open(op.join(output, fb + ".yml"), "w")

        path = op.expanduser(opts.path)
        dir = op.join(path, "test/resources")
        outfile = op.join(output, fb + ".csv")
        interpro = iprscanTemplate.format(opts.iprscan) if opts.iprscan else ""

        print(Template.format(dir, fb, f, outfile, bit_score, db_score,
                              ovl_score, interpro),
              file=fw)

    if opts.iprscan:
        if not op.lexists("interpro.xml"):
            symlink(op.join(iprscan_datadir, "interpro.xml"), "interpro.xml")

        if not op.lexists("interpro.dtd"):
            symlink(op.join(iprscan_datadir, "interpro.dtd"), "interpro.dtd")
Пример #15
0
def get_prefix(dir="../"):
    """
    Look for prefix.gkpStore in the upper directory.
    """
    prefix = glob(dir + "*.gkpStore")[0]
    prefix = op.basename(prefix).rsplit(".", 1)[0]

    return prefix
Пример #16
0
def get_prefix(dir="../"):
    """
    Look for prefix.gkpStore in the upper directory.
    """
    prefix = glob(dir + "*.gkpStore")[0]
    prefix = op.basename(prefix).rsplit(".", 1)[0]

    return prefix
Пример #17
0
def batch(args):
    """
    %prog batch splits output

    The arguments are two folders.
    Input FASTA sequences are in splits/.
    Output csv files are in output/.

    Must have folders swissprot/, tair/, trembl/ that contains the respective
    BLAST output. Once finished, you can run, for example:

    $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml
    """
    p = OptionParser(batch.__doc__)

    ahrd_weights = { "blastp": [0.5, 0.3, 0.2],
                     "blastx": [0.6, 0.4, 0.0]
                   }
    blast_progs = tuple(ahrd_weights.keys())

    p.add_option("--path", default="~/code/AHRD/",
                 help="Path where AHRD is installed [default: %default]")
    p.add_option("--blastprog", default="blastp", choices=blast_progs,
                help="Specify the blast program being run. Based on this option," \
                   + " the AHRD parameters (score_weights) will be modified." \
                   + " [default: %default]")
    p.add_option("--iprscan", default=None,
                help="Specify path to InterProScan results file if available." \
                   + " If specified, the yml conf file will be modified" \
                   + " appropriately. [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    splits, output = args
    mkdir(output)

    bit_score, db_score, ovl_score = ahrd_weights[opts.blastprog]

    for f in glob("{0}/*.fasta".format(splits)):
        fb = op.basename(f).rsplit(".", 1)[0]
        fw = open(op.join(output, fb + ".yml"), "w")

        path = op.expanduser(opts.path)
        dir = op.join(path, "test/resources")
        outfile = op.join(output, fb + ".csv")
        interpro = iprscanTemplate.format(opts.iprscan) if opts.iprscan else ""

        print >> fw, Template.format(dir, fb, f, outfile, bit_score, db_score, ovl_score, interpro)

    if opts.iprscan:
        if not op.lexists("interpro.xml"):
            symlink(op.join(iprscan_datadir, "interpro.xml"), "interpro.xml")

        if not op.lexists("interpro.dtd"):
            symlink(op.join(iprscan_datadir, "interpro.dtd"), "interpro.dtd")
Пример #18
0
def get_weights(weightsfiles=None):
    if weightsfiles is None:
        weightsfiles = glob("*.weights")

    weights = defaultdict(list)
    for row in must_open(weightsfiles):
        a, b, c = row.split()
        weights[a].append((a, b, c))
    return weights
Пример #19
0
def get_weights(weightsfiles=None):
    if weightsfiles is None:
        weightsfiles = glob("*.weights")

    weights = defaultdict(list)
    for row in must_open(weightsfiles):
        a, b, c = row.split()
        weights[a].append((a, b, c))
    return weights
Пример #20
0
    def __init__(self, fig, root, canvas, chr, xlim, datadir,
                 order=None, hlsuffix=None, palette=None, cap=50,
                 gauge="bottom", plot_label=True, plot_chr_label=True,
                 gauge_step=5000000, vlines=None):
        x, y, w, h = canvas
        p = .01
        root.add_patch(Rectangle((x - p, y - p), w + 2 * p, h + 2 * p, lw=1,
                        fill=False, ec="darkslategray", zorder=10))
        datafiles = glob(op.join(datadir, chr + "*"))
        ntracks = len(datafiles)
        yinterval = h / ntracks
        yy = y + h

        if palette is None:
            # Get the palette
            import brewer2mpl
            set2 = brewer2mpl.get_map('Set2', 'qualitative', ntracks).mpl_colors
        else:
            set2 = [palette] * ntracks

        if order:
            datafiles.sort(key=lambda x: order.index(x.split(".")[1]))

        if gauge == "top":
            gauge_ax = fig.add_axes([x, yy + p, w, .0001])
            adjust_spines(gauge_ax, ["top"])
            tpos = yy + .07
        elif gauge == "bottom":
            gauge_ax = fig.add_axes([x, y - p, w, .0001])
            adjust_spines(gauge_ax, ["bottom"])
            tpos = y - .07

        start, end = xlim
        fs = gauge_step < 1000000
        setup_gauge_ax(gauge_ax, start, end, gauge_step, float_formatter=fs)

        if plot_chr_label:
            root.text(x + w / 2, tpos, chr, ha="center", va="center",
                      color="darkslategray", size=16)

        for label, datafile, c in zip(order, datafiles, set2):
            yy -= yinterval
            ax = fig.add_axes([x, yy, w, yinterval * .9])
            xy = XYtrack(ax, datafile, color=c)
            xy.interpolate(end)
            xy.cap(ymax=cap)
            if vlines:
                xy.vlines(vlines)
            if hlsuffix:
                hlfile = op.join(datadir, ".".join((label, hlsuffix)))
                xy.import_hlfile(hlfile, chr)
            if plot_label:
                root.text(x - .035, yy + yinterval / 2, label,
                            ha="center", va="center", color=c)
            xy.draw()
            ax.set_xlim(*xlim)
Пример #21
0
def iter_project(folder, n=2):
    # Check for paired reads and extract project id
    filelist = [x for x in glob(folder + "/*.*") if x.rsplit(".", 1)[-1] in ("fq", "fastq", "txt", "gz")]
    for p in grouper(filelist, n):
        if len(p) != n:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp)
        yield list(p), pf
Пример #22
0
def assemble_dir(pf, target, ploidy="1"):
    from jcvi.assembly.allpaths import prepare

    logging.debug("Work on {0}".format(pf))
    asm = [x.replace("final", pf) for x in target]
    if not need_update(pf, asm):
        logging.debug("Assembly found: {0}. Skipped.".format(asm))
        return

    cwd = os.getcwd()
    os.chdir(pf)
    prepare([pf] + sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--ploidy={0}".format(ploidy)])
    sh("./run.sh")

    for a, t in zip(asm, target):
        sh("cp allpaths/ASSEMBLIES/run/{0} ../{1}".format(t, a))

    logging.debug("Assembly finished: {0}".format(asm))
    os.chdir(cwd)
Пример #23
0
def iter_project(folder, n=2):
    # Check for paired reads and extract project id
    filelist = [x for x in glob(folder + "/*.*") \
                    if x.rsplit(".", 1)[-1] in ("fq", "fastq", "txt", "gz")]
    for p in grouper(filelist, n):
        if len(p) != n:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp)
        yield list(p), pf
Пример #24
0
def prepare(args):
    """
    %prog prepare countfolder families

    Parse list of count files and group per family into families folder.
    """
    p = OptionParser(prepare.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    counts, families = args
    countfiles = glob(op.join(counts, "*.count"))
    countsdb = defaultdict(list)
    for c in countfiles:
        rs = RiceSample(c)
        countsdb[(rs.tissue, rs.ind)].append(rs)

    # Merge duplicates - data sequenced in different batches
    key = lambda x: (x.label, x.rep)
    for (tissue, ind), rs in sorted(countsdb.items()):
        rs.sort(key=key)
        nrs = len(rs)
        for i in xrange(nrs):
            ri = rs[i]
            if not ri.working:
                continue
            for j in xrange(i + 1, nrs):
                rj = rs[j]
                if key(ri) != key(rj):
                    continue
                ri.merge(rj)
                rj.working = False
        countsdb[(tissue, ind)] = [x for x in rs if x.working]

    # Group into families
    mkdir("families")
    for (tissue, ind), r in sorted(countsdb.items()):
        r = list(r)
        if r[0].label != "F1":
            continue
        P1, P2 = r[0].P1, r[0].P2
        P1, P2 = countsdb[(tissue, P1)], countsdb[(tissue, P2)]
        rs = P1 + P2 + r
        groups = [1] * len(P1) + [2] * len(P2) + [3] * len(r)
        assert len(rs) == len(groups)

        outfile = "-".join((tissue, ind))
        merge_counts(rs, op.join(families, outfile))
        groupsfile = outfile + ".groups"
        fw = open(op.join(families, groupsfile), "w")
        print >> fw, ",".join(str(x) for x in groups)
        fw.close()
Пример #25
0
def prepare(args):
    """
    %prog prepare countfolder families

    Parse list of count files and group per family into families folder.
    """
    p = OptionParser(prepare.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    counts, families = args
    countfiles = glob(op.join(counts, "*.count"))
    countsdb = defaultdict(list)
    for c in countfiles:
        rs = RiceSample(c)
        countsdb[(rs.tissue, rs.ind)].append(rs)

    # Merge duplicates - data sequenced in different batches
    key = lambda x: (x.label, x.rep)
    for (tissue, ind), rs in sorted(countsdb.items()):
        rs.sort(key=key)
        nrs = len(rs)
        for i in xrange(nrs):
            ri = rs[i]
            if not ri.working:
                continue
            for j in xrange(i + 1, nrs):
                rj = rs[j]
                if key(ri) != key(rj):
                    continue
                ri.merge(rj)
                rj.working = False
        countsdb[(tissue, ind)] = [x for x in rs if x.working]

    # Group into families
    mkdir("families")
    for (tissue, ind), r in sorted(countsdb.items()):
        r = list(r)
        if r[0].label != "F1":
            continue
        P1, P2 = r[0].P1, r[0].P2
        P1, P2 = countsdb[(tissue, P1)], countsdb[(tissue, P2)]
        rs = P1 + P2 + r
        groups = [1] * len(P1) + [2] * len(P2) + [3] * len(r)
        assert len(rs) == len(groups)

        outfile = "-".join((tissue, ind))
        merge_counts(rs, op.join(families, outfile))
        groupsfile = outfile + ".groups"
        fw = open(op.join(families, groupsfile), "w")
        print >> fw, ",".join(str(x) for x in groups)
        fw.close()
Пример #26
0
def assemble_dir(pf, target, ploidy="1"):
    from jcvi.assembly.allpaths import prepare

    logging.debug("Work on {0}".format(pf))
    asm = [x.replace("final", pf) for x in target]
    if not need_update(pf, asm):
        logging.debug("Assembly found: {0}. Skipped.".format(asm))
        return

    cwd = os.getcwd()
    os.chdir(pf)
    prepare([pf] + sorted(glob("*.fastq") + glob("*.fastq.gz")) + \
            ["--ploidy={0}".format(ploidy)])
    sh("./run.sh")

    for a, t in zip(asm, target):
        sh("cp allpaths/ASSEMBLIES/run/{0} ../{1}".format(t, a))

    logging.debug("Assembly finished: {0}".format(asm))
    os.chdir(cwd)
Пример #27
0
def get_edges(weightsfiles=None):
    if weightsfiles is None:
        weightsfiles = glob("*.weights")

    edges = {}
    for row in must_open(weightsfiles):
        a, b, c = row.split()
        c = int(c)
        edges[(a, b)] = c
        edges[(b, a)] = c
    return edges
Пример #28
0
def get_edges(weightsfiles=None):
    if weightsfiles is None:
        weightsfiles = glob("*.weights")

    edges = {}
    for row in must_open(weightsfiles):
        a, b, c = row.split()
        c = int(c)
        edges[(a, b)] = c
        edges[(b, a)] = c
    return edges
Пример #29
0
def preparegb(p, args):
    p.add_option("--gb_dir",
                 default=None,
                 help="path to dir containing GanBank files (.gb)")
    p.add_option(
        "--id",
        default=None,
        help="GenBank accession IDs in a file. One ID per row, or all IDs"
        " in one row comma separated.",
    )
    p.add_option(
        "--simple",
        default=None,
        type="string",
        help="GenBank accession IDs comma separated "
        "(for lots of IDs please use --id instead).",
    )
    p.add_option(
        "--individual",
        default=False,
        action="store_true",
        help="parse gb accessions individually",
    )
    opts, args = p.parse_args(args)
    accessions = opts.id
    filenames = opts.gb_dir

    if not (opts.gb_dir or opts.id or opts.simple):
        sys.exit(not p.print_help())

    if opts.gb_dir:
        filenames = glob(opts.gb_dir + "/*.gb")

    if opts.id:
        rows = open(opts.id).readlines()
        accessions = []
        for row in rows:
            accessions += map(str.strip, row.strip().split(","))

    if opts.simple:
        accessions = opts.simple.split(",")

    if opts.id or opts.simple:
        fw = must_open("GenBank_accession_IDs.txt", "w")
        for atom in accessions:
            print(atom, file=fw)
        fw.close()
        idfile = fw.name
    else:
        idfile = None

    return filenames, accessions, idfile, opts, args
Пример #30
0
def soap_trios(p, pf, tag, extra):
    """
    Take one pair of reads and 'widow' reads after correction and run SOAP.
    """
    from jcvi.assembly.soap import prepare

    logging.debug("Work on {0} ({1})".format(pf, ",".join(p)))
    asm = "{0}.closed.scafSeq".format(pf)
    if not need_update(p, asm):
        logging.debug("Assembly found: {0}. Skipped.".format(asm))
        return

    slink(p, pf, tag, extra)

    cwd = os.getcwd()
    os.chdir(pf)
    prepare(sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--assemble_1st_rank_only", "-K 31"])
    sh("./run.sh")
    sh("cp asm31.closed.scafSeq ../{0}".format(asm))

    logging.debug("Assembly finished: {0}".format(asm))
    os.chdir(cwd)
Пример #31
0
Файл: base.py Проект: rrane/jcvi
def get_libs(args):
    from itertools import groupby

    fnames = args or glob("*.fastq*")
    fnames = sorted(fnames)
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    library_name = lambda x: "-".join(op.basename(x).split(".")[0].split("-")[:2])
    libs = [(Library(x), sorted(fs)) for x, fs in groupby(fnames, key=library_name)]

    libs.sort(key=lambda x: x[0].size)
    return libs
Пример #32
0
def dn(args):
    """
    %prog dn folder

    Run Trinity-DN on a folder of reads. When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain "_1_" and "_2_".
    """
    p = OptionParser(dn.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.set_home("trinity")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    paired = opts.paired
    thome = opts.trinity_home
    tfolder = folder + "_DN"

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = glob("../" + folder + "/*")
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x]
        assert len(f1) == len(f2)
        r1, r2 = "left.fastq", "right.fastq"
        reads = ((f1, r1), (f2, r2))
    else:
        r = "single.fastq"
        reads = ((flist, r), )

    for fl, r in reads:
        fm = FileMerger(fl, r)
        fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity.pl")
    cmd += " --seqType fq --JM 100G --CPU {0}".format(opts.cpus)
    if paired:
        cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
    else:
        cmd += " --single {0}".format(reads[0][-1])

    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
    os.chdir(cwd)
Пример #33
0
def error(args):
    """
    %prog error version backup_folder

    Find all errors in ../5-consensus/*.err and pull the error unitigs into
    backup/ folder.
    """
    p = OptionParser(error.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    version, backup_folder = args
    mkdir(backup_folder)

    fw = open("errors.log", "w")

    seen = set()
    for g in glob("../5-consensus/*.err"):
        if "partitioned" in g:
            continue

        fp = open(g)
        partID = op.basename(g).rsplit(".err", 1)[0]
        partID = int(partID.split("_")[-1])

        for row in fp:
            if row.startswith(working):
                unitigID = row.split("(")[0].split()[-1]
                continue

            if not failed.upper() in row.upper():
                continue

            uu = (version, partID, unitigID)
            if uu in seen:
                continue
            seen.add(uu)

            print >> fw, "\t".join(str(x) for x in (partID, unitigID))

            s = [str(x) for x in uu]
            unitigfile = pull(s)
            cmd = "mv {0} {1}".format(unitigfile, backup_folder)
            sh(cmd)

        fp.close()

    logging.debug("A total of {0} unitigs saved to {1}.".\
                 format(len(seen), backup_folder))
Пример #34
0
def error(args):
    """
    %prog error version backup_folder

    Find all errors in ../5-consensus/*.err and pull the error unitigs into
    backup/ folder.
    """
    p = OptionParser(error.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    version, backup_folder = args
    mkdir(backup_folder)

    fw = open("errors.log", "w")

    seen = set()
    for g in glob("../5-consensus/*.err"):
        if "partitioned" in g:
            continue

        fp = open(g)
        partID = op.basename(g).rsplit(".err", 1)[0]
        partID = int(partID.split("_")[-1])

        for row in fp:
            if row.startswith(working):
                unitigID = row.split("(")[0].split()[-1]
                continue

            if not failed.upper() in row.upper():
                continue

            uu = (version, partID, unitigID)
            if uu in seen:
                continue
            seen.add(uu)

            print("\t".join(str(x) for x in (partID, unitigID)), file=fw)

            s = [str(x) for x in uu]
            unitigfile = pull(s)
            cmd = "mv {0} {1}".format(unitigfile, backup_folder)
            sh(cmd)

        fp.close()

    logging.debug("A total of {0} unitigs saved to {1}.".\
                 format(len(seen), backup_folder))
Пример #35
0
def get_libs(args):
    from itertools import groupby

    fnames = args or glob("*.fastq*")
    fnames = sorted(fnames)
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    library_name = lambda x: "-".join(\
                op.basename(x).split(".")[0].split("-")[:2])
    libs = [(Library(x), sorted(fs)) for x, fs in \
                groupby(fnames, key=library_name)]

    libs.sort(key=lambda x: x[0].size)
    return libs
Пример #36
0
Файл: sam.py Проект: rrane/jcvi
def merge(args):
    """
    %prog merge merged_bams bams1_dir bams2_dir ...

    Merge BAM files. Treat the bams with the same prefix as a set.
    Output the commands first.
    """
    from jcvi.apps.softlink import get_abs_path
    from jcvi.apps.grid import MakeManager

    p = OptionParser(merge.__doc__)
    p.add_option("--sep", default="_",
                 help="Separator to group per prefix")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    merged_bams = args[0]
    bamdirs = args[1:]

    mkdir(merged_bams)
    bams = []
    for x in bamdirs:
        bams += glob(op.join(x, "*.bam"))
    bams = [x for x in bams if "nsorted" not in x]

    logging.debug("Found a total of {0} BAM files.".format(len(bams)))

    sep = opts.sep
    key = lambda x: op.basename(x).split(sep)[0]
    bams.sort(key=key)
    mm = MakeManager()
    for prefix, files in groupby(bams, key=key):
        files = sorted(list(files))
        nfiles = len(files)
        source = " ".join(files)
        target = op.join(merged_bams, op.basename(files[0]))
        if nfiles == 1:
            source = get_abs_path(source)
            cmd = "ln -s {0} {1}".format(source, target)
            mm.add("", target, cmd)
        else:
            cmds = []
            cmds.append("rm {0}".format(target))
            cmds.append("samtools merge {0} {1}".format(target, source))
            mm.add(files, target, cmds)
    mm.write()
Пример #37
0
def merge(args):
    """
    %prog merge merged_bams bams1_dir bams2_dir ...

    Merge BAM files. Treat the bams with the same prefix as a set.
    Output the commands first.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(merge.__doc__)
    p.set_sep(sep="_", help="Separator to group per prefix")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    merged_bams = args[0]
    bamdirs = args[1:]

    mkdir(merged_bams)
    bams = []
    for x in bamdirs:
        bams += glob(op.join(x, "*.bam"))
    bams = [x for x in bams if "nsorted" not in x]

    logging.debug("Found a total of {0} BAM files.".format(len(bams)))

    sep = opts.sep
    key = lambda x: op.basename(x).split(sep)[0]
    bams.sort(key=key)
    mm = MakeManager()
    for prefix, files in groupby(bams, key=key):
        files = sorted(list(files))
        nfiles = len(files)
        source = " ".join(files)
        target = op.join(merged_bams, op.basename(files[0]))
        if nfiles == 1:
            source = get_abs_path(source)
            cmd = "ln -s {0} {1}".format(source, target)
            mm.add("", target, cmd)
        else:
            cmd = "samtools merge -@ 8 {0} {1}".format(target, source)
            mm.add(files, target, cmd, remove=True)
    mm.write()
Пример #38
0
def merge(args):
    """
    %prog merge outdir output.gff

    Follow-up command after grid jobs are completed after parallel().
    """
    from jcvi.formats.gff import merge as gmerge

    p = OptionParser(merge.__doc__)
    p.set_home("maker")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, outputgff = args
    fsnames, suffix = get_fsnames(outdir)
    nfs = len(fsnames)
    cmd = op.join(opts.maker_home, "bin/gff3_merge")

    outfile = "merge.sh"
    write_file(outfile, mergesh.format(suffix, cmd))

    # Generate per split directory
    # Note that gff3_merge write to /tmp, so I limit processes here to avoid
    # filling up disk space
    sh("parallel -j 8 merge.sh {} ::: " + " ".join(fsnames))

    # One final output
    gffnames = glob("*.all.gff")
    assert len(gffnames) == nfs

    # Again, DO NOT USE gff3_merge to merge with a smallish /tmp/ area
    gfflist = "gfflist"
    fw = open(gfflist, "w")
    print("\n".join(gffnames), file=fw)
    fw.close()

    nlines = sum(1 for x in open(gfflist))
    assert nlines == nfs  # Be extra, extra careful to include all results
    gmerge([gfflist, "-o", outputgff])
    logging.debug("Merged GFF file written to `{0}`".format(outputgff))
Пример #39
0
def assemble(args):
    """
    %prog assemble sffdir

    Assemble each BAC separately using newbler.
    """
    from jcvi.formats.fasta import join

    p = OptionParser(assemble.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="Overwrite the separate BAC assembly [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    sffdir, = args
    asmdir = "newbler"
    fastadir = "fasta"
    mkdir(asmdir, overwrite=opts.overwrite)
    mkdir(fastadir, overwrite=opts.overwrite)
    cmd = "runAssembly -cpu 8 -o {0} {1}"
    for sffile in glob("{0}/*.sff".format(sffdir)):
        pf = op.basename(sffile).split(".")[1]
        pf = pf.lower()
        outdir = op.join(asmdir, pf)
        if op.exists(outdir):
            logging.debug("`{0}` exists. Ignored.".format(outdir))
            continue

        acmd = cmd.format(outdir, sffile)
        sh(acmd)

        ctgfile = op.join(outdir, "454LargeContigs.fna")
        if not op.exists(ctgfile):  # newbler failure
            logging.error("File `{0}` not found (newbler failure).".\
                    format(ctgfile))
            continue
        outfile = op.join(fastadir, "{0}.fasta".format(pf))
        newidopt = "--newid={0}".format(pf)
        minctgsizeopt = "--minctgsize=200"
        join([ctgfile, outfile, newidopt, minctgsizeopt])
Пример #40
0
def merge(args):
    """
    %prog merge outdir output.gff

    Follow-up command after grid jobs are completed after parallel().
    """
    from jcvi.formats.gff import merge as gmerge

    p = OptionParser(merge.__doc__)
    p.set_home("maker")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, outputgff = args
    fsnames, suffix = get_fsnames(outdir)
    nfs = len(fsnames)
    cmd = op.join(opts.maker_home, "bin/gff3_merge")

    outfile = "merge.sh"
    write_file(outfile, mergesh.format(suffix, cmd))

    # Generate per split directory
    # Note that gff3_merge write to /tmp, so I limit processes here to avoid
    # filling up disk space
    sh("parallel -j 8 merge.sh {} ::: " + " ".join(fsnames))

    # One final output
    gffnames = glob("*.all.gff")
    assert len(gffnames) == nfs

    # Again, DO NOT USE gff3_merge to merge with a smallish /tmp/ area
    gfflist = "gfflist"
    fw = open(gfflist, "w")
    print("\n".join(gffnames), file=fw)
    fw.close()

    nlines = sum(1 for x in open(gfflist))
    assert nlines == nfs  # Be extra, extra careful to include all results
    gmerge([gfflist, "-o", outputgff])
    logging.debug("Merged GFF file written to `{0}`".format(outputgff))
Пример #41
0
def omgparse(args):
    """
    %prog omgparse work

    Parse the OMG outputs to get gene lists.
    """
    p = OptionParser(omgparse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    work, = args
    omgfiles = glob(op.join(work, "gf*.out"))
    for omgfile in omgfiles:
        omg = OMGFile(omgfile)
        best = omg.best()
        for bb in best:
            genes, taxa = zip(*bb)
            print "\t".join((",".join(genes), ",".join(taxa)))
Пример #42
0
def omgparse(args):
    """
    %prog omgparse work

    Parse the OMG outputs to get gene lists.
    """
    p = OptionParser(omgparse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    work, = args
    omgfiles = glob(op.join(work, "gf*.out"))
    for omgfile in omgfiles:
        omg = OMGFile(omgfile)
        best = omg.best()
        for bb in best:
            genes, taxa = zip(*bb)
            print "\t".join((",".join(genes), ",".join(taxa)))
Пример #43
0
    def __init__(self, filenames=None, accessions=None, idfile=None):
        self.accessions = accessions
        self.idfile = idfile

        if filenames is not None:
            self.accessions = [op.basename(f).split(".")[0] for f in filenames]
            d = dict(SeqIO.to_dict(SeqIO.parse(f, "gb")).items()[0] \
                for f in filenames)
            for (k, v) in d.iteritems():
                self[k.split(".")[0]] = v

        elif idfile is not None:
            gbdir = self._get_records()
            d = dict(SeqIO.to_dict(SeqIO.parse(f, "gb")).items()[0] \
                for f in glob(gbdir+"/*.gb"))
            for (k, v) in d.iteritems():
                self[k.split(".")[0]] = v

        else:
            sys.exit("GenBank object is initiated from either gb files or "\
                "accession IDs.")
Пример #44
0
    def __init__(self, filenames=None, accessions=None, idfile=None):
        self.accessions = accessions
        self.idfile = idfile

        if filenames is not None:
            self.accessions = [op.basename(f).split(".")[0] for f in filenames]
            d = dict(SeqIO.to_dict(SeqIO.parse(f, "gb")).items()[0] \
                for f in filenames)
            for (k, v) in d.iteritems():
                self[k.split(".")[0]] = v

        elif idfile is not None:
            gbdir = self._get_records()
            d = dict(SeqIO.to_dict(SeqIO.parse(f, "gb")).items()[0] \
                for f in glob(gbdir+"/*.gb"))
            for (k, v) in d.iteritems():
                self[k.split(".")[0]] = v

        else:
            sys.exit("GenBank object is initiated from either gb files or "\
                "accession IDs.")
Пример #45
0
def preparegb(p, args):
    p.add_option("--gb_dir", default=None,
            help="path to dir containing GanBank files (.gb)")
    p.add_option("--id", default=None,
            help="GenBank accession IDs in a file. One ID per row, or all IDs" \
            " in one row comma separated.")
    p.add_option("--simple", default=None, type="string",
            help="GenBank accession IDs comma separated " \
            "(for lots of IDs please use --id instead).")
    p.add_option("--individual", default=False, action="store_true",
            help="parse gb accessions individually [default: %default]")
    opts, args = p.parse_args(args)
    accessions = opts.id
    filenames = opts.gb_dir

    if not (opts.gb_dir or opts.id or opts.simple):
        sys.exit(not p.print_help())

    if opts.gb_dir:
        filenames = glob(opts.gb_dir+"/*.gb")

    if opts.id:
        rows = file(opts.id).readlines()
        accessions = []
        for row in rows:
            accessions += map(str.strip, row.strip().split(","))

    if opts.simple:
        accessions = opts.simple.split(",")

    if opts.id or opts.simple:
        fw = must_open("GenBank_accession_IDs.txt", "w")
        for atom in accessions:
            print >>fw, atom
        fw.close()
        idfile = fw.name
    else:
        idfile=None

    return (filenames, accessions, idfile, opts, args)
Пример #46
0
def draw_tree(
    ax,
    t,
    hpd=None,
    margin=0.1,
    rmargin=0.2,
    tip=0.01,
    treecolor="k",
    supportcolor="k",
    internal=True,
    outgroup=None,
    dashedoutgroup=False,
    reroot=True,
    gffdir=None,
    sizes=None,
    trunc_name=None,
    SH=None,
    scutoff=0,
    leafcolor="k",
    leaffont=12,
    leafinfo=None,
    wgdinfo=None,
    geoscale=False,
):
    """
    main function for drawing phylogenetic tree
    """

    if reroot:
        if outgroup:
            R = t.get_common_ancestor(*outgroup)
        else:
            # Calculate the midpoint node
            R = t.get_midpoint_outgroup()

        if R is not t:
            t.set_outgroup(R)

        # By default, the distance to outgroup and non-outgroup is the same
        # we re-adjust the distances so that the outgroups will appear
        # farthest from everything else
        if dashedoutgroup:
            a, b = t.children
            # Avoid even split
            total = a.dist + b.dist
            newR = t.get_common_ancestor(*outgroup)
            a.dist = 0.9 * total
            b.dist = total - a.dist

    farthest, max_dist = t.get_farthest_leaf()
    print("max_dist = {}".format(max_dist), file=sys.stderr)

    xstart = margin
    ystart = 2 * margin
    # scale the tree
    scale = (1 - margin - rmargin) / max_dist

    def rescale(dist):
        return xstart + scale * dist

    def rescale_divergence(divergence):
        return rescale(max_dist - divergence)

    num_leaves = len(t.get_leaf_names())
    yinterval = (1 - ystart) / num_leaves

    # get exons structures, if any
    structures = {}
    if gffdir:
        gffiles = glob("{0}/*.gff*".format(gffdir))
        setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True)
        structures = dict((a, (b, c)) for a, b, c in setups)

    if sizes:
        sizes = Sizes(sizes).mapping

    coords = {}
    i = 0
    for n in t.traverse("postorder"):
        dist = n.get_distance(t)
        xx = rescale(dist)

        if n.is_leaf():
            yy = ystart + i * yinterval
            i += 1

            if trunc_name:
                name = truncate_name(n.name, rule=trunc_name)
            else:
                name = n.name

            if leafinfo and n.name in leafinfo:
                line = leafinfo[n.name]
                lc = line.color
                sname = line.new_name
            else:
                lc = leafcolor
                sname = None
            lc = lc or "k"
            sname = sname or name.replace("_", "-")
            # if color is given as "R,G,B"
            if "," in lc:
                lc = [float(x) for x in lc.split(",")]

            ax.text(
                xx + tip,
                yy,
                markup(sname),
                va="center",
                fontstyle="italic",
                size=leaffont,
                color=lc,
            )

            gname = n.name.split("_")[0]
            if gname in structures:
                mrnabed, cdsbeds = structures[gname]
                ExonGlyph(
                    ax,
                    1 - rmargin / 2,
                    yy,
                    mrnabed,
                    cdsbeds,
                    align="right",
                    ratio=ratio,
                )
            if sizes and gname in sizes:
                size = sizes[gname]
                size = size / 3 - 1  # base pair converted to amino acid
                size = "{0}aa".format(size)
                ax.text(1 - rmargin / 2 + tip, yy, size, size=leaffont)

        else:
            linestyle = "--" if (dashedoutgroup and n is t) else "-"
            children = [coords[x] for x in n.get_children()]
            children_x, children_y = zip(*children)
            min_y, max_y = min(children_y), max(children_y)
            # plot the vertical bar
            ax.plot((xx, xx), (min_y, max_y), linestyle, color=treecolor)
            # plot the horizontal bar
            for cx, cy in children:
                ax.plot((xx, cx), (cy, cy), linestyle, color=treecolor)
            yy = sum(children_y) * 1.0 / len(children_y)
            # plot HPD if exists
            if hpd and n.name in hpd:
                a, b = hpd[n.name]
                ax.plot(
                    (rescale_divergence(a), rescale_divergence(b)),
                    (yy, yy),
                    "-",
                    color="darkslategray",
                    alpha=0.4,
                    lw=2,
                )
            support = n.support
            if support > 1:
                support = support / 100.0
            if not n.is_root() and supportcolor:
                if support > scutoff / 100.0:
                    ax.text(
                        xx,
                        yy + 0.005,
                        "{0:d}".format(int(abs(support * 100))),
                        ha="right",
                        size=leaffont,
                        color=supportcolor,
                    )
            if internal and n.name:
                TextCircle(ax, xx, yy, n.name, size=9)

        coords[n] = (xx, yy)
        # WGD info
        draw_wgd(ax, yy, rescale_divergence, n.name, wgdinfo)

    # scale bar
    if geoscale:
        draw_geoscale(ax,
                      margin=margin,
                      rmargin=rmargin,
                      yy=margin,
                      max_dist=max_dist)
    else:
        br = 0.1
        x1 = xstart + 0.1
        x2 = x1 + br * scale
        yy = margin
        ax.plot([x1, x1], [yy - tip, yy + tip], "-", color=treecolor)
        ax.plot([x2, x2], [yy - tip, yy + tip], "-", color=treecolor)
        ax.plot([x1, x2], [yy, yy], "-", color=treecolor)
        ax.text(
            (x1 + x2) / 2,
            yy - tip,
            "{0:g}".format(br),
            va="top",
            ha="center",
            size=leaffont,
            color=treecolor,
        )

    if SH is not None:
        xs = x1
        ys = (margin + yy) / 2.0
        ax.text(
            xs,
            ys,
            "SH test against ref tree: {0}".format(SH),
            ha="left",
            size=leaffont,
            color="g",
        )

    normalize_axes(ax)
Пример #47
0
def draw_tree(ax, tx, rmargin=.3, leafcolor="k", supportcolor="k",
              outgroup=None, reroot=True, gffdir=None, sizes=None,
              trunc_name=None, SH=None, scutoff=0, barcodefile=None,
              leafcolorfile=None, leaffont=12):
    """
    main function for drawing phylogenetic tree
    """

    t = Tree(tx)
    if reroot:
        if outgroup:
            R = t.get_common_ancestor(*outgroup)
        else:
            # Calculate the midpoint node
            R = t.get_midpoint_outgroup()

        if R != t:
            t.set_outgroup(R)

    farthest, max_dist = t.get_farthest_leaf()

    margin = .05
    xstart = margin
    ystart = 1 - margin
    canvas = 1 - rmargin - 2 * margin
    tip = .005
    # scale the tree
    scale = canvas / max_dist

    num_leaves = len(t.get_leaf_names())
    yinterval = canvas / (num_leaves + 1)

    # get exons structures, if any
    structures = {}
    if gffdir:
        gffiles = glob("{0}/*.gff*".format(gffdir))
        setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True)
        structures = dict((a, (b, c)) for a, b, c in setups)

    if sizes:
        sizes = Sizes(sizes).mapping

    if barcodefile:
        barcodemap = DictFile(barcodefile, delimiter="\t")

    if leafcolorfile:
        leafcolors = DictFile(leafcolorfile, delimiter="\t")

    coords = {}
    i = 0
    for n in t.traverse("postorder"):
        dist = n.get_distance(t)
        xx = xstart + scale * dist

        if n.is_leaf():
            yy = ystart - i * yinterval
            i += 1

            if trunc_name:
                name = truncate_name(n.name, rule=trunc_name)
            else:
                name = n.name

            if barcodefile:
                name = decode_name(name, barcodemap)

            sname = name.replace("_", "-")

            try:
                lc = leafcolors[n.name]
            except Exception:
                lc = leafcolor
            else:
                # if color is given as "R,G,B"
                if "," in lc:
                    lc = map(float, lc.split(","))

            ax.text(xx + tip, yy, sname, va="center",
                    fontstyle="italic", size=leaffont, color=lc)

            gname = n.name.split("_")[0]
            if gname in structures:
                mrnabed, cdsbeds = structures[gname]
                ExonGlyph(ax, 1 - rmargin / 2, yy, mrnabed, cdsbeds,
                          align="right", ratio=ratio)
            if sizes and gname in sizes:
                size = sizes[gname]
                size = size / 3 - 1  # base pair converted to amino acid
                size = "{0}aa".format(size)
                ax.text(1 - rmargin / 2 + tip, yy, size, size=leaffont)

        else:
            children = [coords[x] for x in n.get_children()]
            children_x, children_y = zip(*children)
            min_y, max_y = min(children_y), max(children_y)
            # plot the vertical bar
            ax.plot((xx, xx), (min_y, max_y), "k-")
            # plot the horizontal bar
            for cx, cy in children:
                ax.plot((xx, cx), (cy, cy), "k-")
            yy = sum(children_y) * 1. / len(children_y)
            support = n.support
            if support > 1:
                support = support / 100.
            if not n.is_root():
                if support > scutoff / 100.:
                    ax.text(xx, yy+.005, "{0:d}".format(int(abs(support * 100))),
                        ha="right", size=leaffont, color=supportcolor)

        coords[n] = (xx, yy)

    # scale bar
    br = .1
    x1 = xstart + .1
    x2 = x1 + br * scale
    yy = ystart - i * yinterval
    ax.plot([x1, x1], [yy - tip, yy + tip], "k-")
    ax.plot([x2, x2], [yy - tip, yy + tip], "k-")
    ax.plot([x1, x2], [yy, yy], "k-")
    ax.text((x1 + x2) / 2, yy - tip, "{0:g}".format(br),
            va="top", ha="center", size=leaffont)

    if SH is not None:
        xs = x1
        ys = (margin + yy) / 2.
        ax.text(xs, ys, "SH test against ref tree: {0}"\
                .format(SH), ha="left", size=leaffont, color="g")
Пример #48
0
    def __init__(self,
                 fig,
                 root,
                 canvas,
                 chr,
                 xlim,
                 datadir,
                 order=None,
                 hlsuffix=None,
                 palette=None,
                 cap=50,
                 gauge="bottom",
                 plot_label=True,
                 plot_chr_label=True,
                 gauge_step=5000000,
                 vlines=None,
                 labels_dict={},
                 diverge=('r', 'g')):
        x, y, w, h = canvas
        p = .01
        root.add_patch(
            Rectangle((x - p, y - p),
                      w + 2 * p,
                      h + 2 * p,
                      lw=1,
                      fill=False,
                      ec="darkslategray",
                      zorder=10))
        datafiles = glob(op.join(datadir, chr + "*"))

        if order:
            datafiles = [z for z in datafiles if z.split(".")[1] in order]
            datafiles.sort(key=lambda x: order.index(x.split(".")[1]))

        ntracks = len(datafiles)
        yinterval = h / ntracks
        yy = y + h

        if palette is None:
            # Get the palette
            set2 = get_map('Set2', 'qualitative', ntracks).mpl_colors
        else:
            set2 = [palette] * ntracks

        if gauge == "top":
            gauge_ax = fig.add_axes([x, yy + p, w, .0001])
            adjust_spines(gauge_ax, ["top"])
            tpos = yy + .07
        elif gauge == "bottom":
            gauge_ax = fig.add_axes([x, y - p, w, .0001])
            adjust_spines(gauge_ax, ["bottom"])
            tpos = y - .07

        start, end = xlim
        if gauge:
            fs = gauge_step < 1000000
            setup_gauge_ax(gauge_ax,
                           start,
                           end,
                           gauge_step,
                           float_formatter=fs)

        if plot_chr_label:
            root.text(x + w / 2,
                      tpos,
                      chr,
                      ha="center",
                      va="center",
                      color="darkslategray",
                      size=16)

        yys = []
        for label, datafile, c in zip(order, datafiles, set2):
            yy -= yinterval
            yys.append(yy)
            ax = fig.add_axes([x, yy, w, yinterval * .9])
            xy = XYtrack(ax, datafile, color=c)
            xy.interpolate(end)
            xy.cap(ymax=cap)
            if vlines:
                xy.vlines(vlines)
            if hlsuffix:
                hlfile = op.join(datadir, ".".join((label, hlsuffix)))
                xy.import_hlfile(hlfile, chr, diverge=diverge)
            if plot_label:
                label = labels_dict.get(label, label.capitalize())
                label = r"\textit{{{0}}}".format(label)
                root.text(x - .015,
                          yy + yinterval / 2,
                          label,
                          ha="right",
                          va="center")
            xy.draw()
            ax.set_xlim(*xlim)

        self.yys = yys
Пример #49
0
def prepare(args):
    """
    %prog prepare "B. oleracea" *.fastq

    Scan input fastq files (see below) and create `in_groups.csv` and
    `in_libs.csv`. The species name does not really matter.
    """
    from jcvi.utils.table import write_csv
    from jcvi.formats.base import write_file
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("--corr", default=False, action="store_true",
                 help="Extra parameters for corrected data [default: %default]")
    p.add_option("--norun", default=False, action="store_true",
                 help="Don't write `run.sh` script [default: %default]")
    p.add_option("--ploidy", default="2", choices=("1", "2"),
                 help="Ploidy [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    organism_name = args[0]
    project_name = "".join(x[0] for x in organism_name.split()).upper()
    fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:])
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    offset = guessoffset([fnames[0]])
    phred64 = offset == 64

    assert all(guessoffset([x]) == offset for x in fnames[1:])

    groupheader = "group_name library_name file_name".split()
    libheader = "library_name project_name organism_name type paired "\
        "frag_size frag_stddev insert_size insert_stddev read_orientation "\
        "genomic_start genomic_end".split()
    groupcontents = []
    libs = []
    for file_name in fnames:
        group_name = op.basename(file_name).split(".")[0]
        library_name = "-".join(group_name.split("-")[:2])

        # Handle paired files and convert to wildcard
        if ".1." in file_name:
            file_name = file_name.replace(".1.", ".?.")
        elif ".2." in file_name:
            continue

        groupcontents.append((group_name, library_name, file_name))
        if library_name not in libs:
            libs.append(library_name)

    libcontents = []
    for library_name in libs:
        L = Library(library_name)
        size = L.size
        stddev = L.stddev
        type = L.type
        paired = L.paired
        read_orientation = L.read_orientation

        size = size or ""
        stddev = stddev or ""
        frag_size = size if type == "fragment" else ""
        frag_stddev = stddev if type == "fragment" else ""
        insert_size = size if type != "fragment" else ""
        insert_stddev = stddev if type != "fragment" else ""
        genomic_start, genomic_end = "", ""
        libcontents.append((library_name, project_name, organism_name, type, \
            paired, frag_size, frag_stddev, insert_size, insert_stddev, \
            read_orientation, genomic_start, genomic_end))

    write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True)
    logging.debug("`in_group.csv` created (# of groups = {0}).".\
        format(len(groupcontents)))

    write_csv(libheader, libcontents, filename="in_libs.csv", tee=True)
    logging.debug("`in_libs.csv` created (# of libs = {0}).".\
        format(len(libcontents)))

    runfile = "run.sh"

    extra = ""
    if opts.corr:
        extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0"
        extra += " REMOVE_DODGY_READS_FRAG=False FE_MAX_KMER_FREQ_TO_MARK=1"

    if not opts.norun:
        contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, phred64, extra)
        write_file(runfile, contents)
Пример #50
0
def htg(args):
    """
    %prog htg fastafile template.sbt

    Prepare sqnfiles for Genbank HTG submission to update existing records.

    `fastafile` contains the records to update, multiple records are allowed
    (with each one generating separate sqn file in the sqn/ folder). The record
    defline has the accession ID. For example,
    >AC148290.3

    Internally, this generates two additional files (phasefile and namesfile)
    and download records from Genbank. Below is implementation details:

    `phasefile` contains, for each accession, phase information. For example:
    AC148290.3      3       HTG     2       mth2-45h12

    which means this is a Phase-3 BAC. Record with only a single contig will be
    labeled as Phase-3 regardless of the info in the `phasefile`. Template file
    is the Genbank sbt template. See jcvi.formats.sbt for generation of such
    files.

    Another problem is that Genbank requires the name of the sequence to stay
    the same when updating and will kick back with a table of name conflicts.
    For example:

    We are unable to process the updates for these entries
    for the following reason:

    Seqname has changed

    Accession Old seq_name New seq_name
    --------- ------------ ------------
    AC239792 mtg2_29457 AC239792.1

    To prepare a submission, this script downloads genbank and asn.1 format,
    and generate the phase file and the names file (use formats.agp.phase() and
    apps.gbsubmit.asn(), respectively). These get automatically run.

    However, use --phases if the genbank files contain outdated information.
    For example, the clone name changes or phase upgrades. In this case, run
    formats.agp.phase() manually, modify the phasefile and use --phases to override.
    """
    from jcvi.formats.fasta import sequin, ids
    from jcvi.formats.agp import phase
    from jcvi.apps.fetch import entrez

    p = OptionParser(htg.__doc__)
    p.add_option("--phases", default=None,
            help="Use another phasefile to override [default: %default]")
    p.add_option("--comment", default="",
            help="Comments for this update [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, sbtfile = args
    pf = fastafile.rsplit(".", 1)[0]

    idsfile = pf + ".ids"
    phasefile = pf + ".phases"
    namesfile = pf + ".names"

    ids([fastafile, "--outfile={0}".format(idsfile)])

    asndir = "asn.1"
    mkdir(asndir)
    entrez([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)])
    asn(glob("{0}/*".format(asndir)) + \
            ["--outfile={0}".format(namesfile)])

    if opts.phases is None:
        gbdir = "gb"
        mkdir(gbdir)
        entrez([idsfile, "--format=gb", "--outdir={0}".format(gbdir)])
        phase(glob("{0}/*".format(gbdir)) + \
                ["--outfile={0}".format(phasefile)])
    else:
        phasefile = opts.phases

    assert op.exists(namesfile) and op.exists(phasefile)

    newphasefile = phasefile + ".new"
    newphasefw = open(newphasefile, "w")
    comment = opts.comment

    fastadir = "fasta"
    sqndir = "sqn"
    mkdir(fastadir)
    mkdir(sqndir)

    from jcvi.graphics.histogram import stem_leaf_plot

    names = DictFile(namesfile)
    assert len(set(names.keys())) == len(set(names.values()))

    phases = DictFile(phasefile)
    ph = [int(x) for x in phases.values()]
    # vmin 1, vmax 4, bins 3
    stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates")
    logging.debug("Information loaded for {0} records.".format(len(phases)))
    assert len(names) == len(phases)

    newph = []

    cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir)
    sh(cmd, outfile="/dev/null", errfile="/dev/null")

    acmd = 'tbl2asn -a z -p fasta -r {sqndir}'
    acmd += ' -i {splitfile} -t {sbtfile} -C tigr'
    acmd += ' -j "{qualifiers}"'
    acmd += ' -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr'
    acmd += ' -y "{comment}" -W T -T T'

    qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]"

    nupdated = 0
    for row in open(phasefile):
        atoms = row.rstrip().split("\t")
        # see formats.agp.phase() for column contents
        accession, phase, clone = atoms[0], atoms[1], atoms[-1]
        fafile = op.join(fastadir, accession + ".fa")
        accession_nv = accession.split(".", 1)[0]

        newid = names[accession_nv]
        newidopt = "--newid={0}".format(newid)
        cloneopt = "--clone={0}".format(clone)
        splitfile, gaps = sequin([fafile, newidopt, cloneopt])
        splitfile = op.basename(splitfile)
        phase = int(phase)
        assert phase in (1, 2, 3)

        oldphase = phase
        if gaps == 0 and phase != 3:
            phase = 3

        if gaps != 0 and phase == 3:
            phase = 2

        print("{0}\t{1}\t{2}".\
                format(accession_nv, oldphase, phase), file=newphasefw)
        newph.append(phase)

        qualifiers = qq.format(phase=phase)
        if ";" in clone:
            qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]"

        cmd = acmd.format(accession=accession, accession_nv=accession_nv,
                sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile,
                qualifiers=qualifiers, comment=comment)
        sh(cmd)

        verify_sqn(sqndir, accession)
        nupdated += 1

    stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates")
    print("A total of {0} records updated.".format(nupdated), file=sys.stderr)
Пример #51
0
def draw_tree(ax,
              tx,
              rmargin=.3,
              treecolor="k",
              leafcolor="k",
              supportcolor="k",
              outgroup=None,
              reroot=True,
              gffdir=None,
              sizes=None,
              trunc_name=None,
              SH=None,
              scutoff=0,
              barcodefile=None,
              leafcolorfile=None,
              leaffont=12):
    """
    main function for drawing phylogenetic tree
    """

    t = Tree(tx)
    if reroot:
        if outgroup:
            R = t.get_common_ancestor(*outgroup)
        else:
            # Calculate the midpoint node
            R = t.get_midpoint_outgroup()

        if R != t:
            t.set_outgroup(R)

    farthest, max_dist = t.get_farthest_leaf()

    margin = .05
    xstart = margin
    ystart = 1 - margin
    canvas = 1 - rmargin - 2 * margin
    tip = .005
    # scale the tree
    scale = canvas / max_dist

    num_leaves = len(t.get_leaf_names())
    yinterval = canvas / (num_leaves + 1)

    # get exons structures, if any
    structures = {}
    if gffdir:
        gffiles = glob("{0}/*.gff*".format(gffdir))
        setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True)
        structures = dict((a, (b, c)) for a, b, c in setups)

    if sizes:
        sizes = Sizes(sizes).mapping

    if barcodefile:
        barcodemap = DictFile(barcodefile, delimiter="\t")

    if leafcolorfile:
        leafcolors = DictFile(leafcolorfile, delimiter="\t")

    coords = {}
    i = 0
    for n in t.traverse("postorder"):
        dist = n.get_distance(t)
        xx = xstart + scale * dist

        if n.is_leaf():
            yy = ystart - i * yinterval
            i += 1

            if trunc_name:
                name = truncate_name(n.name, rule=trunc_name)
            else:
                name = n.name

            if barcodefile:
                name = decode_name(name, barcodemap)

            sname = name.replace("_", "-")

            try:
                lc = leafcolors[n.name]
            except Exception:
                lc = leafcolor
            else:
                # if color is given as "R,G,B"
                if "," in lc:
                    lc = map(float, lc.split(","))

            ax.text(xx + tip,
                    yy,
                    sname,
                    va="center",
                    fontstyle="italic",
                    size=leaffont,
                    color=lc)

            gname = n.name.split("_")[0]
            if gname in structures:
                mrnabed, cdsbeds = structures[gname]
                ExonGlyph(ax,
                          1 - rmargin / 2,
                          yy,
                          mrnabed,
                          cdsbeds,
                          align="right",
                          ratio=ratio)
            if sizes and gname in sizes:
                size = sizes[gname]
                size = size / 3 - 1  # base pair converted to amino acid
                size = "{0}aa".format(size)
                ax.text(1 - rmargin / 2 + tip, yy, size, size=leaffont)

        else:
            children = [coords[x] for x in n.get_children()]
            children_x, children_y = zip(*children)
            min_y, max_y = min(children_y), max(children_y)
            # plot the vertical bar
            ax.plot((xx, xx), (min_y, max_y), "-", color=treecolor)
            # plot the horizontal bar
            for cx, cy in children:
                ax.plot((xx, cx), (cy, cy), "-", color=treecolor)
            yy = sum(children_y) * 1. / len(children_y)
            support = n.support
            if support > 1:
                support = support / 100.
            if not n.is_root():
                if support > scutoff / 100.:
                    ax.text(xx,
                            yy + .005,
                            "{0:d}".format(int(abs(support * 100))),
                            ha="right",
                            size=leaffont,
                            color=supportcolor)

        coords[n] = (xx, yy)

    # scale bar
    br = .1
    x1 = xstart + .1
    x2 = x1 + br * scale
    yy = ystart - i * yinterval
    ax.plot([x1, x1], [yy - tip, yy + tip], "-", color=treecolor)
    ax.plot([x2, x2], [yy - tip, yy + tip], "-", color=treecolor)
    ax.plot([x1, x2], [yy, yy], "-", color=treecolor)
    ax.text((x1 + x2) / 2,
            yy - tip,
            "{0:g}".format(br),
            va="top",
            ha="center",
            size=leaffont,
            color=treecolor)

    if SH is not None:
        xs = x1
        ys = (margin + yy) / 2.
        ax.text(xs,
                ys,
                "SH test against ref tree: {0}".format(SH),
                ha="left",
                size=leaffont,
                color="g")

    normalize_axes(ax)
Пример #52
0
def htg(args):
    """
    %prog htg fastafile template.sbt

    Prepare sqnfiles for Genbank HTG submission to update existing records.

    `fastafile` contains the records to update, multiple records are allowed
    (with each one generating separate sqn file in the sqn/ folder). The record
    defline has the accession ID. For example,
    >AC148290.3

    Internally, this generates two additional files (phasefile and namesfile)
    and download records from Genbank. Below is implementation details:

    `phasefile` contains, for each accession, phase information. For example:
    AC148290.3      3       HTG     2       mth2-45h12

    which means this is a Phase-3 BAC. Record with only a single contig will be
    labeled as Phase-3 regardless of the info in the `phasefile`. Template file
    is the Genbank sbt template. See jcvi.formats.sbt for generation of such
    files.

    Another problem is that Genbank requires the name of the sequence to stay
    the same when updating and will kick back with a table of name conflicts.
    For example:

    We are unable to process the updates for these entries
    for the following reason:

    Seqname has changed

    Accession Old seq_name New seq_name
    --------- ------------ ------------
    AC239792 mtg2_29457 AC239792.1

    To prepare a submission, this script downloads genbank and asn.1 format,
    and generate the phase file and the names file (use formats.agp.phase() and
    apps.gbsubmit.asn(), respectively). These get automatically run.

    However, use --phases if the genbank files contain outdated information.
    For example, the clone name changes or phase upgrades. In this case, run
    formats.agp.phase() manually, modify the phasefile and use --phases to override.
    """
    from jcvi.formats.fasta import sequin, ids
    from jcvi.formats.agp import phase
    from jcvi.apps.fetch import entrez

    p = OptionParser(htg.__doc__)
    p.add_option(
        "--phases",
        default=None,
        help="Use another phasefile to override",
    )
    p.add_option("--comment", default="", help="Comments for this update")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, sbtfile = args
    pf = fastafile.rsplit(".", 1)[0]

    idsfile = pf + ".ids"
    phasefile = pf + ".phases"
    namesfile = pf + ".names"

    ids([fastafile, "--outfile={0}".format(idsfile)])

    asndir = "asn.1"
    mkdir(asndir)
    entrez([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)])
    asn(glob("{0}/*".format(asndir)) + ["--outfile={0}".format(namesfile)])

    if opts.phases is None:
        gbdir = "gb"
        mkdir(gbdir)
        entrez([idsfile, "--format=gb", "--outdir={0}".format(gbdir)])
        phase(
            glob("{0}/*".format(gbdir)) + ["--outfile={0}".format(phasefile)])
    else:
        phasefile = opts.phases

    assert op.exists(namesfile) and op.exists(phasefile)

    newphasefile = phasefile + ".new"
    newphasefw = open(newphasefile, "w")
    comment = opts.comment

    fastadir = "fasta"
    sqndir = "sqn"
    mkdir(fastadir)
    mkdir(sqndir)

    from jcvi.graphics.histogram import stem_leaf_plot

    names = DictFile(namesfile)
    assert len(set(names.keys())) == len(set(names.values()))

    phases = DictFile(phasefile)
    ph = [int(x) for x in phases.values()]
    # vmin 1, vmax 4, bins 3
    stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates")
    logging.debug("Information loaded for {0} records.".format(len(phases)))
    assert len(names) == len(phases)

    newph = []

    cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir)
    sh(cmd, outfile="/dev/null", errfile="/dev/null")

    acmd = "tbl2asn -a z -p fasta -r {sqndir}"
    acmd += " -i {splitfile} -t {sbtfile} -C tigr"
    acmd += ' -j "{qualifiers}"'
    acmd += " -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr"
    acmd += ' -y "{comment}" -W T -T T'

    qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]"

    nupdated = 0
    for row in open(phasefile):
        atoms = row.rstrip().split("\t")
        # see formats.agp.phase() for column contents
        accession, phase, clone = atoms[0], atoms[1], atoms[-1]
        fafile = op.join(fastadir, accession + ".fa")
        accession_nv = accession.split(".", 1)[0]

        newid = names[accession_nv]
        newidopt = "--newid={0}".format(newid)
        cloneopt = "--clone={0}".format(clone)
        splitfile, gaps = sequin([fafile, newidopt, cloneopt])
        splitfile = op.basename(splitfile)
        phase = int(phase)
        assert phase in (1, 2, 3)

        oldphase = phase
        if gaps == 0 and phase != 3:
            phase = 3

        if gaps != 0 and phase == 3:
            phase = 2

        print("{0}\t{1}\t{2}".format(accession_nv, oldphase, phase),
              file=newphasefw)
        newph.append(phase)

        qualifiers = qq.format(phase=phase)
        if ";" in clone:
            qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]"

        cmd = acmd.format(
            accession=accession,
            accession_nv=accession_nv,
            sqndir=sqndir,
            sbtfile=sbtfile,
            splitfile=splitfile,
            qualifiers=qualifiers,
            comment=comment,
        )
        sh(cmd)

        verify_sqn(sqndir, accession)
        nupdated += 1

    stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates")
    print("A total of {0} records updated.".format(nupdated), file=sys.stderr)
Пример #53
0
def get_fsnames(outdir):
    fnames = glob(op.join(outdir, "*.fa*"))
    suffix = "." + fnames[0].split(".")[-1]
    fsnames = [op.basename(x).rsplit(".", 1)[0] for x in fnames]

    return fsnames, suffix
Пример #54
0
Файл: base.py Проект: rrane/jcvi
mb_float_formatter = ticker.FuncFormatter(lambda x, pos: "{0:.1f}M".format(x / 1000000.0))
kb_formatter = ticker.FuncFormatter(lambda x, pos: "{0}K".format(int(x / 1000)))
tex_1digit_formatter = ticker.FuncFormatter(lambda x, pos: _("{0:.1f}".format(x)))
tex_2digit_formatter = ticker.FuncFormatter(lambda x, pos: _("{0:.2f}".format(x)))


def set_tex_axis(ax, formatter=tex_formatter):
    ax.xaxis.set_major_formatter(formatter)
    ax.yaxis.set_major_formatter(formatter)


set_human_axis = partial(set_tex_axis, formatter=human_formatter)
set_human_base_axis = partial(set_tex_axis, formatter=human_base_formatter)

font_dir = op.join(op.dirname(__file__), "fonts")
available_fonts = [op.basename(x) for x in glob(font_dir + "/*")]


def fontprop(ax, name, size=12):

    assert name in available_fonts, "Font must be one of {0}.".format(available_fonts)

    import matplotlib.font_manager as fm

    fname = op.join(font_dir, name)
    prop = fm.FontProperties(fname=fname, size=size)

    logging.debug("Set font to `{0}` (`{1}`).".format(name, prop.get_file()))
    for text in ax.texts:
        text.set_fontproperties(prop)
Пример #55
0
def get_fsnames(outdir):
    fnames = glob(op.join(outdir, "*.fa*"))
    suffix = "." + fnames[0].split(".")[-1]
    fsnames = [op.basename(x).rsplit(".", 1)[0] for x in fnames]

    return fsnames, suffix
Пример #56
0
human_readable_base = partial(human_readable, base=True)
human_formatter = ticker.FuncFormatter(human_readable)
human_base_formatter = ticker.FuncFormatter(human_readable_base)
mb_formatter = ticker.FuncFormatter(lambda x, pos: "{0}M".format(int(x / 1000000)))
mb_float_formatter = ticker.FuncFormatter(lambda x, pos: "{0:.1f}M".format(x / 1000000.))
kb_formatter = ticker.FuncFormatter(lambda x, pos: "{0}K".format(int(x / 1000)))


def set_human_axis(ax, formatter=human_formatter):
    ax.xaxis.set_major_formatter(formatter)
    ax.yaxis.set_major_formatter(formatter)


set_human_base_axis = partial(set_human_axis, formatter=human_base_formatter)

available_fonts = [op.basename(x) for x in glob(datadir + "/*.ttf")]


def fontprop(ax, name, size=12):

    assert name in available_fonts, "Font must be one of {0}.".\
            format(available_fonts)

    import matplotlib.font_manager as fm

    fname = op.join(datadir, name)
    prop = fm.FontProperties(fname=fname, size=size)

    logging.debug("Set font to `{0}` (`{1}`).".format(name, prop.get_file()))
    for text in ax.texts:
        text.set_fontproperties(prop)
Пример #57
0

def set_human_axis(ax, formatter=human_formatter):
    ax.xaxis.set_major_formatter(formatter)
    ax.yaxis.set_major_formatter(formatter)


set_human_base_axis = partial(set_human_axis, formatter=human_base_formatter)


def set_helvetica_axis(ax):
    ax.set_xticklabels([int(x) for x in ax.get_xticks()], family="Helvetica")
    ax.set_yticklabels([int(x) for x in ax.get_yticks()], family="Helvetica")


available_fonts = [op.basename(x) for x in glob(datadir + "/*.ttf")]


def fontprop(ax, name, size=12):

    assert name in available_fonts, "Font must be one of {0}.".format(available_fonts)

    import matplotlib.font_manager as fm

    fname = op.join(datadir, name)
    prop = fm.FontProperties(fname=fname, size=size)

    logging.debug("Set font to `{0}` (`{1}`).".format(name, prop.get_file()))
    for text in ax.texts:
        text.set_fontproperties(prop)