Exemplo n.º 1
0
def loghistogram(data, base=2, ascii=True, title="Counts", summary=False):
    """
    bins is a dictionary with key: log(x, base), value: counts.
    """
    from jcvi.utils.cbook import percentage

    if summary:
        unique = len(data)
        total = sum(data)

        # Print out a distribution
        print >> sys.stderr, "Unique: {0}".format(percentage(unique, total))

    bins = defaultdict(int)
    for d in data:
        logd = int(log(d, base))
        bins[logd] += 1

    x, y = [], []
    for size, number in sorted(bins.items()):
        lb, ub = base ** size, base ** (size + 1)
        x.append((lb, ub))
        y.append(number)

    asciiplot(x, y, title=title)
Exemplo n.º 2
0
def loghistogram(data, base=2, ascii=True, title="Counts", summary=False):
    """
    bins is a dictionary with key: log(x, base), value: counts.
    """
    from jcvi.utils.cbook import percentage

    if summary:
        unique = len(data)
        total = sum(data)

        # Print out a distribution
        print >> sys.stderr, "Unique: {0}".format(percentage(unique, total))

    bins = defaultdict(int)
    for d in data:
        logd = int(log(d, base))
        bins[logd] += 1

    x, y = [], []
    for size, number in sorted(bins.items()):
        lb, ub = base ** size, base ** (size + 1)
        x.append((lb, ub))
        y.append(number)

    asciiplot(x, y, title=title)
Exemplo n.º 3
0
def stem_leaf_plot(data, vmin, vmax, bins, digit=1, title=None):
    '''
    Generate stem and leaf plot given a collection of numbers
    '''
    assert bins > 0
    range = vmax - vmin
    step = range * 1. / bins
    if isinstance(range, int):
        step = int(ceil(step))

    step = step or 1

    bins = np.arange(vmin, vmax + step, step)
    hist, bin_edges = np.histogram(data, bins=bins)
    asciiplot(bin_edges, hist, digit=digit, title=title)
    print >> sys.stderr, "Last bin ends in {0}, inclusive.".format(vmax)
Exemplo n.º 4
0
def stem_leaf_plot(data, vmin, vmax, bins, digit=1, title=None):
    '''
    Generate stem and leaf plot given a collection of numbers
    '''
    assert bins > 0
    range = vmax - vmin
    step = range * 1. / bins
    if isinstance(range, int):
        step = int(ceil(step))

    step = step or 1

    bins = np.arange(vmin, vmax + step, step)
    hist, bin_edges = np.histogram(data, bins=bins)
    asciiplot(bin_edges, hist, digit=digit, title=title)
    print >> sys.stderr, "Last bin ends in {0}, inclusive.".format(vmax)
Exemplo n.º 5
0
def stem_leaf_plot(data, vmin, vmax, bins, digit=1, title=None):
    """
    Generate stem and leaf plot given a collection of numbers
    """
    assert bins > 0
    range = vmax - vmin
    step = range * 1.0 / bins
    if isinstance(range, int):
        step = int(ceil(step))

    step = step or 1

    bins = np.arange(vmin, vmax + step, step)
    hist, bin_edges = np.histogram(data, bins=bins)
    # By default, len(bin_edges) = len(hist) + 1
    bin_edges = bin_edges[: len(hist)]
    asciiplot(bin_edges, hist, digit=digit, title=title)
    print("Last bin ends in {0}, inclusive.".format(vmax), file=sys.stderr)

    return bin_edges, hist
Exemplo n.º 6
0
def model(args):
    """
    %prog model erate

    Model kmer distribution given error rate. See derivation in FIONA paper:
    <http://bioinformatics.oxfordjournals.org/content/30/17/i356.full>
    """
    from scipy.stats import binom, poisson

    p = OptionParser(model.__doc__)
    p.add_option("-k", default=23, type="int", help="Kmer size")
    p.add_option("--cov", default=50, type="int", help="Expected coverage")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (erate, ) = args
    erate = float(erate)
    cov = opts.cov
    k = opts.k

    xy = []
    # Range include c although it is unclear what it means to have c=0
    for c in range(0, cov * 2 + 1):
        Prob_Yk = 0
        for i in range(k + 1):
            # Probability of having exactly i errors
            pi_i = binom.pmf(i, k, erate)
            # Expected coverage of kmer with exactly i errors
            mu_i = cov * (erate / 3)**i * (1 - erate)**(k - i)
            # Probability of seeing coverage of c
            Prob_Yk_i = poisson.pmf(c, mu_i)
            # Sum i over 0, 1, ... up to k errors
            Prob_Yk += pi_i * Prob_Yk_i
        xy.append((c, Prob_Yk))

    x, y = zip(*xy)
    asciiplot(x, y, title="Model")
Exemplo n.º 7
0
def model(args):
    """
    %prog model erate

    Model kmer distribution given error rate. See derivation in FIONA paper:
    <http://bioinformatics.oxfordjournals.org/content/30/17/i356.full>
    """
    from scipy.stats import binom, poisson

    p = OptionParser(model.__doc__)
    p.add_option("-k", default=23, type="int", help="Kmer size")
    p.add_option("--cov", default=50, type="int", help="Expected coverage")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    erate, = args
    erate = float(erate)
    cov = opts.cov
    k = opts.k

    xy = []
    # Range include c although it is unclear what it means to have c=0
    for c in xrange(0, cov * 2 + 1):
        Prob_Yk = 0
        for i in xrange(k + 1):
            # Probability of having exactly i errors
            pi_i = binom.pmf(i, k, erate)
            # Expected coverage of kmer with exactly i errors
            mu_i = cov * (erate / 3) ** i * (1 - erate) ** (k - i)
            # Probability of seeing coverage of c
            Prob_Yk_i = poisson.pmf(c, mu_i)
            # Sum i over 0, 1, ... up to k errors
            Prob_Yk += pi_i * Prob_Yk_i
        xy.append((c, Prob_Yk))

    x, y = zip(*xy)
    asciiplot(x, y, title="Model")
Exemplo n.º 8
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic.
    """
    p = OptionParser(histogram.__doc__)
    p.add_option(
        "--vmin",
        dest="vmin",
        default=1,
        type="int",
        help="minimum value, inclusive",
    )
    p.add_option(
        "--vmax",
        dest="vmax",
        default=100,
        type="int",
        help="maximum value, inclusive",
    )
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Print PDF instead of ASCII plot",
    )
    p.add_option(
        "--method",
        choices=("nbinom", "allpaths"),
        default="nbinom",
        help=
        "'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes",
    )
    p.add_option(
        "--maxiter",
        default=100,
        type="int",
        help="Max iterations for optimization. Only used with --method nbinom",
    )
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option(
        "--nopeaks",
        default=False,
        action="store_true",
        help="Do not annotate K-mer peaks",
    )
    opts, args, iopts = p.set_image_options(args, figsize="7x7")

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    method = opts.method
    vmin, vmax = opts.vmin, opts.vmax
    ascii = not opts.pdf
    peaks = not opts.nopeaks and method == "allpaths"
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = merylhistogram(histfile)

    ks = KmerSpectrum(histfile)
    method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.lambda_ if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size /
                                                                 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print(msg, file=sys.stderr)

    x, y = ks.get_xy(vmin, vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (iopts.w, iopts.h))
    plt.bar(x, y, fc="#b2df8a", lw=0)
    # Plot the negative binomial fit
    if method == "nbinom":
        generative_model = method_info["generative_model"]
        GG = method_info["Gbins"]
        ll = method_info["lambda"]
        rr = method_info["rho"]
        kf_range = method_info["kf_range"]
        stacked = generative_model(GG, ll, rr)
        plt.plot(
            kf_range,
            stacked,
            ":",
            color="#6a3d9a",
            lw=2,
        )

    ax = plt.gca()

    if peaks:  # Only works for method 'allpaths'
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=3, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6
    if method == "nbinom":
        # Plot multiple CN locations, CN1, CN2, ... up to ploidy
        cn_color = "#a6cee3"
        for i in range(1, ks.ploidy + 1):
            x = i * ks.lambda_
            plt.plot((x, x), (0, ymax), "-.", color=cn_color)
            plt.text(
                x,
                ymax * 0.95,
                "CN{}".format(i),
                ha="right",
                va="center",
                color=cn_color,
                rotation=90,
            )

    messages = [
        Total_Kmers_msg,
        Kmer_coverage_msg,
        Genome_size_msg,
        Repetitive_msg,
        SNPrate_msg,
    ]
    if method == "nbinom":
        messages += [ks.ploidy_message] + ks.copy_messages
    write_messages(ax, messages)

    ax.set_title(markup(title))
    ax.set_xlim((0, vmax))
    ax.set_ylim((0, ymax))
    adjust_spines(ax, ["left", "bottom"], outward=True)
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + "." + iopts.format
    savefig(imagename, dpi=100)

    return Genome_size
Exemplo n.º 9
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin",
                 dest="vmin",
                 default=1,
                 type="int",
                 help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax",
                 dest="vmax",
                 default=100,
                 type="int",
                 help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks",
                 default=False,
                 action="store_true",
                 help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\
                        format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        x, y = zip(*tcounts)
        tcounts = dict(tcounts)
        plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Exemplo n.º 10
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]")
    p.add_option(
        "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]"
    )
    p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = meryl([histfile])

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, "g-", lw=2, alpha=0.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=2, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    messages = [Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg]
    write_messages(ax, messages)

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title))
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Exemplo n.º 11
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin", dest="vmin", default=1, type="int",
            help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax", dest="vmax", default=100, type="int",
            help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf", default=False, action="store_true",
            help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage", default=0, type="int",
            help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks", default=False, action="store_true",
            help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    N = int(N)
    KMERYL, KSOAP, KALLPATHS = range(3)
    kformats = ("Meryl", "Soap", "AllPaths")
    kformat = KMERYL

    ascii = not opts.pdf
    peaks = not opts.nopeaks
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    for row in fp:
        if row.startswith("# 1:"):
            kformat = KALLPATHS
            break
        if len(row.split()) == 1:
            kformat = KSOAP
            break
    fp.seek(0)

    logging.debug("Guessed format: {0}".format(kformats[kformat]))

    data = []
    for rowno, row in enumerate(fp):
        if row[0] == '#':
            continue
        if kformat == KSOAP:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = Kcounts
        data.append((K, counts))

    covmax = 1000000
    ks = KmerSpectrum(data)
    ks.analyze(K=N, covmax=covmax)

    Total_Kmers = int(totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\
                        format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() \
                    if opts.vmin <= a <= opts.vmax)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
    tcounts = [(x, y) for x, y in counts if x in t]
    x, y = zip(*tcounts)
    plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
    tcounts = dict(tcounts)

    if peaks:
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Exemplo n.º 12
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic.
    """
    p = OptionParser(histogram.__doc__)
    p.add_option(
        "--vmin",
        dest="vmin",
        default=1,
        type="int",
        help="minimum value, inclusive",
    )
    p.add_option(
        "--vmax",
        dest="vmax",
        default=100,
        type="int",
        help="maximum value, inclusive",
    )
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Print PDF instead of ASCII plot",
    )
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option(
        "--nopeaks",
        default=False,
        action="store_true",
        help="Do not annotate K-mer peaks",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = merylhistogram(histfile)

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size /
                                                                1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print(msg, file=sys.stderr)

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, "g-", lw=2, alpha=0.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=2, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    messages = [
        Total_Kmers_msg,
        Kmer_coverage_msg,
        Genome_size_msg,
        Repetitive_msg,
        SNPrate_msg,
    ]
    write_messages(ax, messages)

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title))
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Exemplo n.º 13
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin",
                 dest="vmin",
                 default=1,
                 type="int",
                 help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax",
                 dest="vmax",
                 default=100,
                 type="int",
                 help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks",
                 default=False,
                 action="store_true",
                 help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    N = int(N)
    KMERYL, KSOAP, KALLPATHS = range(3)
    kformats = ("Meryl", "Soap", "AllPaths")
    kformat = KMERYL

    ascii = not opts.pdf
    peaks = not opts.nopeaks
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    for row in fp:
        if row.startswith("# 1:"):
            kformat = KALLPATHS
            break
        if len(row.split()) == 1:
            kformat = KSOAP
            break
    fp.seek(0)

    logging.debug("Guessed format: {0}".format(kformats[kformat]))

    data = []
    for rowno, row in enumerate(fp):
        if row[0] == '#':
            continue
        if kformat == KSOAP:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = Kcounts
        data.append((K, counts))

    covmax = 1000000
    ks = KmerSpectrum(data)
    ks.analyze(K=N, covmax=covmax)

    Total_Kmers = int(totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = Total_Kmers * 1. / Kmer_coverage / 1e6

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() \
                    if opts.vmin <= a <= opts.vmax)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        return asciiplot(x, y, title=title)

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
    tcounts = [(x, y) for x, y in counts if x in t]
    x, y = zip(*tcounts)
    plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
    tcounts = dict(tcounts)

    if peaks:
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)
Exemplo n.º 14
0
Arquivo: kmer.py Projeto: bennyyu/jcvi
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--pdf", default=False, action="store_true",
            help="Print PDF instead of ASCII plot [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    soap = False
    for row in fp:
        if len(row.split()) == 1:
            soap = True
            break
    fp.seek(0)

    for rowno, row in enumerate(fp):
        if soap:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = counts

    history = ["drop"]
    for a, b in pairwise(sorted(hist.items())):
        Ka, ca = a
        Kb, cb = b
        if ca <= cb:
            status = "rise"
        else:
            status = "drop"
        if history[-1] != status:
            history.append(status)
        if history == ["drop", "rise", "drop"]:
            break

    Total_Kmers = int(totalKmers)
    Kmer_coverage = Ka
    Genome_size = Total_Kmers * 1. / Ka / 1e6

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size)

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() if a <= 100)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        return asciiplot(x, y, title=title)

    fig = plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)

    ax = plt.gca()
    ax.text(.5, .9, _(Total_Kmers_msg),
            ha="center", color='b', transform=ax.transAxes)
    ax.text(.5, .8, _(Kmer_coverage_msg),
            ha="center", color='b', transform=ax.transAxes)
    ax.text(.5, .7, _(Genome_size_msg),
            ha="center", color='b', transform=ax.transAxes)

    ax.set_title(_(title), color='r')
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(_(xlabel), color='r')
    ax.set_ylabel(_(ylabel), color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    plt.savefig(imagename, dpi=100)
    print >> sys.stderr, "Image saved to `{0}`.".format(imagename)
Exemplo n.º 15
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    soap = False
    for row in fp:
        if len(row.split()) == 1:
            soap = True
            break
    fp.seek(0)

    for rowno, row in enumerate(fp):
        if soap:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = counts

    history = ["drop"]
    for a, b in pairwise(sorted(hist.items())):
        Ka, ca = a
        Kb, cb = b
        if ca <= cb:
            status = "rise"
        else:
            status = "drop"
        if history[-1] != status:
            history.append(status)
        if history == ["drop", "rise", "drop"]:
            break

    Total_Kmers = int(totalKmers)
    Kmer_coverage = Ka
    Genome_size = Total_Kmers * 1. / Ka / 1e6

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size)

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() if a <= 100)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        return asciiplot(x, y, title=title)

    fig = plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)

    ax = plt.gca()
    ax.text(.5,
            .9,
            _(Total_Kmers_msg),
            ha="center",
            color='b',
            transform=ax.transAxes)
    ax.text(.5,
            .8,
            _(Kmer_coverage_msg),
            ha="center",
            color='b',
            transform=ax.transAxes)
    ax.text(.5,
            .7,
            _(Genome_size_msg),
            ha="center",
            color='b',
            transform=ax.transAxes)

    ax.set_title(_(title), color='r')
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(_(xlabel), color='r')
    ax.set_ylabel(_(ylabel), color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    plt.savefig(imagename, dpi=100)
    print >> sys.stderr, "Image saved to `{0}`.".format(imagename)