예제 #1
0
파일: coords.py 프로젝트: linlifeng/jcvi
def print_stats(qrycovered, refcovered, id_pct):
    from jcvi.utils.cbook import thousands

    m1 = "Reference coverage: {0} bp".format(thousands(refcovered))
    m2 = "Query coverage: {0} bp".format(thousands(qrycovered))
    m3 = "Identity: {0:.1f}%".format(id_pct)
    print >> sys.stderr, "\n".join((m1, m2, m3))
예제 #2
0
def print_stats(qrycovered, refcovered, id_pct):
    from jcvi.utils.cbook import thousands

    m1 = "Reference coverage: {0} bp".format(thousands(refcovered))
    m2 = "Query coverage: {0} bp".format(thousands(qrycovered))
    m3 = "Identity: {0:.1f}%".format(id_pct)
    print >> sys.stderr, "\n".join((m1, m2, m3))
예제 #3
0
def summary(args):
    """
    %prog summary bedfile

    Sum the total lengths of the intervals.
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    bedfile, = args
    bed = Bed(bedfile)
    stats = SummaryStats([x.span for x in bed])
    print >> sys.stderr, "Total seqids: {0}".format(len(bed.seqids))
    print >> sys.stderr, "Total ranges: {0}".format(len(bed))

    total_bases = bed.sum(unique=False)
    unique_bases = bed.sum()

    print >> sys.stderr, "Total unique bases: {0} bp".format(
        thousands(unique_bases))
    print >> sys.stderr, "Total bases: {0} bp".format(thousands(total_bases))
    print >> sys.stderr, "Estimated coverage: {0:.1f}x".\
                        format(total_bases * 1. / unique_bases)

    print >> sys.stderr, stats
예제 #4
0
파일: bed.py 프로젝트: linlifeng/jcvi
def summary(args):
    """
    %prog summary bedfile

    Sum the total lengths of the intervals.
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    bedfile, = args
    bed = Bed(bedfile)
    stats = SummaryStats([x.span for x in bed])
    print >> sys.stderr, "Total seqids: {0}".format(len(bed.seqids))
    print >> sys.stderr, "Total ranges: {0}".format(len(bed))

    total_bases = bed.sum(unique=False)
    unique_bases = bed.sum()

    print >> sys.stderr, "Total unique bases: {0} bp".format(thousands(unique_bases))
    print >> sys.stderr, "Total bases: {0} bp".format(thousands(total_bases))
    print >> sys.stderr, "Estimated coverage: {0:.1f}x".\
                        format(total_bases * 1. / unique_bases)

    print >> sys.stderr, stats
예제 #5
0
파일: bed.py 프로젝트: bennyyu/jcvi
def summary(args):
    """
    %prog summary bedfile

    Sum the total lengths of the intervals.
    """
    import numpy as np

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    bedfile, = args
    bed = Bed(bedfile)
    spans = np.array([x.span for x in bed])
    avg = int(np.average(spans))
    std = int(np.std(spans))
    print >> sys.stderr, "Total seqids: {0}".format(len(bed.seqids))
    print >> sys.stderr, "Total ranges: {0}".format(len(bed))

    total_bases = bed.sum(unique=False)
    unique_bases = bed.sum()

    print >> sys.stderr, "Total unique bases: {0} bp".format(thousands(unique_bases))
    print >> sys.stderr, "Total bases: {0} bp".format(thousands(total_bases))
    print >> sys.stderr, "Estimated coverage: {0:.1f}x".format(total_bases * 1.0 / unique_bases)

    print >> sys.stderr, "Average spans: {0}, stdev: {1}".format(avg, std)
예제 #6
0
파일: bed.py 프로젝트: yangjl/jcvi
 def report(self):
     print >> sys.stderr, "Total seqids: {0}".format(self.nseqids)
     print >> sys.stderr, "Total ranges: {0}".format(self.nfeats)
     print >> sys.stderr, "Total unique bases: {0} bp".format(thousands(self.unique_bases))
     print >> sys.stderr, "Total bases: {0} bp".format(thousands(self.total_bases))
     print >> sys.stderr, "Estimated coverage: {0:.1f}x".format(self.coverage)
     print >> sys.stderr, self.stats
     maxspan, maxaccn = max(self.mspans)
     minspan, minaccn = min(self.mspans)
     print >> sys.stderr, "Longest: {0} ({1})".format(maxaccn, maxspan)
     print >> sys.stderr, "Shortest: {0} ({1})".format(minaccn, minspan)
예제 #7
0
def fillstats(args):
    """
    %prog fillstats genome.fill

    Build stats on .fill file from GapCloser.
    """
    from jcvi.utils.cbook import SummaryStats, percentage, thousands

    p = OptionParser(fillstats.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fillfile, ) = args
    fp = open(fillfile)
    scaffolds = 0
    gaps = []
    for row in fp:
        if row[0] == ">":
            scaffolds += 1
            continue
        fl = FillLine(row)
        gaps.append(fl)

    print("{0} scaffolds in total".format(scaffolds), file=sys.stderr)

    closed = [x for x in gaps if x.closed]
    closedbp = sum(x.before for x in closed)
    notClosed = [x for x in gaps if not x.closed]
    notClosedbp = sum(x.before for x in notClosed)

    totalgaps = len(closed) + len(notClosed)

    print(
        "Closed gaps: {0} size: {1} bp".format(
            percentage(len(closed), totalgaps), thousands(closedbp)),
        file=sys.stderr,
    )
    ss = SummaryStats([x.after for x in closed])
    print(ss, file=sys.stderr)

    ss = SummaryStats([x.delta for x in closed])
    print("Delta:", ss, file=sys.stderr)

    print(
        "Remaining gaps: {0} size: {1} bp".format(
            percentage(len(notClosed), totalgaps), thousands(notClosedbp)),
        file=sys.stderr,
    )
    ss = SummaryStats([x.after for x in notClosed])
    print(ss, file=sys.stderr)
예제 #8
0
파일: bed.py 프로젝트: radaniba/jcvi
 def report(self):
     print >> sys.stderr, "Total seqids: {0}".format(self.nseqids)
     print >> sys.stderr, "Total ranges: {0}".format(self.nfeats)
     print >> sys.stderr, "Total unique bases: {0} bp".format(
         thousands(self.unique_bases))
     print >> sys.stderr, "Total bases: {0} bp".format(
         thousands(self.total_bases))
     print >> sys.stderr, "Estimated coverage: {0:.1f}x".format(
         self.coverage)
     print >> sys.stderr, self.stats
     maxspan, maxaccn = max(self.mspans)
     minspan, minaccn = min(self.mspans)
     print >> sys.stderr, "Longest: {0} ({1})".format(maxaccn, maxspan)
     print >> sys.stderr, "Shortest: {0} ({1})".format(minaccn, minspan)
예제 #9
0
파일: soap.py 프로젝트: linlifeng/jcvi
def fillstats(args):
    """
    %prog fillstats genome.fill

    Build stats on .fill file from GapCloser.
    """
    from jcvi.utils.cbook import SummaryStats, percentage, thousands

    p = OptionParser(fillstats.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fillfile, = args
    fp = open(fillfile)
    scaffolds = 0
    gaps = []
    for row in fp:
        if row[0] == ">":
            scaffolds += 1
            continue
        fl = FillLine(row)
        gaps.append(fl)

    print >> sys.stderr, "{0} scaffolds in total".format(scaffolds)

    closed = [x for x in gaps if x.closed]
    closedbp = sum(x.before for x in closed)
    notClosed = [x for x in gaps if not x.closed]
    notClosedbp = sum(x.before for x in notClosed)

    totalgaps = len(closed) + len(notClosed)
    totalbp = closedbp + notClosedbp

    print >> sys.stderr, "Closed gaps: {0} size: {1} bp".\
                        format(percentage(len(closed), totalgaps), thousands(closedbp))
    ss = SummaryStats([x.after for x in closed])
    print >> sys.stderr, ss

    ss = SummaryStats([x.delta for x in closed])
    print >> sys.stderr, "Delta:", ss

    print >> sys.stderr, "Remaining gaps: {0} size: {1} bp".\
                        format(percentage(len(notClosed), totalgaps), thousands(notClosedbp))
    ss = SummaryStats([x.after for x in notClosed])
    print >> sys.stderr, ss
예제 #10
0
파일: bed.py 프로젝트: rrane/jcvi
def summary(args):
    """
    %prog summary bedfile

    Sum the total lengths of the intervals.
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(summary.__doc__)
    p.add_option("--sizes", default=False, action="store_true",
                 help="Write .sizes file")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    bed = Bed(bedfile)
    mspans = [(x.span, x.accn) for x in bed]
    if opts.sizes:
        sizesfile = bedfile + ".sizes"
        fw = open(sizesfile, "w")
        for span, accn in mspans:
            print >> fw, span
        fw.close()
        logging.debug("Spans written to `{0}`.".format(sizesfile))

    spans, accns = zip(*mspans)
    stats = SummaryStats(spans)
    print >> sys.stderr, "Total seqids: {0}".format(len(bed.seqids))
    print >> sys.stderr, "Total ranges: {0}".format(len(bed))

    total_bases = bed.sum(unique=False)
    unique_bases = bed.sum()

    print >> sys.stderr, "Total unique bases: {0} bp".format(thousands(unique_bases))
    print >> sys.stderr, "Total bases: {0} bp".format(thousands(total_bases))
    print >> sys.stderr, "Estimated coverage: {0:.1f}x".\
                        format(total_bases * 1. / unique_bases)

    print >> sys.stderr, stats
    maxspan, maxaccn = max(mspans)
    minspan, minaccn = min(mspans)
    print >> sys.stderr, "Longest: {0} ({1})".format(maxaccn, maxspan)
    print >> sys.stderr, "Shortest: {0} ({1})".format(minaccn, minspan)
예제 #11
0
def scaffold(args):
    """
    %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed
                         physicalmap.blast physicalmap.sizes physicalmap.bed

    As evaluation of scaffolding, visualize external line of evidences:
    * Plot synteny to an external genome
    * Plot alignments to physical map
    * Plot alignments to genetic map (TODO)

    Each trio defines one panel to be plotted. blastfile defines the matchings
    between the evidences vs scaffolds. Then the evidence sizes, and evidence
    bed to plot dot plots.

    This script will plot a dot in the dot plot in the corresponding location
    the plots are one contig/scaffold per plot.
    """
    from jcvi.graphics.base import set_image_options
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.add_option("--cutoff", type="int", default=1000000,
            help="Plot scaffolds with size larger than [default: %default]")
    p.add_option("--highlights",
            help="A set of regions in BED format to highlight [default: %default]")
    opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150)

    if len(args) < 4 or len(args) % 3 != 1:
        sys.exit(not p.print_help())

    highlights = opts.highlights
    scafsizes = Sizes(args[0])
    trios = list(grouper(3, args[1:]))
    trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios]
    if highlights:
        hlbed = Bed(highlights)

    for scaffoldID, scafsize in scafsizes.iter_sizes():
        if scafsize < opts.cutoff:
            continue
        logging.debug("Loading {0} (size={1})".format(scaffoldID,
            thousands(scafsize)))

        tmpname = scaffoldID + ".sizes"
        tmp = open(tmpname, "w")
        tmp.write("{0}\t{1}".format(scaffoldID, scafsize))
        tmp.close()

        tmpsizes = Sizes(tmpname)
        tmpsizes.close(clean=True)

        if highlights:
            subhighlights = list(hlbed.sub_bed(scaffoldID))

        imagename = ".".join((scaffoldID, opts.format))
        plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts,
                          highlights=subhighlights)
예제 #12
0
def plot_one_scaffold(scaffoldID, ssizes, sbed, trios, imagename, iopts,
                      highlights=None):
    ntrios = len(trios)
    fig = plt.figure(1, (14, 8))
    plt.cla()
    plt.clf()
    root = fig.add_axes([0, 0, 1, 1])
    axes = [fig.add_subplot(1, ntrios, x) for x in range(1, ntrios + 1)]
    scafsize = ssizes.get_size(scaffoldID)

    for trio, ax in zip(trios, axes):
        blastf, qsizes, qbed = trio
        scaffolding(ax, scaffoldID, blastf, qsizes, ssizes, qbed, sbed,
                    highlights=highlights)

    root.text(.5, .95, "{0}   (size={1})".format(scaffoldID, thousands(scafsize)),
            size=18, ha="center", color='b')
    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
예제 #13
0
def plot_one_scaffold(scaffoldID,
                      ssizes,
                      sbed,
                      trios,
                      imagename,
                      iopts,
                      highlights=None):
    ntrios = len(trios)
    fig = plt.figure(1, (14, 8))
    plt.cla()
    plt.clf()
    root = fig.add_axes([0, 0, 1, 1])
    axes = [fig.add_subplot(1, ntrios, x) for x in range(1, ntrios + 1)]
    scafsize = ssizes.get_size(scaffoldID)

    for trio, ax in zip(trios, axes):
        blastf, qsizes, qbed = trio
        scaffolding(ax,
                    scaffoldID,
                    blastf,
                    qsizes,
                    ssizes,
                    qbed,
                    sbed,
                    highlights=highlights)

    root.text(
        0.5,
        0.95,
        "{0}   (size={1})".format(scaffoldID, thousands(scafsize)),
        size=18,
        ha="center",
        color="b",
    )
    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
예제 #14
0
    def analyze_nbinom(self, K=23, maxiter=100):
        """Analyze the K-mer histogram using negative binomial distribution.

        Args:
            K (int, optional): K-mer size used when generating the histogram. Defaults to 23.
        """
        from scipy.stats import nbinom
        from scipy.optimize import minimize_scalar
        from functools import lru_cache

        method, xopt = "bounded", "xatol"
        MAX_1CN_SIZE = 1e10
        MAX_OPTIMIZED_SIZE = 9.9e9

        # Generate bins for the decomposed negative binomial distributions
        bins = [(i, i) for i in range(1, 9)
                ]  # The first 8 CN are critical often determins ploidy
        for i in (8, 16, 32, 64, 128, 256, 512):  # 14 geometricly sized bins
            a, b = i + 1, int(round(i * 2**0.5))
            bins.append((a, b))
            a, b = b + 1, i * 2
            bins.append((a, b))

        # Convert histogram to np array so we can index by CN
        kf_ceil = max([cov for cov, _ in self.data])
        N = kf_ceil + 1
        hist = np.zeros(N, dtype=np.int)
        for cov, count in self.data:
            hist[cov] = count

        # min1: find first minimum
        _kf_min1 = 10
        while _kf_min1 - 1 >= 2 and hist[_kf_min1 - 1] < hist[_kf_min1]:
            _kf_min1 -= 1
        while _kf_min1 <= kf_ceil and hist[_kf_min1 + 1] < hist[_kf_min1]:
            _kf_min1 += 1

        # max2: find absolute maximum mx2 above first minimum min1
        _kf_max2 = _kf_min1
        for kf in range(_kf_min1 + 1, int(0.8 * kf_ceil)):
            if hist[kf] > hist[_kf_max2]:
                _kf_max2 = kf

        # Discard the last entry as that is usually an inflated number
        hist = hist[:-1]
        kf_range = np.arange(_kf_min1, len(hist), dtype=np.int)
        P = hist[kf_range] * kf_range  # Target distribution
        print("==> Start nbinom method on range ({}, {})".format(
            _kf_min1, len(hist)))

        # Below is the optimization schemes, we optimize one variable at a time
        @lru_cache(maxsize=None)
        def nbinom_pmf_range(lambda_: int, rho: int, bin_id: int):
            stacked = np.zeros(len(kf_range), dtype=np.float64)
            lambda_ /= 100  # 2-digit precision
            rho /= 100  # 2-digit precision
            n = lambda_ / (rho - 1)
            p = 1 / rho
            start, end = bins[bin_id]
            for i in range(start, end + 1):
                stacked += nbinom.pmf(kf_range, n * i, p)
            return stacked

        def generative_model(G, lambda_, rho):
            stacked = np.zeros(len(kf_range), dtype=np.float64)
            lambda_ = int(round(lambda_ * 100))
            rho = int(round(rho * 100))
            for bin_id, g in enumerate(G):
                stacked += g * nbinom_pmf_range(lambda_, rho, bin_id)
            stacked *= kf_range
            return stacked

        def func(lambda_, rho, G):
            stacked = generative_model(G, lambda_, rho)
            return np.sum((P - stacked)**2)  # L2 norm

        def optimize_func(lambda_, rho, G):
            # Iterate over all G
            for i, g in enumerate(G):
                G_i = optimize_func_Gi(lambda_, rho, G, i)
                if (not 1 < G_i < MAX_OPTIMIZED_SIZE
                    ):  # Optimizer did not optimize this G_i
                    break
            # Also remove the last bin since it is subject to marginal effect
            G[i - 1] = 0
            lambda_ = optimize_func_lambda_(lambda_, rho, G)
            rho = optimize_func_rho(lambda_, rho, G)
            score = func(lambda_, rho, G)
            return lambda_, rho, G, score

        def optimize_func_lambda_(lambda_, rho, G):
            def f(arg):
                return func(arg, rho, G)

            res = minimize_scalar(f,
                                  bounds=(_kf_min1, 100),
                                  method=method,
                                  options={xopt: 0.01})
            return res.x

        def optimize_func_rho(lambda_, rho, G):
            def f(arg):
                return func(lambda_, arg, G)

            res = minimize_scalar(f,
                                  bounds=(1.001, 5),
                                  method=method,
                                  options={xopt: 0.01})
            return res.x

        def optimize_func_Gi(lambda_, rho, G, i):
            # Iterate a single G_i
            def f(arg):
                G[i] = arg
                return func(lambda_, rho, G)

            res = minimize_scalar(f,
                                  bounds=(0, MAX_1CN_SIZE),
                                  method=method,
                                  options={xopt: 100})
            return res.x

        def run_optimization(termination=0.999, maxiter=100):
            ll, rr, GG = l0, r0, G0
            prev_score = np.inf
            for i in range(maxiter):
                print("Iteration", i + 1, file=sys.stderr)
                ll, rr, GG, score = optimize_func(ll, rr, GG)
                if score / prev_score > termination:
                    break
                prev_score = score
                if i % 10 == 0:
                    print(ll, rr, GG, score, file=sys.stderr)
            print("Success!", file=sys.stderr)
            # Remove bogus values that are close to the bounds
            final_GG = [g for g in GG if 1 < g < MAX_OPTIMIZED_SIZE]
            return ll, rr, final_GG

        # Optimization - very slow
        G0 = np.zeros(len(bins))
        l0 = _kf_max2
        r0 = 1.5
        print(l0, r0, G0, file=sys.stderr)
        ll, rr, GG = run_optimization(maxiter=maxiter)
        print(ll, rr, GG, file=sys.stderr)

        # Ready for genome summary
        m = "\n==> Kmer (K={0}) Spectrum Analysis\n".format(K)

        genome_size = int(round(self.totalKmers / ll))
        inferred_genome_size = 0
        for i, g in enumerate(GG):
            start, end = bins[i]
            mid = (start + end) / 2
            inferred_genome_size += g * mid * (end - start + 1)
        inferred_genome_size = int(round(inferred_genome_size))
        genome_size = max(genome_size, inferred_genome_size)
        m += "Genome size estimate = {0}\n".format(thousands(genome_size))
        copy_series = []
        copy_messages = []
        for i, g in enumerate(GG):
            start, end = bins[i]
            mid = (start + end) / 2
            copy_num = start if start == end else "{}-{}".format(start, end)
            g_copies = int(round(g * mid * (end - start + 1)))
            copy_series.append((mid, copy_num, g_copies, g))
            copy_message = "CN {}: {:.1f} Mb ({:.1f} percent)".format(
                copy_num, g_copies / 1e6, g_copies * 100 / genome_size)
            copy_messages.append(copy_message)
            m += copy_message + "\n"

        if genome_size > inferred_genome_size:
            g_copies = genome_size - inferred_genome_size
            copy_num = "{}+".format(end + 1)
            copy_series.append(
                (end + 1, copy_num, g_copies, g_copies / (end + 1)))
            m += "CN {}: {:.1f} Mb ({:.1f} percent)\n".format(
                copy_num, g_copies / 1e6, g_copies * 100 / genome_size)

        # Determine ploidy
        def determine_ploidy(copy_series, threshold=0.15):
            counts_so_far = 1
            ploidy_so_far = 0
            for mid, copy_num, g_copies, g in copy_series:
                if g_copies / counts_so_far < threshold:
                    break
                counts_so_far += g_copies
                ploidy_so_far = mid
            return int(ploidy_so_far)

        ploidy = determine_ploidy(copy_series)
        self.ploidy = ploidy
        self.ploidy_message = "Ploidy: {}".format(ploidy)
        m += self.ploidy_message + "\n"
        self.copy_messages = copy_messages[:ploidy]

        # Repeat content
        def calc_repeats(copy_series, ploidy, genome_size):
            unique = 0
            for mid, copy_num, g_copies, g in copy_series:
                if mid <= ploidy:
                    unique += g_copies
                else:
                    break
            return 1 - unique / genome_size

        repeats = calc_repeats(copy_series, ploidy, genome_size)
        self.repetitive = "Repeats: {:.1f} percent".format(repeats * 100)
        m += self.repetitive + "\n"

        # SNP rate
        def calc_snp_rate(copy_series, ploidy, genome_size, K):
            # We can calculate the SNP rate s, assuming K-mer of length K:
            # s = 1-(1-L/G)^(1/K)
            # L: # of unique K-mers under 'het' peak
            # G: genome size
            # K: K-mer length
            L = 0
            for mid, copy_num, g_copies, g in copy_series:
                if mid < ploidy:
                    L += g
                else:
                    break
            return 1 - (1 - L / genome_size)**(1 / K)

        snp_rate = calc_snp_rate(copy_series, ploidy, genome_size, K)
        self.snprate = "SNP rate: {:.2f} percent".format(snp_rate * 100)
        m += self.snprate + "\n"
        print(m, file=sys.stderr)

        self.lambda_ = ll
        return {
            "generative_model": generative_model,
            "Gbins": GG,
            "lambda": ll,
            "rho": rr,
            "kf_range": kf_range,
        }
예제 #15
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic.
    """
    p = OptionParser(histogram.__doc__)
    p.add_option(
        "--vmin",
        dest="vmin",
        default=1,
        type="int",
        help="minimum value, inclusive",
    )
    p.add_option(
        "--vmax",
        dest="vmax",
        default=100,
        type="int",
        help="maximum value, inclusive",
    )
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Print PDF instead of ASCII plot",
    )
    p.add_option(
        "--method",
        choices=("nbinom", "allpaths"),
        default="nbinom",
        help=
        "'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes",
    )
    p.add_option(
        "--maxiter",
        default=100,
        type="int",
        help="Max iterations for optimization. Only used with --method nbinom",
    )
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option(
        "--nopeaks",
        default=False,
        action="store_true",
        help="Do not annotate K-mer peaks",
    )
    opts, args, iopts = p.set_image_options(args, figsize="7x7")

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    method = opts.method
    vmin, vmax = opts.vmin, opts.vmax
    ascii = not opts.pdf
    peaks = not opts.nopeaks and method == "allpaths"
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = merylhistogram(histfile)

    ks = KmerSpectrum(histfile)
    method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.lambda_ if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size /
                                                                 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print(msg, file=sys.stderr)

    x, y = ks.get_xy(vmin, vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (iopts.w, iopts.h))
    plt.bar(x, y, fc="#b2df8a", lw=0)
    # Plot the negative binomial fit
    if method == "nbinom":
        generative_model = method_info["generative_model"]
        GG = method_info["Gbins"]
        ll = method_info["lambda"]
        rr = method_info["rho"]
        kf_range = method_info["kf_range"]
        stacked = generative_model(GG, ll, rr)
        plt.plot(
            kf_range,
            stacked,
            ":",
            color="#6a3d9a",
            lw=2,
        )

    ax = plt.gca()

    if peaks:  # Only works for method 'allpaths'
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=3, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6
    if method == "nbinom":
        # Plot multiple CN locations, CN1, CN2, ... up to ploidy
        cn_color = "#a6cee3"
        for i in range(1, ks.ploidy + 1):
            x = i * ks.lambda_
            plt.plot((x, x), (0, ymax), "-.", color=cn_color)
            plt.text(
                x,
                ymax * 0.95,
                "CN{}".format(i),
                ha="right",
                va="center",
                color=cn_color,
                rotation=90,
            )

    messages = [
        Total_Kmers_msg,
        Kmer_coverage_msg,
        Genome_size_msg,
        Repetitive_msg,
        SNPrate_msg,
    ]
    if method == "nbinom":
        messages += [ks.ploidy_message] + ks.copy_messages
    write_messages(ax, messages)

    ax.set_title(markup(title))
    ax.set_xlim((0, vmax))
    ax.set_ylim((0, ymax))
    adjust_spines(ax, ["left", "bottom"], outward=True)
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + "." + iopts.format
    savefig(imagename, dpi=100)

    return Genome_size
예제 #16
0
    def analyze_allpaths(self, ploidy=2, K=23, covmax=1000000):
        """
        Analyze Kmer spectrum, calculations derived from
        allpathslg/src/kmers/KmerSpectra.cc
        """
        from math import sqrt

        data = self.data
        kf_ceil = max(K for (K, c) in data)
        if kf_ceil > covmax:
            exceeds = sum(1 for (K, c) in data if K > covmax)
            logging.debug("A total of {0} distinct K-mers appear > "
                          "{1} times. Ignored ...".format(exceeds, covmax))
            kf_ceil = covmax

        nkf = kf_ceil + 1
        a = [0] * nkf
        for kf, c in data:
            if kf > kf_ceil:
                continue
            a[kf] = c

        ndk = a  # number of distinct kmers
        nk = [k * c for k, c in enumerate(a)]  # number of kmers
        cndk = [0] * nkf  # cumulative number of distinct kmers
        cnk = [0] * nkf  # cumulative number of kmers
        for kf in range(1, nkf):
            cndk[kf] = cndk[kf - 1] + 0.5 * (ndk[kf - 1] + ndk[kf])
            cnk[kf] = cnk[kf - 1] + 0.5 * (nk[kf - 1] + nk[kf])

        # Separate kmer spectrum in 5 regions based on the kf
        # 1        ... kf_min1    : bad kmers with low frequency
        # kf_min1  ... kf_min2    : good kmers CN = 1/2 (SNPs)
        # kf_min2  ... kf_min3    : good kmers CN = 1
        # kf_min3  ... kf_hi      : good kmers CN > 1 (repetitive)
        # kf_hi    ... inf        : bad kmers with high frequency

        # min1: find first minimum
        _kf_min1 = 10
        while _kf_min1 - 1 >= 2 and nk[_kf_min1 - 1] < nk[_kf_min1]:
            _kf_min1 -= 1
        while _kf_min1 <= kf_ceil and nk[_kf_min1 + 1] < nk[_kf_min1]:
            _kf_min1 += 1

        # max2: find absolute maximum mx2 above first minimum min1
        _kf_max2 = _kf_min1
        for kf in range(_kf_min1 + 1, int(0.8 * kf_ceil)):
            if nk[kf] > nk[_kf_max2]:
                _kf_max2 = kf

        # max2: resetting max2 for cases of very high polymorphism
        if ploidy == 2:
            ndk_half = ndk[_kf_max2 // 2]
            ndk_double = ndk[_kf_max2 * 2]
            if ndk_double > ndk_half:
                _kf_max2 *= 2

        # max1: SNPs local maximum max1 as half global maximum max2
        _kf_max1 = _kf_max2 // 2

        # min2: SNPs local minimum min2 between max1 and max2
        _kf_min2 = (_kf_max1 * (2 * ndk[_kf_max1] + ndk[_kf_max2]) //
                    (ndk[_kf_max1] + ndk[_kf_max2]))

        # min1: refine between min1 and max2/2
        for kf in range(_kf_min1 + 1, _kf_max1):
            if nk[kf] < nk[_kf_min1]:
                _kf_min1 = kf

        # min3: not a minimum, really. upper edge of main peak
        _kf_min3 = _kf_max2 * 3 // 2

        print("kfs:",
              _kf_min1,
              _kf_max1,
              _kf_min2,
              _kf_max2,
              _kf_min3,
              file=sys.stderr)
        self.min1 = _kf_min1
        self.max1 = _kf_max1
        self.min2 = _kf_min2
        self.max2 = _kf_max2
        self.min3 = _kf_min3
        self.lambda_ = self.max2  # Main peak

        # Define maximum kf above which we neglect data
        _kf_hi = (_kf_max2 * sqrt(4 * ndk[2 * _kf_max2] * _kf_max2)
                  if 2 * _kf_max2 < len(ndk) else _kf_max2 *
                  sqrt(4 * ndk[len(ndk) - 1] * _kf_max2))
        _kf_hi = int(_kf_hi)

        if _kf_hi > kf_ceil:
            _kf_hi = kf_ceil

        _nk_total = cnk[len(cnk) - 1]
        _nk_bad_low_kf = cnk[_kf_min1]
        _nk_good_uniq = cnk[_kf_min3] - cnk[_kf_min2]
        _nk_bad_high_kf = _nk_total - cnk[_kf_hi]
        _ndk_good_snp = cndk[_kf_min2] - cndk[_kf_min1]
        _ndk_good_uniq = cndk[_kf_min3] - cndk[_kf_min2]

        # kmer coverage C_k
        _kf_ave_uniq = _nk_good_uniq * 1.0 / _ndk_good_uniq
        _genome_size = (_nk_total - _nk_bad_low_kf -
                        _nk_bad_high_kf) / _kf_ave_uniq
        _genome_size_unique = _ndk_good_uniq + _ndk_good_snp / 2
        _genome_size_repetitive = _genome_size - _genome_size_unique
        _coverage = _nk_total / _genome_size if _genome_size else 0

        # SNP rate estimation, assumes uniform distribution of SNPs over the
        # genome and accounts for the reduction in SNP kmer counts when
        # polymorphism is very high
        if ploidy == 2:
            _d_SNP = (1.0 /
                      (1.0 -
                       (1.0 - 0.5 * _ndk_good_snp / _genome_size)**(1.0 / K))
                      if _ndk_good_snp > 0 else 1000000)

        G = int(_genome_size)
        G1 = int(_genome_size_unique)
        GR = int(_genome_size_repetitive)
        coverage = int(_coverage)

        m = "Kmer (K={0}) Spectrum Analysis\n".format(K)
        m += "Genome size estimate = {0}\n".format(thousands(G))
        m += "Genome size estimate CN = 1 = {0} ({1})\n".format(
            thousands(G1), percentage(G1, G))
        m += "Genome size estimate CN > 1 = {0} ({1})\n".format(
            thousands(GR), percentage(GR, G))
        m += "Coverage estimate: {0} x\n".format(coverage)
        self.repetitive = "Repeats: {0} percent".format(GR * 100 // G)

        if ploidy == 2:
            d_SNP = int(_d_SNP)
            self.snprate = "SNP rate ~= 1/{0}".format(d_SNP)
        else:
            self.snprate = "SNP rate not computed (Ploidy = {0})".format(
                ploidy)
        m += self.snprate + "\n"

        self.genomesize = int(round(self.totalKmers * 1.0 / self.max2))

        print(m, file=sys.stderr)
        return {}
예제 #17
0
파일: sizes.py 프로젝트: wroldwiedbwe/jcvi
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import (
        plt,
        markup,
        human_formatter,
        human_base_formatter,
        savefig,
        set2,
        set_ticklabels_helvetica,
    )

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000,
                    bins=100,
                    xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1",
                 default="Counts",
                 help="Label of y-axis on the left")
    p.add_option(
        "--color",
        default="0",
        choices=[str(x) for x in range(8)],
        help="Color of bars, which is an index 0-7 in brewer set2",
    )
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * 0.5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000**SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1.0 / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, "-", color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = 0.95, 0.95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print(t, file=sys.stderr)
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= 0.05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(
        axis="x",
        direction="out",
        length=3,
        left=False,
        right=False,
        top=False,
        bottom=True,
    )
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
예제 #18
0
파일: sizes.py 프로젝트: xuanblo/jcvi
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import plt, markup, human_formatter, \
                human_base_formatter, savefig, set2, set_ticklabels_helvetica

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000, bins=100, xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1", default="Counts",
                 help="Label of y-axis on the left")
    p.add_option("--color", default='0', choices=[str(x) for x in range(8)],
                 help="Color of bars, which is an index 0-7 in brewer set2")
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * .5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000 ** SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1. / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, '-', color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = .95, .95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print >> sys.stderr, t
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= .05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(axis="x", direction="out", length=3,
                    left=False, right=False, top=False, bottom=True)
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
예제 #19
0
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1,
        is_self=False, synteny=False, cmap_text=None, cmap="copper",
        genomenames=None, sample_number=10000, minfont=5, palette=None,
        chrlw=.1, title=None, sep=True, sepcolor="g", stdpf=True):

    fp = open(anchorfile)
    # add genome names
    if genomenames:
        gx, gy = genomenames.split("_")
    else:
        to_ax_label = lambda fname: op.basename(fname).split(".")[0]
        gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    gx, gy = markup(gx), markup(gy)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\
                        .format(vmin, vmax))

    block_id = 0
    for row in fp:
        atoms = row.split()
        block_color = None
        if row[0] == "#":
            block_id += 1
            if palette:
                block_color = palette.get(block_id, "k")
            continue

        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue

        query, subject = atoms[:2]
        value = atoms[-1]

        if cmap_text:
            try:
                value = float(value)
            except ValueError:
                value = vmax

            if value < vmin:
                continue
            if value > vmax:
                continue
        else:
            value = 0

        if query not in qorder:
            continue
        if subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = value if block_color is None else block_color
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    npairs = downsample(data, sample_number=sample_number)
    x, y, c = zip(*data)

    if palette:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)
    else:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap,
                vmin=vmin, vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    qbreaks = qbed.get_breaks()
    sbreaks = sbed.get_breaks()
    xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize,
                           qbreaks, sbreaks, sep=sep, chrlw=chrlw,
                           sepcolor=sepcolor, minfont=minfont, stdpf=stdpf)

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    if palette:  # bottom-left has the palette, if available
        colors = palette.colors
        xstart, ystart = .1, .05
        for category, c in sorted(colors.items()):
            root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c))
            root.text(xstart + .04, ystart, category, color=c)
            xstart += .1

    if title is None:
        title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy)
        if is_self:
            title = "Intra-genomic comparison within {0}".format(gx)
            npairs /= 2
        title += " ({0} gene pairs)".format(thousands(npairs))
    root.set_title(title, x=.5, y=.96, color="k")
    if title:
        logging.debug("Dot plot title: {}".format(title))
    normalize_axes(root)
예제 #20
0
파일: vcf.py 프로젝트: Hensonmw/jcvi
def summary(args):
    """
    %prog summary txtfile fastafile

    The txtfile can be generated by: %prog mstmap --noheader --freq=0

    Tabulate on all possible combinations of genotypes and provide results
    in a nicely-formatted table. Give a fastafile for SNP rate (average
    # of SNPs per Kb).

    Only three-column file is supported:
    locus_id    intra- genotype    inter- genotype
    """
    from jcvi.utils.cbook import thousands
    from jcvi.utils.table import tabulate

    p = OptionParser(summary.__doc__)
    p.add_option("--counts",
                 help="Print SNP counts in a txt file [default: %default]")
    p.add_option("--bed",
                 help="Print SNPs locations in a bed file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    txtfile, fastafile = args
    bedfw = open(opts.bed, "w") if opts.bed else None

    fp = open(txtfile)
    header = fp.next().split()  # Header
    snps = defaultdict(list)  # contig => list of loci
    combinations = defaultdict(int)
    intraSNPs = interSNPs = 0
    distinctSet = set()  # set of genes that show A-B pattern
    ref, alt = header[1:3]
    snpcounts, goodsnpcounts = defaultdict(int), defaultdict(int)
    for row in fp:
        atoms = row.split()
        assert len(atoms) == 3, \
                "Only three-column file is supported"
        locus, intra, inter = atoms
        ctg, pos = locus.rsplit(".", 1)
        pos = int(pos)
        snps[ctg].append(pos)
        snpcounts[ctg] += 1

        if intra == 'X':
            intraSNPs += 1
        if inter in ('B', 'X'):
            interSNPs += 1
        if intra == 'A' and inter == 'B':
            distinctSet.add(ctg)
            goodsnpcounts[ctg] += 1
        # Tabulate all possible combinations
        intra = ref + "-" + intra
        inter = alt + "-" + inter
        combinations[(intra, inter)] += 1

        if bedfw:
            print >> bedfw, "\t".join(str(x) for x in \
                        (ctg, pos - 1, pos, locus))

    if bedfw:
        logging.debug("SNP locations written to `{0}`.".format(opts.bed))
        bedfw.close()

    nsites = sum(len(x) for x in snps.values())
    sizes = Sizes(fastafile)
    bpsize = sizes.totalsize
    snprate = lambda a: a * 1000. / bpsize
    m = "Dataset `{0}` contains {1} contigs ({2} bp).\n".\
                format(fastafile, len(sizes), thousands(bpsize))
    m += "A total of {0} SNPs within {1} contigs ({2} bp).\n".\
                format(nsites, len(snps),
                       thousands(sum(sizes.mapping[x] for x in snps.keys())))
    m += "SNP rate: {0:.1f}/Kb, ".format(snprate(nsites))
    m += "IntraSNPs: {0} ({1:.1f}/Kb), InterSNPs: {2} ({3:.1f}/Kb)".\
                format(intraSNPs, snprate(intraSNPs), interSNPs, snprate(interSNPs))
    print >> sys.stderr, m
    print >> sys.stderr, tabulate(combinations)

    leg = "Legend: A - homozygous same, B - homozygous different, X - heterozygous"
    print >> sys.stderr, leg

    tag = (ref + "-A", alt + "-B")
    distinctSNPs = combinations[tag]
    tag = str(tag).replace("'", "")
    print >> sys.stderr, "A total of {0} disparate {1} SNPs in {2} contigs.".\
                format(distinctSNPs, tag, len(distinctSet))

    if not opts.counts:
        return

    snpcountsfile = opts.counts
    fw = open(snpcountsfile, "w")
    header = "\t".join(("Contig", "#_SNPs", "#_AB_SNP"))
    print >> fw, header

    assert sum(snpcounts.values()) == nsites
    assert sum(goodsnpcounts.values()) == distinctSNPs

    for ctg in sorted(snps.keys()):
        snpcount = snpcounts[ctg]
        goodsnpcount = goodsnpcounts[ctg]
        print >> fw, "\t".join(str(x) for x in (ctg, snpcount, goodsnpcount))

    fw.close()
    logging.debug("SNP counts per contig is written to `{0}`.".\
                  format(snpcountsfile))
예제 #21
0
def coverage(args):
    """
    %prog coverage fastafile ctg bedfile1 bedfile2 ..

    Plot coverage from a set of BED files that contain the read mappings. The
    paired read span will be converted to a new bedfile that contain the happy
    mates. ctg is the chr/scf/ctg that you want to plot the histogram on.

    If the bedfiles already contain the clone spans, turn on --spans.
    """
    from jcvi.formats.bed import mates, bedpe

    p = OptionParser(coverage.__doc__)
    p.add_option("--ymax",
                 default=None,
                 type="int",
                 help="Limit ymax [default: %default]")
    p.add_option(
        "--spans",
        default=False,
        action="store_true",
        help="BED files already contain clone spans [default: %default]")
    opts, args, iopts = p.set_image_options(args, figsize="8x5")

    if len(args) < 3:
        sys.exit(not p.print_help())

    fastafile, ctg = args[0:2]
    bedfiles = args[2:]

    sizes = Sizes(fastafile)
    size = sizes.mapping[ctg]

    plt.figure(1, (iopts.w, iopts.h))
    ax = plt.gca()

    bins = 100  # smooth the curve
    lines = []
    legends = []
    not_covered = []
    yy = .9
    for bedfile, c in zip(bedfiles, "rgbcky"):
        if not opts.spans:
            pf = bedfile.rsplit(".", 1)[0]
            matesfile = pf + ".mates"
            if need_update(bedfile, matesfile):
                matesfile, matesbedfile = mates([bedfile, "--lib"])

            bedspanfile = pf + ".spans.bed"
            if need_update(matesfile, bedspanfile):
                bedpefile, bedspanfile = bedpe(
                    [bedfile, "--span", "--mates={0}".format(matesfile)])
            bedfile = bedspanfile

        bedsum = Bed(bedfile).sum(seqid=ctg)
        notcoveredbases = size - bedsum

        legend = bedfile.split(".")[0]
        msg = "{0}: {1} bp not covered".format(legend,
                                               thousands(notcoveredbases))
        not_covered.append(msg)
        print >> sys.stderr, msg
        ax.text(.1, yy, msg, color=c, size=9, transform=ax.transAxes)
        yy -= .08

        cov = Coverage(bedfile, sizes.filename)
        x, y = cov.get_plot_data(ctg, bins=bins)
        line, = ax.plot(x, y, '-', color=c, lw=2, alpha=.5)
        lines.append(line)
        legends.append(legend)

    leg = ax.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(.5)

    ylabel = "Average depth per {0}Kb".format(size / bins / 1000)
    ax.set_xlim(0, size)
    ax.set_ylim(0, opts.ymax)
    ax.set_xlabel(ctg)
    ax.set_ylabel(ylabel)
    set_human_base_axis(ax)

    figname = "{0}.{1}.pdf".format(fastafile, ctg)
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
예제 #22
0
파일: dotplot.py 프로젝트: JinfengChen/jcvi
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1,
        is_self=False, synteny=False, cmap_text=None, cmap="copper",
        genomenames=None, sample_number=10000, minfont=5, palette=None,
        chrlw=.01, title=None, sepcolor="gainsboro"):

    fp = open(anchorfile)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\
                        .format(vmin, vmax))

    block_id = 0
    for row in fp:
        atoms = row.split()
        block_color = None
        if row[0] == "#":
            block_id += 1
            if palette:
                block_color = palette.get(block_id, "k")
            continue

        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue

        query, subject = atoms[:2]
        value = atoms[-1]

        if cmap_text:
            try:
                value = float(value)
            except ValueError:
                value = vmax

            if value < vmin:
                continue
            if value > vmax:
                continue
        else:
            value = 0

        if query not in qorder:
            continue
        if subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = value if block_color is None else block_color
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    npairs = len(data)
    # Only show random subset
    if npairs > sample_number:
        logging.debug("Showing a random subset of {0} data points (total {1}) " \
                      "for clarity.".format(sample_number, npairs))
        data = sample(data, sample_number)

    # the data are plotted in this order, the least value are plotted
    # last for aesthetics
    #if not palette:
    #    data.sort(key=lambda x: -x[2])

    x, y, c = zip(*data)

    if palette:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)
    else:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap,
                vmin=vmin, vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    # Tag to mark whether to plot chr name (skip small ones)
    xchr_labels, ychr_labels = [], []
    th = TextHandler(fig)

    # plot the chromosome breaks
    for (seqid, beg, end) in qbed.get_breaks():
        xsize_ratio = abs(end - beg) * .8 / xsize
        fontsize = th.select_fontsize(xsize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        xchr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor)

    for (seqid, beg, end) in sbed.get_breaks():
        ysize_ratio = abs(end - beg) * .8 / ysize
        fontsize = th.select_fontsize(ysize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        ychr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor)

    # plot the chromosome labels
    for label, pos, fontsize in xchr_labels:
        pos = .1 + pos * .8 / xsize
        if fontsize >= minfont:
            root.text(pos, .91, latex(label), size=fontsize,
                ha="center", va="bottom", rotation=45, color="grey")

    # remember y labels are inverted
    for label, pos, fontsize in ychr_labels:
        pos = .9 - pos * .8 / ysize
        if fontsize >= minfont:
            root.text(.91, pos, latex(label), size=fontsize,
                va="center", color="grey")

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # add genome names
    if genomenames:
        gx, gy = genomenames.split("_")
    else:
        to_ax_label = lambda fname: op.basename(fname).split(".")[0]
        gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    ax.set_xlabel(markup(gx), size=16)
    ax.set_ylabel(markup(gy), size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
            color='gray', size=10)

    if palette:  # bottom-left has the palette, if available
        colors = palette.colors
        xstart, ystart = .1, .05
        for category, c in sorted(colors.items()):
            root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c))
            root.text(xstart + .04, ystart, category, color=c)
            xstart += .1

    if not title:
        title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy)
        if is_self:
            title = "Intra-genomic comparison within {0}".format(gx)
            npairs /= 2
        title += " ({0} gene pairs)".format(thousands(npairs))
    root.set_title(markup(title), x=.5, y=.96, color="k")
    logging.debug(title)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()
예제 #23
0
파일: kmer.py 프로젝트: arvin580/jcvi
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]")
    p.add_option(
        "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]"
    )
    p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = meryl([histfile])

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, "g-", lw=2, alpha=0.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=2, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    messages = [Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg]
    write_messages(ax, messages)

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title))
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
예제 #24
0
파일: kmer.py 프로젝트: biologyguy/jcvi
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin",
                 dest="vmin",
                 default=1,
                 type="int",
                 help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax",
                 dest="vmax",
                 default=100,
                 type="int",
                 help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks",
                 default=False,
                 action="store_true",
                 help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\
                        format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        x, y = zip(*tcounts)
        tcounts = dict(tcounts)
        plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
예제 #25
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic.
    """
    p = OptionParser(histogram.__doc__)
    p.add_option(
        "--vmin",
        dest="vmin",
        default=1,
        type="int",
        help="minimum value, inclusive",
    )
    p.add_option(
        "--vmax",
        dest="vmax",
        default=100,
        type="int",
        help="maximum value, inclusive",
    )
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Print PDF instead of ASCII plot",
    )
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option(
        "--nopeaks",
        default=False,
        action="store_true",
        help="Do not annotate K-mer peaks",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = merylhistogram(histfile)

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size /
                                                                1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print(msg, file=sys.stderr)

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, "g-", lw=2, alpha=0.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=2, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    messages = [
        Total_Kmers_msg,
        Kmer_coverage_msg,
        Genome_size_msg,
        Repetitive_msg,
        SNPrate_msg,
    ]
    write_messages(ax, messages)

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title))
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
예제 #26
0
파일: dotplot.py 프로젝트: radaniba/jcvi
def dotplot(anchorfile,
            qbed,
            sbed,
            fig,
            root,
            ax,
            vmin=0,
            vmax=1,
            is_self=False,
            synteny=False,
            cmap_text=None,
            genomenames=None,
            sample_number=10000,
            minfont=5,
            palette=None,
            chrlw=.01,
            title=None,
            sepcolor="gainsboro"):

    fp = open(anchorfile)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax))

    block_id = 0
    for row in fp:
        atoms = row.split()
        block_color = None
        if row[0] == "#":
            block_id += 1
            if palette:
                block_color = palette.get(block_id, "k")
            continue

        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue

        query, subject = atoms[:2]
        value = atoms[-1]

        try:
            value = float(value)
        except ValueError:
            value = vmax

        if value < vmin:
            value = vmin
        if value > vmax:
            value = vmax

        if query not in qorder:
            continue
        if subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = vmax - value if block_color is None else block_color
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    npairs = len(data)
    # Only show random subset
    if npairs > sample_number:
        logging.debug("Showing a random subset of {0} data points (total {1}) " \
                      "for clarity.".format(sample_number, npairs))
        data = sample(data, sample_number)

    # the data are plotted in this order, the least value are plotted
    # last for aesthetics
    if not palette:
        data.sort(key=lambda x: -x[2])

    default_cm = cm.copper
    x, y, c = zip(*data)

    if palette:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)

    else:
        ax.scatter(x,
                   y,
                   c=c,
                   edgecolors="none",
                   s=2,
                   lw=0,
                   cmap=default_cm,
                   vmin=vmin,
                   vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    # Tag to mark whether to plot chr name (skip small ones)
    xchr_labels, ychr_labels = [], []
    th = TextHandler(fig)

    # plot the chromosome breaks
    for (seqid, beg, end) in qbed.get_breaks():
        xsize_ratio = abs(end - beg) * .8 / xsize
        fontsize = th.select_fontsize(xsize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        xchr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor)

    for (seqid, beg, end) in sbed.get_breaks():
        ysize_ratio = abs(end - beg) * .8 / ysize
        fontsize = th.select_fontsize(ysize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        ychr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor)

    # plot the chromosome labels
    for label, pos, fontsize in xchr_labels:
        pos = .1 + pos * .8 / xsize
        if fontsize >= minfont:
            root.text(pos,
                      .91,
                      latex(label),
                      size=fontsize,
                      ha="center",
                      va="bottom",
                      rotation=45,
                      color="grey")

    # remember y labels are inverted
    for label, pos, fontsize in ychr_labels:
        pos = .9 - pos * .8 / ysize
        if fontsize >= minfont:
            root.text(.91,
                      pos,
                      latex(label),
                      size=fontsize,
                      va="center",
                      color="grey")

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # add genome names
    if genomenames:
        gx, gy = genomenames.split("_")
    else:
        to_ax_label = lambda fname: op.basename(fname).split(".")[0]
        gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    ax.set_xlabel(gx, size=16)
    ax.set_ylabel(gy, size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
             color='gray',
             size=10)

    if palette:  # bottom-left has the palette, if available
        colors = palette.colors
        xstart, ystart = .1, .05
        for category, c in sorted(colors.items()):
            root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c))
            root.text(xstart + .04, ystart, category, color=c)
            xstart += .1

    if not title:
        title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy)
        if is_self:
            title = "Intra-genomic comparison within {0}".format(gx)
            npairs /= 2
        title += " ({0} gene pairs)".format(thousands(npairs))
    root.set_title(title, x=.5, y=.96, color="k")
    logging.debug(title)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()
예제 #27
0
파일: vcf.py 프로젝트: radaniba/jcvi
def summary(args):
    """
    %prog summary txtfile fastafile

    The txtfile can be generated by: %prog mstmap --noheader --freq=0

    Tabulate on all possible combinations of genotypes and provide results
    in a nicely-formatted table. Give a fastafile for SNP rate (average
    # of SNPs per Kb).

    Only three-column file is supported:
    locus_id    intra- genotype    inter- genotype
    """
    from jcvi.utils.cbook import thousands
    from jcvi.utils.table import tabulate

    p = OptionParser(summary.__doc__)
    p.add_option("--counts",
                 help="Print SNP counts in a txt file [default: %default]")
    p.add_option("--bed",
                 help="Print SNPs locations in a bed file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    txtfile, fastafile = args
    bedfw = open(opts.bed, "w") if opts.bed else None

    fp = open(txtfile)
    header = fp.next().split()  # Header
    snps = defaultdict(list)  # contig => list of loci
    combinations = defaultdict(int)
    intraSNPs = interSNPs = 0
    distinctSet = set()  # set of genes that show A-B pattern
    ref, alt = header[1:3]
    snpcounts, goodsnpcounts = defaultdict(int), defaultdict(int)
    for row in fp:
        atoms = row.split()
        assert len(atoms) == 3, \
                "Only three-column file is supported"
        locus, intra, inter = atoms
        ctg, pos = locus.rsplit(".", 1)
        pos = int(pos)
        snps[ctg].append(pos)
        snpcounts[ctg] += 1

        if intra == 'X':
            intraSNPs += 1
        if inter in ('B', 'X'):
            interSNPs += 1
        if intra == 'A' and inter == 'B':
            distinctSet.add(ctg)
            goodsnpcounts[ctg] += 1
        # Tabulate all possible combinations
        intra = ref + "-" + intra
        inter = alt + "-" + inter
        combinations[(intra, inter)] += 1

        if bedfw:
            print >> bedfw, "\t".join(str(x) for x in \
                        (ctg, pos - 1, pos, locus))

    if bedfw:
        logging.debug("SNP locations written to `{0}`.".format(opts.bed))
        bedfw.close()

    nsites = sum(len(x) for x in snps.values())
    sizes = Sizes(fastafile)
    bpsize = sizes.totalsize
    snprate = lambda a: a * 1000. / bpsize
    m = "Dataset `{0}` contains {1} contigs ({2} bp).\n".\
                format(fastafile, len(sizes), thousands(bpsize))
    m += "A total of {0} SNPs within {1} contigs ({2} bp).\n".\
                format(nsites, len(snps),
                       thousands(sum(sizes.mapping[x] for x in snps.keys())))
    m += "SNP rate: {0:.1f}/Kb, ".format(snprate(nsites))
    m += "IntraSNPs: {0} ({1:.1f}/Kb), InterSNPs: {2} ({3:.1f}/Kb)".\
                format(intraSNPs, snprate(intraSNPs), interSNPs, snprate(interSNPs))
    print >> sys.stderr, m
    print >> sys.stderr, tabulate(combinations)

    leg = "Legend: A - homozygous same, B - homozygous different, X - heterozygous"
    print >> sys.stderr, leg

    tag = (ref + "-A", alt + "-B")
    distinctSNPs = combinations[tag]
    tag = str(tag).replace("'", "")
    print >> sys.stderr, "A total of {0} disparate {1} SNPs in {2} contigs.".\
                format(distinctSNPs, tag, len(distinctSet))

    if not opts.counts:
        return

    snpcountsfile = opts.counts
    fw = open(snpcountsfile, "w")
    header = "\t".join(("Contig", "#_SNPs", "#_AB_SNP"))
    print >> fw, header

    assert sum(snpcounts.values()) == nsites
    assert sum(goodsnpcounts.values()) == distinctSNPs

    for ctg in sorted(snps.keys()):
        snpcount = snpcounts[ctg]
        goodsnpcount = goodsnpcounts[ctg]
        print >> fw, "\t".join(str(x) for x in (ctg, snpcount, goodsnpcount))

    fw.close()
    logging.debug("SNP counts per contig is written to `{0}`.".\
                  format(snpcountsfile))
예제 #28
0
파일: kmer.py 프로젝트: arvin580/jcvi
    def analyze(self, ploidy=2, K=23, covmax=1000000):
        """
        Analyze Kmer spectrum, calculations derived from
        allpathslg/src/kmers/KmerSpectra.cc
        """
        from math import sqrt

        data = self.data
        kf_ceil = max(K for (K, c) in data)
        if kf_ceil > covmax:
            exceeds = sum(1 for (K, c) in data if K > covmax)
            logging.debug("A total of {0} distinct K-mers appear > " "{1} times. Ignored ...".format(exceeds, covmax))
            kf_ceil = covmax

        nkf = kf_ceil + 1
        a = [0] * nkf
        for kf, c in data:
            if kf > kf_ceil:
                continue
            a[kf] = c

        ndk = a  # number of distinct kmers
        nk = [k * c for k, c in enumerate(a)]  # number of kmers
        cndk = [0] * nkf  # cumulative number of distinct kmers
        cnk = [0] * nkf  # cumulative number of kmers
        for kf in xrange(1, nkf):
            cndk[kf] = cndk[kf - 1] + 0.5 * (ndk[kf - 1] + ndk[kf])
            cnk[kf] = cnk[kf - 1] + 0.5 * (nk[kf - 1] + nk[kf])

        # Separate kmer spectrum in 5 regions based on the kf
        # 1        ... kf_min1    : bad kmers with low frequency
        # kf_min1  ... kf_min2    : good kmers CN = 1/2 (SNPs)
        # kf_min2  ... kf_min3    : good kmers CN = 1
        # kf_min3  ... kf_hi      : good kmers CN > 1 (repetitive)
        # kf_hi    ... inf        : bad kmers with high frequency

        # min1: find first minimum
        _kf_min1 = 10
        while _kf_min1 - 1 >= 2 and nk[_kf_min1 - 1] < nk[_kf_min1]:
            _kf_min1 -= 1
        while _kf_min1 <= kf_ceil and nk[_kf_min1 + 1] < nk[_kf_min1]:
            _kf_min1 += 1

        # max2: find absolute maximum mx2 above first minimum min1
        _kf_max2 = _kf_min1
        for kf in xrange(_kf_min1 + 1, int(0.8 * kf_ceil)):
            if nk[kf] > nk[_kf_max2]:
                _kf_max2 = kf

        # max2: resetting max2 for cases of very high polymorphism
        if ploidy == 2:
            ndk_half = ndk[_kf_max2 / 2]
            ndk_double = ndk[_kf_max2 * 2]
            if ndk_double > ndk_half:
                _kf_max2 *= 2

        # max1: SNPs local maximum max1 as half global maximum max2
        _kf_max1 = _kf_max2 / 2

        # min2: SNPs local minimum min2 between max1 and max2
        _kf_min2 = _kf_max1 * (2 * ndk[_kf_max1] + ndk[_kf_max2]) / (ndk[_kf_max1] + ndk[_kf_max2])

        # min1: refine between min1 and max2/2
        for kf in xrange(_kf_min1 + 1, _kf_max1):
            if nk[kf] < nk[_kf_min1]:
                _kf_min1 = kf

        # min3: not a minimum, really. upper edge of main peak
        _kf_min3 = _kf_max2 * 3 / 2

        print >> sys.stderr, "kfs:", _kf_min1, _kf_max1, _kf_min2, _kf_max2, _kf_min3
        self.min1 = _kf_min1
        self.max1 = _kf_max1
        self.min2 = _kf_min2
        self.max2 = _kf_max2
        self.min3 = _kf_min3

        # Define maximum kf above which we neglect data
        _kf_hi = (
            _kf_max2 * sqrt(4 * ndk[2 * _kf_max2] * _kf_max2)
            if 2 * _kf_max2 < len(ndk)
            else _kf_max2 * sqrt(4 * ndk[len(ndk) - 1] * _kf_max2)
        )
        _kf_hi = int(_kf_hi)

        if _kf_hi > kf_ceil:
            _kf_hi = kf_ceil

        _nk_total = cnk[len(cnk) - 1]
        _nk_bad_low_kf = cnk[_kf_min1]
        _nk_good_uniq = cnk[_kf_min3] - cnk[_kf_min2]
        _nk_bad_high_kf = _nk_total - cnk[_kf_hi]
        _ndk_good_snp = cndk[_kf_min2] - cndk[_kf_min1]
        _ndk_good_uniq = cndk[_kf_min3] - cndk[_kf_min2]

        # kmer coverage C_k
        _kf_ave_uniq = _nk_good_uniq * 1.0 / _ndk_good_uniq
        _genome_size = (_nk_total - _nk_bad_low_kf - _nk_bad_high_kf) / _kf_ave_uniq
        _genome_size_unique = _ndk_good_uniq + _ndk_good_snp / 2
        _genome_size_repetitive = _genome_size - _genome_size_unique
        _coverage = _nk_total / _genome_size if _genome_size else 0

        # SNP rate estimation, assumes uniform distribution of SNPs over the
        # genome and accounts for the reduction in SNP kmer counts when
        # polymorphism is very high
        if ploidy == 2:
            _d_SNP = (
                1.0 / (1.0 - (1.0 - 0.5 * _ndk_good_snp / _genome_size) ** (1.0 / K)) if _ndk_good_snp > 0 else 1000000
            )

        G = int(_genome_size)
        G1 = int(_genome_size_unique)
        GR = int(_genome_size_repetitive)
        coverage = int(_coverage)

        m = "Kmer (K={0}) Spectrum Analysis\n".format(K)
        m += "Genome size estimate = {0}\n".format(thousands(G))
        m += "Genome size estimate CN = 1 = {0} ({1})\n".format(thousands(G1), percentage(G1, G))
        m += "Genome size estimate CN > 1 = {0} ({1})\n".format(thousands(GR), percentage(GR, G))
        m += "Coverage estimate: {0} x\n".format(coverage)
        self.repetitive = "Repeats: {0} percent".format(GR * 100 / G)

        if ploidy == 2:
            d_SNP = int(_d_SNP)
            self.snprate = "SNP rate ~= 1/{0}".format(d_SNP)
        else:
            self.snprate = "SNP rate not computed (Ploidy = {0})".format(ploidy)
        m += self.snprate + "\n"

        self.genomesize = int(round(self.totalKmers * 1.0 / self.max2))

        print >> sys.stderr, m
예제 #29
0
def coverage(args):
    """
    %prog coverage fastafile ctg bedfile1 bedfile2 ..

    Plot coverage from a set of BED files that contain the read mappings. The
    paired read span will be converted to a new bedfile that contain the happy
    mates. ctg is the chr/scf/ctg that you want to plot the histogram on.

    If the bedfiles already contain the clone spans, turn on --spans.
    """
    from jcvi.formats.bed import mates, bedpe

    p = OptionParser(coverage.__doc__)
    p.add_option("--ymax", default=None, type="int",
                 help="Limit ymax [default: %default]")
    p.add_option("--spans", default=False, action="store_true",
                 help="BED files already contain clone spans [default: %default]")
    opts, args, iopts = p.set_image_options(args, figsize="8x5")

    if len(args) < 3:
        sys.exit(not p.print_help())

    fastafile, ctg = args[0:2]
    bedfiles = args[2:]

    sizes = Sizes(fastafile)
    size = sizes.mapping[ctg]

    plt.figure(1, (iopts.w, iopts.h))
    ax = plt.gca()

    bins = 100  # smooth the curve
    lines = []
    legends = []
    not_covered = []
    yy = .9
    for bedfile, c in zip(bedfiles, "rgbcky"):
        if not opts.spans:
            pf = bedfile.rsplit(".", 1)[0]
            matesfile = pf + ".mates"
            if need_update(bedfile, matesfile):
                matesfile, matesbedfile = mates([bedfile, "--lib"])

            bedspanfile = pf + ".spans.bed"
            if need_update(matesfile, bedspanfile):
                bedpefile, bedspanfile = bedpe([bedfile, "--span",
                    "--mates={0}".format(matesfile)])
            bedfile = bedspanfile

        bedsum = Bed(bedfile).sum(seqid=ctg)
        notcoveredbases = size - bedsum

        legend = bedfile.split(".")[0]
        msg = "{0}: {1} bp not covered".format(legend, thousands(notcoveredbases))
        not_covered.append(msg)
        print >> sys.stderr, msg
        ax.text(.1, yy, msg, color=c, size=9, transform=ax.transAxes)
        yy -= .08

        cov = Coverage(bedfile, sizes.filename)
        x, y = cov.get_plot_data(ctg, bins=bins)
        line, = ax.plot(x, y, '-', color=c, lw=2, alpha=.5)
        lines.append(line)
        legends.append(legend)

    leg = ax.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(.5)

    ylabel = "Average depth per {0}Kb".format(size / bins / 1000)
    ax.set_xlim(0, size)
    ax.set_ylim(0, opts.ymax)
    ax.set_xlabel(ctg)
    ax.set_ylabel(ylabel)
    set_human_base_axis(ax)

    figname ="{0}.{1}.pdf".format(fastafile, ctg)
    savefig(figname, dpi=iopts.dpi, iopts=iopts)