Exemplo n.º 1
0
def ld(args):
    """
    %prog ld map

    Calculate pairwise linkage disequilibrium given MSTmap.
    """
    import numpy as np
    from random import sample

    from jcvi.algorithms.matrix import symmetrize

    p = OptionParser(ld.__doc__)
    p.add_option("--subsample", default=1000, type="int",
                 help="Subsample markers to speed up [default: %default]")
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 1:
        sys.exit(not p.print_help())

    mstmap, = args
    subsample = opts.subsample
    data = MSTMap(mstmap)

    markerbedfile = mstmap + ".subsample.bed"
    ldmatrix = mstmap + ".subsample.matrix"
    # Take random subsample while keeping marker order
    if subsample < data.nmarkers:
        data = [data[x] for x in \
                sorted(sample(xrange(len(data)), subsample))]
    else:
        logging.debug("Use all markers, --subsample ignored")

    nmarkers = len(data)
    if need_update(mstmap, (ldmatrix, markerbedfile)):
        fw = open(markerbedfile, "w")
        print("\n".join(x.bedline for x in data), file=fw)
        logging.debug("Write marker set of size {0} to file `{1}`."\
                        .format(nmarkers, markerbedfile))
        fw.close()

        M = np.zeros((nmarkers, nmarkers), dtype=float)
        for i, j in combinations(range(nmarkers), 2):
            a = data[i]
            b = data[j]
            M[i, j] = calc_ldscore(a.genotype, b.genotype)

        M = symmetrize(M)

        logging.debug("Write LD matrix to file `{0}`.".format(ldmatrix))
        M.tofile(ldmatrix)
    else:
        nmarkers = len(Bed(markerbedfile))
        M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers)
        logging.debug("LD matrix `{0}` exists ({1}x{1})."\
                        .format(ldmatrix, nmarkers))

    from jcvi.graphics.base import plt, savefig, Rectangle, draw_cmap

    plt.rcParams["axes.linewidth"] = 0

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    ax = fig.add_axes([.1, .1, .8, .8])  # the heatmap

    ax.matshow(M, cmap=iopts.cmap)

    # Plot chromosomes breaks
    bed = Bed(markerbedfile)
    xsize = len(bed)
    extent = (0, nmarkers)
    chr_labels = []
    ignore_size = 20

    for (seqid, beg, end) in bed.get_breaks():
        ignore = abs(end - beg) < ignore_size
        pos = (beg + end) / 2
        chr_labels.append((seqid, pos, ignore))
        if ignore:
            continue
        ax.plot((end, end), extent, "w-", lw=1)
        ax.plot(extent, (end, end), "w-", lw=1)

    # Plot chromosome labels
    for label, pos, ignore in chr_labels:
        pos = .1 + pos * .8 / xsize
        if not ignore:
            root.text(pos, .91, label,
                ha="center", va="bottom", rotation=45, color="grey")
            root.text(.09, pos, label,
                ha="right", va="center", color="grey")

    ax.set_xlim(extent)
    ax.set_ylim(extent)
    ax.set_axis_off()

    draw_cmap(root, "Pairwise LD (r2)", 0, 1, cmap=iopts.cmap)

    root.add_patch(Rectangle((.1, .1), .8, .8, fill=False, ec="k", lw=2))
    m = mstmap.split(".")[0]
    root.text(.5, .06, "Linkage Disequilibrium between {0} markers".format(m), ha="center")

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = m + ".subsample" + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Exemplo n.º 2
0
def ld(args):
    """
    %prog ld map

    Calculate pairwise linkage disequilibrium given MSTmap.
    """
    import numpy as np
    from random import sample

    from jcvi.algorithms.matrix import symmetrize

    p = OptionParser(ld.__doc__)
    p.add_option("--subsample",
                 default=1000,
                 type="int",
                 help="Subsample markers to speed up [default: %default]")
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 1:
        sys.exit(not p.print_help())

    mstmap, = args
    subsample = opts.subsample
    data = MSTMap(mstmap)

    markerbedfile = mstmap + ".subsample.bed"
    ldmatrix = mstmap + ".subsample.matrix"
    # Take random subsample while keeping marker order
    if subsample < data.nmarkers:
        data = [data[x] for x in \
                sorted(sample(xrange(len(data)), subsample))]
    else:
        logging.debug("Use all markers, --subsample ignored")

    nmarkers = len(data)
    if need_update(mstmap, (ldmatrix, markerbedfile)):
        fw = open(markerbedfile, "w")
        print("\n".join(x.bedline for x in data), file=fw)
        logging.debug("Write marker set of size {0} to file `{1}`."\
                        .format(nmarkers, markerbedfile))
        fw.close()

        M = np.zeros((nmarkers, nmarkers), dtype=float)
        for i, j in combinations(range(nmarkers), 2):
            a = data[i]
            b = data[j]
            M[i, j] = calc_ldscore(a.genotype, b.genotype)

        M = symmetrize(M)

        logging.debug("Write LD matrix to file `{0}`.".format(ldmatrix))
        M.tofile(ldmatrix)
    else:
        nmarkers = len(Bed(markerbedfile))
        M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers)
        logging.debug("LD matrix `{0}` exists ({1}x{1})."\
                        .format(ldmatrix, nmarkers))

    from jcvi.graphics.base import plt, savefig, Rectangle, draw_cmap

    plt.rcParams["axes.linewidth"] = 0

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    ax = fig.add_axes([.1, .1, .8, .8])  # the heatmap

    ax.matshow(M, cmap=iopts.cmap)

    # Plot chromosomes breaks
    bed = Bed(markerbedfile)
    xsize = len(bed)
    extent = (0, nmarkers)
    chr_labels = []
    ignore_size = 20

    for (seqid, beg, end) in bed.get_breaks():
        ignore = abs(end - beg) < ignore_size
        pos = (beg + end) / 2
        chr_labels.append((seqid, pos, ignore))
        if ignore:
            continue
        ax.plot((end, end), extent, "w-", lw=1)
        ax.plot(extent, (end, end), "w-", lw=1)

    # Plot chromosome labels
    for label, pos, ignore in chr_labels:
        pos = .1 + pos * .8 / xsize
        if not ignore:
            root.text(pos,
                      .91,
                      label,
                      ha="center",
                      va="bottom",
                      rotation=45,
                      color="grey")
            root.text(.09, pos, label, ha="right", va="center", color="grey")

    ax.set_xlim(extent)
    ax.set_ylim(extent)
    ax.set_axis_off()

    draw_cmap(root, "Pairwise LD (r2)", 0, 1, cmap=iopts.cmap)

    root.add_patch(Rectangle((.1, .1), .8, .8, fill=False, ec="k", lw=2))
    m = mstmap.split(".")[0]
    root.text(.5,
              .06,
              "Linkage Disequilibrium between {0} markers".format(m),
              ha="center")

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = m + ".subsample" + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Exemplo n.º 3
0
def dotplot(args):
    """
    %prog dotplot map.csv ref.fasta

    Make dotplot between chromosomes and linkage maps.
    The input map is csv formatted, for example:

    ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition
    scaffold_2707,11508,1,0
    scaffold_2707,11525,1,1.2
    """
    from jcvi.assembly.allmaps import CSVMapLine
    from jcvi.formats.sizes import Sizes
    from jcvi.utils.natsort import natsorted
    from jcvi.graphics.base import shorten
    from jcvi.graphics.dotplot import plt, savefig, markup, normalize_axes, \
                    downsample, plot_breaks_and_labels, thousands

    p = OptionParser(dotplot.__doc__)
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args, figsize="8x8",
                                            style="dark", dpi=90, cmap="copper")

    if len(args) != 2:
        sys.exit(not p.print_help())

    csvfile, fastafile = args
    sizes = natsorted(Sizes(fastafile).mapping.items())
    seen = set()
    raw_data = []

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([.1, .1, .8, .8])  # the dot plot

    fp = must_open(csvfile)
    for row in fp:
        m = CSVMapLine(row)
        seen.add(m.seqid)
        raw_data.append(m)

    # X-axis is the genome assembly
    ctgs, ctg_sizes = zip(*sizes)
    xsize = sum(ctg_sizes)
    qb = list(np.cumsum(ctg_sizes))
    qbreaks = list(zip(ctgs, [0] + qb, qb))
    qstarts = dict(zip(ctgs, [0] + qb))

    # Y-axis is the map
    key = lambda x: x.lg
    raw_data.sort(key=key)
    ssizes = {}
    for lg, d in groupby(raw_data, key=key):
        ssizes[lg] = max([x.cm for x in d])
    ssizes = natsorted(ssizes.items())
    lgs, lg_sizes = zip(*ssizes)
    ysize = sum(lg_sizes)
    sb = list(np.cumsum(lg_sizes))
    sbreaks = list(zip([("LG" + x) for x in lgs], [0] + sb, sb))
    sstarts = dict(zip(lgs, [0] + sb))

    # Re-code all the scatter dots
    data = [(qstarts[x.seqid] + x.pos, sstarts[x.lg] + x.cm, 'g') \
                for x in raw_data if (x.seqid in qstarts)]
    npairs = downsample(data)

    x, y, c = zip(*data)
    ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)

    # Flip X-Y label
    gy, gx = op.basename(csvfile).split(".")[:2]
    gx, gy = shorten(gx, maxchar=30), shorten(gy, maxchar=30)
    xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy,
                                xsize, ysize, qbreaks, sbreaks)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    title = "Alignment: {} vs {}".format(gx, gy)
    title += " ({} markers)".format(thousands(npairs))
    root.set_title(markup(title), x=.5, y=.96, color="k")
    logging.debug(title)
    normalize_axes(root)

    image_name = opts.outfile or \
                (csvfile.rsplit(".", 1)[0] + "." + iopts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    fig.clear()
Exemplo n.º 4
0
def dotplot(args):
    """
    %prog dotplot map.csv ref.fasta

    Make dotplot between chromosomes and linkage maps.
    The input map is csv formatted, for example:

    ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition
    scaffold_2707,11508,1,0
    scaffold_2707,11525,1,1.2
    """
    from jcvi.assembly.allmaps import CSVMapLine
    from jcvi.formats.sizes import Sizes
    from jcvi.utils.natsort import natsorted
    from jcvi.graphics.base import shorten
    from jcvi.graphics.dotplot import plt, savefig, markup, normalize_axes, \
                    downsample, plot_breaks_and_labels, thousands

    p = OptionParser(dotplot.__doc__)
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args,
                                            figsize="8x8",
                                            style="dark",
                                            dpi=90,
                                            cmap="copper")

    if len(args) != 2:
        sys.exit(not p.print_help())

    csvfile, fastafile = args
    sizes = natsorted(Sizes(fastafile).mapping.items())
    seen = set()
    raw_data = []

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([.1, .1, .8, .8])  # the dot plot

    fp = must_open(csvfile)
    for row in fp:
        m = CSVMapLine(row)
        seen.add(m.seqid)
        raw_data.append(m)

    # X-axis is the genome assembly
    ctgs, ctg_sizes = zip(*sizes)
    xsize = sum(ctg_sizes)
    qb = list(np.cumsum(ctg_sizes))
    qbreaks = list(zip(ctgs, [0] + qb, qb))
    qstarts = dict(zip(ctgs, [0] + qb))

    # Y-axis is the map
    key = lambda x: x.lg
    raw_data.sort(key=key)
    ssizes = {}
    for lg, d in groupby(raw_data, key=key):
        ssizes[lg] = max([x.cm for x in d])
    ssizes = natsorted(ssizes.items())
    lgs, lg_sizes = zip(*ssizes)
    ysize = sum(lg_sizes)
    sb = list(np.cumsum(lg_sizes))
    sbreaks = list(zip([("LG" + x) for x in lgs], [0] + sb, sb))
    sstarts = dict(zip(lgs, [0] + sb))

    # Re-code all the scatter dots
    data = [(qstarts[x.seqid] + x.pos, sstarts[x.lg] + x.cm, 'g') \
                for x in raw_data if (x.seqid in qstarts)]
    npairs = downsample(data)

    x, y, c = zip(*data)
    ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)

    # Flip X-Y label
    gy, gx = op.basename(csvfile).split(".")[:2]
    gx, gy = shorten(gx, maxchar=30), shorten(gy, maxchar=30)
    xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize,
                                        qbreaks, sbreaks)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    title = "Alignment: {} vs {}".format(gx, gy)
    title += " ({} markers)".format(thousands(npairs))
    root.set_title(markup(title), x=.5, y=.96, color="k")
    logging.debug(title)
    normalize_axes(root)

    image_name = opts.outfile or \
                (csvfile.rsplit(".", 1)[0] + "." + iopts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    fig.clear()