Пример #1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import os
        import stat
        from genomicode import AnnotationMatrix

        # If the file is empty, then just create an empty positions file.
        if os.stat(in_data.identifier)[stat.ST_SIZE] == 0:
            open(out_filename, 'w')
            return

        M = AnnotationMatrix.read(in_data.identifier, header_char="##")

        # Headers are:
        # #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT [Samples...]
        # Pull out the #CHROM and POS columns.
        assert M.num_headers()
        assert M.headers[0] == "#CHROM"
        assert M.headers[1] == "POS"
        chrom_annots = M["#CHROM"]
        pos_annots = M["POS"]

        lines = []
        seen = {}
        for chrom, pos in zip(chrom_annots, pos_annots):
            chrom, pos = chrom.strip(), pos.strip()
            x = chrom, pos
            if x in seen:
                continue
            seen[x] = 1
            x = "\t".join(x) + "\n"
            lines.append(x)
        open(out_filename, 'w').writelines(lines)
Пример #2
0
def main():
    import os
    import argparse
    
    from genomicode import jmath
    from genomicode import AnnotationMatrix

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("datafile", help="Tab-delimited data file.")
    parser.add_argument("header", help="Which column contains data to plot.")
    parser.add_argument(
        "plot_file", help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")
    parser.add_argument(
        "--prism_file", help="Write Prism-formatted results to this file.")
    parser.add_argument(
        "--ignore_missing_values", action="store_true",
        help="Ignore missing values in the file.")

    group = parser.add_argument_group(title="Calculations")
    group.add_argument(
        "--breaks_seq",
        help="Set the breakpoints.  Format: <start>,<stop>,<skip>.")
    group.add_argument(
        "--num_breaks", type=int, help="Number of breakpoints.")
    group.add_argument(
        "--ymax", type=int,
        help="Set the maximum value for the Y axis.")
    
    group = parser.add_argument_group(title="Plot Labels")
    group.add_argument("--title", help="Put a title on the plot.")
    group.add_argument("--xlab", help="Label the X-axis.")
    group.add_argument(
        "--xlabel_size", default=1.0, type=float,
        help="Scale the size of the labels on X-axis.  Default 1.0.")
    group.add_argument(
        "--xlabel_off", action="store_true", help="Do not label the X axis.")
    group.add_argument(
        "--ylabel_off", action="store_true", help="Do not label the Y axis.")
    group.add_argument(
        "--xtick_label_off", action="store_true",
        help="Do not draw the tick labels on the X axis.")

    group = parser.add_argument_group(title="Colors")
    group.add_argument(
        "--bar_color",  help="Set the color of the bars.  Default #FFFFFF")
    x = _fmt_palettes()
    group.add_argument(
        "--bar_palette", help="Color the bars according to a palette: %s." % x)
    group.add_argument(
        "--symmetric_palette", action="store_true",
        help="Make the color symmetric.")

    group = parser.add_argument_group(title="Appearance")
    group.add_argument(
        "--height", type=int, help="Height (in pixels) of the plot.")
    group.add_argument(
        "--width", type=int, help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left", default=1.0, type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--mar_bottom", default=1.0, type=float,
        help="Scale margin at bottom of plot.  Default 1.0.")
    group.add_argument(
        "--xaxis_off", action="store_true", help="Do not show the X axis.")
    group.add_argument(
        "--yaxis_off", action="store_true", help="Do not show the Y axis.")


    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.datafile):
        parser.error("File not found: %s" % args.datafile)
    assert not (args.breaks_seq and args.num_breaks)
    if args.num_breaks:
        assert args.num_breaks >= 2 and args.num_breaks <= 1000
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096*16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096*16, "height too big"
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10
    assert args.xlabel_size > 0 and args.xlabel_size < 10
    assert not (args.bar_color and args.bar_palette)
    assert not args.symmetric_palette or args.bar_palette
    assert args.ymax is None or args.ymax > 0


    height = args.height or 2400
    width = args.width or 3200

    MATRIX = AnnotationMatrix.read(args.datafile, False)
    assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix."
    assert args.header in MATRIX.headers, "header not found: %s" % args.header

    # Pull out the values for the histogram.
    x = MATRIX[args.header]
    if args.ignore_missing_values:
        x = [x for x in x if x.strip()]
    values = map(float, x)

    value_min = value_max = None

    # Start R and set up the environment.
    R = jmath.start_R()

    main = jmath.R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    if args.xlab:
        xlab = args.xlab
    ylab = "Frequency"
    xtick_labels = jmath.R_var("TRUE")
    ytick_labels = jmath.R_var("TRUE")

    if args.xlabel_off:
        xlab = ""
    if args.ylabel_off:
        ylab = ""
    if args.xtick_label_off:
        xtick_labels = jmath.R_var("FALSE")

    breaks = "Sturges"
    if args.breaks_seq:
        breaks = _parse_breaks_seq(args.breaks_seq)
        value_min, value_max = min(breaks), max(breaks)
        jmath.R_equals(breaks, "breaks")
        breaks = jmath.R_var("breaks")
    if args.num_breaks:
        breaks = args.num_breaks

    if value_min is not None:
        values = [x for x in values if x >= value_min]
    if value_max is not None:
        values = [x for x in values if x < value_max]

    lwd = 2
    cex_lab = 1.5
    cex_main = 2.0
    cex_sub = 1.5
    ylim = jmath.R_var("NULL")
    if args.ymax is not None:
        ylim = [0, args.ymax]

    assert values
    jmath.R_equals(values, "X")

    # Figure out the colors.  Do it after X is assigned.
    col = jmath.R_var("NULL")
    if args.bar_color:
        assert args.bar_color.startswith("#")
        col = args.bar_color
    elif args.bar_palette:
        # Figure out how many breaks there are.  Number of bars is num
        # breaks + 1.
        jmath.R_fn(
            "hist", jmath.R_var("X"), breaks=breaks, plot=jmath.R_var("FALSE"),
            RETVAL="x")
        breaks = [x for x in R["x"].rx2("breaks")]
        num_bars = len(breaks) + 1
        col = _make_col_palette(
            args.bar_palette, num_bars, args.symmetric_palette)

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn(
        "bitmap", args.plot_file, type=bm_type, 
        height=height, width=width, units="px", res=300)
    
    # Set the margins.
    x = 5*1.2*args.mar_bottom, 4*1.2*args.mar_left, 4, 2
    mar = [x+0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    jmath.R_fn(
        "hist", jmath.R_var("X"), breaks=breaks, main="", xlab="", ylab="",
        ylim=ylim, axes=jmath.R_var("FALSE"), col=col, RETVAL="x")
    # Make plot area solid white.
    #jmath.R('usr <- par("usr")')
    #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    #jmath.R_fn(
    #    "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"),
    #    main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"),
    #    add=jmath.R_var("TRUE"))
    
    #jmath.R_fn("box", lwd=lwd)
    # x-axis
    if not args.xaxis_off:
        jmath.R_fn(
            "axis", 1, lwd=lwd, labels=xtick_labels, **{ "cex.axis" : 1.5 })
    # y-axis
    if not args.yaxis_off:
        jmath.R_fn(
            "axis", 2, lwd=lwd, labels=ytick_labels, **{ "cex.axis" : 1.5 })
    jmath.R_fn(
        "title", main=main, sub=sub, xlab=xlab, ylab=ylab,
        **{ "cex.lab" : cex_lab, "cex.main" : cex_main, "cex.sub" : cex_sub })
    R("par(op)")
    jmath.R_fn("dev.off")

    if args.prism_file:
        write_prism_file(args.prism_file, R["x"])
Пример #3
0
def main():
    import argparse
    from genomicode import AnnotationMatrix as AM

    SKIP_OUTFILE = "_"

    parser = argparse.ArgumentParser(
        description="Align a set of matrices.  Preserve the order of the "
        "first file given.")
    parser.add_argument("outfile", nargs="+")

    parser.add_argument(
        "--express_file", default=[], action="append", help="")
    parser.add_argument(
        "--annot_file", default=[], action="append", help="")
    parser.add_argument(
        "--header", default=[], action="append",
        help="Specify the header for an annotation file.  Should come "
        "after the --annot_file that it refers to.")
    parser.add_argument(
        "--annot_path",
        help="Align all the annotation files in a path.  "
        "If using this argument, no --annot_file or --express_file should "
        "be given.  "
        "--header is still required, and should apply to at least one file.  "
        'Only one "outfile" should be given, and it should refer to a path '
        "in which to store the aligned files.")
    
    #parser.add_argument(
    #    "--first_annot_header", help="If only aligning annotation files, "
    #    "find the samples to be matched under this header in the first "
    #    "annotation file.")
    parser.add_argument(
        "--clobber", default=False, action="store_true",
        help="Overwrite output files, if they already exist.")

    group = parser.add_argument_group(title="Comparisons")
    group.add_argument(
        "--case_insensitive", default=False, action="store_true",
        help="Do a case insensitive search of sample names.")
    group.add_argument(
        "--hash", default=False, action="store_true",
        help="Hash the sample names to [a-zA-Z0-9_] before comparison.")
    group.add_argument(
        "--ignore_nonalnum", default=False, action="store_true",
        help="Ignore non-alphanumeric characters in the IDs.")
    group.add_argument(
        "--ignore_blank", default=False, action="store_true",
        help="Ignore IDs that are blank (don't align them.")

    group = parser.add_argument_group(title="Joins")
    group.add_argument(
        "--strict", default=False, action="store_true",
        help="Complain if a file is missing a sample.")
    group.add_argument(
        "--left_join", default=False, action="store_true",
        help='By default, does an "inner join" and keeps only the '
        'records that are present in all files.  A "left join" will '
        'keep all records that occur in the first file.')
    group.add_argument(
        "--outer_join", default=False, action="store_true",
        help='By default, does an "inner join" and keeps only the '
        'records that are present in all files.  An "outer join" will '
        'also keep records that occur in any file.')

    group = parser.add_argument_group(title="Output")
    group.add_argument(
        "--null_string", default="",
        help='For left_join or outer_join, what to give the missing values.')
    group.add_argument(
        "--unaligned_only", action="store_true",
        help="Show only the rows that are not aligned.")
    group.add_argument(
        "--dont_add_missing_samples", action="store_true",
        help="If a matrix does not have a sample, don't fill in the value "
        "from another matrix.")

    group = parser.add_argument_group(title="Debug")
    group.add_argument(
        "--debug_nrows", type=int,
        help="Debugging: Only read this many rows from the annotation files.")
    
    args = parser.parse_args()
    # If the user specified an --annot_path, revise args to
    # contain --annot_files instead.
    sys.argv, args = _handle_annot_path(sys.argv, args)

    ni, no = len(args.express_file)+len(args.annot_file), len(args.outfile)
    assert ni == no, "Mismatch: %d inputs and %d outputs" % (ni, no)
        
    for x in args.express_file + args.annot_file:
        assert os.path.exists(x), "I could not find file: %s" % x
    for x in args.outfile:
        if x == SKIP_OUTFILE:
            continue
        assert args.clobber or not os.path.exists(x), "File exists: %s" % x
    assert not (args.left_join and args.outer_join)
    if args.null_string:
        assert args.outer_join or args.left_join, \
               "null_string given, but only used for outer_join"


    # Align the outfiles to the expression and annotation files.
    express_file = args.express_file[:]
    annot_file = args.annot_file[:]
    outfile = args.outfile[:]
    matrix_data = []  # list of (infile, outfile, is_express_file)
    for arg in sys.argv:
        if arg not in ["--express_file", "--annot_file"]:
            continue
        assert outfile
        if arg == "--express_file":
            assert express_file
            x = express_file.pop(0), outfile.pop(0), True
        else:
            assert annot_file
            x = annot_file.pop(0), outfile.pop(0), False
        matrix_data.append(x)
    assert not express_file
    assert not annot_file
    assert not outfile

    # Align the --header arguments to the annotation files.
    headers = [None] * len(matrix_data)
    header_i = -1
    for i, arg in enumerate(sys.argv):
        if arg == "--header":
            assert header_i >= 0, \
                   "--header given before an --express_file or --annot_file."
            assert headers[header_i] is None, "Two --header for one file."
            headers[header_i] = sys.argv[i+1]
        elif arg in ["--express_file", "--annot_file"]:
            header_i += 1

    # Add the headers to the matrix_data.
    new_matrix_data = []  # list of (infile, outfile, is_express_file, header)
    for i in range(len(matrix_data)):
        infile, outfile, is_express_file = matrix_data[i]
        if is_express_file and headers[i]:
            raise NotImplementedError, "No headers for --express_file."
        x = infile, outfile, is_express_file, headers[i]
        new_matrix_data.append(x)
    matrix_data = new_matrix_data

    # Read each of the files.
    new_matrix_data = []  # list of (infile, outfile, matrix, header)
    for x in matrix_data:
        infile, outfile, is_express_file, header = x
        if is_express_file:
            data = read_express(infile)
        else:
            data = AM.read(infile, nrows=args.debug_nrows)
        x = infile, outfile, data, header
        new_matrix_data.append(x)
    matrix_data = new_matrix_data

    # Find the samples in each matrix.
    new_matrix_data = []  # list of (infile, outfile, matrix, header, samples)
    samples_hint = peek_samples_hint(matrix_data)
    for x in matrix_data:
        infile, outfile, matrix, header = x
        headers_hint = [x for x in headers if x]
        x = get_samples(
            matrix, header, samples_hint, headers_hint,
            args.case_insensitive, args.hash, args.ignore_nonalnum)
        assert x, "I could not find the samples for %s" % infile
        header, samples = x
        x = infile, outfile, matrix, header, samples
        new_matrix_data.append(x)
    matrix_data = new_matrix_data

    if args.left_join:
        assert not args.strict, "Can't do a strict left join."
        # No duplicates.
        samples = list_all_samples(
            matrix_data[:1], args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        assert samples, "No samples."
    elif args.outer_join:
        assert not args.strict, "Can't do a strict outer join."
        samples = list_all_samples(
            matrix_data, args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        assert samples, "No samples."
    else:  # inner join
        samples = list_common_samples(
            matrix_data, args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        assert samples, "No common samples found."

    if args.strict:
        all_samples = list_all_samples(
            matrix_data, args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        common_samples = list_common_samples(
            matrix_data, args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        if sorted(all_samples) != sorted(common_samples):
            missing_samples = []
            for x in all_samples:
                i = find_sample(
                    common_samples, x, args.case_insensitive, args.hash,
                    args.ignore_nonalnum, args.ignore_blank)
                if i >= 0:
                    continue
                missing_samples.append(x)
            short = missing_samples
            if len(short) > 10:
                short = short[:10] + ["..."]
            short = "\n".join(short)
            raise AssertionError, "%d samples not in all data sets.\n%s" % \
                  (len(missing_samples), short)

    # Align each of the matrices.
    matrix_data = align_matrices(
        matrix_data, samples, args.case_insensitive, args.hash,
        args.ignore_nonalnum, args.ignore_blank,
        args.left_join, args.outer_join, args.unaligned_only,
        args.null_string)

    # Add the missing samples back to the matrix.
    if not args.dont_add_missing_samples:
        matrix_data = add_missing_samples(matrix_data, args.null_string)

    # Write out each of the matrices.
    for x in matrix_data:
        infile, outfile, matrix, header, samples = x
        if outfile == SKIP_OUTFILE:
            continue
        write_matrix(outfile, matrix)
Пример #4
0
def main():
    import os
    import argparse

    from genomicode import jmath
    from genomicode import AnnotationMatrix
    from genomicode import colorlib
    from genomicode import pcalib

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("datafile", help="Tab-delimited data file.")
    #parser.add_argument("x_header", help="Which column for X values.")
    #parser.add_argument("y_header", help="Which column for Y values.")
    parser.add_argument(
        "plot_file",
        help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")

    group = parser.add_argument_group(title="Data Series")
    group.add_argument(
        "--series",
        action="append",
        help="Add a data series to the plot.  At least one series must be "
        "plotted.  Format: <x_header>;<y_header>")

    group = parser.add_argument_group(title="General Appearance")
    group.add_argument("--no_box",
                       action="store_true",
                       help="Turn off the box around the plot.")
    group.add_argument("--height",
                       type=int,
                       help="Height (in pixels) of the plot.")
    group.add_argument("--width",
                       type=int,
                       help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left",
        default=1.0,
        type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument("--mar_bottom",
                       default=1.0,
                       type=float,
                       help="Scale margin at bottom of plot.  Default 1.0.")
    #group.add_argument(
    #    "--xlabel_size", default=1.0, type=float,
    #    help="Scale the size of the labels on X-axis.  Default 1.0.")
    group.add_argument("--log_x",
                       action="store_true",
                       help="Plot the X-axis on a log scale.")
    group.add_argument("--log_y",
                       action="store_true",
                       help="Plot the Y-axis on a log scale.")
    group.add_argument(
        "--qq",
        action="store_true",
        help="Make a QQ-plot.  Will sort the values to be plotted.")

    group = parser.add_argument_group(title="Plot Labels")
    group.add_argument("--title", help="Put a title on the plot.")
    group.add_argument("--xlab", help="Label the X-axis.")
    group.add_argument("--ylab", help="Label the Y-axis.")
    group.add_argument("--add_regression",
                       action="store_true",
                       help="Put a regression line on the plot.")

    group = parser.add_argument_group(title="Legend")
    group.add_argument("--add_legend",
                       action="store_true",
                       help="Add a legend to the plot.")
    group.add_argument("--legend_inset", type=float, default=0.05, help="")
    LEGEND_LOCATIONS = [
        "bottomright",
        "bottom",
        "bottomleft",
        "left",
        "topleft",
        "top",
        "topright",
        "right",
        "center",
    ]
    group.add_argument("--legend_loc",
                       choices=LEGEND_LOCATIONS,
                       help="Where to draw the legend.")

    group = parser.add_argument_group(title="Point Appearance")
    group.add_argument("--scale_points",
                       default=1.0,
                       type=float,
                       help="Scale the size of the points.  Default 1.0")
    group.add_argument("--label_header",
                       help="Label each point with the values in this column.")
    group.add_argument("--label_size",
                       type=float,
                       help="Scale the size of the labels by this value.")
    group.add_argument("--label_pos",
                       default="top",
                       choices=["top", "bottom", "left", "right"],
                       help="Where to label the points.")

    group = parser.add_argument_group(title="Line Appearance")
    group.add_argument("--add_lines",
                       action="store_true",
                       help="Add lines that connect the points.")
    group.add_argument("--scale_lines",
                       default=1.0,
                       type=float,
                       help="Scale the thickness of the lines.  Default 1.0")

    group = parser.add_argument_group(title="Identity Line")
    group.add_argument("--add_identity_line",
                       action="store_true",
                       help="Add an identity line to the plot.")

    group = parser.add_argument_group(title="Colors")
    group.add_argument(
        "-c",
        "--cluster",
        action="append",
        help="Group samples into a cluster (e.g. -c 1-5); 1-based, inclusive.")
    group.add_argument(
        "--indexes_include_headers",
        "--iih",
        action="store_true",
        help="If not given (default), then index 1 is the first row "
        "with data.  If given, then index 1 is the very first row "
        "in the file, including the headers.")
    group.add_argument("--default_color",
                       help="Default color of points.  Format: #000000.")

    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.datafile):
        parser.error("File not found: %s" % args.datafile)
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096 * 16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096 * 16, "height too big"
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10
    #assert args.xlabel_size > 0 and args.xlabel_size < 10

    assert args.legend_inset >= 0 and args.legend_inset < 10
    if args.legend_loc is None:
        args.legend_loc = "bottomright"

    if args.default_color:
        assert len(args.default_color) == 7
        assert args.default_color[0] == "#"

    MATRIX = AnnotationMatrix.read(args.datafile, False)
    assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix."
    assert args.series, "Need to add a data --series to plot."
    #assert len(args.series) <= 1, "Not implemented."
    #assert args.x_header in MATRIX.headers, \
    #       "header not found: %s" % args.x_header
    #assert args.y_header in MATRIX.headers, \
    #       "header not found: %s" % args.y_header
    if args.label_header:
        assert args.label_header in MATRIX.headers, \
               "header not found: %s" % args.label_header
    if args.label_size is not None:
        assert args.label_size > 0 and args.label_size <= 20
    assert args.scale_points > 0 and args.scale_points < 20
    assert args.scale_lines > 0 and args.scale_lines < 20

    series = _parse_series(MATRIX, args.series)
    cluster = None
    if args.cluster:
        cluster = _parse_cluster(args.cluster, args.indexes_include_headers,
                                 MATRIX)

    if len(series) > 1:
        assert not cluster, "Series and cluster not implemented."

    height = args.height or 2400
    width = args.width or 3200

    # Pull out the values and colors for the plot.
    default_color = "#000000"
    if args.default_color:
        default_color = args.default_color

    assert len(series) < len(colorlib.BREWER_QUALITATIVE_SET1)
    series_data = []  # list of (x_values, y_values, col) for each series
    for i in range(len(series)):
        x_header, y_header = series[i]
        x = MATRIX[x_header]
        y = MATRIX[y_header]
        I1 = [j for (j, a) in enumerate(x) if a]
        I2 = [j for (j, a) in enumerate(y) if a]
        I = [j for j in I1 if j in I2]
        x = [x[j] for j in I]
        y = [y[j] for j in I]
        x = map(float, x)
        y = map(float, y)
        assert len(x) == len(y)
        c = default_color
        if len(series) > 1:
            rgb = colorlib.BREWER_QUALITATIVE_SET1[i]
            c = colorlib.rgb2hex(rgb, prefix="#")
        c = [c] * len(x)
        x = x, y, c
        series_data.append(x)

    # Merge all the data point for each series.
    x_values = []
    y_values = []
    col = []
    for (x, y, c) in series_data:
        x_values.extend(x)
        y_values.extend(y)
        #c = [c] * len(x)
        col.extend(c)
    assert len(x_values) == len(y_values)
    assert len(x_values) == len(col)

    if args.qq:
        O = jmath.order(x_values)
        x_values = [x_values[i] for i in O]
        y_values = [y_values[i] for i in O]
        col = [col[i] for i in O]

    if cluster is not None:
        col_rgb = pcalib.choose_colors(cluster)
        col = [default_color] * len(col_rgb)
        for i in range(len(col_rgb)):
            if col_rgb[i] is None:
                continue
            col[i] = colorlib.rgb2hex(col_rgb[i], prefix="#")
        assert len(col) == len(x_values)

    #for i in range(len(x_values)):
    #    x = x_values[i], y_values[i], cluster[i], col[i]
    #    print "\t".join(map(str, x))

    # Start R and set up the environment.
    R = jmath.start_R()

    main = jmath.R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    if len(series) == 1:
        xlab = x_header
    if args.xlab:
        xlab = args.xlab
    ylab = ""
    if len(series) == 1:
        ylab = y_header
    if args.xlab:
        ylab = args.ylab

    lwd_box = 2
    lwd_axis = 2
    lwd_regr = 3
    cex = 1.0 * args.scale_points
    cex_lab = 1.5
    cex_main = 2.0
    cex_sub = 1.0

    plot_log = ""
    if args.log_x:
        plot_log += "x"
    if args.log_y:
        plot_log += "y"

    assert x_values
    assert y_values
    jmath.R_equals(x_values, "X")
    jmath.R_equals(y_values, "Y")

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn("bitmap",
               args.plot_file,
               type=bm_type,
               height=height,
               width=width,
               units="px",
               res=300)

    # Set the margins.
    x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2
    mar = [x + 0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    jmath.R_fn("plot",
               jmath.R_var("X"),
               jmath.R_var("Y"),
               main="",
               xlab="",
               ylab="",
               pch=19,
               cex=cex,
               log=plot_log,
               col=col,
               axes=jmath.R_var("FALSE"),
               RETVAL="x")
    # Make plot area solid white.
    #jmath.R('usr <- par("usr")')
    #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    #jmath.R_fn(
    #    "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"),
    #    main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"),
    #    add=jmath.R_var("TRUE"))

    if args.add_lines:
        lwd = 4 * args.scale_lines
        i = 0
        for (x, y, c) in series_data:
            # Cannot use c for the color.  It might've been changed by
            # --cluster.
            assert col and i < len(col)
            c = col[i:i + len(x)]
            i += len(x)

            # The "lines" function takes a scalar for col (except for
            # type=h, histogram vertical lines).  If there are
            # multiple colors, then split up the points based on the
            # colors.
            l_x, l_y, l_c = [], [], None
            for j in range(len(x)):
                if c[j] != l_c:
                    if l_x:
                        jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)
                    # Add the previous point so that the points will
                    # connect.
                    if l_x:
                        l_x = [l_x[-1]]
                        l_y = [l_y[-1]]
                    else:
                        l_x, l_y, l_c = [], [], None
                l_x.append(x[j])
                l_y.append(y[j])
                l_c = c[j]
            if l_x:
                jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)

    if args.add_identity_line:
        lwd = 4

        x_min, x_max = min(x_values), max(x_values)
        y_min, y_max = min(y_values), max(y_values)

        iden_min = max(x_min, y_min)
        iden_max = min(x_max, y_max)

        l_x = [iden_min, iden_max]
        l_y = l_x
        l_c = "#FF0000"
        jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)

    if args.label_header:
        cex = 1
        if args.label_size is not None:
            cex = args.label_size
        pos2specifier = {
            "top": 3,
            "bottom": 1,
            "left": 2,
            "right": 4,
        }
        pos = pos2specifier[args.label_pos]
        point_labels = MATRIX[args.label_header]
        jmath.R_fn("text",
                   jmath.R_var("X"),
                   jmath.R_var("Y"),
                   labels=point_labels,
                   cex=cex,
                   pos=pos)

    # Calculate correlation, and other statistics.
    # TODO: Should calculate this for each series.
    r = jmath.R("cor(X, Y)")
    p_value = jmath.R("cor.test(X, Y)$p.value")
    r = r[0]
    p_value = p_value[0]
    print "R = %.2f" % r
    print "p = %.2g" % p_value

    # Add a regression line.
    if args.add_regression:
        jmath.R("fit <- lm(Y ~ X)")
        coef = jmath.R("fit$coefficients")
        assert len(coef) == 2
        b, m = coef
        x1 = min(x_values)
        y1 = x1 * m + b
        x2 = max(x_values)
        y2 = x2 * m + b
        jmath.R_fn("lines", [x1, x2], [y1, y2],
                   lwd=lwd_regr,
                   lty=2,
                   col="#C63F31")
        sub = "R=%.2f (p=%.2g)" % (r, p_value)
        header = "X", "Y", "R", "p"
        print "\t".join(header)
        x = xlab, ylab, r, p_value
        print "\t".join(map(str, x))

    if args.add_legend:
        leg = [x[1] for x in series]
        fill = [x[-1] for x in series_data]
        #jmath.R("x <- rgb(0.5, 0.5, 0.5, 0.5)")
        # alpha does not seem to be supported here.
        jmath.R_fn("legend",
                   args.legend_loc,
                   legend=leg,
                   fill=fill,
                   inset=args.legend_inset)

    if not args.no_box:
        jmath.R_fn("box", lwd=lwd_box)
    jmath.R_fn("axis", 1, lwd=lwd_axis, **{"cex.axis": 1.5})
    jmath.R_fn("axis", 2, lwd=lwd_axis, **{"cex.axis": 1.5})
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_lab,
                   "cex.main": cex_main,
                   "cex.sub": cex_sub
               })
    R("par(op)")
    jmath.R_fn("dev.off")
Пример #5
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_filename):
        import os
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import alignlib
        #from Betsy import module_utils as mlib

        rsem_path = in_data.identifier
        assert os.path.exists(rsem_path)
        assert os.path.isdir(rsem_path)
        result_files = alignlib.find_rsem_result_files(rsem_path)
        assert result_files, "No .results files found."
        metadata = {}

        preprocess = out_attributes.get("preprocess")
        assert preprocess in ["tpm", "fpkm"]

        #x = mlib.get_user_option(
        #    user_options, "genes_or_isoforms", not_empty=True,
        #    allowed_values=["genes", "isoforms"])
        #get_genes = x == "genes"

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["expression_of"]
        assert x in ["gene", "isoform"]
        get_genes = x == "gene"

        transcript_header = "transcript_id(s)"
        if not get_genes:
            transcript_header = "transcript_id"

        # For each of the gene files, get the expression data.
        sample2matrix = {}  # sample -> AnnotationMatrix
        for x in result_files:
            sample, gene_filename, isoform_filename = x
            # Get the gene results.
            # TODO: Implement isoforms.
            filename = gene_filename
            if not get_genes:
                filename = isoform_filename
            assert filename is not None, "Missing: %s" % filename
            #if filename is None:
            #    continue
            assert os.path.exists(filename)
            matrix = AnnotationMatrix.read(filename)
            # Do some checking on the matrix.
            assert "gene_id" in matrix.headers
            assert transcript_header in matrix.headers
            assert "TPM" in matrix.headers
            assert "FPKM" in matrix.headers
            sample2matrix[sample] = matrix
        assert sample2matrix, "No samples"

        gene_id = transcript_id = None
        # Pull out the gene and transcript IDs.
        for matrix in sample2matrix.itervalues():
            x1 = matrix["gene_id"]
            x2 = matrix[transcript_header]
            if gene_id is None:
                gene_id = x1
            if transcript_id is None:
                transcript_id = x2
            assert x1 == gene_id
            assert x2 == transcript_id
        assert gene_id
        assert transcript_id
        assert len(gene_id) == len(transcript_id)

        # Assemble into a gene expression matrix.
        header = "TPM"
        if preprocess == "fpkm":
            header = "FPKM"
        t_data = []  # matrix, where each row is a sample.
        t_data.append(gene_id)
        t_data.append(transcript_id)
        samples = []
        for sample in sorted(sample2matrix):
            matrix = sample2matrix[sample]
            exp = matrix[header]
            assert len(exp) == len(gene_id)
            t_data.append(exp)
            samples.append(sample)

        data = jmath.transpose(t_data)
        header = ["gene_id", transcript_header] + samples
        data = [header] + data

        # Write out the data file.
        handle = open(out_filename, 'w')
        for x in data:
            print >>handle, "\t".join(map(str, x))

        return metadata