def _make_col_palette(palette_name, num_bars, symmetric): # Format the col variable for the R hist function. from genomicode import colorlib color_fn = get_palette_fn(palette_name) if not symmetric: x = color_fn(num_bars) print x else: # Color order goes from left to right. The colors that was on # the left should now be in the middle. if num_bars % 2: # Odd. x = color_fn(num_bars/2+1) x = list(reversed(x)) + x[:-1] else: x = color_fn(num_bars/2) x = list(reversed(x)) + x x = [colorlib.rgb2hex(x) for x in x] x = [x.replace("0x", "#") for x in x] assert len(x) == num_bars return x
def main(): import argparse import math import arrayio from genomicode import config from genomicode import colorlib from genomicode import jmath from genomicode.jmath import R_fn, R_var, R_equals parser = argparse.ArgumentParser(description="") parser.add_argument("expression_file", help="Gene expression file.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") parser.add_argument("-v", "--verbose", action="store_true", help="") parser.add_argument("--prism_file", help="Save result in Prism-formatted file.") group = parser.add_argument_group(title="Genes") group.add_argument( "--gene_names", default=[], action="append", help="Comma-separated list of IDs (e.g. probes, gene names) " "to include.") group.add_argument("--all_genes", default=False, action="store_true", help="Plot all genes in the file.") group = parser.add_argument_group(title="Plot") group.add_argument("--title", default=None, help="Put a title on the plot.") group.add_argument("--height", default=None, type=int, help="Height (in pixels) of the plot.") group.add_argument("--width", default=None, type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument( "--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0 (no scaling).") group.add_argument( "--xlabel_size", default=1.0, type=float, help="Scale the size of the labels on X-axis. Default 1.0.") group.add_argument("--xlabel_off", default=False, action="store_true", help="Turn off the X labels.") group.add_argument("--ylabel", help="Label the Y axis.") group.add_argument("--gene_name_header", help="Header for gene names to be used in the legend.") group.add_argument("--yaxis_starts_at_0", action="store_true", help="Y-axis should start at 0.") group.add_argument("--legend_off", action="store_true", help="Do not draw legend.") group.add_argument("--horizontal_lines", action="store_true", help="Draw horizontal lines.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.expression_file): parser.error("I could not find file %s." % args.expression_file) if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096 * 16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096 * 16, "height too big" assert args.gene_names or args.all_genes, \ "Please specify some genes to plot." assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 assert args.xlabel_size > 0 and args.xlabel_size < 10 height = args.height or 1600 width = args.width or 1600 MATRIX = arrayio.read(args.expression_file) assert MATRIX.nrow() and MATRIX.ncol(), "Empty matrix." I = None if args.gene_names: I = find_gene_names(MATRIX, args.gene_names) elif args.all_genes: I = range(MATRIX.nrow()) assert I, "No genes found." assert len(I) < 50, "Too many genes." MATRIX = MATRIX.matrix(I, None) # Find the gene names for the legend. if args.gene_name_header: h = args.gene_name_header assert h in MATRIX.row_names(), "Missing header: %s" % h gene_names = MATRIX.row_names(h) else: gene_names = [ get_pretty_gene_name(MATRIX, i) for i in range(MATRIX.nrow()) ] assert len(gene_names) == MATRIX.nrow() if args.prism_file: write_prism_file(args.prism_file, MATRIX, gene_names) # Start R and set up the environment. R = jmath.start_R() path = config.changlab_Rlib plotlib = os.path.join(path, "plotlib.R") assert os.path.exists(plotlib), "I cannot find: %s" % plotlib R_fn("source", plotlib) main = R_var("NA") if args.title: main = args.title sub = "" xlab = "" #ylab = "Gene Expression" ylab = "" if args.ylabel: ylab = args.ylabel labels = jmath.R_var("FALSE") #labels = MATRIX.col_names(arrayio.COL_ID) col = R_var("NULL") xlim = [1, MATRIX.ncol() + 1] y_max = jmath.max(jmath.max(MATRIX._X)) y_min = jmath.min(jmath.min(MATRIX._X)) ylim = [y_min - 1, y_max + 1] if args.yaxis_starts_at_0: assert y_max > 0 ylim[0] = 0 if not args.xlabel_off: labels = MATRIX.col_names(arrayio.COL_ID) lwd = 2 las = 3 # vertical labels at = R_var("NULL") if labels != jmath.R_var("FALSE"): at = range(1, len(labels) + 1) cex_labels = 1 * args.xlabel_size cex_legend = 1 cex_lab = 1.5 cex_sub = 1.5 x = colorlib.bild_colors(len(gene_names)) x = [colorlib.rgb2hex(x) for x in x] x = [x.replace("0x", "#") for x in x] col = x R_equals(MATRIX._X, "X") R_equals(labels, "labels") R_equals(at, "at") bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" R_fn("bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2 mar = [x + 0.1 for x in x] R_fn("par", mar=mar, RETVAL="op") R_fn("plot", R_var("NA"), type="n", axes=R_var("FALSE"), xlab="", ylab="", xlim=xlim, ylim=ylim) jmath.R('usr <- par("usr")') jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') jmath.R_fn("box", lwd=lwd) jmath.R_fn("axis", 1, lwd=lwd, labels=R_var("labels"), at=R_var("at"), las=las, **{"cex.axis": cex_labels}) jmath.R_fn("axis", 2, lwd=lwd, **{"cex.axis": 1.5}) jmath.R_fn("title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab": cex_lab, "cex.main": 2.0, "cex.sub": cex_sub }) for i in range(MATRIX.nrow()): y = MATRIX._X[i] x = range(1, len(y) + 1) R_fn("lines", x, y, lwd=lwd, col=col[i]) R_fn("points", x, y, pch=19, cex=1, col=col[i]) if args.horizontal_lines: y1 = int(math.ceil(ylim[0])) y2 = int(math.floor(ylim[1])) for y in range(y1, y2 + 1): R_fn("lines", (1, MATRIX.ncol() + 1), (y, y), lty=3, col="#A0A0A0") if not args.legend_off: R_fn("legend", "bottomleft", legend=gene_names, fill=col, cex=1, inset=0.05, **{"box.lwd": 1.5}) R_fn("par", R_var("op")) R_fn("dev.off")
def main(): import os import argparse from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import colorlib from genomicode import pcalib parser = argparse.ArgumentParser(description="") parser.add_argument("datafile", help="Tab-delimited data file.") #parser.add_argument("x_header", help="Which column for X values.") #parser.add_argument("y_header", help="Which column for Y values.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") group = parser.add_argument_group(title="Data Series") group.add_argument( "--series", action="append", help="Add a data series to the plot. At least one series must be " "plotted. Format: <x_header>;<y_header>") group = parser.add_argument_group(title="General Appearance") group.add_argument("--no_box", action="store_true", help="Turn off the box around the plot.") group.add_argument("--height", type=int, help="Height (in pixels) of the plot.") group.add_argument("--width", type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument("--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0.") #group.add_argument( # "--xlabel_size", default=1.0, type=float, # help="Scale the size of the labels on X-axis. Default 1.0.") group.add_argument("--log_x", action="store_true", help="Plot the X-axis on a log scale.") group.add_argument("--log_y", action="store_true", help="Plot the Y-axis on a log scale.") group.add_argument( "--qq", action="store_true", help="Make a QQ-plot. Will sort the values to be plotted.") group = parser.add_argument_group(title="Plot Labels") group.add_argument("--title", help="Put a title on the plot.") group.add_argument("--xlab", help="Label the X-axis.") group.add_argument("--ylab", help="Label the Y-axis.") group.add_argument("--add_regression", action="store_true", help="Put a regression line on the plot.") group = parser.add_argument_group(title="Legend") group.add_argument("--add_legend", action="store_true", help="Add a legend to the plot.") group.add_argument("--legend_inset", type=float, default=0.05, help="") LEGEND_LOCATIONS = [ "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right", "center", ] group.add_argument("--legend_loc", choices=LEGEND_LOCATIONS, help="Where to draw the legend.") group = parser.add_argument_group(title="Point Appearance") group.add_argument("--scale_points", default=1.0, type=float, help="Scale the size of the points. Default 1.0") group.add_argument("--label_header", help="Label each point with the values in this column.") group.add_argument("--label_size", type=float, help="Scale the size of the labels by this value.") group.add_argument("--label_pos", default="top", choices=["top", "bottom", "left", "right"], help="Where to label the points.") group = parser.add_argument_group(title="Line Appearance") group.add_argument("--add_lines", action="store_true", help="Add lines that connect the points.") group.add_argument("--scale_lines", default=1.0, type=float, help="Scale the thickness of the lines. Default 1.0") group = parser.add_argument_group(title="Identity Line") group.add_argument("--add_identity_line", action="store_true", help="Add an identity line to the plot.") group = parser.add_argument_group(title="Colors") group.add_argument( "-c", "--cluster", action="append", help="Group samples into a cluster (e.g. -c 1-5); 1-based, inclusive.") group.add_argument( "--indexes_include_headers", "--iih", action="store_true", help="If not given (default), then index 1 is the first row " "with data. If given, then index 1 is the very first row " "in the file, including the headers.") group.add_argument("--default_color", help="Default color of points. Format: #000000.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.datafile): parser.error("File not found: %s" % args.datafile) if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096 * 16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096 * 16, "height too big" assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 #assert args.xlabel_size > 0 and args.xlabel_size < 10 assert args.legend_inset >= 0 and args.legend_inset < 10 if args.legend_loc is None: args.legend_loc = "bottomright" if args.default_color: assert len(args.default_color) == 7 assert args.default_color[0] == "#" MATRIX = AnnotationMatrix.read(args.datafile, False) assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix." assert args.series, "Need to add a data --series to plot." #assert len(args.series) <= 1, "Not implemented." #assert args.x_header in MATRIX.headers, \ # "header not found: %s" % args.x_header #assert args.y_header in MATRIX.headers, \ # "header not found: %s" % args.y_header if args.label_header: assert args.label_header in MATRIX.headers, \ "header not found: %s" % args.label_header if args.label_size is not None: assert args.label_size > 0 and args.label_size <= 20 assert args.scale_points > 0 and args.scale_points < 20 assert args.scale_lines > 0 and args.scale_lines < 20 series = _parse_series(MATRIX, args.series) cluster = None if args.cluster: cluster = _parse_cluster(args.cluster, args.indexes_include_headers, MATRIX) if len(series) > 1: assert not cluster, "Series and cluster not implemented." height = args.height or 2400 width = args.width or 3200 # Pull out the values and colors for the plot. default_color = "#000000" if args.default_color: default_color = args.default_color assert len(series) < len(colorlib.BREWER_QUALITATIVE_SET1) series_data = [] # list of (x_values, y_values, col) for each series for i in range(len(series)): x_header, y_header = series[i] x = MATRIX[x_header] y = MATRIX[y_header] I1 = [j for (j, a) in enumerate(x) if a] I2 = [j for (j, a) in enumerate(y) if a] I = [j for j in I1 if j in I2] x = [x[j] for j in I] y = [y[j] for j in I] x = map(float, x) y = map(float, y) assert len(x) == len(y) c = default_color if len(series) > 1: rgb = colorlib.BREWER_QUALITATIVE_SET1[i] c = colorlib.rgb2hex(rgb, prefix="#") c = [c] * len(x) x = x, y, c series_data.append(x) # Merge all the data point for each series. x_values = [] y_values = [] col = [] for (x, y, c) in series_data: x_values.extend(x) y_values.extend(y) #c = [c] * len(x) col.extend(c) assert len(x_values) == len(y_values) assert len(x_values) == len(col) if args.qq: O = jmath.order(x_values) x_values = [x_values[i] for i in O] y_values = [y_values[i] for i in O] col = [col[i] for i in O] if cluster is not None: col_rgb = pcalib.choose_colors(cluster) col = [default_color] * len(col_rgb) for i in range(len(col_rgb)): if col_rgb[i] is None: continue col[i] = colorlib.rgb2hex(col_rgb[i], prefix="#") assert len(col) == len(x_values) #for i in range(len(x_values)): # x = x_values[i], y_values[i], cluster[i], col[i] # print "\t".join(map(str, x)) # Start R and set up the environment. R = jmath.start_R() main = jmath.R_var("NA") if args.title: main = args.title sub = "" xlab = "" if len(series) == 1: xlab = x_header if args.xlab: xlab = args.xlab ylab = "" if len(series) == 1: ylab = y_header if args.xlab: ylab = args.ylab lwd_box = 2 lwd_axis = 2 lwd_regr = 3 cex = 1.0 * args.scale_points cex_lab = 1.5 cex_main = 2.0 cex_sub = 1.0 plot_log = "" if args.log_x: plot_log += "x" if args.log_y: plot_log += "y" assert x_values assert y_values jmath.R_equals(x_values, "X") jmath.R_equals(y_values, "Y") bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn("bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2 mar = [x + 0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") jmath.R_fn("plot", jmath.R_var("X"), jmath.R_var("Y"), main="", xlab="", ylab="", pch=19, cex=cex, log=plot_log, col=col, axes=jmath.R_var("FALSE"), RETVAL="x") # Make plot area solid white. #jmath.R('usr <- par("usr")') #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') #jmath.R_fn( # "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"), # main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"), # add=jmath.R_var("TRUE")) if args.add_lines: lwd = 4 * args.scale_lines i = 0 for (x, y, c) in series_data: # Cannot use c for the color. It might've been changed by # --cluster. assert col and i < len(col) c = col[i:i + len(x)] i += len(x) # The "lines" function takes a scalar for col (except for # type=h, histogram vertical lines). If there are # multiple colors, then split up the points based on the # colors. l_x, l_y, l_c = [], [], None for j in range(len(x)): if c[j] != l_c: if l_x: jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) # Add the previous point so that the points will # connect. if l_x: l_x = [l_x[-1]] l_y = [l_y[-1]] else: l_x, l_y, l_c = [], [], None l_x.append(x[j]) l_y.append(y[j]) l_c = c[j] if l_x: jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) if args.add_identity_line: lwd = 4 x_min, x_max = min(x_values), max(x_values) y_min, y_max = min(y_values), max(y_values) iden_min = max(x_min, y_min) iden_max = min(x_max, y_max) l_x = [iden_min, iden_max] l_y = l_x l_c = "#FF0000" jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) if args.label_header: cex = 1 if args.label_size is not None: cex = args.label_size pos2specifier = { "top": 3, "bottom": 1, "left": 2, "right": 4, } pos = pos2specifier[args.label_pos] point_labels = MATRIX[args.label_header] jmath.R_fn("text", jmath.R_var("X"), jmath.R_var("Y"), labels=point_labels, cex=cex, pos=pos) # Calculate correlation, and other statistics. # TODO: Should calculate this for each series. r = jmath.R("cor(X, Y)") p_value = jmath.R("cor.test(X, Y)$p.value") r = r[0] p_value = p_value[0] print "R = %.2f" % r print "p = %.2g" % p_value # Add a regression line. if args.add_regression: jmath.R("fit <- lm(Y ~ X)") coef = jmath.R("fit$coefficients") assert len(coef) == 2 b, m = coef x1 = min(x_values) y1 = x1 * m + b x2 = max(x_values) y2 = x2 * m + b jmath.R_fn("lines", [x1, x2], [y1, y2], lwd=lwd_regr, lty=2, col="#C63F31") sub = "R=%.2f (p=%.2g)" % (r, p_value) header = "X", "Y", "R", "p" print "\t".join(header) x = xlab, ylab, r, p_value print "\t".join(map(str, x)) if args.add_legend: leg = [x[1] for x in series] fill = [x[-1] for x in series_data] #jmath.R("x <- rgb(0.5, 0.5, 0.5, 0.5)") # alpha does not seem to be supported here. jmath.R_fn("legend", args.legend_loc, legend=leg, fill=fill, inset=args.legend_inset) if not args.no_box: jmath.R_fn("box", lwd=lwd_box) jmath.R_fn("axis", 1, lwd=lwd_axis, **{"cex.axis": 1.5}) jmath.R_fn("axis", 2, lwd=lwd_axis, **{"cex.axis": 1.5}) jmath.R_fn("title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab": cex_lab, "cex.main": cex_main, "cex.sub": cex_sub }) R("par(op)") jmath.R_fn("dev.off")
def plot_boxplot(filename, group_names, group2values, height=None, width=None, cluster=None, title="", subtitle="", sub="", xlab="", ylab="", subtitle_size=1.0, subtitle_line=0.5, subtitle_col="#000000", xlabel_size=1.0, xlabel_off=False, mar_bottom=1.0, mar_left=1.0, mar_top=1.0): # group_names is a list of the names for each group. # group2values is a dictionary of group_name -> list of values. # Also, can be matrix (values x groups). # subtitle goes under title. sub goes under plot. from genomicode import config from genomicode import jmath from genomicode import colorlib from genomicode import pcalib # Start R and set up the environment. R = jmath.start_R() path = config.changlab_Rlib plotlib = os.path.join(path, "plotlib.R") assert os.path.exists(plotlib), "I cannot find: %s" % plotlib jmath.R_fn("source", plotlib) main = jmath.R_var("NA") if title: main = title sub = sub xlab = xlab ylab = ylab xlabel = group_names if xlabel_off: xlabel = jmath.R_var("FALSE") col = jmath.R_var("NULL") if cluster is not None: x = pcalib.choose_colors(cluster) x = [colorlib.rgb2hex(x) for x in x] x = [x.replace("0x", "#") for x in x] col = x lwd = 2 las = 3 # vertical labels at = jmath.R_var("NULL") if xlabel != jmath.R_var("FALSE"): at = range(1, len(xlabel) + 1) cex_labels = 1.25 * xlabel_size #cex_legend = 1 cex_xlab = 1.5 cex_ylab = 2.0 cex_sub = 1.5 if type(group2values) is type([]): # Is matrix. Should do more checking here. jmath.R_equals(group2values, "X") else: R("X <- list()") for i, n in enumerate(group_names): x = group2values.get(n, []) x = [x for x in x if x is not None] jmath.R_equals(x, "s") R("X[[%d]] <- s" % (i + 1)) #try: # #jmath.R_equals(MATRIX._X, "X") # jmath.R_equals(X, "X") #except ValueError, x: # # Not needed anymore. Missing values are now implemented in jmath. # ## Look for missing values. # #for i in range(len(MATRIX._X)): # # assert None not in MATRIX._X[i], \ # # "Missing values in row %d (0-based)." % i # ## Cannot diagnose error. Raise the original exception. # raise jmath.R_equals(xlabel, "labels") jmath.R_equals(at, "at") bm_type = "png16m" if filename.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn("bitmap", filename, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. # default is 5.1, 4.1, 4.1, 2.1 label_adjust = 1.0 if xlabel == jmath.R_var("FALSE"): label_adjust = 0.2 x = 5 * 2.0 * mar_bottom * label_adjust, 4 * 1.2 * mar_left, 4 * mar_top, 2 mar = [x + 0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") jmath.R_fn("boxplot", jmath.R_var("X"), col=col, main="", xlab="", ylab="", axes=jmath.R_var("FALSE"), pch=19, cex=1, ylim=jmath.R_var("NULL")) # Make plot area solid white. jmath.R('usr <- par("usr")') jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') jmath.R_fn("boxplot", jmath.R_var("X"), col=col, main="", xlab="", ylab="", axes=jmath.R_var("FALSE"), pch=19, cex=1, ylim=jmath.R_var("NULL"), add=jmath.R_var("TRUE")) jmath.R_fn("box", lwd=lwd) jmath.R_fn("axis", 1, lwd=lwd, labels=jmath.R_var("labels"), at=jmath.R_var("at"), las=las, **{"cex.axis": cex_labels}) jmath.R_fn("axis", 2, lwd=lwd, **{"cex.axis": 1.5}) jmath.R_fn("title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab": cex_xlab, "cex.main": 2.0, "cex.sub": cex_sub, "col.sub": "#A60400" }) if subtitle: jmath.R_fn("mtext", subtitle, cex=1.0 * subtitle_size, line=subtitle_line, col=subtitle_col) R("par(op)") jmath.R_fn("dev.off")
def main(): from optparse import OptionParser, OptionGroup import numpy import arrayio from genomicode import jmath from genomicode import pcalib from genomicode import colorlib from genomicode import prismlib # Does a PCA on the columns. usage = "usage: %prog [options] filename outfile.png" parser = OptionParser(usage=usage, version="%prog 01") #parser.add_option( # "-l", "--log_transform", default=False, # action="store_true", # help="Log transform the data first.") parser.add_option( "--num_header_cols", type=int, help="This number of columns are headers. If not given, will guess.") parser.add_option("-g", "--genes", default=None, type="int", help="Number of genes to use.") parser.add_option( "--prism_file", help="Write the column principal components to a prism-formatted " "file.") parser.add_option( "--row_pc_file", help="Write the principal components of the rows to this file.") parser.add_option( "--col_pc_file", help="Write the principal components of the cols to this file.") #parser.add_option( # "-v", "--verbose", default=False, action="store_true", # help="") group = OptionGroup(parser, "Clustering") parser.add_option_group(group) group.add_option( "-c", "--cluster", default=[], action="append", help="Group samples into a cluster (e.g. -c 1-5); 1-based.") group.add_option( "--indexes_include_headers", "--iih", action="store_true", help="If not given (default), then index 1 is the first column " "with data. If given, then index 1 is the very first column " "in the file, including the headers.") group.add_option( "--cluster_file", help="A KGG format file of the clusters for the samples. " "Clusters in this file can be 0-based or 1-based.") group = OptionGroup(parser, "Visualization") parser.add_option_group(group) group.add_option("--title", help="Put a title on the plot.") group.add_option("--width", default=None, type="int", help="Width (in pixels) of the plot.") group.add_option("--label", default=False, action="store_true", help="Label the samples.") group.add_option("--label_axes", default=False, action="store_true", help="Label the axes.") group.add_option("--scale_label", type=float, default=1.0, help="Scale the size of the labels.") # Parse the input arguments. options, args = parser.parse_args() if len(args) < 2: parser.error("Please specify an infile and an outfile.") elif len(args) > 2: parser.error("Too many input parameters (%d)." % len(args)) filename, outfile = args if not os.path.exists(filename): parser.error("I could not find file %s." % filename) if options.num_header_cols is not None: assert options.num_header_cols > 0 and options.num_header_cols < 100 if options.width is not None: assert options.width > 10, "too small" assert options.width < 4096 * 16, "width too big" assert options.scale_label > 0.01 and options.scale_label < 100 options.log_transform = False num_genes = options.genes #K = 10 # number of dimensions MATRIX = read_matrix(filename, options.num_header_cols) if options.log_transform: MATRIX._X = jmath.log(MATRIX._X, base=2, safe=1) assert MATRIX.nrow() and MATRIX.ncol(), "Empty matrix." cluster = None if options.cluster and options.cluster_file: parser.error("Cannot specify clusters and a cluster file.") if options.cluster: cluster = _parse_cluster(options.cluster, options.indexes_include_headers, MATRIX) if options.cluster_file: if not os.path.exists(options.cluster_file): parser.error("I could not find cluster file: %s" % options.cluster_file) cluster = _parse_cluster_file(options.cluster_file, MATRIX) # Select a subset of the genes. if num_genes: assert MATRIX.ncol() > 1, "Not enough samples to select genes." I = pcalib.select_genes_var(MATRIX._X, num_genes) MATRIX = MATRIX.matrix(I, None) # Calculate the principal components and plot them. K = min(MATRIX.nrow(), MATRIX.ncol()) principal_components, perc_var = pcalib.svd_project_cols(MATRIX._X, K) X = [x[0] for x in principal_components] Y = [x[1] for x in principal_components] color = None if cluster is not None: color = pcalib.choose_colors(cluster) LABEL = None if options.label: LABEL = MATRIX.col_names(arrayio.COL_ID) assert not LABEL or len(LABEL) == len(X), "%d %d" % (len(X), len(LABEL)) height = width = None if options.width is not None: height, width = int(options.width * 0.75), options.width pcalib.plot_scatter(X, Y, outfile, group=cluster, color=color, title=options.title, label=LABEL, xlabel=options.label_axes, ylabel=options.label_axes, scale_label=options.scale_label, height=height, width=width) # Write out the scatter plot in Prism format. if options.prism_file: # Write out as prism format. num_series = 1 if cluster: num_series = max(cluster) + 1 names = ["CLUSTER-%d" % (i + 1) for i in range(num_series)] DATA = {} rownames = {} for i in range(num_series): xy = [] n = [] for j in range(len(principal_components)): if cluster and cluster[j] != i: continue x = principal_components[j][0] y = principal_components[j][1] xy.append([x, y]) n.append(MATRIX.col_names(arrayio.COL_ID)[j]) if xy: DATA[names[i]] = xy rownames[names[i]] = n prismlib.write_scatterplot(options.prism_file, DATA, rownames) if options.col_pc_file: # Write out the principal components. handle = open(options.col_pc_file, 'w') assert cluster is None or len(cluster) == len(principal_components) x = ["PC%02d (%.2f%%)" % (i, 100 * perc_var[i]) for i in range(K)] header = ["Index", "Sample", "Cluster", "Color"] + x print >> handle, "\t".join(header) for i in range(len(principal_components)): x = MATRIX.col_names(arrayio.COL_ID)[i] c = "" if color and color[i] is not None: c = colorlib.rgb2hex(color[i]) clust = "" if cluster is not None and cluster[i] is not None: clust = cluster[i] x = [i + 1, x, clust, c] + principal_components[i] assert len(x) == len(header) print >> handle, "\t".join(map(str, x)) handle.close() # Look at the principal components on the rows. if options.row_pc_file: handle = open(options.row_pc_file, 'w') row_names = MATRIX.row_names() x = ["PC%02d (%.2f%%)" % (i, 100 * perc_var[i]) for i in range(K)] header = ["Index"] + row_names + x print >> handle, "\t".join(header) # U nrow x k columns are principal components # V k x ncol rows are principal components U, s, V = numpy.linalg.svd(MATRIX._X, full_matrices=False) for i in range(len(U)): assert len(U[i]) == K, "%d %d" % (len(U), len(U[i]), K) n = [MATRIX.row_names(x)[i] for x in row_names] x = [i + 1] + n + list(U[i]) assert len(x) == len(header) print >> handle, "\t".join(map(str, x)) handle.close()