def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os import stat from genomicode import AnnotationMatrix # If the file is empty, then just create an empty positions file. if os.stat(in_data.identifier)[stat.ST_SIZE] == 0: open(out_filename, 'w') return M = AnnotationMatrix.read(in_data.identifier, header_char="##") # Headers are: # #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT [Samples...] # Pull out the #CHROM and POS columns. assert M.num_headers() assert M.headers[0] == "#CHROM" assert M.headers[1] == "POS" chrom_annots = M["#CHROM"] pos_annots = M["POS"] lines = [] seen = {} for chrom, pos in zip(chrom_annots, pos_annots): chrom, pos = chrom.strip(), pos.strip() x = chrom, pos if x in seen: continue seen[x] = 1 x = "\t".join(x) + "\n" lines.append(x) open(out_filename, 'w').writelines(lines)
def main(): import os import argparse from genomicode import jmath from genomicode import AnnotationMatrix parser = argparse.ArgumentParser(description="") parser.add_argument("datafile", help="Tab-delimited data file.") parser.add_argument("header", help="Which column contains data to plot.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") parser.add_argument( "--prism_file", help="Write Prism-formatted results to this file.") parser.add_argument( "--ignore_missing_values", action="store_true", help="Ignore missing values in the file.") group = parser.add_argument_group(title="Calculations") group.add_argument( "--breaks_seq", help="Set the breakpoints. Format: <start>,<stop>,<skip>.") group.add_argument( "--num_breaks", type=int, help="Number of breakpoints.") group.add_argument( "--ymax", type=int, help="Set the maximum value for the Y axis.") group = parser.add_argument_group(title="Plot Labels") group.add_argument("--title", help="Put a title on the plot.") group.add_argument("--xlab", help="Label the X-axis.") group.add_argument( "--xlabel_size", default=1.0, type=float, help="Scale the size of the labels on X-axis. Default 1.0.") group.add_argument( "--xlabel_off", action="store_true", help="Do not label the X axis.") group.add_argument( "--ylabel_off", action="store_true", help="Do not label the Y axis.") group.add_argument( "--xtick_label_off", action="store_true", help="Do not draw the tick labels on the X axis.") group = parser.add_argument_group(title="Colors") group.add_argument( "--bar_color", help="Set the color of the bars. Default #FFFFFF") x = _fmt_palettes() group.add_argument( "--bar_palette", help="Color the bars according to a palette: %s." % x) group.add_argument( "--symmetric_palette", action="store_true", help="Make the color symmetric.") group = parser.add_argument_group(title="Appearance") group.add_argument( "--height", type=int, help="Height (in pixels) of the plot.") group.add_argument( "--width", type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument( "--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0.") group.add_argument( "--xaxis_off", action="store_true", help="Do not show the X axis.") group.add_argument( "--yaxis_off", action="store_true", help="Do not show the Y axis.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.datafile): parser.error("File not found: %s" % args.datafile) assert not (args.breaks_seq and args.num_breaks) if args.num_breaks: assert args.num_breaks >= 2 and args.num_breaks <= 1000 if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096*16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096*16, "height too big" assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 assert args.xlabel_size > 0 and args.xlabel_size < 10 assert not (args.bar_color and args.bar_palette) assert not args.symmetric_palette or args.bar_palette assert args.ymax is None or args.ymax > 0 height = args.height or 2400 width = args.width or 3200 MATRIX = AnnotationMatrix.read(args.datafile, False) assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix." assert args.header in MATRIX.headers, "header not found: %s" % args.header # Pull out the values for the histogram. x = MATRIX[args.header] if args.ignore_missing_values: x = [x for x in x if x.strip()] values = map(float, x) value_min = value_max = None # Start R and set up the environment. R = jmath.start_R() main = jmath.R_var("NA") if args.title: main = args.title sub = "" xlab = "" if args.xlab: xlab = args.xlab ylab = "Frequency" xtick_labels = jmath.R_var("TRUE") ytick_labels = jmath.R_var("TRUE") if args.xlabel_off: xlab = "" if args.ylabel_off: ylab = "" if args.xtick_label_off: xtick_labels = jmath.R_var("FALSE") breaks = "Sturges" if args.breaks_seq: breaks = _parse_breaks_seq(args.breaks_seq) value_min, value_max = min(breaks), max(breaks) jmath.R_equals(breaks, "breaks") breaks = jmath.R_var("breaks") if args.num_breaks: breaks = args.num_breaks if value_min is not None: values = [x for x in values if x >= value_min] if value_max is not None: values = [x for x in values if x < value_max] lwd = 2 cex_lab = 1.5 cex_main = 2.0 cex_sub = 1.5 ylim = jmath.R_var("NULL") if args.ymax is not None: ylim = [0, args.ymax] assert values jmath.R_equals(values, "X") # Figure out the colors. Do it after X is assigned. col = jmath.R_var("NULL") if args.bar_color: assert args.bar_color.startswith("#") col = args.bar_color elif args.bar_palette: # Figure out how many breaks there are. Number of bars is num # breaks + 1. jmath.R_fn( "hist", jmath.R_var("X"), breaks=breaks, plot=jmath.R_var("FALSE"), RETVAL="x") breaks = [x for x in R["x"].rx2("breaks")] num_bars = len(breaks) + 1 col = _make_col_palette( args.bar_palette, num_bars, args.symmetric_palette) bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn( "bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5*1.2*args.mar_bottom, 4*1.2*args.mar_left, 4, 2 mar = [x+0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") jmath.R_fn( "hist", jmath.R_var("X"), breaks=breaks, main="", xlab="", ylab="", ylim=ylim, axes=jmath.R_var("FALSE"), col=col, RETVAL="x") # Make plot area solid white. #jmath.R('usr <- par("usr")') #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') #jmath.R_fn( # "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"), # main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"), # add=jmath.R_var("TRUE")) #jmath.R_fn("box", lwd=lwd) # x-axis if not args.xaxis_off: jmath.R_fn( "axis", 1, lwd=lwd, labels=xtick_labels, **{ "cex.axis" : 1.5 }) # y-axis if not args.yaxis_off: jmath.R_fn( "axis", 2, lwd=lwd, labels=ytick_labels, **{ "cex.axis" : 1.5 }) jmath.R_fn( "title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab" : cex_lab, "cex.main" : cex_main, "cex.sub" : cex_sub }) R("par(op)") jmath.R_fn("dev.off") if args.prism_file: write_prism_file(args.prism_file, R["x"])
def main(): import argparse from genomicode import AnnotationMatrix as AM SKIP_OUTFILE = "_" parser = argparse.ArgumentParser( description="Align a set of matrices. Preserve the order of the " "first file given.") parser.add_argument("outfile", nargs="+") parser.add_argument( "--express_file", default=[], action="append", help="") parser.add_argument( "--annot_file", default=[], action="append", help="") parser.add_argument( "--header", default=[], action="append", help="Specify the header for an annotation file. Should come " "after the --annot_file that it refers to.") parser.add_argument( "--annot_path", help="Align all the annotation files in a path. " "If using this argument, no --annot_file or --express_file should " "be given. " "--header is still required, and should apply to at least one file. " 'Only one "outfile" should be given, and it should refer to a path ' "in which to store the aligned files.") #parser.add_argument( # "--first_annot_header", help="If only aligning annotation files, " # "find the samples to be matched under this header in the first " # "annotation file.") parser.add_argument( "--clobber", default=False, action="store_true", help="Overwrite output files, if they already exist.") group = parser.add_argument_group(title="Comparisons") group.add_argument( "--case_insensitive", default=False, action="store_true", help="Do a case insensitive search of sample names.") group.add_argument( "--hash", default=False, action="store_true", help="Hash the sample names to [a-zA-Z0-9_] before comparison.") group.add_argument( "--ignore_nonalnum", default=False, action="store_true", help="Ignore non-alphanumeric characters in the IDs.") group.add_argument( "--ignore_blank", default=False, action="store_true", help="Ignore IDs that are blank (don't align them.") group = parser.add_argument_group(title="Joins") group.add_argument( "--strict", default=False, action="store_true", help="Complain if a file is missing a sample.") group.add_argument( "--left_join", default=False, action="store_true", help='By default, does an "inner join" and keeps only the ' 'records that are present in all files. A "left join" will ' 'keep all records that occur in the first file.') group.add_argument( "--outer_join", default=False, action="store_true", help='By default, does an "inner join" and keeps only the ' 'records that are present in all files. An "outer join" will ' 'also keep records that occur in any file.') group = parser.add_argument_group(title="Output") group.add_argument( "--null_string", default="", help='For left_join or outer_join, what to give the missing values.') group.add_argument( "--unaligned_only", action="store_true", help="Show only the rows that are not aligned.") group.add_argument( "--dont_add_missing_samples", action="store_true", help="If a matrix does not have a sample, don't fill in the value " "from another matrix.") group = parser.add_argument_group(title="Debug") group.add_argument( "--debug_nrows", type=int, help="Debugging: Only read this many rows from the annotation files.") args = parser.parse_args() # If the user specified an --annot_path, revise args to # contain --annot_files instead. sys.argv, args = _handle_annot_path(sys.argv, args) ni, no = len(args.express_file)+len(args.annot_file), len(args.outfile) assert ni == no, "Mismatch: %d inputs and %d outputs" % (ni, no) for x in args.express_file + args.annot_file: assert os.path.exists(x), "I could not find file: %s" % x for x in args.outfile: if x == SKIP_OUTFILE: continue assert args.clobber or not os.path.exists(x), "File exists: %s" % x assert not (args.left_join and args.outer_join) if args.null_string: assert args.outer_join or args.left_join, \ "null_string given, but only used for outer_join" # Align the outfiles to the expression and annotation files. express_file = args.express_file[:] annot_file = args.annot_file[:] outfile = args.outfile[:] matrix_data = [] # list of (infile, outfile, is_express_file) for arg in sys.argv: if arg not in ["--express_file", "--annot_file"]: continue assert outfile if arg == "--express_file": assert express_file x = express_file.pop(0), outfile.pop(0), True else: assert annot_file x = annot_file.pop(0), outfile.pop(0), False matrix_data.append(x) assert not express_file assert not annot_file assert not outfile # Align the --header arguments to the annotation files. headers = [None] * len(matrix_data) header_i = -1 for i, arg in enumerate(sys.argv): if arg == "--header": assert header_i >= 0, \ "--header given before an --express_file or --annot_file." assert headers[header_i] is None, "Two --header for one file." headers[header_i] = sys.argv[i+1] elif arg in ["--express_file", "--annot_file"]: header_i += 1 # Add the headers to the matrix_data. new_matrix_data = [] # list of (infile, outfile, is_express_file, header) for i in range(len(matrix_data)): infile, outfile, is_express_file = matrix_data[i] if is_express_file and headers[i]: raise NotImplementedError, "No headers for --express_file." x = infile, outfile, is_express_file, headers[i] new_matrix_data.append(x) matrix_data = new_matrix_data # Read each of the files. new_matrix_data = [] # list of (infile, outfile, matrix, header) for x in matrix_data: infile, outfile, is_express_file, header = x if is_express_file: data = read_express(infile) else: data = AM.read(infile, nrows=args.debug_nrows) x = infile, outfile, data, header new_matrix_data.append(x) matrix_data = new_matrix_data # Find the samples in each matrix. new_matrix_data = [] # list of (infile, outfile, matrix, header, samples) samples_hint = peek_samples_hint(matrix_data) for x in matrix_data: infile, outfile, matrix, header = x headers_hint = [x for x in headers if x] x = get_samples( matrix, header, samples_hint, headers_hint, args.case_insensitive, args.hash, args.ignore_nonalnum) assert x, "I could not find the samples for %s" % infile header, samples = x x = infile, outfile, matrix, header, samples new_matrix_data.append(x) matrix_data = new_matrix_data if args.left_join: assert not args.strict, "Can't do a strict left join." # No duplicates. samples = list_all_samples( matrix_data[:1], args.case_insensitive, args.hash, args.ignore_nonalnum) assert samples, "No samples." elif args.outer_join: assert not args.strict, "Can't do a strict outer join." samples = list_all_samples( matrix_data, args.case_insensitive, args.hash, args.ignore_nonalnum) assert samples, "No samples." else: # inner join samples = list_common_samples( matrix_data, args.case_insensitive, args.hash, args.ignore_nonalnum) assert samples, "No common samples found." if args.strict: all_samples = list_all_samples( matrix_data, args.case_insensitive, args.hash, args.ignore_nonalnum) common_samples = list_common_samples( matrix_data, args.case_insensitive, args.hash, args.ignore_nonalnum) if sorted(all_samples) != sorted(common_samples): missing_samples = [] for x in all_samples: i = find_sample( common_samples, x, args.case_insensitive, args.hash, args.ignore_nonalnum, args.ignore_blank) if i >= 0: continue missing_samples.append(x) short = missing_samples if len(short) > 10: short = short[:10] + ["..."] short = "\n".join(short) raise AssertionError, "%d samples not in all data sets.\n%s" % \ (len(missing_samples), short) # Align each of the matrices. matrix_data = align_matrices( matrix_data, samples, args.case_insensitive, args.hash, args.ignore_nonalnum, args.ignore_blank, args.left_join, args.outer_join, args.unaligned_only, args.null_string) # Add the missing samples back to the matrix. if not args.dont_add_missing_samples: matrix_data = add_missing_samples(matrix_data, args.null_string) # Write out each of the matrices. for x in matrix_data: infile, outfile, matrix, header, samples = x if outfile == SKIP_OUTFILE: continue write_matrix(outfile, matrix)
def main(): import os import argparse from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import colorlib from genomicode import pcalib parser = argparse.ArgumentParser(description="") parser.add_argument("datafile", help="Tab-delimited data file.") #parser.add_argument("x_header", help="Which column for X values.") #parser.add_argument("y_header", help="Which column for Y values.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") group = parser.add_argument_group(title="Data Series") group.add_argument( "--series", action="append", help="Add a data series to the plot. At least one series must be " "plotted. Format: <x_header>;<y_header>") group = parser.add_argument_group(title="General Appearance") group.add_argument("--no_box", action="store_true", help="Turn off the box around the plot.") group.add_argument("--height", type=int, help="Height (in pixels) of the plot.") group.add_argument("--width", type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument("--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0.") #group.add_argument( # "--xlabel_size", default=1.0, type=float, # help="Scale the size of the labels on X-axis. Default 1.0.") group.add_argument("--log_x", action="store_true", help="Plot the X-axis on a log scale.") group.add_argument("--log_y", action="store_true", help="Plot the Y-axis on a log scale.") group.add_argument( "--qq", action="store_true", help="Make a QQ-plot. Will sort the values to be plotted.") group = parser.add_argument_group(title="Plot Labels") group.add_argument("--title", help="Put a title on the plot.") group.add_argument("--xlab", help="Label the X-axis.") group.add_argument("--ylab", help="Label the Y-axis.") group.add_argument("--add_regression", action="store_true", help="Put a regression line on the plot.") group = parser.add_argument_group(title="Legend") group.add_argument("--add_legend", action="store_true", help="Add a legend to the plot.") group.add_argument("--legend_inset", type=float, default=0.05, help="") LEGEND_LOCATIONS = [ "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right", "center", ] group.add_argument("--legend_loc", choices=LEGEND_LOCATIONS, help="Where to draw the legend.") group = parser.add_argument_group(title="Point Appearance") group.add_argument("--scale_points", default=1.0, type=float, help="Scale the size of the points. Default 1.0") group.add_argument("--label_header", help="Label each point with the values in this column.") group.add_argument("--label_size", type=float, help="Scale the size of the labels by this value.") group.add_argument("--label_pos", default="top", choices=["top", "bottom", "left", "right"], help="Where to label the points.") group = parser.add_argument_group(title="Line Appearance") group.add_argument("--add_lines", action="store_true", help="Add lines that connect the points.") group.add_argument("--scale_lines", default=1.0, type=float, help="Scale the thickness of the lines. Default 1.0") group = parser.add_argument_group(title="Identity Line") group.add_argument("--add_identity_line", action="store_true", help="Add an identity line to the plot.") group = parser.add_argument_group(title="Colors") group.add_argument( "-c", "--cluster", action="append", help="Group samples into a cluster (e.g. -c 1-5); 1-based, inclusive.") group.add_argument( "--indexes_include_headers", "--iih", action="store_true", help="If not given (default), then index 1 is the first row " "with data. If given, then index 1 is the very first row " "in the file, including the headers.") group.add_argument("--default_color", help="Default color of points. Format: #000000.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.datafile): parser.error("File not found: %s" % args.datafile) if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096 * 16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096 * 16, "height too big" assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 #assert args.xlabel_size > 0 and args.xlabel_size < 10 assert args.legend_inset >= 0 and args.legend_inset < 10 if args.legend_loc is None: args.legend_loc = "bottomright" if args.default_color: assert len(args.default_color) == 7 assert args.default_color[0] == "#" MATRIX = AnnotationMatrix.read(args.datafile, False) assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix." assert args.series, "Need to add a data --series to plot." #assert len(args.series) <= 1, "Not implemented." #assert args.x_header in MATRIX.headers, \ # "header not found: %s" % args.x_header #assert args.y_header in MATRIX.headers, \ # "header not found: %s" % args.y_header if args.label_header: assert args.label_header in MATRIX.headers, \ "header not found: %s" % args.label_header if args.label_size is not None: assert args.label_size > 0 and args.label_size <= 20 assert args.scale_points > 0 and args.scale_points < 20 assert args.scale_lines > 0 and args.scale_lines < 20 series = _parse_series(MATRIX, args.series) cluster = None if args.cluster: cluster = _parse_cluster(args.cluster, args.indexes_include_headers, MATRIX) if len(series) > 1: assert not cluster, "Series and cluster not implemented." height = args.height or 2400 width = args.width or 3200 # Pull out the values and colors for the plot. default_color = "#000000" if args.default_color: default_color = args.default_color assert len(series) < len(colorlib.BREWER_QUALITATIVE_SET1) series_data = [] # list of (x_values, y_values, col) for each series for i in range(len(series)): x_header, y_header = series[i] x = MATRIX[x_header] y = MATRIX[y_header] I1 = [j for (j, a) in enumerate(x) if a] I2 = [j for (j, a) in enumerate(y) if a] I = [j for j in I1 if j in I2] x = [x[j] for j in I] y = [y[j] for j in I] x = map(float, x) y = map(float, y) assert len(x) == len(y) c = default_color if len(series) > 1: rgb = colorlib.BREWER_QUALITATIVE_SET1[i] c = colorlib.rgb2hex(rgb, prefix="#") c = [c] * len(x) x = x, y, c series_data.append(x) # Merge all the data point for each series. x_values = [] y_values = [] col = [] for (x, y, c) in series_data: x_values.extend(x) y_values.extend(y) #c = [c] * len(x) col.extend(c) assert len(x_values) == len(y_values) assert len(x_values) == len(col) if args.qq: O = jmath.order(x_values) x_values = [x_values[i] for i in O] y_values = [y_values[i] for i in O] col = [col[i] for i in O] if cluster is not None: col_rgb = pcalib.choose_colors(cluster) col = [default_color] * len(col_rgb) for i in range(len(col_rgb)): if col_rgb[i] is None: continue col[i] = colorlib.rgb2hex(col_rgb[i], prefix="#") assert len(col) == len(x_values) #for i in range(len(x_values)): # x = x_values[i], y_values[i], cluster[i], col[i] # print "\t".join(map(str, x)) # Start R and set up the environment. R = jmath.start_R() main = jmath.R_var("NA") if args.title: main = args.title sub = "" xlab = "" if len(series) == 1: xlab = x_header if args.xlab: xlab = args.xlab ylab = "" if len(series) == 1: ylab = y_header if args.xlab: ylab = args.ylab lwd_box = 2 lwd_axis = 2 lwd_regr = 3 cex = 1.0 * args.scale_points cex_lab = 1.5 cex_main = 2.0 cex_sub = 1.0 plot_log = "" if args.log_x: plot_log += "x" if args.log_y: plot_log += "y" assert x_values assert y_values jmath.R_equals(x_values, "X") jmath.R_equals(y_values, "Y") bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn("bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2 mar = [x + 0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") jmath.R_fn("plot", jmath.R_var("X"), jmath.R_var("Y"), main="", xlab="", ylab="", pch=19, cex=cex, log=plot_log, col=col, axes=jmath.R_var("FALSE"), RETVAL="x") # Make plot area solid white. #jmath.R('usr <- par("usr")') #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') #jmath.R_fn( # "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"), # main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"), # add=jmath.R_var("TRUE")) if args.add_lines: lwd = 4 * args.scale_lines i = 0 for (x, y, c) in series_data: # Cannot use c for the color. It might've been changed by # --cluster. assert col and i < len(col) c = col[i:i + len(x)] i += len(x) # The "lines" function takes a scalar for col (except for # type=h, histogram vertical lines). If there are # multiple colors, then split up the points based on the # colors. l_x, l_y, l_c = [], [], None for j in range(len(x)): if c[j] != l_c: if l_x: jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) # Add the previous point so that the points will # connect. if l_x: l_x = [l_x[-1]] l_y = [l_y[-1]] else: l_x, l_y, l_c = [], [], None l_x.append(x[j]) l_y.append(y[j]) l_c = c[j] if l_x: jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) if args.add_identity_line: lwd = 4 x_min, x_max = min(x_values), max(x_values) y_min, y_max = min(y_values), max(y_values) iden_min = max(x_min, y_min) iden_max = min(x_max, y_max) l_x = [iden_min, iden_max] l_y = l_x l_c = "#FF0000" jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) if args.label_header: cex = 1 if args.label_size is not None: cex = args.label_size pos2specifier = { "top": 3, "bottom": 1, "left": 2, "right": 4, } pos = pos2specifier[args.label_pos] point_labels = MATRIX[args.label_header] jmath.R_fn("text", jmath.R_var("X"), jmath.R_var("Y"), labels=point_labels, cex=cex, pos=pos) # Calculate correlation, and other statistics. # TODO: Should calculate this for each series. r = jmath.R("cor(X, Y)") p_value = jmath.R("cor.test(X, Y)$p.value") r = r[0] p_value = p_value[0] print "R = %.2f" % r print "p = %.2g" % p_value # Add a regression line. if args.add_regression: jmath.R("fit <- lm(Y ~ X)") coef = jmath.R("fit$coefficients") assert len(coef) == 2 b, m = coef x1 = min(x_values) y1 = x1 * m + b x2 = max(x_values) y2 = x2 * m + b jmath.R_fn("lines", [x1, x2], [y1, y2], lwd=lwd_regr, lty=2, col="#C63F31") sub = "R=%.2f (p=%.2g)" % (r, p_value) header = "X", "Y", "R", "p" print "\t".join(header) x = xlab, ylab, r, p_value print "\t".join(map(str, x)) if args.add_legend: leg = [x[1] for x in series] fill = [x[-1] for x in series_data] #jmath.R("x <- rgb(0.5, 0.5, 0.5, 0.5)") # alpha does not seem to be supported here. jmath.R_fn("legend", args.legend_loc, legend=leg, fill=fill, inset=args.legend_inset) if not args.no_box: jmath.R_fn("box", lwd=lwd_box) jmath.R_fn("axis", 1, lwd=lwd_axis, **{"cex.axis": 1.5}) jmath.R_fn("axis", 2, lwd=lwd_axis, **{"cex.axis": 1.5}) jmath.R_fn("title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab": cex_lab, "cex.main": cex_main, "cex.sub": cex_sub }) R("par(op)") jmath.R_fn("dev.off")
def run( self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import alignlib #from Betsy import module_utils as mlib rsem_path = in_data.identifier assert os.path.exists(rsem_path) assert os.path.isdir(rsem_path) result_files = alignlib.find_rsem_result_files(rsem_path) assert result_files, "No .results files found." metadata = {} preprocess = out_attributes.get("preprocess") assert preprocess in ["tpm", "fpkm"] #x = mlib.get_user_option( # user_options, "genes_or_isoforms", not_empty=True, # allowed_values=["genes", "isoforms"]) #get_genes = x == "genes" # Figure out whether to align to genome or transcriptome. x = out_attributes["expression_of"] assert x in ["gene", "isoform"] get_genes = x == "gene" transcript_header = "transcript_id(s)" if not get_genes: transcript_header = "transcript_id" # For each of the gene files, get the expression data. sample2matrix = {} # sample -> AnnotationMatrix for x in result_files: sample, gene_filename, isoform_filename = x # Get the gene results. # TODO: Implement isoforms. filename = gene_filename if not get_genes: filename = isoform_filename assert filename is not None, "Missing: %s" % filename #if filename is None: # continue assert os.path.exists(filename) matrix = AnnotationMatrix.read(filename) # Do some checking on the matrix. assert "gene_id" in matrix.headers assert transcript_header in matrix.headers assert "TPM" in matrix.headers assert "FPKM" in matrix.headers sample2matrix[sample] = matrix assert sample2matrix, "No samples" gene_id = transcript_id = None # Pull out the gene and transcript IDs. for matrix in sample2matrix.itervalues(): x1 = matrix["gene_id"] x2 = matrix[transcript_header] if gene_id is None: gene_id = x1 if transcript_id is None: transcript_id = x2 assert x1 == gene_id assert x2 == transcript_id assert gene_id assert transcript_id assert len(gene_id) == len(transcript_id) # Assemble into a gene expression matrix. header = "TPM" if preprocess == "fpkm": header = "FPKM" t_data = [] # matrix, where each row is a sample. t_data.append(gene_id) t_data.append(transcript_id) samples = [] for sample in sorted(sample2matrix): matrix = sample2matrix[sample] exp = matrix[header] assert len(exp) == len(gene_id) t_data.append(exp) samples.append(sample) data = jmath.transpose(t_data) header = ["gene_id", transcript_header] + samples data = [header] + data # Write out the data file. handle = open(out_filename, 'w') for x in data: print >>handle, "\t".join(map(str, x)) return metadata