def write_matrix(outfile, matrix): from genomicode import AnnotationMatrix as AM if isinstance(matrix, AM.AnnotationMatrix): AM.write(outfile, matrix) else: write_express(outfile, matrix)
def make_matrix(samples, callers, annot_header, annot_data, named_data, call_data): # annot_header list of headers for annot_data. # annot_data list of tuples: chrom, pos, ref, alt[, more] # named_data list of (name, headers, all_annots) # call_data list of tuples: chrom, pos, ref, alt, sample, caller, call # chrom string # pos int # ref string # alt string # sample string # caller string # call Call object from genomicode import AnnotationMatrix # Make sure there's no duplicates. assert annot_header[:4] == ["Chrom", "Pos", "Ref", "Alt"] seen = {} for x in annot_data: x = x[:4] x = tuple(x) assert x not in seen, "Duplicate" seen[x] = 1 # Make annotation matrix. for x in annot_data: assert len(x) == len(annot_header) headers = annot_header all_annots = [] for i in range(len(headers)): x = [x[i] for x in annot_data] all_annots.append(x) annot_matrix = AnnotationMatrix.create_from_annotations( headers, all_annots) # Make named matrices. named_matrices = [] for x in named_data: name, headers, all_annots = x matrix = AnnotationMatrix.create_from_annotations(headers, all_annots) x = name, matrix named_matrices.append(x) # Make call matrix. call_matrix = SparseCallMatrix(call_data) return SimpleVariantMatrix(samples, callers, annot_matrix, named_matrices, call_matrix)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os import stat from genomicode import AnnotationMatrix # If the file is empty, then just create an empty positions file. if os.stat(in_data.identifier)[stat.ST_SIZE] == 0: open(out_filename, 'w') return M = AnnotationMatrix.read(in_data.identifier, header_char="##") # Headers are: # #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT [Samples...] # Pull out the #CHROM and POS columns. assert M.num_headers() assert M.headers[0] == "#CHROM" assert M.headers[1] == "POS" chrom_annots = M["#CHROM"] pos_annots = M["POS"] lines = [] seen = {} for chrom, pos in zip(chrom_annots, pos_annots): chrom, pos = chrom.strip(), pos.strip() x = chrom, pos if x in seen: continue seen[x] = 1 x = "\t".join(x) + "\n" lines.append(x) open(out_filename, 'w').writelines(lines)
def filter_min_gene_expression_in_every_sample(MATRIX, gxp): # Gene expression >= 1 in all samples. from genomicode import AnnotationMatrix assert type(gxp) is type(0.0) x = MATRIX.headers x = [x for x in x if x.startswith("Gene Expression")] sample_h = x assert sample_h, 'Missing: "Gene Expression" columns' I_keep = [] for i in range(MATRIX.num_annots()): keep = True for h in sample_h: if not MATRIX[h][i]: keep = False break # 5.3 # 0,0.379 x = MATRIX[h][i] x = x.split(",") x = [float(x) for x in x] x = max(x) exp = x if exp < gxp: keep = False break if not keep: continue I_keep.append(i) x = AnnotationMatrix.rowslice(MATRIX, I_keep) return x
def filter_min_coverage_in_every_sample(MATRIX, coverage): from genomicode import AnnotationMatrix assert type(coverage) is type(0) x = MATRIX.headers x = [x for x in x if x.startswith("Coverage")] sample_h = x assert sample_h, 'Missing: "Coverage" columns' I_keep = [] for i in range(MATRIX.num_annots()): keep = True for h in sample_h: if not MATRIX[h][i]: keep = False break # Ref/Alt/VAF x = MATRIX[h][i] x = x.split("/") assert len(x) == 3 cov = int(x[0]) + int(x[1]) if cov < coverage: keep = False break if keep: I_keep.append(i) x = AnnotationMatrix.rowslice(MATRIX, I_keep) return x
def filter_min_callers_in_any_sample(MATRIX, num_callers): from genomicode import AnnotationMatrix assert type(num_callers) is type(0) x = MATRIX.headers x = [x for x in x if x.startswith("Num Callers")] callers_h = x assert callers_h, 'Missing: "Gene Expression" columns' I_keep = [] for i in range(MATRIX.num_annots()): keep = False for h in callers_h: if not MATRIX[h][i]: continue nc = int(MATRIX[h][i]) if nc >= num_callers: keep = True break if keep: I_keep.append(i) x = AnnotationMatrix.rowslice(MATRIX, I_keep) return x
def sort_vcf_file(filename): from genomicode import vcflib from genomicode import jmath from genomicode import AnnotationMatrix vcf = vcflib.read(filename) CHROM = vcf.matrix["#CHROM"] POS = vcf.matrix["POS"] POS = [int(x) for x in POS] # Check if POS is sorted. If it's already sorted, then return. is_sorted = True for i in range(len(CHROM) - 1): c1, p1 = CHROM[i], POS[i] c2, p2 = CHROM[i + 1], POS[i + 1] if c1 != c2: continue if p2 < p1: is_sorted = False break if is_sorted: return # Sort by CHROM and POS. S = ["%s:%d" % (CHROM[i], POS[i]) for i in range(len(CHROM))] O = jmath.order_list(S, natural=True) vcf.matrix = AnnotationMatrix.rowslice(vcf.matrix, O) vcflib.write(filename, vcf)
def add_snpeff_to_svm(svm_file, snpeff_file, outfile): import shutil from genomicode import filelib from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix if not filelib.exists_nz(snpeff_file): shutil.copy2(svm_file, outfile) return # Read the annotations. header = None # includes Chrom, Pos, Ref, Alt coord2d = {} for d in filelib.read_row(snpeff_file, header=1): if header is None: header = d._header coord = d.Chrom, d.Pos, d.Ref, d.Alt coord2d[coord] = d svm = SimpleVariantMatrix.read_as_am(svm_file) CHROM = svm.header2annots["______Chrom"] POS = svm.header2annots["______Pos"] REF = svm.header2annots["______Ref"] ALT = svm.header2annots["______Alt"] snpeff_header = header[4:] snpeff_matrix = [] # Row major. for i in range(len(CHROM)): coord = CHROM[i], POS[i], REF[i], ALT[i] row = [""] * len(snpeff_header) d = coord2d.get(coord) if d: row = d._cols[4:] assert len(row) == len(snpeff_header) snpeff_matrix.append(row) assert len(snpeff_matrix) == len(CHROM) # AnnotationMatrix is column major. snpeff_annots = [] for j in range(len(snpeff_header)): x = [snpeff_matrix[i][j] for i in range(len(snpeff_matrix))] snpeff_annots.append(x) # Convert the headers to SVM format. snpeff_header = ["SnpEff______%s" % x for x in snpeff_header] # Make the new SimpleVariantMatrix. headers = svm.headers[:4] + snpeff_header + svm.headers[4:] x = [svm.header2annots[x] for x in svm.headers_h] all_annots = x[:4] + snpeff_annots + x[4:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=svm.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def align_annot(matrix, indexes, null_string): from genomicode import AnnotationMatrix as AM name2annots_new = {} for name, annots in matrix.header2annots.iteritems(): annots_new = [] for i, i_annot in enumerate(indexes): if i_annot != None: annots_new.append(annots[i_annot]) #elif name == header: # annots_new.append(samples[i]) else: annots_new.append(null_string) name2annots_new[name] = annots_new #return AnnotationMatrix(name2annots_new, matrix.name_order) x = AM.AnnotationMatrix(matrix.headers, matrix.headers_h, name2annots_new) return x
def check_matrix(X): import re import arrayio import copy from genomicode import hashlib from genomicode import AnnotationMatrix assert arrayio.gct_format.is_matrix(X) # Make sure gene IDs (NAME) is unique and non-empty. assert X.row_names()[0].upper() == "NAME", \ "Header of first column should be: NAME" seen = {} for i, name in enumerate(X.row_names("NAME")): assert name.strip(), "Empty gene ID in row %d." % (i + 1) assert name not in seen, "Duplicate gene ID: %s" % name seen[name] = 1 # Make sure sample names don't contain spaces or other # punctuation. GSEA seems to be sensitive to these things. sample_names = X.col_names(arrayio.tdf.SAMPLE_NAME) bad_names = [] for i, name in enumerate(sample_names): if not name: bad_names.append("<blank>") elif re.search("[^a-zA-Z0-9_-]", name): bad_names.append(name) #assert not bad_names, "Bad sample name: %s" % ", ".join(bad_names) # If there are bad names, try to fix them. if bad_names: X = copy.deepcopy(X) sample_names = [hashlib.hash_var(x) for x in sample_names] sample_names = AnnotationMatrix.uniquify_headers(sample_names) header = X._resolve_synonym(arrayio.tdf.SAMPLE_NAME, X.col_names, X._synonyms) X._col_names[header] = sample_names # Make sure sample names are unique. seen = {} for i, name in enumerate(sample_names): assert name not in seen, "Duplicate sample name: %s" % name seen[name] = 1 return X
def filter_sift_polyphen_damaging(MATRIX): from genomicode import AnnotationMatrix x = [x for x in MATRIX.headers if x.endswith("SIFT_pred")] assert len(x) == 1 SIFT_pred = MATRIX[x[0]] x = [x for x in MATRIX.headers if x.endswith("Polyphen2_HDIV_pred")] assert len(x) == 1 hdiv_pred = MATRIX[x[0]] x = [x for x in MATRIX.headers if x.endswith("Polyphen2_HVAR_pred")] assert len(x) == 1 hvar_pred = MATRIX[x[0]] I_keep = [] for i, (sift, hdiv, hvar) in enumerate(zip(SIFT_pred, hdiv_pred, hvar_pred)): if sift == "D" and hdiv in ["D", "P"] and hvar in ["D", "P"]: I_keep.append(i) x = AnnotationMatrix.rowslice(MATRIX, I_keep) return x
def filter_linked_perc(MATRIX, args): if args is None: return MATRIX from genomicode import AnnotationMatrix filter_perc = float(args) assert filter_perc >= 0 and filter_perc <= 100 h = "Linkage______Perc Linked" perc_linked = MATRIX[h] I = [] for i, perc in enumerate(perc_linked): if perc == "": I.append(i) continue perc = float(perc) if perc <= filter_perc: I.append(i) return AnnotationMatrix.rowslice(MATRIX, I)
def filter_min_callers(MATRIX, args, germline): if args is None: return MATRIX from genomicode import AnnotationMatrix num_callers = args assert num_callers >= 1 and num_callers < 20 I_nc = [ i for (i, x) in enumerate(MATRIX.headers) if x.startswith("Num Callers") ] headers_nc = [MATRIX.headers_h[i] for i in I_nc] for i, h in enumerate(headers_nc): is_germ = False for g in germline: if h.endswith(g): is_germ = True break if is_germ: headers_nc[i] = None headers_nc = [x for x in headers_nc if x] I_remove = [] for i in range(MATRIX.num_annots()): has_sample = False for h in headers_nc: x = MATRIX.header2annots[h][i] if not x.strip(): continue nc = int(x) if nc >= num_callers: has_sample = True break if not has_sample: I_remove.append(i) x = {}.fromkeys(I_remove) I_keep = [i for i in range(MATRIX.num_annots()) if i not in x] filtered_matrix = AnnotationMatrix.rowslice(MATRIX, I_keep) return filtered_matrix
def exonic_only(MATRIX, args): if not args: return MATRIX from genomicode import AnnotationMatrix header = "Annovar______Func.refGene" assert header in MATRIX.headers_h I_keep = [] func = MATRIX.header2annots[header] for i in range(len(func)): # exonic # ncRNA_exonic;splicing # exonic;splicing x = func[i] x = x.split(";") if "exonic" not in x: continue I_keep.append(i) MATRIX = AnnotationMatrix.rowslice(MATRIX, I_keep) return MATRIX
def filter_nonsynonymous(MATRIX): # Filter out synonymous variants. from genomicode import AnnotationMatrix # Make sure annotated with Annovar. HEADER = "Annovar______ExonicFunc.refGene" assert HEADER in MATRIX.headers, "Missing: ExonicFunc.refGene" exonic_func = MATRIX[HEADER] I_keep = [] for i, efunc in enumerate(exonic_func): assert efunc in [ "", "nonsynonymous SNV", "synonymous SNV", "stopgain", "stoploss", "frameshift substitution", "nonframeshift substitution", "unknown"], \ "Unknown exonic_func: %s" % efunc if efunc in [ "nonsynonymous SNV", "stopgain", "stoploss", "frameshift substitution" ]: I_keep.append(i) x = AnnotationMatrix.rowslice(MATRIX, I_keep) return x
def annotate_linked_variants(MATRIX, args): if not args: return MATRIX from genomicode import filelib from genomicode import AnnotationMatrix link_file = args filelib.assert_exists_nz(link_file) coord2perc = {} for d in filelib.read_row(link_file, header=1): chrom = d.Chrom pos = int(d.Pos) perc = float(d.Perc_Linked) coord2perc[(chrom, pos)] = perc chrom = MATRIX.header2annots["______Chrom"] pos = MATRIX.header2annots["______Pos"] pos = [int(x) for x in pos] link_score = [""] * len(chrom) for i in range(len(chrom)): link_score[i] = coord2perc.get((chrom[i], pos[i]), "") # Add after: # Chrom, Pos, Ref, Alt header = "Linkage______Score" assert header not in MATRIX.headers headers = MATRIX.headers[:4] + [header] + MATRIX.headers[4:] all_annots = [] for h in headers: if h != header: x = MATRIX[h] else: x = link_score all_annots.append(x) return AnnotationMatrix.create_from_annotations(headers, all_annots, MATRIX.headerlines)
def main(): import os import argparse from genomicode import jmath from genomicode import AnnotationMatrix parser = argparse.ArgumentParser(description="") parser.add_argument("datafile", help="Tab-delimited data file.") parser.add_argument("header", help="Which column contains data to plot.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") parser.add_argument( "--prism_file", help="Write Prism-formatted results to this file.") parser.add_argument( "--ignore_missing_values", action="store_true", help="Ignore missing values in the file.") group = parser.add_argument_group(title="Calculations") group.add_argument( "--breaks_seq", help="Set the breakpoints. Format: <start>,<stop>,<skip>.") group.add_argument( "--num_breaks", type=int, help="Number of breakpoints.") group.add_argument( "--ymax", type=int, help="Set the maximum value for the Y axis.") group = parser.add_argument_group(title="Plot Labels") group.add_argument("--title", help="Put a title on the plot.") group.add_argument("--xlab", help="Label the X-axis.") group.add_argument( "--xlabel_size", default=1.0, type=float, help="Scale the size of the labels on X-axis. Default 1.0.") group.add_argument( "--xlabel_off", action="store_true", help="Do not label the X axis.") group.add_argument( "--ylabel_off", action="store_true", help="Do not label the Y axis.") group.add_argument( "--xtick_label_off", action="store_true", help="Do not draw the tick labels on the X axis.") group = parser.add_argument_group(title="Colors") group.add_argument( "--bar_color", help="Set the color of the bars. Default #FFFFFF") x = _fmt_palettes() group.add_argument( "--bar_palette", help="Color the bars according to a palette: %s." % x) group.add_argument( "--symmetric_palette", action="store_true", help="Make the color symmetric.") group = parser.add_argument_group(title="Appearance") group.add_argument( "--height", type=int, help="Height (in pixels) of the plot.") group.add_argument( "--width", type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument( "--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0.") group.add_argument( "--xaxis_off", action="store_true", help="Do not show the X axis.") group.add_argument( "--yaxis_off", action="store_true", help="Do not show the Y axis.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.datafile): parser.error("File not found: %s" % args.datafile) assert not (args.breaks_seq and args.num_breaks) if args.num_breaks: assert args.num_breaks >= 2 and args.num_breaks <= 1000 if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096*16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096*16, "height too big" assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 assert args.xlabel_size > 0 and args.xlabel_size < 10 assert not (args.bar_color and args.bar_palette) assert not args.symmetric_palette or args.bar_palette assert args.ymax is None or args.ymax > 0 height = args.height or 2400 width = args.width or 3200 MATRIX = AnnotationMatrix.read(args.datafile, False) assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix." assert args.header in MATRIX.headers, "header not found: %s" % args.header # Pull out the values for the histogram. x = MATRIX[args.header] if args.ignore_missing_values: x = [x for x in x if x.strip()] values = map(float, x) value_min = value_max = None # Start R and set up the environment. R = jmath.start_R() main = jmath.R_var("NA") if args.title: main = args.title sub = "" xlab = "" if args.xlab: xlab = args.xlab ylab = "Frequency" xtick_labels = jmath.R_var("TRUE") ytick_labels = jmath.R_var("TRUE") if args.xlabel_off: xlab = "" if args.ylabel_off: ylab = "" if args.xtick_label_off: xtick_labels = jmath.R_var("FALSE") breaks = "Sturges" if args.breaks_seq: breaks = _parse_breaks_seq(args.breaks_seq) value_min, value_max = min(breaks), max(breaks) jmath.R_equals(breaks, "breaks") breaks = jmath.R_var("breaks") if args.num_breaks: breaks = args.num_breaks if value_min is not None: values = [x for x in values if x >= value_min] if value_max is not None: values = [x for x in values if x < value_max] lwd = 2 cex_lab = 1.5 cex_main = 2.0 cex_sub = 1.5 ylim = jmath.R_var("NULL") if args.ymax is not None: ylim = [0, args.ymax] assert values jmath.R_equals(values, "X") # Figure out the colors. Do it after X is assigned. col = jmath.R_var("NULL") if args.bar_color: assert args.bar_color.startswith("#") col = args.bar_color elif args.bar_palette: # Figure out how many breaks there are. Number of bars is num # breaks + 1. jmath.R_fn( "hist", jmath.R_var("X"), breaks=breaks, plot=jmath.R_var("FALSE"), RETVAL="x") breaks = [x for x in R["x"].rx2("breaks")] num_bars = len(breaks) + 1 col = _make_col_palette( args.bar_palette, num_bars, args.symmetric_palette) bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn( "bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5*1.2*args.mar_bottom, 4*1.2*args.mar_left, 4, 2 mar = [x+0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") jmath.R_fn( "hist", jmath.R_var("X"), breaks=breaks, main="", xlab="", ylab="", ylim=ylim, axes=jmath.R_var("FALSE"), col=col, RETVAL="x") # Make plot area solid white. #jmath.R('usr <- par("usr")') #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') #jmath.R_fn( # "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"), # main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"), # add=jmath.R_var("TRUE")) #jmath.R_fn("box", lwd=lwd) # x-axis if not args.xaxis_off: jmath.R_fn( "axis", 1, lwd=lwd, labels=xtick_labels, **{ "cex.axis" : 1.5 }) # y-axis if not args.yaxis_off: jmath.R_fn( "axis", 2, lwd=lwd, labels=ytick_labels, **{ "cex.axis" : 1.5 }) jmath.R_fn( "title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab" : cex_lab, "cex.main" : cex_main, "cex.sub" : cex_sub }) R("par(op)") jmath.R_fn("dev.off") if args.prism_file: write_prism_file(args.prism_file, R["x"])
def read_as_am(filename, is_csv=False): # Read file in SVM format. Return an AnnotationMatrix object. # Does no special processing on any columns (i.e. no parsing as # integers or Call objects). Everything is a string. # Header format: <header0>___<header1>___<header2> # "blanks" are filled in. E.g. "Annovar" occurs in each Annovar # column in header0. # # Headers: # ______Chrom # ______Pos # ______Ref # ______Alt # Num Callers______<Sample> # ... from genomicode import filelib from genomicode import AnnotationMatrix delimiter = "\t" if is_csv: delimiter = "," matrix = [] for x in filelib.read_cols(filename, delimiter=delimiter): matrix.append(x) assert len(matrix) >= 3 # at least 3 rows for the header for i in range(1, len(matrix)): assert len(matrix[i]) == len(matrix[0]) assert len(matrix[0]) >= 4 # Chrom, Pos, Ref, Alt assert len(matrix[0]) >= 5, "No calls" header0 = matrix[0] header1 = matrix[1] header2 = matrix[2] assert header2[:4] == ["Chrom", "Pos", "Ref", "Alt"] # Fill in the blanks for header1. for i in range(1, len(header1)): if header1[i]: continue # header1[i] is blank. If header0[i], then this starts a new # "block". Start with a new header1, and do not copy the old # one over. if not header1[i] and not header0[i]: header1[i] = header1[i - 1] # Fill in the blanks for header0. for i in range(1, len(header0)): if not header0[i]: header0[i] = header0[i - 1] # Make a list of all samples. I = [i for (i, x) in enumerate(header2) if x == "Ref/Alt/VAF"] assert I x = [header0[i] for i in I] x = [x for x in x if x] # Get rid of duplicates, preserving order. x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]] samples = x # Make a list of all callers. x = [header1[i] for i in I] x = [x for x in x if x] # Get rid of duplicates, preserving order. x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]] callers = x headers = [] for x in zip(header0, header1, header2): x = "___".join(x) headers.append(x) all_annots = [] for j in range(len(headers)): annots = [x[j] for x in matrix[3:]] all_annots.append(annots) matrix = AnnotationMatrix.create_from_annotations(headers, all_annots) matrix.samples = samples matrix.callers = callers return matrix
def main(): import argparse from genomicode import AnnotationMatrix as AM SKIP_OUTFILE = "_" parser = argparse.ArgumentParser( description="Align a set of matrices. Preserve the order of the " "first file given.") parser.add_argument("outfile", nargs="+") parser.add_argument( "--express_file", default=[], action="append", help="") parser.add_argument( "--annot_file", default=[], action="append", help="") parser.add_argument( "--header", default=[], action="append", help="Specify the header for an annotation file. Should come " "after the --annot_file that it refers to.") parser.add_argument( "--annot_path", help="Align all the annotation files in a path. " "If using this argument, no --annot_file or --express_file should " "be given. " "--header is still required, and should apply to at least one file. " 'Only one "outfile" should be given, and it should refer to a path ' "in which to store the aligned files.") #parser.add_argument( # "--first_annot_header", help="If only aligning annotation files, " # "find the samples to be matched under this header in the first " # "annotation file.") parser.add_argument( "--clobber", default=False, action="store_true", help="Overwrite output files, if they already exist.") group = parser.add_argument_group(title="Comparisons") group.add_argument( "--case_insensitive", default=False, action="store_true", help="Do a case insensitive search of sample names.") group.add_argument( "--hash", default=False, action="store_true", help="Hash the sample names to [a-zA-Z0-9_] before comparison.") group.add_argument( "--ignore_nonalnum", default=False, action="store_true", help="Ignore non-alphanumeric characters in the IDs.") group.add_argument( "--ignore_blank", default=False, action="store_true", help="Ignore IDs that are blank (don't align them.") group = parser.add_argument_group(title="Joins") group.add_argument( "--strict", default=False, action="store_true", help="Complain if a file is missing a sample.") group.add_argument( "--left_join", default=False, action="store_true", help='By default, does an "inner join" and keeps only the ' 'records that are present in all files. A "left join" will ' 'keep all records that occur in the first file.') group.add_argument( "--outer_join", default=False, action="store_true", help='By default, does an "inner join" and keeps only the ' 'records that are present in all files. An "outer join" will ' 'also keep records that occur in any file.') group = parser.add_argument_group(title="Output") group.add_argument( "--null_string", default="", help='For left_join or outer_join, what to give the missing values.') group.add_argument( "--unaligned_only", action="store_true", help="Show only the rows that are not aligned.") group.add_argument( "--dont_add_missing_samples", action="store_true", help="If a matrix does not have a sample, don't fill in the value " "from another matrix.") group = parser.add_argument_group(title="Debug") group.add_argument( "--debug_nrows", type=int, help="Debugging: Only read this many rows from the annotation files.") args = parser.parse_args() # If the user specified an --annot_path, revise args to # contain --annot_files instead. sys.argv, args = _handle_annot_path(sys.argv, args) ni, no = len(args.express_file)+len(args.annot_file), len(args.outfile) assert ni == no, "Mismatch: %d inputs and %d outputs" % (ni, no) for x in args.express_file + args.annot_file: assert os.path.exists(x), "I could not find file: %s" % x for x in args.outfile: if x == SKIP_OUTFILE: continue assert args.clobber or not os.path.exists(x), "File exists: %s" % x assert not (args.left_join and args.outer_join) if args.null_string: assert args.outer_join or args.left_join, \ "null_string given, but only used for outer_join" # Align the outfiles to the expression and annotation files. express_file = args.express_file[:] annot_file = args.annot_file[:] outfile = args.outfile[:] matrix_data = [] # list of (infile, outfile, is_express_file) for arg in sys.argv: if arg not in ["--express_file", "--annot_file"]: continue assert outfile if arg == "--express_file": assert express_file x = express_file.pop(0), outfile.pop(0), True else: assert annot_file x = annot_file.pop(0), outfile.pop(0), False matrix_data.append(x) assert not express_file assert not annot_file assert not outfile # Align the --header arguments to the annotation files. headers = [None] * len(matrix_data) header_i = -1 for i, arg in enumerate(sys.argv): if arg == "--header": assert header_i >= 0, \ "--header given before an --express_file or --annot_file." assert headers[header_i] is None, "Two --header for one file." headers[header_i] = sys.argv[i+1] elif arg in ["--express_file", "--annot_file"]: header_i += 1 # Add the headers to the matrix_data. new_matrix_data = [] # list of (infile, outfile, is_express_file, header) for i in range(len(matrix_data)): infile, outfile, is_express_file = matrix_data[i] if is_express_file and headers[i]: raise NotImplementedError, "No headers for --express_file." x = infile, outfile, is_express_file, headers[i] new_matrix_data.append(x) matrix_data = new_matrix_data # Read each of the files. new_matrix_data = [] # list of (infile, outfile, matrix, header) for x in matrix_data: infile, outfile, is_express_file, header = x if is_express_file: data = read_express(infile) else: data = AM.read(infile, nrows=args.debug_nrows) x = infile, outfile, data, header new_matrix_data.append(x) matrix_data = new_matrix_data # Find the samples in each matrix. new_matrix_data = [] # list of (infile, outfile, matrix, header, samples) samples_hint = peek_samples_hint(matrix_data) for x in matrix_data: infile, outfile, matrix, header = x headers_hint = [x for x in headers if x] x = get_samples( matrix, header, samples_hint, headers_hint, args.case_insensitive, args.hash, args.ignore_nonalnum) assert x, "I could not find the samples for %s" % infile header, samples = x x = infile, outfile, matrix, header, samples new_matrix_data.append(x) matrix_data = new_matrix_data if args.left_join: assert not args.strict, "Can't do a strict left join." # No duplicates. samples = list_all_samples( matrix_data[:1], args.case_insensitive, args.hash, args.ignore_nonalnum) assert samples, "No samples." elif args.outer_join: assert not args.strict, "Can't do a strict outer join." samples = list_all_samples( matrix_data, args.case_insensitive, args.hash, args.ignore_nonalnum) assert samples, "No samples." else: # inner join samples = list_common_samples( matrix_data, args.case_insensitive, args.hash, args.ignore_nonalnum) assert samples, "No common samples found." if args.strict: all_samples = list_all_samples( matrix_data, args.case_insensitive, args.hash, args.ignore_nonalnum) common_samples = list_common_samples( matrix_data, args.case_insensitive, args.hash, args.ignore_nonalnum) if sorted(all_samples) != sorted(common_samples): missing_samples = [] for x in all_samples: i = find_sample( common_samples, x, args.case_insensitive, args.hash, args.ignore_nonalnum, args.ignore_blank) if i >= 0: continue missing_samples.append(x) short = missing_samples if len(short) > 10: short = short[:10] + ["..."] short = "\n".join(short) raise AssertionError, "%d samples not in all data sets.\n%s" % \ (len(missing_samples), short) # Align each of the matrices. matrix_data = align_matrices( matrix_data, samples, args.case_insensitive, args.hash, args.ignore_nonalnum, args.ignore_blank, args.left_join, args.outer_join, args.unaligned_only, args.null_string) # Add the missing samples back to the matrix. if not args.dont_add_missing_samples: matrix_data = add_missing_samples(matrix_data, args.null_string) # Write out each of the matrices. for x in matrix_data: infile, outfile, matrix, header, samples = x if outfile == SKIP_OUTFILE: continue write_matrix(outfile, matrix)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import math from genomicode import filelib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib svm_node = in_data filelib.assert_exists_nz(svm_node.identifier) linked_file = mlib.get_user_option(user_options, "linked_variants_file", not_empty=True, check_file=True) # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier) CHROM = SVM["______Chrom"] POS = SVM["______Pos"] POS = [int(x) for x in POS] all_coords = {} # (chrom, pos) -> 1 for x in zip(CHROM, POS): all_coords[x] = 1 # Read the linked variant file. # Chrom Pos Perc Linked p coord2info = {} # (chrom, pos) -> d for d in filelib.read_row(linked_file, header=1): pos = int(d.Pos) if (d.Chrom, pos) not in all_coords: continue coord2info[(d.Chrom, pos)] = d # Align the linked annotations to the matrix. MAX_SCORE = 1000 min_p = 10**-(MAX_SCORE / 10) linked_headers = ["Perc Linked", "Score"] annotations = [] for (chrom, pos) in zip(CHROM, POS): if (chrom, pos) not in coord2info: x = [""] * len(linked_headers) annotations.append(x) continue d = coord2info[(chrom, pos)] score = MAX_SCORE if float(d.p) >= min_p: score = -10 * math.log(float(d.p), 10) x = d.Perc_Linked, score assert len(x) == len(linked_headers) annotations.append(x) # Convert the headers and annotations to SVM format. linked_headers = ["Linkage______%s" % x for x in linked_headers] linked_annotations = jmath.transpose(annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 ## If Annovar exists, put after. #I = [i for (i, x) in enumerate(SVM.headers) # if x.upper().startswith("ANNOVAR")] #if I: # INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + linked_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + linked_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib svm_node = in_data filelib.assert_exists_nz(svm_node.identifier) cosmic_file = mlib.get_user_option( user_options, "cosmic_variants_file", not_empty=True, check_file=True) # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier) CHROM = SVM["______Chrom"] POS = SVM["______Pos"] POS = [int(x) for x in POS] all_coords = {} # (chrom, pos) -> 1 for x in zip(CHROM, POS): all_coords[x] = 1 # Read the COSMIC variant file. # Chrom Start End GRCh Count SNP # Mutation CDS Mutation AA # FATHMM prediction FATHMM score Mutation somatic status coord2info = {} # (chrom, pos) -> d for d in filelib.read_row(cosmic_file, header=1): start, end = int(d.Start), int(d.End) in_svm = False for pos in range(start, end+1): if (d.Chrom, pos) in all_coords: in_svm = True break if not in_svm: continue coord2info[(d.Chrom, pos)] = d # Align the COSMIC annotations to the matrix. cosmic_headers = [ "SNP", "Num Tumors", "Mutation CDS", "Mutation AA", "FATHMM prediction", "FATHMM score", "Mutation somatic status"] annotations = [] for (chrom, pos) in zip(CHROM, POS): if (chrom, pos) not in coord2info: x = [""] * len(cosmic_headers) annotations.append(x) continue d = coord2info[(chrom, pos)] x = d.SNP, d.Count, d.Mutation_CDS, d.Mutation_AA, \ d.FATHMM_prediction, d.FATHMM_score, \ d.Mutation_somatic_status annotations.append(x) # Convert the headers and annotations to SVM format. cosmic_headers = ["COSMIC______%s" % x for x in cosmic_headers] cosmic_annotations = jmath.transpose(annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 # If Annovar exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("ANNOVAR")] if I: INDEX = max(INDEX, max(I)+1) # If SnpEff exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("SNPEFF")] if I: INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + cosmic_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + cosmic_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): from genomicode import filelib from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix simple_file = in_data.identifier metadata = {} # Read all in memory. Hopefully, not too big. ds = [] for d in filelib.read_row(simple_file, header=-1): ds.append(d) #if len(ds) > 50000: # DEBUG # break # MuSE sometimes has alternates. # Alt A,C # Num_Alt 13,0 # VAF 0.19,0.0 # Detect this and fix it. Take the alternate with the highest VAF. for d in ds: if d.Num_Alt.find(",") < 0: continue x1 = d.Num_Alt.split(",") x2 = d.VAF.split(",") assert len(x1) == len(x2) x1 = map(int, x1) x2 = map(float, x2) max_vaf = max_i = None for i in range(len(x2)): if max_vaf is None or x2[i] > max_vaf: max_vaf = x2[i] max_i = i assert max_i is not None d.Num_Alt = str(x1[max_i]) d.VAF = str(x2[max_i]) # Make a list of all the positions. positions = {} # (Chrom, Pos) -> 1 for d in ds: positions[(d.Chrom, int(d.Pos))] = 1 positions = sorted(positions) # Make a list of all the callers. callers = {} for d in ds: callers[d.Caller] = 1 callers = sorted(callers) # Make a list of all the samples. samples = {} for d in ds: samples[d.Sample] = 1 samples = sorted(samples) # Make a list of the coordinates. coord_data = {} for d in ds: x = d.Chrom, int(d.Pos), d.Ref, d.Alt coord_data[x] = 1 coord_data = sorted(coord_data) # Make a list of all DNA calls. call_data = [] for d in ds: assert d.Source in ["DNA", "RNA"] if d.Source != "DNA": continue num_ref = num_alt = vaf = None if d.Num_Ref: num_ref = int(d.Num_Ref) if d.Num_Alt: num_alt = int(d.Num_Alt) if d.VAF: vaf = float(d.VAF) if num_ref is None and num_alt is None and vaf is None: continue call = SimpleVariantMatrix.Call(num_ref, num_alt, vaf) x = d.Chrom, int(d.Pos), d.Ref, d.Alt, d.Sample, d.Caller, call call_data.append(x) # sample -> caller -> chrom, pos, ref, alt -> call samp2caller2coord2call = {} for x in call_data: chrom, pos, ref, alt, sample, caller, call = x coord = chrom, pos, ref, alt if sample not in samp2caller2coord2call: samp2caller2coord2call[sample] = {} caller2coord2call = samp2caller2coord2call[sample] if caller not in caller2coord2call: caller2coord2call[caller] = {} coord2call = caller2coord2call[caller] # A (sample, caller, coord) may have multiple calls. For # example, for germline samples that are called with each # tumor sample. If this is the case, then take the call # with the highest coverage. if coord in coord2call: old_call = coord2call[coord] cov = old_cov = None if call.num_ref is not None and call.num_alt is not None: cov = call.num_ref + call.num_alt if old_call.num_ref is not None and \ old_call.num_alt is not None: old_cov = old_call.num_ref + old_call.num_alt if cov is None and old_cov is not None: call = old_call elif cov is not None and old_cov is not None and cov < old_cov: call = old_call coord2call[coord] = call # Count the number of callers that called a variant at each # position for each sample. samp2coord2caller = {} # sample -> chrom, pos, ref, alt -> caller -> 1 # Need to do this first, to make sure each caller is counted # at most once. This is to account for germline samples that # is called by each caller multiple times. for x in call_data: chrom, pos, ref, alt, sample, caller, call = x coord = chrom, pos, ref, alt if sample not in samp2coord2caller: samp2coord2caller[sample] = {} if coord not in samp2coord2caller[sample]: samp2coord2caller[sample][coord] = {} samp2coord2caller[sample][coord][caller] = 1 samp2coord2nc = {} # sample -> chrom, pos, ref, alt -> num_callers for sample in samp2coord2caller: samp2coord2nc[sample] = {} for coord in samp2coord2caller[sample]: samp2coord2nc[sample][coord] = len( samp2coord2caller[sample][coord]) #for x in call_data: # chrom, pos, ref, alt, sample, caller, call = x # coord = chrom, pos, ref, alt # if sample not in samp2coord2nc: # samp2coord2nc[sample] = {} # nc = samp2coord2nc[sample].get(coord, 0) + 1 # samp2coord2nc[sample][coord] = nc # Format everything into an annotation matrix. headers0 = [] headers1 = [] headers2 = [] all_annots = [] # Add the positions. headers0 += ["", "", "", ""] headers1 += ["", "", "", ""] headers2 += ["Chrom", "Pos", "Ref", "Alt"] for i in range(4): x = [x[i] for x in coord_data] x = [str(x) for x in x] all_annots.append(x) # Add the number of callers information. headers0 += ["Num Callers"] * len(samples) headers1 += [""] * len(samples) headers2 += samples for sample in samples: annots = [] for coord in coord_data: nc = samp2coord2nc.get(sample, {}).get(coord, "") annots.append(nc) all_annots.append(annots) # Add information about calls. for sample in samples: caller2coord2call = samp2caller2coord2call.get(sample, {}) for i, caller in enumerate(callers): h0 = "" if not i: h0 = sample h1 = caller h2 = "Ref/Alt/VAF" headers0.append(h0) headers1.append(h1) headers2.append(h2) coord2call = caller2coord2call.get(caller, {}) annots = [] for coord in coord_data: x = "" call = coord2call.get(coord) if call: x = SimpleVariantMatrix._format_call(call) annots.append(x) all_annots.append(annots) # Set the headers. assert len(headers0) == len(headers1) assert len(headers0) == len(headers2) assert len(headers0) == len(all_annots) headers = [None] * len(headers0) for i, x in enumerate(zip(headers0, headers1, headers2)): x = "___".join(x) headers[i] = x matrix = AnnotationMatrix.create_from_annotations(headers, all_annots) SimpleVariantMatrix.write_from_am(out_filename, matrix) #annot_header = ["Chrom", "Pos", "Ref", "Alt"] #matrix = SimpleVariantMatrix.make_matrix( # samples, callers, annot_header, coord_data, named_data, # call_data) #SimpleVariantMatrix.write(out_filename, matrix) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import hashlib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib simple_node = in_data filelib.assert_exists_nz(simple_node.identifier) gene_file = mlib.get_user_option( user_options, "cancer_genes_file", not_empty=True, check_file=True) # Read the cancer genes file. # <Gene ID> <Gene Symbol> <Dataset> ... symbol2info = {} # symbol -> d gene_iter = filelib.read_row(gene_file, header=1) header = None for d in gene_iter: assert "Gene Symbol" in d._header if header is None: header = [ x for x in d._header if x not in ["Gene ID", "Gene Symbol"]] if not d.Gene_Symbol: continue symbol2info[d.Gene_Symbol] = d # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(simple_node.identifier) GENE_H = "Annovar______Gene.refGene" assert GENE_H in SVM.headers, "Missing annotation: %s" % GENE_H GENES = SVM[GENE_H] # Align the matrix to the simple variant matrix. gene_headers = header gene_annotations = [] for i, gene_str in enumerate(GENES): # Format of genes: # PFN1P2 # PMS2P2,PMS2P7 values = [""] * len(gene_headers) genes = gene_str.split(",") for gene in genes: if gene not in symbol2info: continue d = symbol2info[gene] for j, h in enumerate(gene_headers): h = hashlib.hash_var(h) assert hasattr(d, h) x = getattr(d, h) assert x in ["", "1"] if x == "1": values[j] = 1 gene_annotations.append(values) # Convert the headers and annotations to SVM format. gene_headers = ["Cancer Genes______%s" % x for x in gene_headers] gene_annotations = jmath.transpose(gene_annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 # If Annovar exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("ANNOVAR")] if I: INDEX = max(INDEX, max(I)+1) # If SnpEff exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("SNPEFF")] if I: INDEX = max(INDEX, max(I)+1) # If COSMIC exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("COSMIC")] if I: INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + gene_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + gene_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_filename): import arrayio from genomicode import filelib from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix simple_node, signal_node = antecedents filelib.assert_exists_nz(simple_node.identifier) filelib.assert_exists_nz(signal_node.identifier) # Read the variant file. SVM = SimpleVariantMatrix.read(simple_node.identifier) #AM = SVM.annot_matrix #assert GENE_H in AM.headers # Read the gene expression file. GXP = arrayio.read(signal_node.identifier) # Make sure the samples from the variant matrix can be found # in the gene expression matrix. GXP_samples = GXP.col_names(arrayio.COL_ID) missing = [x for x in SVM.samples if x not in GXP_samples] assert len(missing) < len(SVM.samples), ( "SimpleVariantMatrix and gene expression file have " "no common samples.") # Actually, may not have all the same samples. For example, a # gene expression profile might not have been calculated for # the germline sample. So ignore if something is missing. #x = missing #if len(x) > 5: # x = x[:5] + ["..."] #msg = "Samples (%d) not found in gene expression file: %s" % ( # len(missing), ", ".join(x)) #assert not missing, msg # Add all the samples from the gene expression file. SAMPLES = GXP_samples # Find the genes in each row. GENE_H = "Gene.refGene" annovar_matrix = None for (name, matrix) in SVM.named_matrices: if GENE_H in matrix.headers: annovar_matrix = matrix break assert annovar_matrix, "Missing annotation: %s" % GENE_H GENES = annovar_matrix[GENE_H] # Make a list of the genes. genes = {} for i, gene_str in enumerate(GENES): # Format of genes: # PFN1P2 # PMS2P2,PMS2P7 for x in gene_str.split(","): genes[x] = 1 genes = sorted(genes) # Make a matrix of the gene expression values for each gene # and each sample. #I = [GXP_samples.index(x) for x in SVM.samples] #GXP_a = GXP.matrix(genes, I) # align the matrices. GXP_a = GXP.matrix(genes, None) # Write out the expression matrix for debugging purposes. arrayio.write(GXP_a, "expression.txt") # Search for each of the genes in the matrix. gene2I = {} # gene -> list of row indexes for gene in genes: x = GXP_a._index(row=gene) I_row, i_col = x if I_row: gene2I[gene] = I_row # Align the gene expression matrix to the simple variant # matrix. #matrix = [[None]*len(SVM.samples) for i in range(len(GENES))] matrix = [[None]*len(SAMPLES) for i in range(len(GENES))] for i, gene_str in enumerate(GENES): # Format of genes: Format of output # PFN1P2 5.2 # PMS2P2,PMS2P7 2.2,8.6 # If a gene is missing, then skip it. genes = gene_str.split(",") #for j in range(len(SVM.samples)): for j in range(len(SAMPLES)): values = [] # expression values for each gene. for k in range(len(genes)): if genes[k] not in gene2I: continue x = [GXP_a._X[l][j] for l in gene2I[genes[k]]] # If there are multiple instances of this gene, # then pick the one with the maximum expression. x = max(x) values.append(x) values = [_pretty_gxp(x) for x in values] x = ",".join(values) matrix[i][j] = x # Add the matrix back to the simple variant matrix. #headers = SVM.samples headers = SAMPLES all_annots = [] for j in range(len(headers)): x = [matrix[i][j] for i in range(len(matrix))] all_annots.append(x) x = AnnotationMatrix.create_from_annotations(headers, all_annots) SVM.named_matrices.append(("Gene Expression", x)) # Write to file. SimpleVariantMatrix.write(out_filename, SVM)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import itertools from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix from Betsy import module_utils as mlib summary_file = in_data.identifier metadata = {} #x = mlib.get_user_option( # user_options, "nonsynonymous_and_stopgain_only", # allowed_values=["no", "yes"]) #nonsynonymous_and_stopgain_only = (x == "yes") min_alt_reads = mlib.get_user_option(user_options, "filter_by_min_alt_reads", not_empty=True, type=int) assert min_alt_reads >= 0 and min_alt_reads < 10000 min_total_reads = mlib.get_user_option(user_options, "filter_by_min_total_reads", not_empty=True, type=int) assert min_total_reads >= 0 and min_total_reads < 10000 min_vaf = mlib.get_user_option(user_options, "filter_by_min_vaf", not_empty=True, type=float) assert min_vaf >= 0.0 and min_vaf < 1.0 #min_gq = mlib.get_user_option( # user_options, "filter_by_min_GQ", not_empty=True, type=float) #assert min_gq >= 0 and min_gq < 1000 assert min_total_reads or min_alt_reads, "No filter" matrix = SimpleVariantMatrix.read_as_am(summary_file) #var_matrix = SimpleVariantMatrix.read(summary_file) #call_matrix = var_matrix.call_matrix #annot_matrix = var_matrix.annot_matrix #annovar_matrix = None #for (name, matrix) in var_matrix.named_matrices: # if "ExonicFunc.refGene" in matrix.headers: # annovar_matrix = matrix # break #assert annovar_matrix, "Missing annotation: ExonicFunc.refGene" # copy.deepcopy is very slow. Try to avoid it. # Strategy: # 1. Make a list of the changes to be made. # 2. Save the filtered rows. # 3. Make the changes. # 4. Save the non-filtered rows. I_remove = {} # i -> 1 call_remove = {} # i -> (sample, caller) -> 1 #CHROM = matrix.header2annots["______Chrom"] #POS = matrix.header2annots["______Pos"] #POS = [int(x) for x in POS] #REF = matrix.header2annots["______Ref"] #ALT = matrix.header2annots["______Alt"] # Optimization: normalize the headers for the samples and callers. sc2header = {} # (sample, caller) -> header_h for sc in itertools.product(matrix.samples, matrix.callers): sample, caller = sc header = "%s___%s___Ref/Alt/VAF" % (sample, caller) header_h = matrix.normalize_header(header) assert header_h sc2header[sc] = header_h for i in range(matrix.num_annots()): has_calls = False # whether this row has any calls. for sc in itertools.product(matrix.samples, matrix.callers): sample, caller = sc header_h = sc2header[sc] call_str = matrix.header2annots[header_h][i] if not call_str: continue call = SimpleVariantMatrix._parse_call(call_str) filt = False # filter_by_min_alt_reads if min_alt_reads > 0 and \ (call.num_alt is None or call.num_alt < min_alt_reads): filt = True # filter_by_min_total_reads if min_total_reads > 0 and (call.total is None or call.total < min_total_reads): filt = True # filter_by_min_vaf if min_vaf >= 1E-6 and (call.vaf is None or call.vaf < min_vaf): filt = True if filt: if i not in call_remove: call_remove[i] = {} call_remove[i][sc] = 1 else: has_calls = True # If this coordinate has no more calls, then remove the # whole row. if not has_calls: I_remove[i] = 1 I_remove = sorted(I_remove) # Write out a matrix of the discarded rows. filtered_matrix = AnnotationMatrix.rowslice(matrix, I_remove) SimpleVariantMatrix.write_from_am("discarded.txt", filtered_matrix) # Remove the calls. for i in call_remove: for sc in call_remove[i]: header_h = sc2header[sc] call_str = matrix.header2annots[header_h][i] assert call_str matrix.header2annots[header_h][i] = "" # Which rows to keep. I_remove_dict = {}.fromkeys(I_remove) I_keep = [ i for i in range(matrix.num_annots()) if i not in I_remove_dict ] filtered_matrix = AnnotationMatrix.rowslice(matrix, I_keep) SimpleVariantMatrix.write_from_am(out_filename, filtered_matrix) ## ## Filter out synonymous variants. ## #if nonsynonymous_and_stopgain_only: ## # # Make sure annotated with Annovar. ## # assert "ExonicFunc.refGene" in annovar_matrix.headers ## # exonic_func = annovar_matrix["ExonicFunc.refGene"] ## # for i, efunc in enumerate(exonic_func): ## # efunc = exonic_func[i] ## # assert efunc in [ ## # "", "nonsynonymous SNV", "synonymous SNV", ## # "stopgain", "stoploss", ## # "frameshift substitution", "nonframeshift substitution", ## # "unknown"], \ ## # "Unknown exonic_func: %s" % efunc ## # if efunc not in ["nonsynonymous SNV", "stopgain"]: ## # I_remove[i] = 1 ## # continue ## # Filter based on the calls. ## if min_alt_reads > 0 or min_total_reads > 0: ## all_coord = call_matrix.coord2samplecaller2call.keys() ## for coord in all_coord: ## all_sc = call_matrix.coord2samplecaller2call[coord].keys() ## for sc in all_sc: ## # SimpleVariantMatrix.Call object. ## call = call_matrix.coord2samplecaller2call[coord][sc] ## # filter_by_min_alt_reads ## if min_alt_reads > 0 and \ ## (call.num_alt is None or call.num_alt < min_alt_reads): ## if coord not in call_remove: ## call_remove[coord] = {} ## call_remove[coord][sc] = 1 ## # filter_by_min_total_reads ## if min_total_reads > 0 and ( ## call.total is None or call.total < min_total_reads): ## if coord not in call_remove: ## call_remove[coord] = {} ## call_remove[coord][sc] = 1 ## # Filter based on VAF. ## if min_vaf >= 1E-6: ## all_coord = call_matrix.coord2samplecaller2call.keys() ## for coord in all_coord: ## all_sc = call_matrix.coord2samplecaller2call[coord].keys() ## for sc in all_sc: ## call = call_matrix.coord2samplecaller2call[coord][sc] ## # filter_by_min_vaf ## if call.vaf is None or call.vaf < min_vaf: ## if coord not in call_remove: ## call_remove[coord] = {} ## call_remove[coord][sc] = 1 ## # If any of these coordinates have no more variants, then ## # remove the whole row. ## if call_remove: ## chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ## ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] ## pos = [int(x) for x in pos] ## coord2i = {} ## for i, coord in enumerate(zip(chrom, pos, ref, alt)): ## coord2i[coord] = i ## for coord in call_remove: ## num_remove = len(call_remove[coord]) ## num_calls = len(call_matrix.coord2samplecaller2call[coord]) ## assert num_remove <= num_calls ## if num_remove == num_calls: ## i = coord2i[coord] ## I_remove[i] = 1 ## # Make a matrix of the discarded rows. ## old_annot_matrix = var_matrix.annot_matrix ## old_named_matrices = var_matrix.named_matrices ## filtered_matrix = var_matrix ## x = AnnotationMatrix.rowslice(var_matrix.annot_matrix, I_remove) ## filtered_matrix.annot_matrix = x ## named_matrices = [] ## for (name, matrix) in var_matrix.named_matrices: ## matrix = AnnotationMatrix.rowslice(matrix, I_remove) ## named_matrices.append((name, matrix)) ## filtered_matrix.named_matrices = named_matrices ## SimpleVariantMatrix.write("discarded.txt", filtered_matrix) ## var_matrix.annot_matrix = old_annot_matrix ## var_matrix.named_matrices = old_named_matrices ## # Remove the calls. ## for coord in call_remove: ## chrom, pos, ref, alt = coord ## for (sample, caller) in call_remove[coord]: ## var_matrix.call_matrix.set_call( ## chrom, pos, ref, alt, sample, caller, None) ## # Which rows to keep. ## I_keep = [ ## i for i in range(var_matrix.num_variants()) if i not in I_remove] ## # Filter annotation matrix ## var_matrix.annot_matrix = AnnotationMatrix.rowslice( ## var_matrix.annot_matrix, I_keep) ## # Filter named matrices. ## for i, (name, matrix) in enumerate(var_matrix.named_matrices): ## matrix = AnnotationMatrix.rowslice(matrix, I_keep) ## var_matrix.named_matrices[i] = (name, matrix) ## SimpleVariantMatrix.write(out_filename, var_matrix) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import alignlib #from Betsy import module_utils as mlib rsem_path = in_data.identifier assert os.path.exists(rsem_path) assert os.path.isdir(rsem_path) result_files = alignlib.find_rsem_result_files(rsem_path) assert result_files, "No .results files found." metadata = {} preprocess = out_attributes.get("preprocess") assert preprocess in ["tpm", "fpkm"] #x = mlib.get_user_option( # user_options, "genes_or_isoforms", not_empty=True, # allowed_values=["genes", "isoforms"]) #get_genes = x == "genes" # Figure out whether to align to genome or transcriptome. x = out_attributes["expression_of"] assert x in ["gene", "isoform"] get_genes = x == "gene" transcript_header = "transcript_id(s)" if not get_genes: transcript_header = "transcript_id" # For each of the gene files, get the expression data. sample2matrix = {} # sample -> AnnotationMatrix for x in result_files: sample, gene_filename, isoform_filename = x # Get the gene results. # TODO: Implement isoforms. filename = gene_filename if not get_genes: filename = isoform_filename assert filename is not None, "Missing: %s" % filename #if filename is None: # continue assert os.path.exists(filename) matrix = AnnotationMatrix.read(filename) # Do some checking on the matrix. assert "gene_id" in matrix.headers assert transcript_header in matrix.headers assert "TPM" in matrix.headers assert "FPKM" in matrix.headers sample2matrix[sample] = matrix assert sample2matrix, "No samples" gene_id = transcript_id = None # Pull out the gene and transcript IDs. for matrix in sample2matrix.itervalues(): x1 = matrix["gene_id"] x2 = matrix[transcript_header] if gene_id is None: gene_id = x1 if transcript_id is None: transcript_id = x2 assert x1 == gene_id assert x2 == transcript_id assert gene_id assert transcript_id assert len(gene_id) == len(transcript_id) # Assemble into a gene expression matrix. header = "TPM" if preprocess == "fpkm": header = "FPKM" t_data = [] # matrix, where each row is a sample. t_data.append(gene_id) t_data.append(transcript_id) samples = [] for sample in sorted(sample2matrix): matrix = sample2matrix[sample] exp = matrix[header] assert len(exp) == len(gene_id) t_data.append(exp) samples.append(sample) data = jmath.transpose(t_data) header = ["gene_id", transcript_header] + samples data = [header] + data # Write out the data file. handle = open(out_filename, 'w') for x in data: print >>handle, "\t".join(map(str, x)) return metadata
def add_coverage_to_svm(svm_file, coverage_file, outfile, is_rna_cov): from genomicode import jmath from genomicode import filelib from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix # Read the variant file. SVM = SimpleVariantMatrix.read(svm_file) AM = SVM.annot_matrix assert "Chrom" in AM.headers assert "Pos" in AM.headers CHROM = AM["Chrom"] POS = AM["Pos"] POS = [int(x) for x in POS] # Read the coverage matrix. # Chrom Pos <Sample> [<Sample> ...] # Pos is 1-based. coord2sample2cov = {} # (chrom, pos) -> sample -> ref/alt/vaf cov_samples = {} for d in filelib.read_row(coverage_file, header=1): coord = d.Chrom, int(d.Pos) if coord not in coord2sample2cov: coord2sample2cov[coord] = {} for i in range(2, len(d._header)): sample = d._header[i] cov = d._cols[i] if not cov: continue #coord2sample2cov[coord][sample] = int(cov) coord2sample2cov[coord][sample] = cov cov_samples[sample] = 1 # Make sure the samples from the variant matrix can be found # in the coverage matrix. missing = [x for x in SVM.samples if x not in cov_samples] assert len(missing) < len(SVM.samples), ( "SimpleVariantMatrix and coverage file have " "no common samples.") # If the samples aren't sequenced at high coverage, it's # possible they just don't have reads at these positions. Be # a little lenient here, and accept the file if some of the # samples overlap. #x = missing #if len(x) > 5: # x = x[:5] + ["..."] #msg = "Samples (%d) not found in coverage file: %s" % ( # len(missing), ", ".join(x)) #assert not missing, msg # Report the coverage for the samples at the intersection. SAMPLES = [x for x in SVM.samples if x in cov_samples] # Align the matrix to the simple variant matrix. #matrix = [[None]*len(SVM.samples) for i in range(AM.num_annots())] matrix = [[None]*len(SAMPLES) for i in range(AM.num_annots())] for i in range(AM.num_annots()): coord = CHROM[i], POS[i] sample2cov = coord2sample2cov.get(coord, {}) x = [sample2cov.get(x, "") for x in SAMPLES] #x = map(str, x) matrix[i] = x # Add the matrix back to the simple variant matrix. headers = SAMPLES all_annots = jmath.transpose(matrix) name = "Coverage" # If this is being used to add RNA coverage, use a different # name. if is_rna_cov: name = "RNA Coverage" x = AnnotationMatrix.create_from_annotations(headers, all_annots) SVM.named_matrices.append((name, x)) # Write to file. SimpleVariantMatrix.write(outfile, SVM)
def main(): import os import argparse from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import colorlib from genomicode import pcalib parser = argparse.ArgumentParser(description="") parser.add_argument("datafile", help="Tab-delimited data file.") #parser.add_argument("x_header", help="Which column for X values.") #parser.add_argument("y_header", help="Which column for Y values.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") group = parser.add_argument_group(title="Data Series") group.add_argument( "--series", action="append", help="Add a data series to the plot. At least one series must be " "plotted. Format: <x_header>;<y_header>") group = parser.add_argument_group(title="General Appearance") group.add_argument("--no_box", action="store_true", help="Turn off the box around the plot.") group.add_argument("--height", type=int, help="Height (in pixels) of the plot.") group.add_argument("--width", type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument("--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0.") #group.add_argument( # "--xlabel_size", default=1.0, type=float, # help="Scale the size of the labels on X-axis. Default 1.0.") group.add_argument("--log_x", action="store_true", help="Plot the X-axis on a log scale.") group.add_argument("--log_y", action="store_true", help="Plot the Y-axis on a log scale.") group.add_argument( "--qq", action="store_true", help="Make a QQ-plot. Will sort the values to be plotted.") group = parser.add_argument_group(title="Plot Labels") group.add_argument("--title", help="Put a title on the plot.") group.add_argument("--xlab", help="Label the X-axis.") group.add_argument("--ylab", help="Label the Y-axis.") group.add_argument("--add_regression", action="store_true", help="Put a regression line on the plot.") group = parser.add_argument_group(title="Legend") group.add_argument("--add_legend", action="store_true", help="Add a legend to the plot.") group.add_argument("--legend_inset", type=float, default=0.05, help="") LEGEND_LOCATIONS = [ "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right", "center", ] group.add_argument("--legend_loc", choices=LEGEND_LOCATIONS, help="Where to draw the legend.") group = parser.add_argument_group(title="Point Appearance") group.add_argument("--scale_points", default=1.0, type=float, help="Scale the size of the points. Default 1.0") group.add_argument("--label_header", help="Label each point with the values in this column.") group.add_argument("--label_size", type=float, help="Scale the size of the labels by this value.") group.add_argument("--label_pos", default="top", choices=["top", "bottom", "left", "right"], help="Where to label the points.") group = parser.add_argument_group(title="Line Appearance") group.add_argument("--add_lines", action="store_true", help="Add lines that connect the points.") group.add_argument("--scale_lines", default=1.0, type=float, help="Scale the thickness of the lines. Default 1.0") group = parser.add_argument_group(title="Identity Line") group.add_argument("--add_identity_line", action="store_true", help="Add an identity line to the plot.") group = parser.add_argument_group(title="Colors") group.add_argument( "-c", "--cluster", action="append", help="Group samples into a cluster (e.g. -c 1-5); 1-based, inclusive.") group.add_argument( "--indexes_include_headers", "--iih", action="store_true", help="If not given (default), then index 1 is the first row " "with data. If given, then index 1 is the very first row " "in the file, including the headers.") group.add_argument("--default_color", help="Default color of points. Format: #000000.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.datafile): parser.error("File not found: %s" % args.datafile) if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096 * 16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096 * 16, "height too big" assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 #assert args.xlabel_size > 0 and args.xlabel_size < 10 assert args.legend_inset >= 0 and args.legend_inset < 10 if args.legend_loc is None: args.legend_loc = "bottomright" if args.default_color: assert len(args.default_color) == 7 assert args.default_color[0] == "#" MATRIX = AnnotationMatrix.read(args.datafile, False) assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix." assert args.series, "Need to add a data --series to plot." #assert len(args.series) <= 1, "Not implemented." #assert args.x_header in MATRIX.headers, \ # "header not found: %s" % args.x_header #assert args.y_header in MATRIX.headers, \ # "header not found: %s" % args.y_header if args.label_header: assert args.label_header in MATRIX.headers, \ "header not found: %s" % args.label_header if args.label_size is not None: assert args.label_size > 0 and args.label_size <= 20 assert args.scale_points > 0 and args.scale_points < 20 assert args.scale_lines > 0 and args.scale_lines < 20 series = _parse_series(MATRIX, args.series) cluster = None if args.cluster: cluster = _parse_cluster(args.cluster, args.indexes_include_headers, MATRIX) if len(series) > 1: assert not cluster, "Series and cluster not implemented." height = args.height or 2400 width = args.width or 3200 # Pull out the values and colors for the plot. default_color = "#000000" if args.default_color: default_color = args.default_color assert len(series) < len(colorlib.BREWER_QUALITATIVE_SET1) series_data = [] # list of (x_values, y_values, col) for each series for i in range(len(series)): x_header, y_header = series[i] x = MATRIX[x_header] y = MATRIX[y_header] I1 = [j for (j, a) in enumerate(x) if a] I2 = [j for (j, a) in enumerate(y) if a] I = [j for j in I1 if j in I2] x = [x[j] for j in I] y = [y[j] for j in I] x = map(float, x) y = map(float, y) assert len(x) == len(y) c = default_color if len(series) > 1: rgb = colorlib.BREWER_QUALITATIVE_SET1[i] c = colorlib.rgb2hex(rgb, prefix="#") c = [c] * len(x) x = x, y, c series_data.append(x) # Merge all the data point for each series. x_values = [] y_values = [] col = [] for (x, y, c) in series_data: x_values.extend(x) y_values.extend(y) #c = [c] * len(x) col.extend(c) assert len(x_values) == len(y_values) assert len(x_values) == len(col) if args.qq: O = jmath.order(x_values) x_values = [x_values[i] for i in O] y_values = [y_values[i] for i in O] col = [col[i] for i in O] if cluster is not None: col_rgb = pcalib.choose_colors(cluster) col = [default_color] * len(col_rgb) for i in range(len(col_rgb)): if col_rgb[i] is None: continue col[i] = colorlib.rgb2hex(col_rgb[i], prefix="#") assert len(col) == len(x_values) #for i in range(len(x_values)): # x = x_values[i], y_values[i], cluster[i], col[i] # print "\t".join(map(str, x)) # Start R and set up the environment. R = jmath.start_R() main = jmath.R_var("NA") if args.title: main = args.title sub = "" xlab = "" if len(series) == 1: xlab = x_header if args.xlab: xlab = args.xlab ylab = "" if len(series) == 1: ylab = y_header if args.xlab: ylab = args.ylab lwd_box = 2 lwd_axis = 2 lwd_regr = 3 cex = 1.0 * args.scale_points cex_lab = 1.5 cex_main = 2.0 cex_sub = 1.0 plot_log = "" if args.log_x: plot_log += "x" if args.log_y: plot_log += "y" assert x_values assert y_values jmath.R_equals(x_values, "X") jmath.R_equals(y_values, "Y") bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn("bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2 mar = [x + 0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") jmath.R_fn("plot", jmath.R_var("X"), jmath.R_var("Y"), main="", xlab="", ylab="", pch=19, cex=cex, log=plot_log, col=col, axes=jmath.R_var("FALSE"), RETVAL="x") # Make plot area solid white. #jmath.R('usr <- par("usr")') #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') #jmath.R_fn( # "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"), # main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"), # add=jmath.R_var("TRUE")) if args.add_lines: lwd = 4 * args.scale_lines i = 0 for (x, y, c) in series_data: # Cannot use c for the color. It might've been changed by # --cluster. assert col and i < len(col) c = col[i:i + len(x)] i += len(x) # The "lines" function takes a scalar for col (except for # type=h, histogram vertical lines). If there are # multiple colors, then split up the points based on the # colors. l_x, l_y, l_c = [], [], None for j in range(len(x)): if c[j] != l_c: if l_x: jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) # Add the previous point so that the points will # connect. if l_x: l_x = [l_x[-1]] l_y = [l_y[-1]] else: l_x, l_y, l_c = [], [], None l_x.append(x[j]) l_y.append(y[j]) l_c = c[j] if l_x: jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) if args.add_identity_line: lwd = 4 x_min, x_max = min(x_values), max(x_values) y_min, y_max = min(y_values), max(y_values) iden_min = max(x_min, y_min) iden_max = min(x_max, y_max) l_x = [iden_min, iden_max] l_y = l_x l_c = "#FF0000" jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) if args.label_header: cex = 1 if args.label_size is not None: cex = args.label_size pos2specifier = { "top": 3, "bottom": 1, "left": 2, "right": 4, } pos = pos2specifier[args.label_pos] point_labels = MATRIX[args.label_header] jmath.R_fn("text", jmath.R_var("X"), jmath.R_var("Y"), labels=point_labels, cex=cex, pos=pos) # Calculate correlation, and other statistics. # TODO: Should calculate this for each series. r = jmath.R("cor(X, Y)") p_value = jmath.R("cor.test(X, Y)$p.value") r = r[0] p_value = p_value[0] print "R = %.2f" % r print "p = %.2g" % p_value # Add a regression line. if args.add_regression: jmath.R("fit <- lm(Y ~ X)") coef = jmath.R("fit$coefficients") assert len(coef) == 2 b, m = coef x1 = min(x_values) y1 = x1 * m + b x2 = max(x_values) y2 = x2 * m + b jmath.R_fn("lines", [x1, x2], [y1, y2], lwd=lwd_regr, lty=2, col="#C63F31") sub = "R=%.2f (p=%.2g)" % (r, p_value) header = "X", "Y", "R", "p" print "\t".join(header) x = xlab, ylab, r, p_value print "\t".join(map(str, x)) if args.add_legend: leg = [x[1] for x in series] fill = [x[-1] for x in series_data] #jmath.R("x <- rgb(0.5, 0.5, 0.5, 0.5)") # alpha does not seem to be supported here. jmath.R_fn("legend", args.legend_loc, legend=leg, fill=fill, inset=args.legend_inset) if not args.no_box: jmath.R_fn("box", lwd=lwd_box) jmath.R_fn("axis", 1, lwd=lwd_axis, **{"cex.axis": 1.5}) jmath.R_fn("axis", 2, lwd=lwd_axis, **{"cex.axis": 1.5}) jmath.R_fn("title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab": cex_lab, "cex.main": cex_main, "cex.sub": cex_sub }) R("par(op)") jmath.R_fn("dev.off")
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import StringIO import arrayio from genomicode import arrayplatformlib from genomicode import parallel from genomicode import filelib from genomicode import AnnotationMatrix from Betsy import module_utils as mlib M = arrayio.read(in_data.identifier) metadata = {} # Add GENE_ID, GENE_SYMBOL, and DESCRIPTION. Figure out which # platforms provide each one of this. CATEGORIES = [ arrayplatformlib.GENE_ID, arrayplatformlib.GENE_SYMBOL, # biomaRt doesn't convert description. So just ignore it # for now. # TODO: implement DESCRIPTION. #arrayplatformlib.DESCRIPTION, ] #all_platforms = arrayplatformlib.identify_all_platforms_of_matrix(M) #assert all_platforms, "Unknown platform: %s" % in_data.identifier #header, platform_name = all_platforms[0] scores = arrayplatformlib.score_matrix(M) scores = [x for x in scores if x.max_score >= 0.75] assert scores, "I could not identify any platforms." # Find all the platforms not in the matrix. platforms = [ arrayplatformlib.find_platform_by_name(x.platform_name) for x in scores ] categories = [x.category for x in platforms] missing = [x for x in CATEGORIES if x not in categories] score = scores[0] platform = platforms[0] to_add = [] # list of platform names for category in missing: x = arrayplatformlib.PLATFORMS x = [x for x in x if x.category == category] x = [x for x in x if x.bm_organism == platform.bm_organism] x = [x for x in x if x.name != score.platform_name] # Take the first one, if any. if x: to_add.append(x[0].name) if to_add: annotate = mlib.get_config("annotate_matrix", which_assert_file=True) sq = parallel.quote cmd = [ "python", sq(annotate), "--no_na", "--header", sq(score.header), ] for x in to_add: x = ["--platform", sq(x)] cmd.extend(x) cmd.append(in_data.identifier) cmd = " ".join(cmd) data = parallel.sshell(cmd) metadata["commands"] = [cmd] assert data.find("Traceback") < 0, data else: data = open(in_data.identifier).read() # Clean up the headers. platform2pretty = { "Entrez_ID_human": "Gene ID", "Entrez_Symbol_human": "Gene Symbol", "Entrez_ID_mouse": "Gene ID", "Entrez_Symbol_mouse": "Gene Symbol", } handle = open(outfile, 'w') header_written = False for cols in filelib.read_cols(StringIO.StringIO(data)): if not header_written: cols = [platform2pretty.get(x, x) for x in cols] cols = AnnotationMatrix.uniquify_headers(cols) header_written = True print >> handle, "\t".join(cols) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): #import shutil from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix from Betsy import module_utils as mlib summary_node = in_data summary_filename = summary_node.identifier metadata = {} buildver = mlib.get_user_option(user_options, "annovar_buildver", allowed_values=["hg19"], not_empty=True) # Name files. p, root, ext = mlib.splitpath(summary_filename) annovar_infile = "pos.txt" log_filename = "annovar.log" # Annovar takes a filestem, without the ".vcf". annovar_outstem = "annotations" # Produces file: # <annovar_outstem>.hg19_multianno.txt multianno_file = "%s.hg19_multianno.txt" % annovar_outstem #temp_file = "temp.txt" # Make the infile for Annovar. # <chrom> <start> <end> <ref> <alt> handle = open(annovar_infile, 'w') for d in filelib.read_row(summary_filename, skip=2, header=1): x = d.Chrom, d.Pos, d.Pos, d.Ref, d.Alt print >> handle, "\t".join(x) handle.close() cmd = alignlib.make_annovar_command(annovar_infile, log_filename, annovar_outstem, buildver, vcf_input=False) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(log_filename) filelib.assert_exists_nz(multianno_file) matrix = SimpleVariantMatrix.read(summary_filename) annot_matrix = matrix.annot_matrix #headers = annot_matrix.headers + anno_header[5:] chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] pos = [int(x) for x in pos] # Read in the multianno output file. pos2d = {} # (chrom, start, ref, alt) -> d anno_header = None for d in filelib.read_row(multianno_file, header=1): key = d.Chr, int(d.Start), d.Ref, d.Alt assert key not in pos2d, "Duplicate pos: %s" % str(key) pos2d[key] = d if not anno_header: anno_header = d._header assert anno_header # Multianno starts with: # Chr Start End Ref Alt # Ignore these. assert anno_header[:5] == ["Chr", "Start", "End", "Ref", "Alt"] headers = anno_header[5:] all_annots = [] #for h in annot_matrix.headers_h: # x = annot_matrix.header2annots[h] # all_annots.append(x) for i in range(5, len(anno_header)): annots = [] for coord in zip(chrom, pos, ref, alt): d = pos2d.get(coord) x = "" if d: x = d._cols[i] annots.append(x) all_annots.append(annots) x = AnnotationMatrix.create_from_annotations(headers, all_annots) matrix.named_matrices.insert(0, ("Annovar", x)) SimpleVariantMatrix.write(out_filename, matrix) ## cols_to_add = len(anno_header) - 5 ## assert cols_to_add > 0 ## # Merge the multianno file with the simple call summary. Add ## # these columns before the <Sample>. ## # Sample <Sample> ## # Caller <Caller> ## # Chrom Pos Ref Alt Ref/Alt/VAF ## handle = open(temp_file, 'w') ## it = filelib.read_cols(summary_filename) ## header1 = it.next() ## header2 = it.next() ## header3 = it.next() ## assert len(header1) == len(header2), "%d %d %d %s" % ( ## len(header1), len(header2), len(header3), summary_filename) ## assert len(header1) == len(header3), "%d %d %d %s" % ( ## len(header1), len(header2), len(header3), summary_filename) ## assert header1[0] == "Sample" ## assert header2[0] == "Caller" ## assert header3[:4] == ["Chrom", "Pos", "Ref", "Alt"] ## header1 = header1[:4] + [""]*cols_to_add + header1[4:] ## header2 = header2[:4] + [""]*cols_to_add + header2[4:] ## header3 = header3[:4] + anno_header[5:] + header3[4:] ## print >>handle, "\t".join(header1) ## print >>handle, "\t".join(header2) ## print >>handle, "\t".join(header3) ## for cols in it: ## chrom, pos, ref, alt = cols[:4] ## pos = int(pos) ## d = pos2d.get((chrom, pos)) ## if not d: ## cols = cols[:4] + [""]*cols_to_add + cols[4:] ## continue ## assert ref == d.Ref, "%s %s %s %s %s %s" % ( ## chrom, pos, ref, alt, d.Ref, d.Alt) ## assert alt == d.Alt, "%s %s %s %s %s %s" % ( ## chrom, pos, ref, alt, d.Ref, d.Alt) ## x = d._cols[5:] ## assert len(x) == cols_to_add ## cols = cols[:4] + x + cols[4:] ## print >>handle, "\t".join(cols) ## handle.close() ## shutil.move(temp_file, out_filename) return metadata