def write_prism_file(filename, hist): # hist is R list from hist function. from genomicode import jmath # XY plot in Prism. # Get "breaks" out of histogram return value. breaks = [x for x in hist.rx2("breaks")] breaks = breaks[:-1] counts = [x for x in hist.rx2("counts")] density = [x for x in hist.rx2("density")] mids = [x for x in hist.rx2("mids")] assert len(breaks) == len(counts) assert len(breaks) == len(density) assert len(breaks) == len(mids) # density from R doesn't sum up to 1. (e.g. sum to 2). # Recalculate so that it sums to 1. total = sum(counts) for i in range(len(density)): density[i] = counts[i] / float(total) header = ["Mids", "Left", "Counts", "Density"] x = [mids, breaks, counts, density] x = jmath.transpose(x) x = [header] + x handle = open(filename, 'w') for x in x: print >>handle, "\t".join(map(str, x))
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import geolib from genomicode import jmath # Input should be a GEOSeriesMatrixFile. filename = in_data.identifier convert_NA = user_options.get("set_NA_to") # Get the sample data and write it out. matrix = geolib._extract_sm_sample_meta(filename) matrix = geolib._clean_sm_sample_meta(matrix) matrix = geolib._prettify_sm_sample_meta(matrix) matrix = jmath.transpose(matrix) # each column is an annotation if convert_NA != "NA": for i in range(1, len(matrix)): for j in range(len(matrix[i])): # Do case sensitive? if matrix[i][j] == "NA": matrix[i][j] = convert_NA outhandle = open(outfile, 'w') for x in matrix: print >> outhandle, "\t".join(map(str, x)) outhandle.close()
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib import os from genomicode import jmath in_data = antecedents matrix = [x for x in filelib.read_cols(in_data.identifier)] matrix = [x[1:] for x in matrix] matrix = jmath.transpose(matrix) sample = matrix[0][1:] data = matrix[1:] if not os.path.exists(outfile): os.mkdir(outfile) for one_data in data: value = one_data[1:] value = [float(i) for i in value] pair = [(value[i], sample[i]) for i in range(len(value))] pair.sort() gene_value = [i[0] for i in pair] label = [i[1] for i in pair] ylabel = one_data[0] from genomicode import mplgraph fig = mplgraph.barplot(gene_value, box_label=label, xtick_rotation=90, xlabel='sample', ylabel=ylabel) output = os.path.join(outfile, ylabel) fig.savefig(output + '.png') assert filelib.exists_nz(outfile), ( 'the output file %s for plot_geneset_score_bar fails' % outfile)
def read_geneset_scores(filename): # Read the output from score_geneset.py and return a Matrix # object. import os from genomicode import jmath from genomicode import filelib from genomicode import Matrix from arrayio import const from arrayio import tab_delimited_format as tdf assert os.path.exists(filename) matrix = [x for x in filelib.read_cols(filename)] matrix = jmath.transpose(matrix) # Only want the scores. Get rid of the direction, pvalue, and # significance lines. # Columns: # SAMPLE # FILE # [Score ...] # [Direction ...] " direction" # [p value ...] " pvalue" # [significant ...] " significant" assert matrix i = 0 while i < len(matrix): assert matrix[i] metadata = False if matrix[i][0].endswith(" direction"): metadata = True elif matrix[i][0].endswith(" pvalue"): metadata = True elif matrix[i][0].endswith(" significant"): metadata = True if not metadata: i += 1 continue del matrix[i] # BUG: Need more checks on size and format of matrix. col_names = {} sample_row = 0 if matrix[1][0].upper() == "SAMPLE": sample_row = 1 col_names[tdf.SAMPLE_NAME] = matrix[sample_row][1:] row_names = {} row_names['geneset'] = [] synonyms = {} synonyms[const.COL_ID] = tdf.SAMPLE_NAME data = [] for line in matrix[2:]: single_data = [jmath.safe_float(i) for i in line[1:]] data.append(single_data) row_names['geneset'].append(line[0]) M = Matrix.InMemoryMatrix(data, row_names=row_names, col_names=col_names, synonyms=synonyms) return M
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import arrayio from Betsy import read_label_file from genomicode import jmath cls_node_train, data_node = antecedents result, label_line, second_line = read_label_file.read( cls_node_train.identifier) y = [second_line[int(i)] for i in label_line] R = jmath.start_R() M = arrayio.read(data_node.identifier) M_train = M.matrix(None, range(0, len(label_line))) M_test = M.matrix(None, range(len(label_line), M.dim()[1])) M1 = M_train.slice() M_train = jmath.transpose(M1) jmath.R_equals_matrix(M_train, 'data') M2 = M_test.slice() M2 = jmath.transpose(M2) jmath.R_equals_matrix(M2, 'test') jmath.R_equals(y, 'y') R('y<-as.factor(y)') R('require(randomForest, quietly=TRUE)') R('library(randomForest)') R('model <- randomForest(data,y=y,importance=TRUE)') R('predict_result <- predict(model, test)') predict_result = R['predict_result'] levels = predict_result.levels predict_labels = predict_result[:] predict_labels = [levels[i - 1] for i in predict_labels] name = M_test._col_names.keys()[0] sample_name = M_test._col_names[name] result = [['Sample_name', 'Predicted_class', 'Confidence']] for i in range(len(sample_name)): result.append([str(sample_name[i]), predict_labels[i], '']) f = file(outfile, 'w') for i in result: f.write('\t'.join(i)) f.write('\n') f.close()
def write_selap_dataset(file_layout): import arrayio from genomicode import jmath matrix = arrayio.read(file_layout.DATASET) # Align the matrix to the SELAP model. # Make a matrix for SELAP. X_selap = jmath.transpose(matrix._X) handle = open(file_layout.SELAP_DATASET, 'w') for x in X_selap: print >> handle, "\t".join(map(str, x)) handle.close()
def write_from_am(handle_or_file, svm_matrix): from genomicode import jmath headers0 = [None] * len(svm_matrix.headers) headers1 = [None] * len(svm_matrix.headers) headers2 = [None] * len(svm_matrix.headers) for i, header in enumerate(svm_matrix.headers): x = header.split("___") assert len(x) == 3, "Invalid header format: %s" % x headers0[i] = x[0] headers1[i] = x[1] headers2[i] = x[2] for i in range(len(headers0) - 1, -1, -1): # If headers1[i] is the same as header1[i-1], then do not # write it out again. # # Exception: If headers0[i] != headers0[i-1], then we're # starting a new "block", and headers1[i] should still be # written out. # Example: If there's only one <Caller>, then the <Sample> # will not be blank, but the <Caller> should still be copied # over (because they are the same). # <Sample1> <Sample2> # <Caller> <Caller> # Ref/Alt/VAF Ref/Alt/VAF if headers1[i] == headers1[i - 1] and headers0[i] == headers0[i - 1]: headers1[i] = "" if headers0[i] == headers0[i - 1]: headers0[i] = "" matrix = [] for i, header_h in enumerate(svm_matrix.headers_h): h0 = headers0[i] h1 = headers1[i] h2 = headers2[i] annots = svm_matrix.header2annots[header_h] x = [h0, h1, h2] + annots matrix.append(x) # Transpose the matrix. matrix = jmath.transpose(matrix) handle = handle_or_file if type(handle) is type(""): handle = open(handle, 'w') for x in svm_matrix.headerlines: print >> handle, x for x in matrix: print >> handle, "\t".join(map(str, x))
def write_prism_file(filename, MATRIX, gene_names): # Format in prism format for an XY plot. Each gene is a different # series. from genomicode import jmath num_samples = MATRIX.ncol() m = [] # Make a row-based matrix and transpose it. # Write the sample name. sample_names = MATRIX.col_names(MATRIX.col_names()[0]) x = [] for i in range(len(gene_names)): x.extend(sample_names) m.append(x) # Write the X-coordinate. x = [] for i in range(len(gene_names)): for j in range(num_samples): x.append(j + 1) m.append(x) # Add each series. for i in range(len(gene_names)): # Pre-pad blanks for the other series. x1 = [""] * num_samples * i x2 = MATRIX._X[i] # Post-pad blanks to fill in the matrix. x3 = [""] * (len(m[0]) - len(x1) - len(x2)) x = x1 + x2 + x3 m.append(x) # Transpose to column-major format. m = jmath.transpose(m) # Add the gene names as the column headers. x = ["Sample", "X"] + gene_names m = [x] + m # Write the matrix to the file. handle = open(filename, 'w') for x in m: print >> handle, "\t".join(map(str, x))
def write(handle_or_file, annot_matrix, delim=None): from genomicode import jmath if delim is None: delim = "\t" matrix = [] for i, header_h in enumerate(annot_matrix.headers_h): header = annot_matrix.headers[i] annots = annot_matrix.header2annots[header_h] x = [header] + annots matrix.append(x) # Transpose the matrix. matrix = jmath.transpose(matrix) handle = handle_or_file if type(handle) is type(""): handle = open(handle, 'w') for x in annot_matrix.headerlines: print >> handle, x for x in matrix: print >> handle, delim.join(map(str, x))
def write_prism_file(filename, hist): # hist is R list from hist function. from genomicode import jmath # XY plot in Prism. # Get "breaks" out of histogram return value. breaks = [x for x in hist.rx2("breaks")] breaks = breaks[:-1] counts = [x for x in hist.rx2("counts")] density = [x for x in hist.rx2("density")] mids = [x for x in hist.rx2("mids")] assert len(breaks) == len(counts) assert len(breaks) == len(density) assert len(breaks) == len(mids) header = ["Mids", "Left", "Counts", "Density"] x = [mids, breaks, counts, density] x = jmath.transpose(x) x = [header] + x handle = open(filename, 'w') for x in x: print >> handle, "\t".join(map(str, x))
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import hashlib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib simple_node = in_data filelib.assert_exists_nz(simple_node.identifier) gene_file = mlib.get_user_option( user_options, "cancer_genes_file", not_empty=True, check_file=True) # Read the cancer genes file. # <Gene ID> <Gene Symbol> <Dataset> ... symbol2info = {} # symbol -> d gene_iter = filelib.read_row(gene_file, header=1) header = None for d in gene_iter: assert "Gene Symbol" in d._header if header is None: header = [ x for x in d._header if x not in ["Gene ID", "Gene Symbol"]] if not d.Gene_Symbol: continue symbol2info[d.Gene_Symbol] = d # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(simple_node.identifier) GENE_H = "Annovar______Gene.refGene" assert GENE_H in SVM.headers, "Missing annotation: %s" % GENE_H GENES = SVM[GENE_H] # Align the matrix to the simple variant matrix. gene_headers = header gene_annotations = [] for i, gene_str in enumerate(GENES): # Format of genes: # PFN1P2 # PMS2P2,PMS2P7 values = [""] * len(gene_headers) genes = gene_str.split(",") for gene in genes: if gene not in symbol2info: continue d = symbol2info[gene] for j, h in enumerate(gene_headers): h = hashlib.hash_var(h) assert hasattr(d, h) x = getattr(d, h) assert x in ["", "1"] if x == "1": values[j] = 1 gene_annotations.append(values) # Convert the headers and annotations to SVM format. gene_headers = ["Cancer Genes______%s" % x for x in gene_headers] gene_annotations = jmath.transpose(gene_annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 # If Annovar exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("ANNOVAR")] if I: INDEX = max(INDEX, max(I)+1) # If SnpEff exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("SNPEFF")] if I: INDEX = max(INDEX, max(I)+1) # If COSMIC exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("COSMIC")] if I: INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + gene_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + gene_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def run( self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import alignlib #from Betsy import module_utils as mlib rsem_path = in_data.identifier assert os.path.exists(rsem_path) assert os.path.isdir(rsem_path) result_files = alignlib.find_rsem_result_files(rsem_path) assert result_files, "No .results files found." metadata = {} preprocess = out_attributes.get("preprocess") assert preprocess in ["tpm", "fpkm"] #x = mlib.get_user_option( # user_options, "genes_or_isoforms", not_empty=True, # allowed_values=["genes", "isoforms"]) #get_genes = x == "genes" # Figure out whether to align to genome or transcriptome. x = out_attributes["expression_of"] assert x in ["gene", "isoform"] get_genes = x == "gene" transcript_header = "transcript_id(s)" if not get_genes: transcript_header = "transcript_id" # For each of the gene files, get the expression data. sample2matrix = {} # sample -> AnnotationMatrix for x in result_files: sample, gene_filename, isoform_filename = x # Get the gene results. # TODO: Implement isoforms. filename = gene_filename if not get_genes: filename = isoform_filename assert filename is not None, "Missing: %s" % filename #if filename is None: # continue assert os.path.exists(filename) matrix = AnnotationMatrix.read(filename) # Do some checking on the matrix. assert "gene_id" in matrix.headers assert transcript_header in matrix.headers assert "TPM" in matrix.headers assert "FPKM" in matrix.headers sample2matrix[sample] = matrix assert sample2matrix, "No samples" gene_id = transcript_id = None # Pull out the gene and transcript IDs. for matrix in sample2matrix.itervalues(): x1 = matrix["gene_id"] x2 = matrix[transcript_header] if gene_id is None: gene_id = x1 if transcript_id is None: transcript_id = x2 assert x1 == gene_id assert x2 == transcript_id assert gene_id assert transcript_id assert len(gene_id) == len(transcript_id) # Assemble into a gene expression matrix. header = "TPM" if preprocess == "fpkm": header = "FPKM" t_data = [] # matrix, where each row is a sample. t_data.append(gene_id) t_data.append(transcript_id) samples = [] for sample in sorted(sample2matrix): matrix = sample2matrix[sample] exp = matrix[header] assert len(exp) == len(gene_id) t_data.append(exp) samples.append(sample) data = jmath.transpose(t_data) header = ["gene_id", transcript_header] + samples data = [header] + data # Write out the data file. handle = open(out_filename, 'w') for x in data: print >>handle, "\t".join(map(str, x)) return metadata
def add_coverage_to_svm(svm_file, coverage_file, outfile, is_rna_cov): from genomicode import jmath from genomicode import filelib from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix # Read the variant file. SVM = SimpleVariantMatrix.read(svm_file) AM = SVM.annot_matrix assert "Chrom" in AM.headers assert "Pos" in AM.headers CHROM = AM["Chrom"] POS = AM["Pos"] POS = [int(x) for x in POS] # Read the coverage matrix. # Chrom Pos <Sample> [<Sample> ...] # Pos is 1-based. coord2sample2cov = {} # (chrom, pos) -> sample -> ref/alt/vaf cov_samples = {} for d in filelib.read_row(coverage_file, header=1): coord = d.Chrom, int(d.Pos) if coord not in coord2sample2cov: coord2sample2cov[coord] = {} for i in range(2, len(d._header)): sample = d._header[i] cov = d._cols[i] if not cov: continue #coord2sample2cov[coord][sample] = int(cov) coord2sample2cov[coord][sample] = cov cov_samples[sample] = 1 # Make sure the samples from the variant matrix can be found # in the coverage matrix. missing = [x for x in SVM.samples if x not in cov_samples] assert len(missing) < len(SVM.samples), ( "SimpleVariantMatrix and coverage file have " "no common samples.") # If the samples aren't sequenced at high coverage, it's # possible they just don't have reads at these positions. Be # a little lenient here, and accept the file if some of the # samples overlap. #x = missing #if len(x) > 5: # x = x[:5] + ["..."] #msg = "Samples (%d) not found in coverage file: %s" % ( # len(missing), ", ".join(x)) #assert not missing, msg # Report the coverage for the samples at the intersection. SAMPLES = [x for x in SVM.samples if x in cov_samples] # Align the matrix to the simple variant matrix. #matrix = [[None]*len(SVM.samples) for i in range(AM.num_annots())] matrix = [[None]*len(SAMPLES) for i in range(AM.num_annots())] for i in range(AM.num_annots()): coord = CHROM[i], POS[i] sample2cov = coord2sample2cov.get(coord, {}) x = [sample2cov.get(x, "") for x in SAMPLES] #x = map(str, x) matrix[i] = x # Add the matrix back to the simple variant matrix. headers = SAMPLES all_annots = jmath.transpose(matrix) name = "Coverage" # If this is being used to add RNA coverage, use a different # name. if is_rna_cov: name = "RNA Coverage" x = AnnotationMatrix.create_from_annotations(headers, all_annots) SVM.named_matrices.append((name, x)) # Write to file. SimpleVariantMatrix.write(outfile, SVM)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import math from genomicode import filelib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib svm_node = in_data filelib.assert_exists_nz(svm_node.identifier) linked_file = mlib.get_user_option(user_options, "linked_variants_file", not_empty=True, check_file=True) # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier) CHROM = SVM["______Chrom"] POS = SVM["______Pos"] POS = [int(x) for x in POS] all_coords = {} # (chrom, pos) -> 1 for x in zip(CHROM, POS): all_coords[x] = 1 # Read the linked variant file. # Chrom Pos Perc Linked p coord2info = {} # (chrom, pos) -> d for d in filelib.read_row(linked_file, header=1): pos = int(d.Pos) if (d.Chrom, pos) not in all_coords: continue coord2info[(d.Chrom, pos)] = d # Align the linked annotations to the matrix. MAX_SCORE = 1000 min_p = 10**-(MAX_SCORE / 10) linked_headers = ["Perc Linked", "Score"] annotations = [] for (chrom, pos) in zip(CHROM, POS): if (chrom, pos) not in coord2info: x = [""] * len(linked_headers) annotations.append(x) continue d = coord2info[(chrom, pos)] score = MAX_SCORE if float(d.p) >= min_p: score = -10 * math.log(float(d.p), 10) x = d.Perc_Linked, score assert len(x) == len(linked_headers) annotations.append(x) # Convert the headers and annotations to SVM format. linked_headers = ["Linkage______%s" % x for x in linked_headers] linked_annotations = jmath.transpose(annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 ## If Annovar exists, put after. #I = [i for (i, x) in enumerate(SVM.headers) # if x.upper().startswith("ANNOVAR")] #if I: # INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + linked_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + linked_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def find_diffexp_genes(outfile, gmt_file, algorithm, paired, MATRIX, geneid_header, genename_header, genename_delim, name1, name2, classes, filter_fold_change, fold_change, p_cutoff, fdr_cutoff, bonf_cutoff, sam_DELTA, sam_qq_file, edger_tagwise_dispersion, num_procs): # classes must be 0, 1, None. import os import sys import math import StringIO import warnings from rpy2 import rinterface from genomicode import config from genomicode import jmath from genomicode import genesetlib algorithm2function_unpaired = { "fold_change": "find.de.genes.fc", "ttest": "find.de.genes.ttest", "sam": "find.de.genes.sam", "ebayes": "find.de.genes.ebayes", "deseq2": "find.de.genes.deseq2", "edger": "find.de.genes.edgeR", } algorithm2function_paired = { "ebayes": "find.de.genes.paired.ebayes", } algorithm2function = algorithm2function_unpaired if paired: algorithm2function = algorithm2function_paired assert algorithm in algorithm2function_paired, \ "No paired version of %s" % algorithm assert algorithm in algorithm2function, "Unknown algorithm: %s" % algorithm # Select the relevant columns from MATRIX. I = [i for (i, x) in enumerate(classes) if x in [0, 1]] assert len(I) MATRIX = MATRIX.matrix(None, I) classes = [classes[i] for i in I] # All algorithms except "fold_change" need at least 2 samples of # each class. counts = {} for x in classes: counts[x] = counts.get(x, 0) + 1 assert sorted(counts) == [0, 1], "Only one class represented." if algorithm not in ["fold_change", "deseq2"]: assert counts[0] >= 2, "There must be at least 2 of each class." assert counts[1] >= 2, "There must be at least 2 of each class." names = [name1, name2] X = MATRIX._X Y = [names[x] for x in classes] sample_name = None if MATRIX.col_names(): sample_name = MATRIX.col_names(MATRIX.col_names()[0]) x = choose_gene_names(MATRIX) if not geneid_header: geneid_header = x[0] if not genename_header: genename_header = x[1] assert not geneid_header or geneid_header in MATRIX.row_names() assert not genename_header or genename_header in MATRIX.row_names() R = jmath.start_R() de_lib = os.path.join(config.changlab_Rlib, "diffexp.R") stat_lib = os.path.join(config.changlab_Rlib, "statlib.R") assert os.path.exists(de_lib), "I could not find file: %s" % de_lib assert os.path.exists(stat_lib), "I could not find file: %s" % stat_lib R('source("%s")' % de_lib) R('source("%s")' % stat_lib) jmath.R_equals(X, "X") jmath.R_equals(Y, "Y") if sample_name: jmath.R_equals(sample_name, "sample.name") jmath.R('colnames(X) <- sample.name') geneid = genenames = None if geneid_header: geneid = MATRIX.row_names(geneid_header) jmath.R_equals(geneid, "geneid") if genename_header: genenames = MATRIX.row_names(genename_header) jmath.R_equals(genenames, "genenames") # Set up the arguments. args = ["X", "Y"] if algorithm == "sam": args.append("%g" % sam_DELTA) if geneid: args.append("geneid=geneid") if genenames: args.append("genenames=genenames") # Pass the fold change to the algorithm, because it can affect the # multiple hypothesis correction. if filter_fold_change is not None: args.append("FOLD.CHANGE=%g" % filter_fold_change) if algorithm in ["ttest", "deseq2"]: args.append("NPROCS=%d" % num_procs) # t-test only #if show_all_genes and algorithm != "sam": if algorithm not in ["sam", "fold_change"]: args.append("filter.p05=FALSE") if algorithm == "edger": if edger_tagwise_dispersion: args.append("tagwise.dispersion=TRUE") else: args.append("tagwise.dispersion=FALSE") # Prevent SAM from writing junk to the screen. handle = StringIO.StringIO() old_stdout = sys.stdout sys.stdout = handle # Call the proper R function. DESeq2 throws off a lot of # warnings. Turn them off temporarily. fn = algorithm2function[algorithm] x = ", ".join(args) with warnings.catch_warnings(): warnings.simplefilter("ignore") R("x <- %s(%s)" % (fn, x)) R("DATA <- x$DATA") DATA_R = R["DATA"] sys.stdout = old_stdout # Write out a QQ file for SAM. if algorithm == "sam" and sam_qq_file: R('S <- x$S') jmath.R_fn("bitmap", sam_qq_file, type="png256", height=1600, width=1600, units="px", res=300) jmath.R_fn("samr.plot", jmath.R_var("S"), sam_DELTA) jmath.R_fn("dev.off") # Convert this DataFrame into a Python object. Columns of floats # can be StrVector objects if there are NA embedded within them. # NA are special objects of either type # rpy2.rinterface.NACharacterType or type # rpy2.rinterface.NARealType. tDATA_py = [] header = [DATA_R.colnames[i] for i in range(DATA_R.ncol)] for zzz, col_R in enumerate(DATA_R): # iterate over columns col_py = [col_R[i] for i in range(len(col_R))] if col_R.__class__.__name__ == "StrVector": pass elif col_R.__class__.__name__ == "FloatVector": col_py = [float(x) for x in col_py] elif col_R.__class__.__name__ == "IntVector": col_py = [int(x) for x in col_py] tDATA_py.append(col_py) DATA_py = jmath.transpose(tDATA_py) #handle = open('test01.txt', 'w') #for x in DATA_py: # print >>handle, "\t".join(map(str, x)) # Convert NA to None. for i in range(len(DATA_py)): for j in range(len(DATA_py[i])): if type(DATA_py[i][j]) in [ rinterface.NACharacterType, rinterface.NARealType ]: DATA_py[i][j] = None # Sort by increasing p-value, then decreasing fold change. name = "p.value" direction = 1 #if algorithm == "sam": # name = "Score(d)" if name not in header: name = "Log_2 Fold Change" direction = -1 assert name in header, 'I could not find the "%s" column.' % name I = header.index(name) #schwartz = [(direction*float(x[I]), x) for x in DATA_py] values = [x[I] for x in DATA_py] for i in range(len(values)): if values[i] is None: values[i] = direction * 1E10 else: values[i] = direction * float(values[i]) schwartz = zip(values, DATA_py) schwartz.sort() DATA_py = [x[-1] for x in schwartz] # Filter based on user criteria. if fold_change is not None: log_2_fc = math.log(fold_change, 2) name = "Log_2 Fold Change" assert name in header, 'I could not find the "%s" column.' % name I = header.index(name) DATA_py = [ x for x in DATA_py if x[I] is not None and abs(x[I]) >= log_2_fc ] if p_cutoff is not None: name = "p.value" assert name in header, 'I could not find the "%s" column.' % name I = header.index(name) DATA_py = [ x for x in DATA_py if x[I] is not None and float(x[I]) < p_cutoff ] if fdr_cutoff is not None: name = "FDR" # This might be missing if all the genes have already been # filtered. #assert name in header, 'I could not find the "%s" column.' % name if name in header: I = header.index(name) DATA_py = [ x for x in DATA_py if x[I] is not None and float(x[I]) < fdr_cutoff ] if bonf_cutoff is not None: name = "Bonf" assert name in header, 'I could not find the "%s" column.' % name I = header.index(name) DATA_py = [ x for x in DATA_py if x[I] is not None and float(x[I]) < bonf_cutoff ] ## If no significant genes, then don't produce any output. ##if not DATA_py: ## return # Write to the outhandle. _write_matrix(outfile, header, DATA_py) # Don't close someone else's file handle. #outhandle.close() # Write out the gene sets in GMT format, if requested. if not gmt_file: return assert "Direction" in header, 'I could not find the "Direction" column.' assert "Gene ID" in header, 'I could not find the "Gene ID" column.' assert "Gene Name" in header, 'I could not find the "Gene Name" column.' I_direction = header.index("Direction") I_geneid = header.index("Gene ID") I_genename = header.index("Gene Name") # "Higher in <name1>" # "Higher in <name2>" # "SAME" possible_directions = [ "Higher in %s" % name1, "Higher in %s" % name2, "SAME" ] direction = [x[I_direction] for x in DATA_py] for x in direction: assert x.startswith("Higher in ") or x == "SAME" assert x in possible_directions samples = [x.replace("Higher in ", "") for x in direction] genesets = [] # list of (<SAMPLE>, [UP|DN]) for s in samples: if s == "SAME": continue assert s in [name1, name2] # Make genesets relative to name2. (Assume name1 is control). d = "UP" if s == name1: s, d = name2, "DN" genesets.append((s, d)) genesets_all = sorted({}.fromkeys(genesets)) outhandle = open(gmt_file, 'w') for geneset in genesets_all: sample, direct = geneset I = [i for (i, gs) in enumerate(genesets) if gs == geneset] gid = [DATA_py[i][I_geneid] for i in I] gn = [DATA_py[i][I_genename] for i in I] # gn might be float. genesetlib expects array of strings. #import sys; sys.exit(0) gid = genesetlib.clean_genes(gid) gn = genesetlib.clean_genes(gn, delim=genename_delim) # <SAMPLE>_[ID|NAME]_[UP|DN] if gid: x = "%s_%s_%s" % (sample, "ID", direct) x = [x, "na"] + gid print >> outhandle, "\t".join(x) if gn: x = "%s_%s_%s" % (sample, "NAME", direct) x = [x, "na"] + gn print >> outhandle, "\t".join(x) outhandle.close()
def main(): import argparse import glob import itertools DEF_PVALUE = 0.05 parser = argparse.ArgumentParser( description="Score a gene set on a gene expression data set.") parser.add_argument("expression_files", nargs="+", help="Data set(s) to score.") parser.add_argument("-o", dest="outfile", default=None, help="Name of file for results.") parser.add_argument("--transpose", action="store_true", help="Transpose the output matrix.") parser.add_argument( "--pvalue", type=float, default=DEF_PVALUE, help="p-value cutoff for determining significant changes " "(default %g)." % DEF_PVALUE) parser.add_argument("--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_argument("-j", dest="num_procs", type=int, default=1, help="Number of jobs to run in parallel.") # Assumes that there are no commas in names of gene sets. group = parser.add_argument_group(title="Gene Set") group.add_argument( "--geneset_file", dest="geneset_files", action="append", default=[], help="File(s) with gene sets. Should be in gmx or gmt format.") group.add_argument( "-g", dest="gene_set", action="append", default=[], help="Name of the gene set to score. If you want to score both " "the positively and negatively correlated genes, specify both " "gene sets using the format: <positive_geneset>,<negative_geneset>. " "You can use this option multiple times to score more than one gene " "set.") group.add_argument("--all", dest="all_gene_sets", action="store_true", default=False, help="Score all gene sets in the files.") group.add_argument( "--any_matching", dest="any_matching_gene_sets", action="store_true", default=False, help="Score gene sets in the files that matches these genes.") group.add_argument("--automatch", action="store_true", default=False, help="Will match _UP with _DN (or _DOWN).") group = parser.add_argument_group( title="Genes", description="Add gene expression profiles to output.") group.add_argument( "--genes", default=[], action="append", help="Comma-separated list of IDs (e.g. probes, gene names) " "to include.") args = parser.parse_args() assert args.expression_files, \ "Please specify an expression data set to score." expression_files = [] for x in args.expression_files: xg = glob.glob(x) assert xg, "I could not find the expression file: %s" % x expression_files.extend(xg) for x in expression_files: assert os.path.exists(x), \ "I could not find the expression file: %s" % x assert args.outfile, "Please specify the name of an outfile." if args.num_procs < 1 or args.num_procs > 100: parser.error("Please specify between 1 and 100 processes.") assert args.pvalue > 0 and args.pvalue <= 1, \ "Invalid pvalue %g" % args.pvalue assert args.geneset_files, "Please specify one or more geneset files." for x in args.geneset_files: assert os.path.exists(x), "I could not find the gene set file: %s" % x assert args.all_gene_sets or args.gene_set or args.any_matching_gene_sets,\ "Please specify one or more gene sets to score." if args.all_gene_sets: assert not args.gene_set and not args.any_matching_gene_sets if args.any_matching_gene_sets: assert not args.gene_set and not args.all_gene_sets #if args.num_procs > 1: # raise NotImplementedError, "Doesn't work. Matrix class decorator." if args.libpath: sys.path = args.libpath + sys.path # Import after the library path is set. #import time import multiprocessing from genomicode import genesetlib from genomicode import genepattern from genomicode import jmath #start_time = time.time() genepattern.fix_environ_path() gene_names = _parse_gene_names(args.genes) msg = "Reading gene set file." if len(args.geneset_files) > 1: msg = "Reading gene set files." print msg sys.stdout.flush() geneset2genes = {} # name -> list of genes for filename in args.geneset_files: for x in genesetlib.read_genesets(filename): name, description, genes = x assert name not in geneset2genes, "Duplicate geneset: %s." % name geneset2genes[name] = genes genesets = args.gene_set if args.all_gene_sets or args.any_matching_gene_sets: genesets = sorted(geneset2genes) if args.automatch: genesets = match_gene_sets(genesets) #genesets = genesets[:10] matrix_names = [os.path.split(x)[1] for x in expression_files] print "Setting up jobs." sys.stdout.flush() ignore_gene_not_found = args.any_matching_gene_sets # list of gs_name, pos_genes, neg_genes, matrix_name, matrix_file # list of gene_name, None, None, matrix_name, matrix_file jobs = [] for geneset in genesets: pos_gs, neg_gs = _parse_geneset(geneset) assert pos_gs in geneset2genes, \ "I could not find gene set: %s" % pos_gs if neg_gs: assert neg_gs in geneset2genes, \ "I could not find gene set: %s" % neg_gs gs_name = pos_gs if neg_gs: gs_name = "%s/%s" % (pos_gs, neg_gs) pos_genes = geneset2genes[pos_gs] neg_genes = geneset2genes.get(neg_gs, []) if not pos_genes and not neg_genes: print "Empty gene set: %s. Skipping." % gs_name continue for matrix_name, matrix_file in zip(matrix_names, expression_files): x = gs_name, pos_genes, neg_genes, matrix_name, matrix_file, \ ignore_gene_not_found jobs.append(x) for name in gene_names: for matrix_name, matrix_file in zip(matrix_names, expression_files): x = name, None, None, matrix_name, matrix_file, None jobs.append(x) # Group the jobs into batches such that jobs that use the same # matrix are in the same batch. batched_jobs = {} # matrix_file -> list of jobs for i in range(len(jobs)): batch = jobs[i][4] if batch not in batched_jobs: batched_jobs[batch] = [] batched_jobs[batch].append(jobs[i]) batched_jobs = batched_jobs.values() # list of list of jobs # If there are too many gene sets to score for a file, split it up # into multiple batches. Don't know the tradeoff between reading # a file twice and calculating more gene sets. while len(batched_jobs) < args.num_procs: # Find the largest job and split it into two. largest = i_largest = None for i in range(len(batched_jobs)): nj = len(batched_jobs[i]) if nj > 1 and nj > largest: largest = nj i_largest = i if largest is None: break # Split i_largest in half. bj = batched_jobs[i_largest] i = len(bj) / 2 j1, j2 = bj[:i], bj[i:] batched_jobs[i_largest] = j1 batched_jobs.append(j2) job_str = "jobs" if len(jobs) == 1: job_str = "job" print "Scoring %d %s." % (len(jobs), job_str) sys.stdout.flush() manager = multiprocessing.Manager() lock = manager.Lock() pool = multiprocessing.Pool(args.num_procs) # (matrix, geneset, index, sample) -> GeneSetScore or GeneScore score_dict = {} results = [] # AsyncResults for batch in batched_jobs: fn_args = (batch, ) fn_keywds = {} fn_keywds["lock"] = lock if args.num_procs == 1: x = score_many(batch) score_dict.update(x) else: x = pool.apply_async(score_many, fn_args, fn_keywds) results.append(x) pool.close() pool.join() for x in results: x = x.get() score_dict.update(x) all_matrix_samples = [] all_genesets = [] all_genes = [] for (x, score) in score_dict.iteritems(): matrix_name, gene_name, index, sample = x x = matrix_name, index, sample all_matrix_samples.append(x) if isinstance(score, GeneSetScore): all_genesets.append(gene_name) elif isinstance(score, GeneScore): all_genes.append(gene_name) else: raise AssertionError all_matrix_samples = sorted({}.fromkeys(all_matrix_samples)) all_genesets = sorted({}.fromkeys(all_genesets)) all_genes = sorted({}.fromkeys(all_genes)) # Format the output. Columns should be in order: # <SAMPLE> <FILE> # <GS SCORES> ... <GS DIRECTION> ... <GS PVALUE> ... <GS SIGNIFICANT> ... # <GENES> ... header = ["SAMPLE", "FILE"] x = ["", "direction", "pvalue", "significant"] for x in itertools.product(x, all_genesets): suffix, name = x x = "%s %s" % (name, suffix) x = x.strip() header = header + [x] for g in all_genes: header = header + [g] output = [] output.append(header) for x in all_matrix_samples: matrix, index, sample = x #x = [scores[(matrix, x, index, sample)] for x in all_genesets] # Get the scores for the gene sets. keys = [(matrix, x, index, sample) for x in all_genesets] default = GeneSetScore("", "", "", "") scores = [score_dict.get(x, default).score for x in keys] directs = [score_dict.get(x, default).direction for x in keys] pvalues = [score_dict.get(x, default).pvalue for x in keys] signifs = [] for x in zip(directs, pvalues): direct, pvalue = x x = "" if type(pvalue) is type(0.0) and pvalue < args.pvalue: x = direct signifs.append(x) # Get the scores for the genes. keys = [(matrix, x, index, sample) for x in all_genes] default = GeneScore("") gene_scores = [score_dict.get(x, default).score for x in keys] x = [sample, matrix] + \ scores + directs + pvalues + signifs + gene_scores assert len(x) == len(header) output.append(x) if args.transpose: output = jmath.transpose(output) outhandle = open(args.outfile, 'w') for x in output: print >> outhandle, "\t".join(map(str, x)) outhandle.close() print "Done."
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib svm_node = in_data filelib.assert_exists_nz(svm_node.identifier) cosmic_file = mlib.get_user_option( user_options, "cosmic_variants_file", not_empty=True, check_file=True) # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier) CHROM = SVM["______Chrom"] POS = SVM["______Pos"] POS = [int(x) for x in POS] all_coords = {} # (chrom, pos) -> 1 for x in zip(CHROM, POS): all_coords[x] = 1 # Read the COSMIC variant file. # Chrom Start End GRCh Count SNP # Mutation CDS Mutation AA # FATHMM prediction FATHMM score Mutation somatic status coord2info = {} # (chrom, pos) -> d for d in filelib.read_row(cosmic_file, header=1): start, end = int(d.Start), int(d.End) in_svm = False for pos in range(start, end+1): if (d.Chrom, pos) in all_coords: in_svm = True break if not in_svm: continue coord2info[(d.Chrom, pos)] = d # Align the COSMIC annotations to the matrix. cosmic_headers = [ "SNP", "Num Tumors", "Mutation CDS", "Mutation AA", "FATHMM prediction", "FATHMM score", "Mutation somatic status"] annotations = [] for (chrom, pos) in zip(CHROM, POS): if (chrom, pos) not in coord2info: x = [""] * len(cosmic_headers) annotations.append(x) continue d = coord2info[(chrom, pos)] x = d.SNP, d.Count, d.Mutation_CDS, d.Mutation_AA, \ d.FATHMM_prediction, d.FATHMM_score, \ d.Mutation_somatic_status annotations.append(x) # Convert the headers and annotations to SVM format. cosmic_headers = ["COSMIC______%s" % x for x in cosmic_headers] cosmic_annotations = jmath.transpose(annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 # If Annovar exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("ANNOVAR")] if I: INDEX = max(INDEX, max(I)+1) # If SnpEff exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("SNPEFF")] if I: INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + cosmic_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + cosmic_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def summarize_factor_scores(file_layout, python, arrayplot, cluster, libpath): import zipfile import arrayio from genomicode import Matrix from genomicode import jmath from genomicode import archive from genomicode import graphlib from genomicode import bfrm DATA = arrayio.read(file_layout.DATASET) param_file = "parameters.txt" model = bfrm.read_clean_model(file_layout.BFRM_MODEL, param_file=param_file) num_factors = model["F"].nrow() # Load the factor names. assert zipfile.is_zipfile(file_layout.BFRM_MODEL) s2f = archive.unzip_dict(file_layout.BFRM_MODEL) assert "factorids.txt" in s2f, "Missing: factorids.txt" zfile = zipfile.ZipFile(file_layout.BFRM_MODEL) factor_names = [x.strip() for x in zfile.open(s2f["factorids.txt"])] assert len(factor_names) == num_factors # sample x factor matrix F = arrayio.read(file_layout.BFRM_AF) assert F.nrow() == DATA.ncol() F_X = jmath.transpose(F._X) # F_X contains all factors, including intercept and design. # Remove all but the latent factors. F_X = F_X[-num_factors:] # Sort the factors so they'll be in the same order as the clean # model. assert len(F_X) == len(model["FACTOR_O"]) F_X = [F_X[i] for i in model["FACTOR_O"]] factor_names = [factor_names[i] for i in model["FACTOR_O"]] # Write out the projected factor scores. SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME row_names = {} col_names = {} row_names["xID"] = factor_names col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME) M = Matrix.InMemoryMatrix(F_X, row_names, col_names) arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES) # Make the heatmap. x = graphlib.find_wide_heatmap_size(M.nrow(), M.ncol(), min_box_height=10, min_box_width=10, max_total_height=768, max_total_width=1024) xpix, ypix = x ypix = min(ypix, xpix * 4) x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES, file_layout.FACTOR_SCORES_PNG, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, gene_center="mean", gene_normalize="var", gene_label=True, cluster_genes=True, array_label=True, cluster_arrays=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) # Clean up the cluster files. files = [ file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR ] for filename in files: if not os.path.exists(filename): continue src = filename x = os.path.split(filename)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst)