def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import genesetlib import plot_sample_pca data_node, classify_node = antecedents result_data = genesetlib.read_tdf(classify_node.identifier, preserve_spaces=True, allow_duplicates=True) for i in result_data: if i[0] == 'Predicted_class': legend = i[2] colors = ['r', 'b', 'g', 'y'] legend_dict = {} for index, item in enumerate(legend): if item not in legend_dict: legend_dict[item] = [index] else: legend_dict[item].append(index) color = [''] * len(legend) for index, key in enumerate(legend_dict.keys()): c = colors[index] for i in legend_dict[key]: color[i] = c plot_sample_pca.plot_pca(data_node.identifier, outfile, color, legend)
def read_annotation_descriptor(annotation_descriptor, annotations): # Return list of (header, annots), where the annots are aligned to # the annotations. from genomicode import jmath from genomicode import genesetlib filename = annotation_descriptor assert os.path.exists(filename), "I could not find file: %s" % filename header2annots = [] # list of (header, annots) for x in genesetlib.read_tdf( filename, preserve_spaces=True, allow_duplicates=True): name, description, annots = x header2annots.append((name, annots)) assert header2annots, "No annots." # Find the column that contains the annotations. header2counts = {} for (header, annots) in header2annots: count = len(set(annotations).intersection(annots)) header2counts[header] = count best_header = best_count = None for (header, count) in header2counts.iteritems(): if best_count is None or count > best_count: best_header, best_count = header, count assert best_count >= len(annotations)/2.0, \ "I could not find the annotations in the descriptor file." annot_header = best_header # Align the annotation matrix to the annotations. annot_annots = None for (header, annots) in header2annots: if header == annot_header: annot_annots = annots assert annot_annots I = jmath.match(annotations, annot_annots) header2annots_aligned = [] for header, annots in header2annots: annots_aligned = [] for i in I: if i is None: annots_aligned.append("") else: annots_aligned.append(annots[i]) x = header, annots_aligned header2annots_aligned.append(x) header2annots = header2annots_aligned return header2annots
def read(filename, is_csv=False, header_char=None, nrows=None): # Everything are strings. No numeric conversion. import re from genomicode import genesetlib delimiter = "\t" if is_csv: delimiter = "," # re.sub takes a lot of time (25% of all running time!). Compile # it. re_naive = re.compile("na\\W+ve") all_headers, all_annots = [], [] all_comments = [] for x in genesetlib.read_tdf(filename, preserve_spaces=True, allow_duplicates=True, delimiter=delimiter, yield_lines_startswith=header_char, nrows=nrows): if type(x) is type(""): all_comments.append(x) continue name, description, annots = x # Hack: Some files contain special characters, which mess up # alignment. Fix this here. # na\xc3\xafve-WIBR3.5 hESC # na\xe2\x80\x9a\xc3\xa0\xc3\xb6\xe2\x88\x9a\xc3\xb2ve-C1.2 hiPSC #annots = [re.sub("na\\W+ve", "naive", x) for x in annots] # This takes a long time. Don't do it unless necessary. if False: annots = [re_naive.sub("naive", x) for x in annots] all_headers.append(name) all_annots.append(annots) assert all_headers, "Empty file: %s" % filename headers_h = uniquify_headers(all_headers) header2annots = {} for (header_h, annots) in zip(headers_h, all_annots): header2annots[header_h] = annots return AnnotationMatrix(all_headers, headers_h, header2annots, headerlines=all_comments)
def read_gene_descriptor(gene_descriptor, geneset): # Read pretty names for the genes. gene_descriptor is in the # format of <filename>,<header>. Return a dictionary of gene -> # pretty name. from genomicode import genesetlib x = gene_descriptor.split(",") assert len(x) >= 2 filename, pretty_header = x assert os.path.exists(filename), "I could not find file: %s" % filename header2genes = {} for x in genesetlib.read_tdf( filename, preserve_spaces=True, allow_duplicates=True): name, description, genes = x header2genes[name] = genes assert header2genes, "No genes." # Find the column that contains the genes in the gene set. Since # some of the genes may not be annotated, provide some leeway # here. header2counts = {} for (header, genes) in header2genes.iteritems(): count = len(set(geneset).intersection(genes)) header2counts[header] = count best_header = best_count = None for (header, count) in header2counts.iteritems(): if best_count is None or count > best_count: best_header, best_count = header, count assert best_count >= len(geneset)/2.0, \ "I could not find the genes in the descriptor file." gene_header = best_header genes = header2genes[gene_header] pretty = header2genes[pretty_header] assert len(genes) == len(pretty) gene2pretty = {} for g, p in zip(genes, pretty): if not g or not p: continue gene2pretty[g] = p return gene2pretty
def read_infile(filename): from genomicode import genesetlib name_order = [] name2annots = {} num_annots = None for x in genesetlib.read_tdf(filename, preserve_spaces=True, allow_duplicates=True): name, description, annots = x if num_annots is None: num_annots = len(annots) assert len(annots) == num_annots name_order.append(name) name2annots[name] = annots return AnnotationMatrix(name2annots, name_order)
def read_clinical_annotations(M, filename): # Return a tuple of (Matrix, clinical annotations). The # annotations are a dictionary of name -> list of values. They # are aligned with the matrix. from genomicode import genesetlib clinical_annots = {} for x in genesetlib.read_tdf(filename, preserve_spaces=True, allow_duplicates=True): name, description, values = x clinical_annots[name] = values # Align the gene scores with the clinical annotations. x = align_matrix_with_clinical_data(M, clinical_annots) M, clinical_annots = x return M, clinical_annots