def renderMutations(self, mutations, metadata=None, comments=None): if comments is None: comments = [] outputHeaders = [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY' ] # Create a list of annotation names and make sure to catch the case where there are no variants specified. try: m = mutations.next() except StopIteration as si: m = None if m is not None: fp = self._createVcfHeaderFilePtr(comments, m) else: fp = self._createVcfHeaderFilePtr(comments, metadata.asDict()) if m is not None: fieldsUsed = self.alternativeDictionary.keys() annotations = MutUtils.getAllAttributeNames(m) self.fieldMap = MutUtils.createFieldsMapping( fieldsUsed, annotations, self.alternativeDictionary, True) # Write each row: ctr = 0 unrenderableRows = 0 tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n") mutRow = self._createMutRow(m) if mutRow is not None: tsvWriter.writerow(mutRow) ctr += 1 for m in mutations: if (ctr % 1000) == 0: self.logger.info("Processed " + str(ctr) + " mutations") mutRow = self._createMutRow(m) # We may not render all rows. if mutRow is not None: tsvWriter.writerow(mutRow) else: unrenderableRows += 1 ctr += 1 self.logger.info("Processed all " + str(ctr) + " mutations. Could not render: " + str(unrenderableRows))
def renderMutations(self, mutations, metadata=None, comments=None): if comments is None: comments = [] outputHeaders = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY'] # Create a list of annotation names and make sure to catch the case where there are no variants specified. try: m = mutations.next() except StopIteration as si: m = None if m is not None: fp = self._createVcfHeaderFilePtr(comments, m) else: fp = self._createVcfHeaderFilePtr(comments, metadata.asDict()) if m is not None: fieldsUsed = self.alternativeDictionary.keys() annotations = MutUtils.getAllAttributeNames(m) self.fieldMap = MutUtils.createFieldsMapping(fieldsUsed, annotations, self.alternativeDictionary, True) # Write each row: ctr = 0 unrenderableRows = 0 tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n") mutRow = self._createMutRow(m) if mutRow is not None: tsvWriter.writerow(mutRow) ctr += 1 for m in mutations: if (ctr % 1000) == 0: self.logger.info("Processed " + str(ctr) + " mutations") mutRow = self._createMutRow(m) # We may not render all rows. if mutRow is not None: tsvWriter.writerow(mutRow) else: unrenderableRows += 1 ctr += 1 self.logger.info("Processed all " + str(ctr) + " mutations. Could not render: " + str(unrenderableRows))
def renderMutations(self, segments, metadata=None, comments=None): """Render segments into a gene list as described in the docs for this class. :param segments: iterable of MutationData :param metadata: :param comments: """ config_parser = ConfigUtils.createConfigParser(self._config_file) logging.getLogger(__name__).info("Building alternative keys dictionary...") self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config_parser) if metadata is None: metadata = OrderedDict() if comments is None: comments = [] fp = file(self._filename, 'w') for c in comments: fp.write("## " + c + "\n") # TODO: Define constant for "genes", and other annotations headers = config_parser.options("alternatives") gene_to_segment_dict = dict() annotations = None i = 0 for i, seg in enumerate(segments): if annotations is None: annotations = seg.keys() field_mapping = MutUtils.createFieldsMapping(headers, annotations, self._alternativeDictionary, isRenderInternalFields=True, prepend="") gene_list = seg['genes'].split(",") for g in gene_list: if g == seg["start_gene"]: gene_to_segment_dict[g + " " + seg["start_exon"]] = seg elif g == seg["end_gene"]: gene_to_segment_dict[g + " " + seg["end_exon"]] = seg else: gene_to_segment_dict[g] = seg if i == 0: logging.getLogger(__name__).info("No segments given. There will be no genes in the list.") writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore") writer.writeheader() logging.getLogger(__name__).info("Rendering gene list...") all_genes_seen = sorted(gene_to_segment_dict.keys()) num_genes = len(all_genes_seen) for i,gene in enumerate(all_genes_seen): # This next line may be slow... line_dict = dict() seg = gene_to_segment_dict[gene] for h in headers: annotation_field = field_mapping.get(h, h) line_dict[h] = seg.get(annotation_field, "") line_dict["gene"] = gene writer.writerow(line_dict) if i % 1000 == 0: logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i+1),num_genes)) fp.close()
def renderMutations(self, mutations, metadata=None, comments=None): """ Returns a file name pointing to the maf file that is generated. """ if metadata is None: metadata = OrderedDict() if comments is None: comments = [] self.logger.info("TCGA MAF output file: " + self._filename) self.logger.info("Render starting...") requiredColumns = self.config.get("general", "requiredColumns").split(',') optionalColumns = self.config.get("general", "optionalColumns").split(',') # Create the header list, making sure to preserve order. headers = requiredColumns headers.extend(optionalColumns) # Create a list of annotation names try: m = mutations.next() annotations = MutUtils.getAllAttributeNames(m) except StopIteration as si: # There are no mutations, so use the config file and metadata to determine what columns to output metadataAnnotations = metadata.keys() annotations = set(headers).union(metadataAnnotations) m = None # Create a mapping between column name and annotation name fieldMap = MutUtils.createFieldsMapping( headers, annotations, self.alternativeDictionary, self.config.getboolean("general", "displayAnnotations"), exposedFields=self.exposedColumns, prepend=self._prepend) fieldMapKeys = fieldMap.keys() internalFields = sorted(list(set(fieldMapKeys).difference(headers))) headers.extend(internalFields) # Initialize the output file and write a header. fp = file(self._filename, 'w') fp.write("#version " + self.getTcgaMafVersion() + "\n") for c in comments: fp.write("## " + c + "\n") # Initialize a csv DictWriter # Remove headers that start with "_" dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n") dw.writeheader() ctr = 0 try: # Add the NCBI build if m is not None: m.createAnnotation('ncbi_build', self.lookupNCBI_Build(m.build), annotationSource="OUTPUT") self._writeMutationRow(dw, fieldMap, fieldMapKeys, m) ctr += 1 for m in mutations: # Add the NCBI build m.createAnnotation('ncbi_build', self.lookupNCBI_Build(m.build), annotationSource="OUTPUT") self._writeMutationRow(dw, fieldMap, fieldMapKeys, m) # Update mutation count and log every 1000 mutations ctr += 1 if (ctr % 1000) == 0: self.logger.info("Rendered " + str(ctr) + " mutations.") except Exception as e: import traceback self.logger.error(traceback.format_exc()) self.logger.error( "Error at mutation " + str(ctr) + " " + str([m.chr, m.start, m.end, m.ref_allele, m.alt_allele]) + ": ") self.logger.error("Incomplete: rendered %d mutations." % (ctr)) fp.close() raise e fp.close() if self._is_entrez_id_message_logged: logging.getLogger(__name__).warn( "Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF." ) self.logger.info("Rendered all " + str(ctr) + " mutations.") return self._filename