def renderMutations(self, mutations, metadata=None, comments=None): if comments is None: comments = [] outputHeaders = [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY' ] # Create a list of annotation names and make sure to catch the case where there are no variants specified. try: m = mutations.next() except StopIteration as si: m = None if m is not None: fp = self._createVcfHeaderFilePtr(comments, m) else: fp = self._createVcfHeaderFilePtr(comments, metadata.asDict()) if m is not None: fieldsUsed = self.alternativeDictionary.keys() annotations = MutUtils.getAllAttributeNames(m) self.fieldMap = MutUtils.createFieldsMapping( fieldsUsed, annotations, self.alternativeDictionary, True) # Write each row: ctr = 0 unrenderableRows = 0 tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n") mutRow = self._createMutRow(m) if mutRow is not None: tsvWriter.writerow(mutRow) ctr += 1 for m in mutations: if (ctr % 1000) == 0: self.logger.info("Processed " + str(ctr) + " mutations") mutRow = self._createMutRow(m) # We may not render all rows. if mutRow is not None: tsvWriter.writerow(mutRow) else: unrenderableRows += 1 ctr += 1 self.logger.info("Processed all " + str(ctr) + " mutations. Could not render: " + str(unrenderableRows))
def _determineHeaders(self, mut, metadata): if mut is None: headers = [] else: headers = MutUtils.getAllAttributeNames(mut) if len(headers) == 0: headers = metadata.keys() # Remove headers that start with "_" for header in headers: if header.startswith("_"): headers.remove(header) return headers
def renderMutations(self, mutations, metadata=None, comments=None): if comments is None: comments = [] outputHeaders = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY'] # Create a list of annotation names and make sure to catch the case where there are no variants specified. try: m = mutations.next() except StopIteration as si: m = None if m is not None: fp = self._createVcfHeaderFilePtr(comments, m) else: fp = self._createVcfHeaderFilePtr(comments, metadata.asDict()) if m is not None: fieldsUsed = self.alternativeDictionary.keys() annotations = MutUtils.getAllAttributeNames(m) self.fieldMap = MutUtils.createFieldsMapping(fieldsUsed, annotations, self.alternativeDictionary, True) # Write each row: ctr = 0 unrenderableRows = 0 tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n") mutRow = self._createMutRow(m) if mutRow is not None: tsvWriter.writerow(mutRow) ctr += 1 for m in mutations: if (ctr % 1000) == 0: self.logger.info("Processed " + str(ctr) + " mutations") mutRow = self._createMutRow(m) # We may not render all rows. if mutRow is not None: tsvWriter.writerow(mutRow) else: unrenderableRows += 1 ctr += 1 self.logger.info("Processed all " + str(ctr) + " mutations. Could not render: " + str(unrenderableRows))
def renderMutations(self, mutations, metadata=None, comments=None): """ Returns a file name pointing to the maf file that is generated. """ if metadata is None: metadata = OrderedDict() if comments is None: comments = [] self.logger.info("TCGA MAF output file: " + self._filename) self.logger.info("Render starting...") requiredColumns = self.config.get("general", "requiredColumns").split(',') optionalColumns = self.config.get("general", "optionalColumns").split(',') # Create the header list, making sure to preserve order. headers = requiredColumns headers.extend(optionalColumns) # Create a list of annotation names try: m = mutations.next() annotations = MutUtils.getAllAttributeNames(m) except StopIteration as si: # There are no mutations, so use the config file and metadata to determine what columns to output metadataAnnotations = metadata.keys() annotations = set(headers).union(metadataAnnotations) m = None # If we are splitting allelic_depth into two fields, add those to the headers. Note that the mutations will # be annotated properly later. if self._is_splitting_allelic_depth and "allelic_depth" in annotations: depth_fields = [TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT, TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT] headers.extend(depth_fields) if m is not None: # Add columns for the new annotations created as part of collapsing cols additional_internal_columns = [] if self._column_collapser is not None: additional_internal_columns = self._column_collapser.retrieve_new_annotations_added(m, self._column_collapser_suffix) # Create a mapping between column name and annotation name field_map = FieldMapCreator.create_field_map(headers, m, self.alternativeDictionary, self.config.getboolean("general", "displayAnnotations"), exposed_fields=self.exposedColumns, prepend=self._prepend, deprioritize_input_annotations=self._is_reannotating, additional_columns=additional_internal_columns) field_map_keys = field_map.keys() internal_fields = sorted(list(set(field_map_keys).difference(headers))) headers.extend(internal_fields) # Initialize the output file and write a header. fp = file(self._filename, 'w') fp.write("#version " + self.getTcgaMafVersion() + "\n") for c in comments: fp.write("## " + c + "\n") # Initialize a csv DictWriter # Remove headers that start with "_" dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n") dw.writeheader() ctr = 0 try: # Add the NCBI build if m is not None: self._add_output_annotations(m) self._writeMutationRow(dw, field_map, field_map_keys, m) ctr += 1 for m in mutations: # Add the NCBI build self._add_output_annotations(m) self._writeMutationRow(dw, field_map, field_map_keys, m) # Update mutation count and log every 1000 mutations ctr += 1 if (ctr % 1000) == 0: self.logger.info("Rendered " + str(ctr) + " mutations.") except Exception as e: import traceback self.logger.error(traceback.format_exc()) self.logger.error("Error at mutation " + str(ctr) + " " + str([m.chr,m.start,m.end,m.ref_allele,m.alt_allele]) + ": ") self.logger.error("Incomplete: rendered %d mutations." % (ctr)) fp.close() raise e fp.close() if self._is_entrez_id_message_logged: logging.getLogger(__name__).warn("Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF.") self.logger.info("Rendered all " + str(ctr) + " mutations.") return self._filename
def _writeMuts2Tsv(self, muts, path): """ Given a mutation generator, this methods writes a tab separated file for all mutations in the mutation generator. In addition, this method computes the appropriate sample name in scenarios where the mutation is missing sample name annotation. It also computes a list of all chromosomes and sample names contained within the generator. :param path: temporary filename :param muts: generator object with mutations """ sampleNames = set() chroms = set() writer = None # create a temporary file to write tab-separated file tempTsvFile = tempfile.NamedTemporaryFile(dir=path, delete=False) self.logger.debug("Creating intermediate tsv file at %s" % tempTsvFile.name) mutAttributeNames = [] sampleNameSelector = SampleNameSelector(self.mutation, configFile=self.configTable.getConfigFilename(), section="OTHER") with open(tempTsvFile.name, 'w') as fptr: ctr = 0 sampleNameAnnotationName = sampleNameSelector.getOutputAnnotationName() sampleNameSource = sampleNameSelector.getAnnotationSource() for mut in muts: if len(mutAttributeNames) == 0: mutAttributeNames = mut.getAttributeNames() sampleName = sampleNameSelector.getSampleName(mut) if sampleName is not None: if mut.get(sampleNameAnnotationName, None) is None: mut.createAnnotation(sampleNameAnnotationName, sampleName, sampleNameSource) sampleNames.add(sampleName) # Parse chromosome chroms.add(mut.chr) updated_start, updated_ref_allele, updated_alt_allele = MutUtils.retrieveMutCoordinatesForRendering(mut) mut.ref_allele = updated_ref_allele mut.alt_allele = updated_alt_allele mut.start = updated_start if ctr == 0: fieldnames2Render = MutUtils.getAllAttributeNames(mut) if sampleNameAnnotationName is not None: fieldnames2Render += [sampleNameAnnotationName] for fieldname in fieldnames2Render: # fieldnames that start "_" aren't rendered if fieldname.startswith("_"): fieldnames2Render.remove(fieldname) writer = csv.DictWriter(fptr, fieldnames2Render, extrasaction='ignore', delimiter=self.delimiter, lineterminator=self.lineterminator) writer.writeheader() writer.writerow(mut) ctr += 1 if (ctr % 1000) == 0: self.logger.info("Wrote " + str(ctr) + " mutations to tsv.") sampleNames = list(sampleNames) sampleNames.sort() chroms = list(chroms) return chroms, sampleNames, tempTsvFile.name
def _writeMuts2Tsv(self, muts, path): """ Given a mutation generator, this methods writes a tab separated file for all mutations in the mutation generator. In addition, this method computes the appropriate sample name in scenarios where the mutation is missing sample name annotation. It also computes a list of all chromosomes and sample names contained within the generator. :param filename: temporary filename :param muts: generator object with mutations """ sampleNames = set() chroms = set() writer = None # create a temporary file to write tab-separated file tempTsvFile = tempfile.NamedTemporaryFile(dir=path, delete=False) self.logger.info("Creating intermediate tsv file...") sampleNameAnnotationNames = self.getAnnotationNames("SAMPLE_NAME") tumorSampleNameAnnotationNames = self.getAnnotationNames("SAMPLE_TUMOR_NAME") normalSampleNameAnnotationNames = self.getAnnotationNames("SAMPLE_NORMAL_NAME") mutAttributeNames = [] with open(tempTsvFile.name, 'w') as fptr: ctr = 0 for mut in muts: sampleName = None sampleNameAnnotationName = None if len(mutAttributeNames) == 0: mutAttributeNames = mut.getAttributeNames() # Sample name annotation is present if len(sampleNameAnnotationNames) != 0: sampleNameAnnotationName = sampleNameAnnotationNames[0] sampleName = mut[sampleNameAnnotationName] # Both, tumor and normal sample name annotations are present elif len(tumorSampleNameAnnotationNames) != 0 and len(normalSampleNameAnnotationNames) != 0: tumorSampleNameAnnotationName = tumorSampleNameAnnotationNames[0] normalSampleNameAnnotationName = normalSampleNameAnnotationNames[0] sampleName = string.join([mut[normalSampleNameAnnotationName], mut[tumorSampleNameAnnotationName]], sep="-") sampleNameAnnotationName = MutUtils.SAMPLE_NAME_ANNOTATION_NAME mut.createAnnotation(sampleNameAnnotationName, sampleName, "OUTPUT") if ctr == 0: self.logger.info("Sample name is the concatenation of %s and %s columns." % (normalSampleNameAnnotationName, tumorSampleNameAnnotationName)) # Only tumor sample name is present elif len(tumorSampleNameAnnotationNames) != 0: tumorSampleNameAnnotationName = tumorSampleNameAnnotationNames[0] sampleName = mut[tumorSampleNameAnnotationName] sampleNameAnnotationName = MutUtils.SAMPLE_NAME_ANNOTATION_NAME mut.createAnnotation(sampleNameAnnotationName, sampleName, "INPUT") if ctr == 0: self.logger.info("Sample name is %s column." % tumorSampleNameAnnotationName) # Only normal sample name is present elif len(normalSampleNameAnnotationNames) != 0: normalSampleNameAnnotationName = normalSampleNameAnnotationNames[0] sampleName = mut[normalSampleNameAnnotationName] sampleNameAnnotationName = MutUtils.SAMPLE_NAME_ANNOTATION_NAME mut.createAnnotation(sampleNameAnnotationName, sampleName, "INPUT") if ctr == 0: self.logger.info("Sample name is %s column." % normalSampleNameAnnotationName) if sampleName is not None: sampleNames.add(sampleName) # Parse chromosome chroms.add(mut.chr) updated_start, updated_ref_allele, updated_alt_allele = MutUtils.retrieveMutCoordinatesForRendering(mut) mut.ref_allele = updated_ref_allele mut.alt_allele = updated_alt_allele mut.start = updated_start if ctr == 0: fieldnames2Render = MutUtils.getAllAttributeNames(mut) if sampleNameAnnotationName is not None: fieldnames2Render += [sampleNameAnnotationName] for fieldname in fieldnames2Render: # fieldnames that start "_" aren't rendered if fieldname.startswith("_"): fieldnames2Render.remove(fieldname) writer = csv.DictWriter(fptr, fieldnames2Render, extrasaction='ignore', delimiter=self.delimiter, lineterminator=self.lineterminator) writer.writeheader() writer.writerow(mut) ctr += 1 if (ctr % 1000) == 0: self.logger.info("Wrote " + str(ctr) + " mutations to tsv.") sampleNames = list(sampleNames) sampleNames.sort() chroms = list(chroms) return chroms, sampleNames, tempTsvFile.name
def renderMutations(self, mutations, metadata=None, comments=None): """ Returns a file name pointing to the maf file that is generated. """ if metadata is None: metadata = OrderedDict() if comments is None: comments = [] self.logger.info("TCGA MAF output file: " + self._filename) self.logger.info("Render starting...") requiredColumns = self.config.get("general", "requiredColumns").split(',') optionalColumns = self.config.get("general", "optionalColumns").split(',') # Create the header list, making sure to preserve order. headers = requiredColumns headers.extend(optionalColumns) # Create a list of annotation names try: m = mutations.next() annotations = MutUtils.getAllAttributeNames(m) except StopIteration as si: # There are no mutations, so use the config file and metadata to determine what columns to output metadataAnnotations = metadata.keys() annotations = set(headers).union(metadataAnnotations) m = None # Create a mapping between column name and annotation name fieldMap = MutUtils.createFieldsMapping( headers, annotations, self.alternativeDictionary, self.config.getboolean("general", "displayAnnotations"), exposedFields=self.exposedColumns, prepend=self._prepend) fieldMapKeys = fieldMap.keys() internalFields = sorted(list(set(fieldMapKeys).difference(headers))) headers.extend(internalFields) # Initialize the output file and write a header. fp = file(self._filename, 'w') fp.write("#version " + self.getTcgaMafVersion() + "\n") for c in comments: fp.write("## " + c + "\n") # Initialize a csv DictWriter # Remove headers that start with "_" dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n") dw.writeheader() ctr = 0 try: # Add the NCBI build if m is not None: m.createAnnotation('ncbi_build', self.lookupNCBI_Build(m.build), annotationSource="OUTPUT") self._writeMutationRow(dw, fieldMap, fieldMapKeys, m) ctr += 1 for m in mutations: # Add the NCBI build m.createAnnotation('ncbi_build', self.lookupNCBI_Build(m.build), annotationSource="OUTPUT") self._writeMutationRow(dw, fieldMap, fieldMapKeys, m) # Update mutation count and log every 1000 mutations ctr += 1 if (ctr % 1000) == 0: self.logger.info("Rendered " + str(ctr) + " mutations.") except Exception as e: import traceback self.logger.error(traceback.format_exc()) self.logger.error( "Error at mutation " + str(ctr) + " " + str([m.chr, m.start, m.end, m.ref_allele, m.alt_allele]) + ": ") self.logger.error("Incomplete: rendered %d mutations." % (ctr)) fp.close() raise e fp.close() if self._is_entrez_id_message_logged: logging.getLogger(__name__).warn( "Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF." ) self.logger.info("Rendered all " + str(ctr) + " mutations.") return self._filename