예제 #1
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        if comments is None:
            comments = []

        outputHeaders = [
            'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
            'FORMAT', 'NORMAL', 'PRIMARY'
        ]

        # Create a list of annotation names and make sure to catch the case where there are no variants specified.
        try:
            m = mutations.next()
        except StopIteration as si:
            m = None

        if m is not None:
            fp = self._createVcfHeaderFilePtr(comments, m)
        else:
            fp = self._createVcfHeaderFilePtr(comments, metadata.asDict())

        if m is not None:
            fieldsUsed = self.alternativeDictionary.keys()

            annotations = MutUtils.getAllAttributeNames(m)
            self.fieldMap = MutUtils.createFieldsMapping(
                fieldsUsed, annotations, self.alternativeDictionary, True)

        # Write each row:
        ctr = 0
        unrenderableRows = 0
        tsvWriter = csv.DictWriter(fp,
                                   outputHeaders,
                                   delimiter="\t",
                                   lineterminator="\n")
        mutRow = self._createMutRow(m)

        if mutRow is not None:
            tsvWriter.writerow(mutRow)
            ctr += 1

        for m in mutations:
            if (ctr % 1000) == 0:
                self.logger.info("Processed " + str(ctr) + " mutations")
            mutRow = self._createMutRow(m)

            # We may not render all rows.
            if mutRow is not None:
                tsvWriter.writerow(mutRow)
            else:
                unrenderableRows += 1
            ctr += 1
        self.logger.info("Processed all " + str(ctr) +
                         " mutations.  Could not render: " +
                         str(unrenderableRows))
예제 #2
0
    def _determineHeaders(self, mut, metadata):
        if mut is None:
            headers = []
        else:
            headers = MutUtils.getAllAttributeNames(mut)

        if len(headers) == 0:
            headers = metadata.keys()

        # Remove headers that start with "_"
        for header in headers:
            if header.startswith("_"):
                headers.remove(header)

        return headers
예제 #3
0
    def _determineHeaders(self, mut, metadata):
        if mut is None:
            headers = []
        else:
            headers = MutUtils.getAllAttributeNames(mut)

        if len(headers) == 0:
            headers = metadata.keys()

        # Remove headers that start with "_"
        for header in headers:
            if header.startswith("_"):
                headers.remove(header)

        return headers
예제 #4
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        if comments is None:
            comments = []

        outputHeaders = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'PRIMARY']

        # Create a list of annotation names and make sure to catch the case where there are no variants specified.
        try:
            m = mutations.next()
        except StopIteration as si:
            m = None

        if m is not None:
            fp = self._createVcfHeaderFilePtr(comments, m)
        else:
            fp = self._createVcfHeaderFilePtr(comments, metadata.asDict())

        if m is not None:
            fieldsUsed = self.alternativeDictionary.keys()

            annotations = MutUtils.getAllAttributeNames(m)
            self.fieldMap = MutUtils.createFieldsMapping(fieldsUsed, annotations, self.alternativeDictionary, True)

        # Write each row:
        ctr = 0
        unrenderableRows = 0
        tsvWriter = csv.DictWriter(fp, outputHeaders, delimiter="\t", lineterminator="\n")
        mutRow = self._createMutRow(m)

        if mutRow is not None:
            tsvWriter.writerow(mutRow)
            ctr += 1

        for m in mutations:
            if (ctr % 1000) == 0:
                self.logger.info("Processed " + str(ctr) + " mutations")
            mutRow = self._createMutRow(m)

            # We may not render all rows.
            if mutRow is not None:
                tsvWriter.writerow(mutRow)
            else:
                unrenderableRows += 1
            ctr += 1
        self.logger.info("Processed all " + str(ctr) + " mutations.  Could not render: " + str(unrenderableRows))
예제 #5
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        """ Returns a file name pointing to the maf file that is generated. """
        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        self.logger.info("TCGA MAF output file: " + self._filename)
        self.logger.info("Render starting...")

        requiredColumns = self.config.get("general", "requiredColumns").split(',')
        optionalColumns = self.config.get("general", "optionalColumns").split(',')

        # Create the header list, making sure to preserve order.
        headers = requiredColumns
        headers.extend(optionalColumns)

        # Create a list of annotation names
        try:
            m = mutations.next()
            annotations = MutUtils.getAllAttributeNames(m)
        except StopIteration as si:

            # There are no mutations, so use the config file and metadata to determine what columns to output
            metadataAnnotations = metadata.keys()
            annotations = set(headers).union(metadataAnnotations)
            m = None

        # If we are splitting allelic_depth into two fields, add those to the headers.  Note that the mutations will
        #  be annotated properly later.
        if self._is_splitting_allelic_depth and "allelic_depth" in annotations:
            depth_fields = [TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT, TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT]
            headers.extend(depth_fields)

        if m is not None:

            # Add columns for the new annotations created as part of collapsing cols
            additional_internal_columns = []
            if self._column_collapser is not None:
                additional_internal_columns = self._column_collapser.retrieve_new_annotations_added(m, self._column_collapser_suffix)

            # Create a mapping between column name and annotation name
            field_map = FieldMapCreator.create_field_map(headers, m, self.alternativeDictionary,
                                                    self.config.getboolean("general", "displayAnnotations"),
                                                    exposed_fields=self.exposedColumns, prepend=self._prepend,
                                                    deprioritize_input_annotations=self._is_reannotating,
                                                    additional_columns=additional_internal_columns)

            field_map_keys = field_map.keys()
            internal_fields = sorted(list(set(field_map_keys).difference(headers)))
            headers.extend(internal_fields)

        # Initialize the output file and write a header.
        fp = file(self._filename, 'w')
        fp.write("#version " + self.getTcgaMafVersion() + "\n")
        
        for c in comments:
            fp.write("## " + c + "\n")
        
        # Initialize a csv DictWriter
        # Remove headers that start with "_"
        dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n")
        dw.writeheader()
        ctr = 0

        try:
            # Add the NCBI build
            if m is not None:
                self._add_output_annotations(m)
                self._writeMutationRow(dw, field_map, field_map_keys, m)
                ctr += 1

            for m in mutations:

                # Add the NCBI build
                self._add_output_annotations(m)
                self._writeMutationRow(dw, field_map, field_map_keys, m)
                
                # Update mutation count and log every 1000 mutations
                ctr += 1
                if (ctr % 1000) == 0:
                    self.logger.info("Rendered " + str(ctr) + " mutations.")
        except Exception as e:
            import traceback
            self.logger.error(traceback.format_exc())
            self.logger.error("Error at mutation " + str(ctr) + " " + str([m.chr,m.start,m.end,m.ref_allele,m.alt_allele]) + ": ")
            self.logger.error("Incomplete: rendered %d mutations." % (ctr))
            fp.close()
            raise e
        
        fp.close()
        if self._is_entrez_id_message_logged:
            logging.getLogger(__name__).warn("Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF.")
        self.logger.info("Rendered all " + str(ctr) + " mutations.")
        return self._filename
예제 #6
0
    def _writeMuts2Tsv(self, muts, path):
        """
        Given a mutation generator, this methods writes a tab separated file for all mutations in the mutation
        generator. In addition, this method computes the appropriate sample name in scenarios where the mutation is
        missing sample name annotation. It also computes a list of all chromosomes and sample names contained within
        the generator.

        :param path: temporary filename
        :param muts: generator object with mutations
        """

        sampleNames = set()
        chroms = set()

        writer = None

        # create a temporary file to write tab-separated file
        tempTsvFile = tempfile.NamedTemporaryFile(dir=path, delete=False)
        self.logger.debug("Creating intermediate tsv file at %s" % tempTsvFile.name)

        mutAttributeNames = []
        sampleNameSelector = SampleNameSelector(self.mutation,
                                                configFile=self.configTable.getConfigFilename(),
                                                section="OTHER")

        with open(tempTsvFile.name, 'w') as fptr:
            ctr = 0
            sampleNameAnnotationName = sampleNameSelector.getOutputAnnotationName()
            sampleNameSource = sampleNameSelector.getAnnotationSource()

            for mut in muts:
                if len(mutAttributeNames) == 0:
                    mutAttributeNames = mut.getAttributeNames()

                sampleName = sampleNameSelector.getSampleName(mut)
                if sampleName is not None:
                    if mut.get(sampleNameAnnotationName, None) is None:
                        mut.createAnnotation(sampleNameAnnotationName, sampleName, sampleNameSource)
                    sampleNames.add(sampleName)

                # Parse chromosome
                chroms.add(mut.chr)

                updated_start, updated_ref_allele, updated_alt_allele = MutUtils.retrieveMutCoordinatesForRendering(mut)
                mut.ref_allele = updated_ref_allele
                mut.alt_allele = updated_alt_allele
                mut.start = updated_start

                if ctr == 0:
                    fieldnames2Render = MutUtils.getAllAttributeNames(mut)
                    if sampleNameAnnotationName is not None:
                        fieldnames2Render += [sampleNameAnnotationName]
                    for fieldname in fieldnames2Render:  # fieldnames that start "_" aren't rendered
                        if fieldname.startswith("_"):
                            fieldnames2Render.remove(fieldname)

                    writer = csv.DictWriter(fptr, fieldnames2Render, extrasaction='ignore', delimiter=self.delimiter,
                                            lineterminator=self.lineterminator)
                    writer.writeheader()

                writer.writerow(mut)

                ctr += 1
                if (ctr % 1000) == 0:
                    self.logger.info("Wrote " + str(ctr) + " mutations to tsv.")

        sampleNames = list(sampleNames)
        sampleNames.sort()
        chroms = list(chroms)

        return chroms, sampleNames, tempTsvFile.name
예제 #7
0
    def _writeMuts2Tsv(self, muts, path):
        """
        Given a mutation generator, this methods writes a tab separated file for all mutations in the mutation
        generator. In addition, this method computes the appropriate sample name in scenarios where the mutation is
        missing sample name annotation. It also computes a list of all chromosomes and sample names contained within
        the generator.

        :param filename: temporary filename
        :param muts: generator object with mutations
        """

        sampleNames = set()
        chroms = set()

        writer = None

        # create a temporary file to write tab-separated file
        tempTsvFile = tempfile.NamedTemporaryFile(dir=path, delete=False)
        self.logger.info("Creating intermediate tsv file...")

        sampleNameAnnotationNames = self.getAnnotationNames("SAMPLE_NAME")
        tumorSampleNameAnnotationNames = self.getAnnotationNames("SAMPLE_TUMOR_NAME")
        normalSampleNameAnnotationNames = self.getAnnotationNames("SAMPLE_NORMAL_NAME")

        mutAttributeNames = []

        with open(tempTsvFile.name, 'w') as fptr:
            ctr = 0
            for mut in muts:

                sampleName = None
                sampleNameAnnotationName = None

                if len(mutAttributeNames) == 0:
                    mutAttributeNames = mut.getAttributeNames()

                # Sample name annotation is present
                if len(sampleNameAnnotationNames) != 0:
                    sampleNameAnnotationName = sampleNameAnnotationNames[0]
                    sampleName = mut[sampleNameAnnotationName]
                # Both, tumor and normal sample name annotations are present
                elif len(tumorSampleNameAnnotationNames) != 0 and len(normalSampleNameAnnotationNames) != 0:
                    tumorSampleNameAnnotationName = tumorSampleNameAnnotationNames[0]
                    normalSampleNameAnnotationName = normalSampleNameAnnotationNames[0]
                    sampleName = string.join([mut[normalSampleNameAnnotationName],
                                              mut[tumorSampleNameAnnotationName]], sep="-")
                    sampleNameAnnotationName = MutUtils.SAMPLE_NAME_ANNOTATION_NAME
                    mut.createAnnotation(sampleNameAnnotationName, sampleName, "OUTPUT")
                    if ctr == 0:
                        self.logger.info("Sample name is the concatenation of %s and %s columns."
                                         % (normalSampleNameAnnotationName, tumorSampleNameAnnotationName))
                # Only tumor sample name is present
                elif len(tumorSampleNameAnnotationNames) != 0:
                    tumorSampleNameAnnotationName = tumorSampleNameAnnotationNames[0]
                    sampleName = mut[tumorSampleNameAnnotationName]
                    sampleNameAnnotationName = MutUtils.SAMPLE_NAME_ANNOTATION_NAME
                    mut.createAnnotation(sampleNameAnnotationName, sampleName, "INPUT")
                    if ctr == 0:
                        self.logger.info("Sample name is %s column." % tumorSampleNameAnnotationName)
                # Only normal sample name is present
                elif len(normalSampleNameAnnotationNames) != 0:
                    normalSampleNameAnnotationName = normalSampleNameAnnotationNames[0]
                    sampleName = mut[normalSampleNameAnnotationName]
                    sampleNameAnnotationName = MutUtils.SAMPLE_NAME_ANNOTATION_NAME
                    mut.createAnnotation(sampleNameAnnotationName, sampleName, "INPUT")
                    if ctr == 0:
                        self.logger.info("Sample name is %s column." % normalSampleNameAnnotationName)

                if sampleName is not None:
                    sampleNames.add(sampleName)

                # Parse chromosome
                chroms.add(mut.chr)

                updated_start, updated_ref_allele, updated_alt_allele = MutUtils.retrieveMutCoordinatesForRendering(mut)
                mut.ref_allele = updated_ref_allele
                mut.alt_allele = updated_alt_allele
                mut.start = updated_start

                if ctr == 0:
                    fieldnames2Render = MutUtils.getAllAttributeNames(mut)
                    if sampleNameAnnotationName is not None:
                        fieldnames2Render += [sampleNameAnnotationName]
                    for fieldname in fieldnames2Render:  # fieldnames that start "_" aren't rendered
                        if fieldname.startswith("_"):
                            fieldnames2Render.remove(fieldname)

                    writer = csv.DictWriter(fptr, fieldnames2Render, extrasaction='ignore', delimiter=self.delimiter,
                                            lineterminator=self.lineterminator)
                    writer.writeheader()

                writer.writerow(mut)

                ctr += 1
                if (ctr % 1000) == 0:
                    self.logger.info("Wrote " + str(ctr) + " mutations to tsv.")

        sampleNames = list(sampleNames)
        sampleNames.sort()
        chroms = list(chroms)

        return chroms, sampleNames, tempTsvFile.name
예제 #8
0
    def renderMutations(self, mutations, metadata=None, comments=None):
        """ Returns a file name pointing to the maf file that is generated. """
        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        self.logger.info("TCGA MAF output file: " + self._filename)
        self.logger.info("Render starting...")

        requiredColumns = self.config.get("general",
                                          "requiredColumns").split(',')
        optionalColumns = self.config.get("general",
                                          "optionalColumns").split(',')

        # Create the header list, making sure to preserve order.
        headers = requiredColumns
        headers.extend(optionalColumns)

        # Create a list of annotation names
        try:
            m = mutations.next()
            annotations = MutUtils.getAllAttributeNames(m)
        except StopIteration as si:

            # There are no mutations, so use the config file and metadata to determine what columns to output
            metadataAnnotations = metadata.keys()
            annotations = set(headers).union(metadataAnnotations)
            m = None

        # Create a mapping between column name and annotation name
        fieldMap = MutUtils.createFieldsMapping(
            headers,
            annotations,
            self.alternativeDictionary,
            self.config.getboolean("general", "displayAnnotations"),
            exposedFields=self.exposedColumns,
            prepend=self._prepend)
        fieldMapKeys = fieldMap.keys()
        internalFields = sorted(list(set(fieldMapKeys).difference(headers)))
        headers.extend(internalFields)

        # Initialize the output file and write a header.
        fp = file(self._filename, 'w')
        fp.write("#version " + self.getTcgaMafVersion() + "\n")

        for c in comments:
            fp.write("## " + c + "\n")

        # Initialize a csv DictWriter
        # Remove headers that start with "_"
        dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n")
        dw.writeheader()
        ctr = 0

        try:
            # Add the NCBI build
            if m is not None:
                m.createAnnotation('ncbi_build',
                                   self.lookupNCBI_Build(m.build),
                                   annotationSource="OUTPUT")
                self._writeMutationRow(dw, fieldMap, fieldMapKeys, m)
                ctr += 1

            for m in mutations:

                # Add the NCBI build
                m.createAnnotation('ncbi_build',
                                   self.lookupNCBI_Build(m.build),
                                   annotationSource="OUTPUT")
                self._writeMutationRow(dw, fieldMap, fieldMapKeys, m)

                # Update mutation count and log every 1000 mutations
                ctr += 1
                if (ctr % 1000) == 0:
                    self.logger.info("Rendered " + str(ctr) + " mutations.")
        except Exception as e:
            import traceback
            self.logger.error(traceback.format_exc())
            self.logger.error(
                "Error at mutation " + str(ctr) + " " +
                str([m.chr, m.start, m.end, m.ref_allele, m.alt_allele]) +
                ": ")
            self.logger.error("Incomplete: rendered %d mutations." % (ctr))
            fp.close()
            raise e

        fp.close()
        if self._is_entrez_id_message_logged:
            logging.getLogger(__name__).warn(
                "Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF."
            )
        self.logger.info("Rendered all " + str(ctr) + " mutations.")
        return self._filename