def test_not_updating_annotation_source(self): """Test that do not have to update annotation source if columns are collapsed""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "|36", annotationSource="TEST") cc = ColumnCollapser() cc.update_mutation(m1) self.assertEqual(m1.getAnnotation("ALT_F2R1").getDatasource(), "TEST")
def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None): """ """ options = dict() if other_options is None else other_options self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.logger.info("Building alternative keys dictionary...") self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self.options = options self._prepend = self.config.get("general", "prepend") if self.options.get(OptionConstants.NO_PREPEND, False): self._prepend = "" # _is_reannotating is a flag to determine whether we should give precendence to annotations that were not # annotated as part of the INPUT. self._is_reannotating = options.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False) self._is_splitting_allelic_depth = self.options.get(OptionConstants.SPLIT_ALLELIC_DEPTH, True) self.exposedColumns = set(self.config.get("general", "exposedColumns").split(',')) self._is_entrez_id_message_logged = False self._is_collapsing_number_cols = options.get(OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS, False) self._column_collapser = None self._column_collapser_suffix = None if self._is_collapsing_number_cols: self._column_collapser = ColumnCollapser() self._column_collapser_suffix = "_full"
def test_simple_collapse(self): """Ensure simple rules for numeric collapsing are honored""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "34|36") m1.createAnnotation('i_t_Foxog', ".509|.511") m1.createAnnotation('i_tumor_f', ".200|.210") m1.createAnnotation('hamilcar', "0|0") m1.createAnnotation('donotcollapse', "1|45") m2 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m2.createAnnotation('ALT_F2R1', "36|38") m2.createAnnotation('i_t_Foxog', ".500|.510") m2.createAnnotation('i_tumor_f', ".100|.110") m2.createAnnotation('hamilcar', "0.01|0") m2.createAnnotation('barca', "0.02|0") m2.createAnnotation('donotcollapse', "100|4500") cc = ColumnCollapser() cc.update_mutation(m1) self.assertEqual(m1['ALT_F2R1'], "34") self.assertEqual(float(m1['i_t_Foxog']), float(".510")) self.assertEqual(float(m1['i_tumor_f']), float(".205")) self.assertEqual(float(m1['hamilcar']), float("0")) self.assertEqual(m1['donotcollapse'], "1|45") cc.update_mutation(m2) self.assertEqual(m2['ALT_F2R1'], "36") self.assertEqual(float(m2['i_t_Foxog']), float(".505")) self.assertEqual(float(m2['i_tumor_f']), float(".105")) self.assertEqual(float(m2['hamilcar']), float("0.005")) self.assertEqual(float(m2['barca']), float("0.01")) self.assertEqual(m2['donotcollapse'], "100|4500")
def test_annotation_copy(self): """Test that we can create a backup annotation with the old values after collapsing, if requested.""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "|36", annotationSource="TEST") cc = ColumnCollapser() cc.update_mutation(m1, new_annotation_source="foo", copy_old_suffix="_full") self.assertEqual(m1["ALT_F2R1_full"], "|36") self.assertEqual(m1["ALT_F2R1"], "36") self.assertEqual(m1.getAnnotation("ALT_F2R1_full").getDatasource(), "TEST") self.assertTrue(m1.getAnnotation("ALT_F2R1").getDatasource() != m1.getAnnotation("ALT_F2R1_full").getDatasource())
def test_annotation_copy_collision(self): """Test that annotation copy will use the bahavior of the mutation in case of collision due to suffix""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "30|36", annotationSource="TEST") m1.createAnnotation('ALT_F2R1_full', "going_to_be_overwritten", annotationSource="TEST") is_exception_seen = False cc = ColumnCollapser() try: cc.update_mutation(m1, copy_old_suffix="_full") except DuplicateAnnotationException as dae: is_exception_seen = True self.assertTrue(is_exception_seen, "Did not see duplicate annotation exception") m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000", allow_overwriting=True) m1.createAnnotation('ALT_F2R1', "30|36", annotationSource="TEST") m1.createAnnotation('ALT_F2R1_full', "going_to_be_overwritten", annotationSource="TEST") cc = ColumnCollapser() cc.update_mutation(m1, copy_old_suffix="_full") self.assertEqual(m1['ALT_F2R1_full'], "30|36") self.assertEqual(m1['ALT_F2R1'], "30")
def test_annotation_copy(self): """Test that we can create a backup annotation with the old values after collapsing, if requested.""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "|36", annotationSource="TEST") cc = ColumnCollapser() cc.update_mutation(m1, new_annotation_source="foo", copy_old_suffix="_full") self.assertEqual(m1["ALT_F2R1_full"], "|36") self.assertEqual(m1["ALT_F2R1"], "36") self.assertEqual( m1.getAnnotation("ALT_F2R1_full").getDatasource(), "TEST") self.assertTrue( m1.getAnnotation("ALT_F2R1").getDatasource() != m1.getAnnotation( "ALT_F2R1_full").getDatasource())
def test_cannot_collapse(self): """Make sure that we move on when we cannot collapse values.""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "|36") m1.createAnnotation('i_t_Foxog', "|") m1.createAnnotation('i_tumor_f', "") m1.createAnnotation('hamilcar', "0|blah") m1.createAnnotation('barca', "carthage_rules") m1.createAnnotation('donotcollapse', "1|45") cc = ColumnCollapser() cc.update_mutation(m1) self.assertEqual(m1['ALT_F2R1'], "36") self.assertEqual(m1['i_t_Foxog'], "") self.assertEqual(m1['i_tumor_f'], "") self.assertEqual(m1['hamilcar'], "0|blah") self.assertEqual(m1['barca'], "carthage_rules") self.assertEqual(m1['donotcollapse'], "1|45")
class TcgaMafOutputRenderer(OutputRenderer): """ Render a generator or list of mutations into a TCGA MAF file. TCGA MAF specification can be found at: https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+%28MAF%29+Specification Version specified in the config file in the "general" section. """ def getTcgaMafVersion(self): return self.config.get("general", "version") OUTPUT_T_REF_COUNT = 't_ref_count' OUTPUT_T_ALT_COUNT = 't_alt_count' def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None): """ """ options = dict() if other_options is None else other_options self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.logger.info("Building alternative keys dictionary...") self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self.options = options self._prepend = self.config.get("general", "prepend") if self.options.get(OptionConstants.NO_PREPEND, False): self._prepend = "" # _is_reannotating is a flag to determine whether we should give precendence to annotations that were not # annotated as part of the INPUT. self._is_reannotating = options.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False) self._is_splitting_allelic_depth = self.options.get(OptionConstants.SPLIT_ALLELIC_DEPTH, True) self.exposedColumns = set(self.config.get("general", "exposedColumns").split(',')) self._is_entrez_id_message_logged = False self._is_collapsing_number_cols = options.get(OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS, False) self._column_collapser = None self._column_collapser_suffix = None if self._is_collapsing_number_cols: self._column_collapser = ColumnCollapser() self._column_collapser_suffix = "_full" def lookupNCBI_Build(self, build): """ If a build number exists in the config file, use that. Otherwise, use the name specified. """ if not self.config.has_option("genomeBuild", build): return build self.config.get("genomeBuild", build, vars={"genomeBuild":build}) def _createMutationRow(self, m, headers, fieldMapping): """ Create a single mutation dictionary (i.e. render a line). A dictionary as per the csv library. Headers will usually be the fieldMapping keys, but extra parameter is provided here in case subset is desired. Also, allows caching of the keys ahead of time. """ row = dict() for h in headers: annotation = fieldMapping[h] value = m.get(annotation, "__UNKNOWN__") row[h] = value return row def _determine_new_allele_if_blank(self, d, allele_key, new_value): """ :param d: dictionary of column names :param allele_key: key to replace if "" or does not exist. :param new_value: value to use if "" or does not exist :return: """ result = d.get(allele_key, new_value) if result.strip() == "": result = new_value return result def _update_validation_values(self, row): """ If Validation_Status == "Valid" then Tumor_Validation_Allele1, Tumor_Validation_Allele2, Match_Norm_Validation_Allele1, Match_Norm_Validation_Allele2 cannot be null If Mutation_Status == "Somatic" and Validation_Status == "Valid", then Match_Norm_Validation_Allele1 == Match_Norm_Validation_Allele2 == Reference_Allele and (Tumor_Validation_Allele1 or Tumor_Validation_Allele2) != Reference_Allele If Validation_Status == "Invalid" then Tumor_Validation_Allele1, Tumor_Validation_Allele2, Match_Norm_Validation_Allele1, Match_Norm_Validation_Allele2 cannot be null AND Tumor_Validation_Allelle1 == Match_Norm_Validation_Allele1 AND Tumor_Validation_Allelle2 == Match_Norm_Validation_Allele2 (Added as a replacement for 8a as a result of breakdown) IMPORTANT: The input parameter is altered. :param row: dict with name value pairs that include the TCGA MAF columns. This is usually not the mutation. """ if row['Validation_Status'] == "Valid": if row['Mutation_Status'] == "Somatic": row['Tumor_Validation_Allele1'] = self._determine_new_allele_if_blank(row, 'Tumor_Validation_Allele1', row['Reference_Allele']) row['Tumor_Validation_Allele2'] = self._determine_new_allele_if_blank(row, 'Tumor_Validation_Allele2', row['Tumor_Seq_Allele2']) row['Match_Norm_Validation_Allele1'] = self._determine_new_allele_if_blank(row, 'Match_Norm_Validation_Allele1', row['Reference_Allele']) row['Match_Norm_Validation_Allele2'] = self._determine_new_allele_if_blank(row, 'Match_Norm_Validation_Allele2', row['Reference_Allele']) if row['Validation_Status'] == "Invalid": # Only valid mutation status value is None for an invalid mutation if row['Mutation_Status'] != "None": row['Mutation_Status'] = "None" # If the alleles are blank, populate properly for invalid mutation. Basically, everything becomes reference row['Match_Norm_Validation_Allele1'] = self._determine_new_allele_if_blank(row, 'Match_Norm_Validation_Allele1', row['Reference_Allele']) row['Match_Norm_Validation_Allele2'] = self._determine_new_allele_if_blank(row, 'Match_Norm_Validation_Allele2', row['Reference_Allele']) row['Tumor_Validation_Allele1'] = self._determine_new_allele_if_blank(row, 'Tumor_Validation_Allele1', row['Match_Norm_Validation_Allele1']) row['Tumor_Validation_Allele2'] = self._determine_new_allele_if_blank(row, 'Tumor_Validation_Allele2', row['Match_Norm_Validation_Allele2']) def _writeMutationRow(self, dw, fieldMap, fieldMapKeys, m): """ If this row should be rendered, then write it to the given DictWriter Additionally, apply corrections needed to make this a valid TCGA MAF. This method must be called as a last step before writing the output, as it relies on the output row, as opposed to the annotated mutation. :param dw: DictWriter :param fieldMap: :param fieldMapKeys: :param m: :return: """ row = self._createMutationRow(m, fieldMapKeys, fieldMap) # Use HGNC Entrez Gene ID, if available and nothing else has populated it., if row['Entrez_Gene_Id'] == "" and m.get('HGNC_Entrez Gene ID(supplied by NCBI)', "") != "": row['Entrez_Gene_Id'] = m.get('HGNC_Entrez Gene ID(supplied by NCBI)') if row['Entrez_Gene_Id'] == "": row['Entrez_Gene_Id'] = "0" if not self._is_entrez_id_message_logged and row['Entrez_Gene_Id'] == "0" and row['Hugo_Symbol'] != "Unknown": logging.getLogger(__name__).warn("Entrez Gene ID was zero, but Hugo Symbol was not Unknown. Is the HGNC and/or Transcript datasource complete?") self._is_entrez_id_message_logged = True self._update_validation_values(row) # Handle the splitting of allelic depth if row.get('allelic_depth', "").strip() != "" and self._is_splitting_allelic_depth: vals = row.get('allelic_depth', "").split(",") ref_count = vals[0] alt_count = vals[1] row[TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT] = alt_count row[TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT] = ref_count dw.writerow(row) def _add_output_annotations(self, m): """Add annotations specific to the TCGA MAF """ m.createAnnotation('ncbi_build', self.lookupNCBI_Build(m.build), annotationSource="OUTPUT") if self._is_splitting_allelic_depth and m.get('allelic_depth', "").strip() != "": # Handle the splitting of allelic depth vals = m.get('allelic_depth', "").split(",") ref_count = vals[0] alt_count = vals[1] m.createAnnotation(TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT, alt_count, "OUTPUT") m.createAnnotation(TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT, ref_count, "OUTPUT") if self._is_collapsing_number_cols: self._column_collapser.update_mutation(m, "OUTPUT", self._column_collapser_suffix) def renderMutations(self, mutations, metadata=None, comments=None): """ Returns a file name pointing to the maf file that is generated. """ if metadata is None: metadata = OrderedDict() if comments is None: comments = [] self.logger.info("TCGA MAF output file: " + self._filename) self.logger.info("Render starting...") requiredColumns = self.config.get("general", "requiredColumns").split(',') optionalColumns = self.config.get("general", "optionalColumns").split(',') # Create the header list, making sure to preserve order. headers = requiredColumns headers.extend(optionalColumns) # Create a list of annotation names try: m = mutations.next() annotations = MutUtils.getAllAttributeNames(m) except StopIteration as si: # There are no mutations, so use the config file and metadata to determine what columns to output metadataAnnotations = metadata.keys() annotations = set(headers).union(metadataAnnotations) m = None # If we are splitting allelic_depth into two fields, add those to the headers. Note that the mutations will # be annotated properly later. if self._is_splitting_allelic_depth and "allelic_depth" in annotations: depth_fields = [TcgaMafOutputRenderer.OUTPUT_T_ALT_COUNT, TcgaMafOutputRenderer.OUTPUT_T_REF_COUNT] headers.extend(depth_fields) if m is not None: # Add columns for the new annotations created as part of collapsing cols additional_internal_columns = [] if self._column_collapser is not None: additional_internal_columns = self._column_collapser.retrieve_new_annotations_added(m, self._column_collapser_suffix) # Create a mapping between column name and annotation name field_map = FieldMapCreator.create_field_map(headers, m, self.alternativeDictionary, self.config.getboolean("general", "displayAnnotations"), exposed_fields=self.exposedColumns, prepend=self._prepend, deprioritize_input_annotations=self._is_reannotating, additional_columns=additional_internal_columns) field_map_keys = field_map.keys() internal_fields = sorted(list(set(field_map_keys).difference(headers))) headers.extend(internal_fields) # Initialize the output file and write a header. fp = file(self._filename, 'w') fp.write("#version " + self.getTcgaMafVersion() + "\n") for c in comments: fp.write("## " + c + "\n") # Initialize a csv DictWriter # Remove headers that start with "_" dw = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n") dw.writeheader() ctr = 0 try: # Add the NCBI build if m is not None: self._add_output_annotations(m) self._writeMutationRow(dw, field_map, field_map_keys, m) ctr += 1 for m in mutations: # Add the NCBI build self._add_output_annotations(m) self._writeMutationRow(dw, field_map, field_map_keys, m) # Update mutation count and log every 1000 mutations ctr += 1 if (ctr % 1000) == 0: self.logger.info("Rendered " + str(ctr) + " mutations.") except Exception as e: import traceback self.logger.error(traceback.format_exc()) self.logger.error("Error at mutation " + str(ctr) + " " + str([m.chr,m.start,m.end,m.ref_allele,m.alt_allele]) + ": ") self.logger.error("Incomplete: rendered %d mutations." % (ctr)) fp.close() raise e fp.close() if self._is_entrez_id_message_logged: logging.getLogger(__name__).warn("Some Entrez_Gene_IDs may be missing for valid Hugo Symbols in this TCGA MAF.") self.logger.info("Rendered all " + str(ctr) + " mutations.") return self._filename