def createConfigTableKeys(self, configParser, configTable): # Parse fields from FORMAT section of the config file """ :param configParser: :param configTable: """ table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "INFO") for ID, name in table.items(): configTable.addInfoFieldID(ID, name) # Parse fields from FORMAT section of the config file table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "FORMAT") for ID, name in table.items(): configTable.addFormatFieldID(ID, name) # Parse fields from NOT_SPLIT_TAGS section of the config file table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "NOT_SPLIT_TAGS") for fieldType, IDs in table.items(): configTable.addFieldIDsToNotSplitSet(fieldType, IDs) # Parse fields from SPLIT_TAGS section of the config file table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "SPLIT_TAGS") for fieldType, IDs in table.items(): configTable.addFieldIDsToSplitSet(fieldType, IDs)
def createConfigTableKeys(self, configParser, configTable): # Parse fields from FORMAT section of the config file """ :param configParser: :param configTable: """ # Parse fields from INFO section of the config file table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig( configParser, "INFO") for name, ID in table.items(): configTable.addInfoFieldName(name, ID) table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig( configParser, "FORMAT") for name, ID in table.items(): configTable.addFormatFieldName(name, ID) table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig( configParser, "OTHER") for name, ID in table.items(): configTable.addOtherFieldName(name, ID) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig( configParser, "INFO_DESCRIPTION") for name, description in table.items(): configTable.addInfoFieldNameDescription( name, string.join(description, ",")) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig( configParser, "FORMAT_DESCRIPTION") for name, description in table.items(): configTable.addFormatFieldNameDescription( name, string.join(description, ",")) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig( configParser, "FILTER_DESCRIPTION") for name, description in table.items(): configTable.addFilterFieldNameDescription( name, string.join(description, ",")) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig( configParser, "SPLIT_TAGS") for fieldType, names in table.items(): configTable.addFieldNamesToSplitSet(fieldType, names) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig( configParser, "NOT_SPLIT_TAGS") for fieldType, names in table.items(): configTable.addFieldNamesToNotSplitSet(fieldType, names)
def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None): """ """ options = dict() if other_options is None else other_options self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.logger.info("Building alternative keys dictionary...") self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self.options = options self._prepend = self.config.get("general", "prepend") if self.options.get(OptionConstants.NO_PREPEND, False): self._prepend = "" # _is_reannotating is a flag to determine whether we should give precendence to annotations that were not # annotated as part of the INPUT. self._is_reannotating = options.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False) self._is_splitting_allelic_depth = self.options.get(OptionConstants.SPLIT_ALLELIC_DEPTH, True) self.exposedColumns = set(self.config.get("general", "exposedColumns").split(',')) self._is_entrez_id_message_logged = False self._is_collapsing_number_cols = options.get(OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS, False) self._column_collapser = None self._column_collapser_suffix = None if self._is_collapsing_number_cols: self._column_collapser = ColumnCollapser() self._column_collapser_suffix = "_full"
def __init__(self, filename, configFile="tcgaVCF1.1_output.config", otherOptions=None): self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self.seenDbSNPs = dict() self.fieldMap = {}
def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None): """ TODO: Need functionality for not prepending the i_ on internal fields. """ options = dict() if other_options is None else other_options self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.logger.info("Building alternative keys dictionary...") self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig( self.config) #TODO: Read missing options from the config file or specify that error should be thrown. self.options = options self._prepend = self.config.get("general", "prepend") if self.options.get(OptionConstants.NO_PREPEND, False): self._prepend = "" self.exposedColumns = set( self.config.get("general", "exposedColumns").split(',')) self._is_entrez_id_message_logged = False
def __init__(self, mut, configFile="sample_name_selection.config", section="SAMPLE_NAME"): config = ConfigUtils.createConfigParser(configFile) self.logger = logging.getLogger(__name__) aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig( config, section) self.configFile = configFile sampleAnnotation = self._getAnnotationFromAliases( mut, aliases["sample_name"]) tumorAnnotation = self._getAnnotationFromAliases( mut, aliases["sample_tumor_name"]) normalAnnotation = self._getAnnotationFromAliases( mut, aliases["sample_normal_name"]) source_column = self._getSourceColumn(sampleAnnotation, tumorAnnotation, normalAnnotation) self._logSampleNameColumnDescription(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation) self.sampleNameGrabber = self._getSampleNameGrabber( source_column, sampleAnnotation, tumorAnnotation, normalAnnotation) self.outputAnnotationName = self._deriveOutputAnnotationName( sampleAnnotation) self.annotationSource = self._deriveAnnotationSource(source_column)
def __init__(self, filename, mutation_data_factory=None, configFile='maflite_input.config', genomeBuild="hg19", other_options=None): """ Constructor """ super(MafliteInputMutationCreator, self).__init__(filename, mutation_data_factory, configFile, genomeBuild, other_options) self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self._tsvReader = GenericTsvReader(filename) # Key is the required columns and the values are a list of valid alternative headers. # Key is column name to an alternative. self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig( self.config) self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary( self._alternativeDict) missingRequiredHeaders = [] required_columns = sorted( self.config.get("general", "required_headers").split(",")) self._build = genomeBuild self.logger.info( "Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames())) # The specified fields are those that were given in the input. self._specified_fields = self._tsvReader.getFieldNames() for col in required_columns: if col not in self._specified_fields: isAltFound = False for alt in self._alternativeDict.get(col, []): if alt in self._specified_fields: isAltFound = True break if not isAltFound: # build is optional. if col != "build": missingRequiredHeaders.append(col) missingRequiredHeaders.sort() if len(missingRequiredHeaders) > 0: raise MafliteMissingRequiredHeaderException( "Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders))
def __init__(self, filename, configFile="tcgaVCF1.1_output.config", otherOptions=None): self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig( self.config) self.seenDbSNPs = dict() self.fieldMap = {}
def createConfigTableKeys(self, configParser, configTable): # Parse fields from FORMAT section of the config file """ :param configParser: :param configTable: """ # Parse fields from INFO section of the config file table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "INFO") for name, ID in table.items(): configTable.addInfoFieldName(name, ID) table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "FORMAT") for name, ID in table.items(): configTable.addFormatFieldName(name, ID) table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "OTHER") for name, ID in table.items(): configTable.addOtherFieldName(name, ID) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "INFO_DESCRIPTION") for name, description in table.items(): configTable.addInfoFieldNameDescription(name, string.join(description, ",")) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "FORMAT_DESCRIPTION") for name, description in table.items(): configTable.addFormatFieldNameDescription(name, string.join(description, ",")) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "FILTER_DESCRIPTION") for name, description in table.items(): configTable.addFilterFieldNameDescription(name, string.join(description, ",")) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "SPLIT_TAGS") for fieldType, names in table.items(): configTable.addFieldNamesToSplitSet(fieldType, names) table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "NOT_SPLIT_TAGS") for fieldType, names in table.items(): configTable.addFieldNamesToNotSplitSet(fieldType, names)
def __init__(self, mut, configFile="sample_name_selection.config", section="SAMPLE_NAME"): config = ConfigUtils.createConfigParser(configFile) self.logger = logging.getLogger(__name__) aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config, section) self.configFile=configFile sampleAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_name"]) tumorAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_tumor_name"]) normalAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_normal_name"]) source_column = self._getSourceColumn(sampleAnnotation,tumorAnnotation,normalAnnotation) self._logSampleNameColumnDescription(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation) self.sampleNameGrabber = self._getSampleNameGrabber(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation) self.outputAnnotationName = self._deriveOutputAnnotationName(sampleAnnotation) self.annotationSource = self._deriveAnnotationSource(source_column)
def __init__(self, filename, configFile='maflite_input.config', genomeBuild="hg19", other_options=None): """ Constructor Currently, this InputCreator does not support any other options. The parameter is ignored. """ self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self._tsvReader = GenericTsvReader(filename) # Key is the required columns and the values are a list of valid alternative headers. # Key is column name to an alternative. self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig( self.config) self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary( self._alternativeDict) missingRequiredHeaders = [] specifiedFields = self._tsvReader.getFieldNames() required_columns = sorted( self.config.get("general", "required_headers").split(",")) self._build = genomeBuild for col in required_columns: if col not in specifiedFields: isAltFound = False for alt in self._alternativeDict.get(col, []): if alt in specifiedFields: isAltFound = True break if not isAltFound: # build is optional. if col != "build": missingRequiredHeaders.append(col) missingRequiredHeaders.sort() self.logger.info( "Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames())) if len(missingRequiredHeaders) > 0: raise MafliteMissingRequiredHeaderException( "Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders))
def __init__(self, filename, mutation_data_factory=None, configFile='maflite_input.config', genomeBuild="hg19", other_options=None): """ Constructor """ super(MafliteInputMutationCreator, self).__init__(filename, mutation_data_factory, configFile, genomeBuild, other_options) self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self._tsvReader = GenericTsvReader(filename) # Key is the required columns and the values are a list of valid alternative headers. # Key is column name to an alternative. self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict) missingRequiredHeaders = [] required_columns = sorted(self.config.get("general", "required_headers").split(",")) self._build = genomeBuild self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames())) # The specified fields are those that were given in the input. self._specified_fields = self._tsvReader.getFieldNames() for col in required_columns: if col not in self._specified_fields: isAltFound = False for alt in self._alternativeDict.get(col, []): if alt in self._specified_fields: isAltFound = True break if not isAltFound: # build is optional. if col != "build": missingRequiredHeaders.append(col) missingRequiredHeaders.sort() if len(missingRequiredHeaders) > 0: raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders) )
def __init__(self, filename, configFile='maflite_input.config', genomeBuild="hg19", other_options=None): """ Constructor Currently, this InputCreator does not support any other options. The parameter is ignored. """ self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self._tsvReader = GenericTsvReader(filename) # Key is the required columns and the values are a list of valid alternative headers. # Key is column name to an alternative. self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict) missingRequiredHeaders = [] specifiedFields = self._tsvReader.getFieldNames() required_columns = sorted(self.config.get("general", "required_headers").split(",")) self._build = genomeBuild for col in required_columns: if col not in specifiedFields: isAltFound = False for alt in self._alternativeDict.get(col, []): if alt in specifiedFields: isAltFound = True break if not isAltFound: # build is optional. if col != "build": missingRequiredHeaders.append(col) missingRequiredHeaders.sort() self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames())) if len(missingRequiredHeaders) > 0: raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders) )
def renderMutations(self, segments, metadata=None, comments=None): """Render segments into a gene list as described in the docs for this class. :param segments: iterable of MutationData :param metadata: :param comments: """ config_parser = ConfigUtils.createConfigParser(self._config_file) logging.getLogger(__name__).info( "Building alternative keys dictionary...") self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig( config_parser) if metadata is None: metadata = OrderedDict() if comments is None: comments = [] fp = file(self._filename, 'w') for c in comments: fp.write("## " + c + "\n") # TODO: Define constant for "genes", and other annotations headers = config_parser.options("alternatives") gene_to_segment_dict = dict() annotations = None i = 0 for i, seg in enumerate(segments): if annotations is None: annotations = seg.keys() field_mapping = FieldMapCreator.create_field_map( headers, seg, self._alternativeDictionary, is_render_internal_fields=True, prepend="") gene_list = seg['genes'].split(",") for g in gene_list: if g == seg["start_gene"]: gene_to_segment_dict[g + " " + seg["start_exon"]] = seg elif g == seg["end_gene"]: gene_to_segment_dict[g + " " + seg["end_exon"]] = seg else: gene_to_segment_dict[g] = seg if i == 0: logging.getLogger(__name__).info( "No segments given. There will be no genes in the list.") writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore") writer.writeheader() logging.getLogger(__name__).info("Rendering gene list...") all_genes_seen = sorted(gene_to_segment_dict.keys()) num_genes = len(all_genes_seen) for i, gene in enumerate(all_genes_seen): # This next line may be slow... line_dict = dict() seg = gene_to_segment_dict[gene] for h in headers: annotation_field = field_mapping.get(h, h) line_dict[h] = seg.get(annotation_field, "") line_dict["gene"] = gene writer.writerow(line_dict) if i % 1000 == 0: logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i + 1), num_genes)) fp.close()
def renderMutations(self, segments, metadata=None, comments=None): """Render segments into a gene list as described in the docs for this class. :param segments: iterable of MutationData :param metadata: :param comments: """ config_parser = ConfigUtils.createConfigParser(self._config_file) logging.getLogger(__name__).info("Building alternative keys dictionary...") self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config_parser) if metadata is None: metadata = OrderedDict() if comments is None: comments = [] fp = file(self._filename, 'w') for c in comments: fp.write("## " + c + "\n") # TODO: Define constant for "genes", and other annotations headers = config_parser.options("alternatives") gene_to_segment_dict = dict() annotations = None i = 0 for i, seg in enumerate(segments): if annotations is None: annotations = seg.keys() field_mapping = MutUtils.createFieldsMapping(headers, annotations, self._alternativeDictionary, isRenderInternalFields=True, prepend="") gene_list = seg['genes'].split(",") for g in gene_list: if g == seg["start_gene"]: gene_to_segment_dict[g + " " + seg["start_exon"]] = seg elif g == seg["end_gene"]: gene_to_segment_dict[g + " " + seg["end_exon"]] = seg else: gene_to_segment_dict[g] = seg if i == 0: logging.getLogger(__name__).info("No segments given. There will be no genes in the list.") writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore") writer.writeheader() logging.getLogger(__name__).info("Rendering gene list...") all_genes_seen = sorted(gene_to_segment_dict.keys()) num_genes = len(all_genes_seen) for i,gene in enumerate(all_genes_seen): # This next line may be slow... line_dict = dict() seg = gene_to_segment_dict[gene] for h in headers: annotation_field = field_mapping.get(h, h) line_dict[h] = seg.get(annotation_field, "") line_dict["gene"] = gene writer.writerow(line_dict) if i % 1000 == 0: logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i+1),num_genes)) fp.close()