def __init__(self, inifile, defaults=None): SafeConfigParser.__init__(self, defaults) self.inifile = inifile # Make the _sections list an ordered dict, so that the section # names occur in order. self._sections = OrderedDict() self.orderedKeys = {}
class INIParser(SafeConfigParser): """ An extended form of SafeConfigParser which stores the order of options for each section """ ALLOWED_STREAMS = ('st', ) def __init__(self, inifile, defaults=None): SafeConfigParser.__init__(self, defaults) self.inifile = inifile # Make the _sections list an ordered dict, so that the section # names occur in order. self._sections = OrderedDict() self.orderedKeys = {} def readfp(self, filehandle=None): if filehandle is None: filehandle = open(self.inifile, 'r') SafeConfigParser.readfp(self, filehandle) for section in self._sections.keys(): for key, value in self._sections[section].items(): if key and value: self._sections[section][key] = value.replace('\n', '') def optionxform(self, optionstr): """ Overrides SafeConfigParser.optionxform() method to prevent converting option names to lowercase, and to store defined keys in an ordered list. This works because the SafeConfigParser.readfp() method internally calls SafeConfigParser._read() to parse the .ini file, and each new option is passed to optionxform() to process (by default converting it to lowercase). So each time an option string is passed to optionxform, it happens in the order those options are defined in the .ini file. """ # Store the current stream in the correct ordered section currentSection = self._sections.keys()[-1] try: self.orderedKeys[currentSection].append(optionstr) except KeyError: self.orderedKeys[currentSection] = [ optionstr, ] return optionstr def items_as_arguments(self, section): """ This method returns the ordered list of options as a list of properly formatted strings as the user would enter on the command-line """ arguments = [] for orderedKey in self.orderedKeys[section]: for key, value in self.items(section): if key == orderedKey: generic_key = re.sub('[0-9]*$', '', key) if generic_key in self.ALLOWED_STREAMS: arguments.append('--%s=%s' % (key, value)) return arguments
class INIParser(SafeConfigParser): """ An extended form of SafeConfigParser which stores the order of options for each section """ ALLOWED_STREAMS = ('st',) def __init__(self, inifile, defaults=None): SafeConfigParser.__init__(self, defaults) self.inifile = inifile # Make the _sections list an ordered dict, so that the section # names occur in order. self._sections = OrderedDict() self.orderedKeys = {} def readfp(self, filehandle=None): if filehandle is None: filehandle = open(self.inifile,'r') SafeConfigParser.readfp(self, filehandle) for section in self._sections.keys(): for key,value in self._sections[section].items(): if key and value: self._sections[section][key] = value.replace('\n', '') def optionxform(self, optionstr): """ Overrides SafeConfigParser.optionxform() method to prevent converting option names to lowercase, and to store defined keys in an ordered list. This works because the SafeConfigParser.readfp() method internally calls SafeConfigParser._read() to parse the .ini file, and each new option is passed to optionxform() to process (by default converting it to lowercase). So each time an option string is passed to optionxform, it happens in the order those options are defined in the .ini file. """ # Store the current stream in the correct ordered section currentSection = self._sections.keys()[-1] try: self.orderedKeys[currentSection].append(optionstr) except KeyError: self.orderedKeys[currentSection] = [optionstr,] return optionstr def items_as_arguments(self, section): """ This method returns the ordered list of options as a list of properly formatted strings as the user would enter on the command-line """ arguments = [] for orderedKey in self.orderedKeys[section]: for key,value in self.items(section): if key == orderedKey: generic_key = re.sub('[0-9]*$', '', key) if generic_key in self.ALLOWED_STREAMS: arguments.append('--%s=%s' % (key, value)) return arguments
def recordSummary(data, popName, lociList): global summaryTable pattLabel = re.compile(" Label.*") pattSep = re.compile("[=].*") pattTotal = re.compile(" Totals.*") startrecording = 0 mostsigsofar = "" for line in data: if re.search(pattLabel, line): startrecording = 1 elif re.search(pattSep, line): startrecording = 0 elif re.search(pattTotal, line): totalsig = (string.split(line)[6])[8:] elif startrecording: sig = (string.split(line)[6])[8:] if len(sig) > len(mostsigsofar): mostsigsofar = sig if mostsigsofar == '': mostsigsofar = '-' if totalsig == '': totalsig = '-' # parse popName into <pop><chrom>-cases pop = popName[:-(len("-cases") + 1)] chrom = popName[-(len("-cases") + 1)] print pop, chrom, mostsigsofar, totalsig datatuple = (totalsig, mostsigsofar) # generate haplotype locus name locus = lociListSuffix(lociList) if summaryTable.has_key(chrom): if summaryTable[chrom].has_key(locus): if summaryTable[chrom][locus].has_key(pop): print "has pop key" else: summaryTable[chrom][locus][pop] = datatuple else: summaryTable[chrom][locus] = OrderedDict([pop, datatuple]) else: summaryTable[chrom] = \ OrderedDict([locus, OrderedDict([pop, datatuple])])
def _genInternalMaps(self): """Returns dictionary containing 2-tuple of column position. It is keyed by locus names originally specified in sample metadata file, the locus names (keys) are made uppercase and don't contain the allele designator. Note that this is simply a transformed _subset_ of that returned by **getSampleMap()** *For internal use only.*""" # assume there is no population column popNameCol = None # create a map that only contains non-allele fields self.nonAlleleMap = OrderedDict() self.alleleMap = OrderedDict() for key in self.sampleMap.keys(): # do we have the allele designator? if key[0] == self.alleleDesignator: # remove allele designator, only necessary # for initial splitting out of locus keys from # other fields, and also make uppercase locusKey = string.upper(key[len(self.alleleDesignator):]) self.alleleMap[locusKey] = self.sampleMap[key] elif key[0] == self.popNameDesignator: popNameCol = self.sampleMap[key] self.nonAlleleMap[key[1:]] = self.sampleMap[key] else: self.nonAlleleMap[key] = self.sampleMap[key] if popNameCol == None: self.popName = None else: # save population name self.popName = string.split(self.fileData[self.sampleFirstLine], self.separator)[popNameCol]
def _mapPopHeaders(self): """Create associations for field names and input columns. Using the header information from the top of the file, creates a dictionary for the population-level data. Also validates the file information for the correct number of fields are present on each line *For internal use only*.""" # get population header metadata popHeaderLine = string.rstrip(self.fileData[0]) # parse it self.popMap, fieldCount = self._mapFields(popHeaderLine, self.popFields) # debugging only if self.debug: print "population header line: ", popHeaderLine print self.popMap # get population data popDataLine = string.rstrip(self.fileData[1]) # debugging only if self.debug: print "population data line: ", popDataLine # make sure pop data line matches number expected from metadata popDataFields = string.split(popDataLine, self.separator) if len(popDataFields) != fieldCount: print "error: found", len(popDataFields),\ "fields expected", fieldCount, "fields" # create a dictionary using the metadata field names as key # for the population data self.popData = OrderedDict() for popField in self.popMap.keys(): self.popData[popField] = popDataFields[self.popMap[popField]]
#! /usr/bin/env python import sys, os, string, glob, re from Haplo import HaploArlequin from Utils import OrderedDict # global summary table summaryTable = OrderedDict() def stripSuffix(filename): return string.split(os.path.basename(filename), '.')[0] def lociListSuffix(lociList): suffix = "" print lociList, len(lociList) for i in lociList: print "locus:", i extra = "%02d" % i print "formatting:", extra if suffix == "": suffix = extra else: suffix = suffix + "-" + extra print "new suffix:", suffix return suffix def genArpFilename(prefix, lociList): return prefix + "-" + lociListSuffix(lociList) + ".haplo"
class ParseFile: """*Abstract* class for parsing a datafile. *Not to be instantiated.*""" def __init__(self, filename, validPopFields=None, validSampleFields=None, separator='\t', fieldPairDesignator='_1:_2', alleleDesignator='*', popNameDesignator='+', debug=0): """Constructor for ParseFile object. - 'filename': filename for the file to be parsed. - 'validPopFields': a string consisting of valid headers (one per line) for overall population data (no default) - 'validSampleFields': a string consisting of valid headers (one per line) for lines of sample data. (no default) - 'separator': separator for adjacent fields (default: a tab stop, '\\t'). - 'fieldPairDesignator': a string which consists of additions to the allele `stem' for fields grouped in pairs (allele fields) [e.g. for `HLA-A', and `HLA-A(2)', then we use ':(2)', for `DQA1_1' and `DQA1_2', then use use '_1:_2', the latter case distinguishes both fields from the stem] (default: ':(2)') - 'alleleDesignator': The first character of the key which determines whether this column contains allele data. Defaults to '*' - 'popNameDesignator': The first character of the key which determines whether this column contains the population name. Defaults to '+' - 'debug': Switches debugging on if set to '1' (default: no debugging, '0')""" self.filename = filename self.validPopFields = validPopFields self.validSampleFields = validSampleFields self.debug = debug self.separator = separator self.fieldPairDesignator = fieldPairDesignator self.alleleDesignator = alleleDesignator self.popNameDesignator = popNameDesignator # assume no population or sample data, until supplied self.popData = None self.sampleMap = None # Reads and parses a given filename. self._sampleFileRead(self.filename) if self.validPopFields == None: # skip parsing of metadata header self.sampleFirstLine = 1 else: # parse metadata header self.sampleFirstLine = 3 # gets the .ini file information for metadata self.popFields = ParseFile._dbFieldsRead(self, self.validPopFields) if self.debug: # debugging only print self.popFields # parse the metadata self._mapPopHeaders() # gets the .ini file information for samples self.sampleFields = ParseFile._dbFieldsRead(self, self.validSampleFields) if self.debug: print self.sampleFields # always parse the samples, they must always exist! self._mapSampleHeaders() def _dbFieldsRead(self, data): """Reads the valid key, value pairs. Takes a string that is expected to consist of database field names separated by newlines. Returns a tuple of field names. *For internal use only.*""" li = [] for line in string.split(data, os.linesep): if self.debug: print string.rstrip(line) li.append(string.rstrip(line)) return tuple(li) def _mapFields(self, line, fieldList): """Creates a list of valid database fields. From a separator delimited string, creates a list of valid fields and creates a dictionary of positions keyed by valid field names. - Complains if a field name is not valid. - Complains if the correct number of fields are not found for the metadata headers. Returns a 2-tuple: - a dictionary keyed by field name. - the total number of metadata fields. *For internal use only.*""" # split line fields = line.split(self.separator) # check to see if the correct number of fields found if len(fields) != len(fieldList) and self.debug: print "warning: found", len(fields), "fields expected", \ len(fieldList), "fields" i = 0 assoc = OrderedDict() for field in fields: # strip the field of leading and trailing blanks because # column name may inadvertantly contain these due to # spreadsheet -> tab-delimited file format idiosyncrasies field = string.strip(field) # check to see whether field is a valid key, and generate # the appropriate identifier, this is done as method call # so it can overwritten in subclasses of this abstract # class (i.e. the subclass will have 'knowledge' about the # nature of fields, but not this abstract class) # If an asterisk character is given as the first item in # the valid fields list, then accept any field name (ie, # locus name) as valid. This makes sense only in the # allelecount file context. if fieldList[0] == "*": isValidKey, key = (1, field) else: isValidKey, key = self.genValidKey(field, fieldList) if isValidKey: # if key is one of pair already in map, add it to make # a tuple at that key e.g. `HLA-A(2)' already exists # and inserting `HLA-A', or `DQB1_1' and `DQB1_2' should # both be inserted at `DQB1' if assoc.has_key(key): assoc[key] = assoc[key], i else: assoc[key] = i elif self.debug: print "warning: field name `%s' not valid" % field i = i + 1 return assoc, i def _sampleFileRead(self, filename): """Reads filename into object. Takes a filename and reads the file data into an instance variable. *For internal use only*. """ f = open(filename, 'r') self.fileData = f.readlines() def _mapPopHeaders(self): """Create associations for field names and input columns. Using the header information from the top of the file, creates a dictionary for the population-level data. Also validates the file information for the correct number of fields are present on each line *For internal use only*.""" # get population header metadata popHeaderLine = string.rstrip(self.fileData[0]) # parse it self.popMap, fieldCount = self._mapFields(popHeaderLine, self.popFields) # debugging only if self.debug: print "population header line: ", popHeaderLine print self.popMap # get population data popDataLine = string.rstrip(self.fileData[1]) # debugging only if self.debug: print "population data line: ", popDataLine # make sure pop data line matches number expected from metadata popDataFields = string.split(popDataLine, self.separator) if len(popDataFields) != fieldCount: print "error: found", len(popDataFields),\ "fields expected", fieldCount, "fields" # create a dictionary using the metadata field names as key # for the population data self.popData = OrderedDict() for popField in self.popMap.keys(): self.popData[popField] = popDataFields[self.popMap[popField]] def _mapSampleHeaders(self): """Create the associations between field names and input columns. Using the header information from the top of the file, creates associations for the sample data fields. Also validates the file information for the correct number of fields are present on each line *For internal use only*.""" # get sample header metadata sampleHeaderLine = string.rstrip(self.fileData[self.sampleFirstLine - 1]) # parse it self.sampleMap, fieldCount = self._mapFields(sampleHeaderLine, self.sampleFields) # debugging only if self.debug: print "sample header line: ", sampleHeaderLine print self.sampleMap # check file data to see that correct number of fields are # present for each sample for lineCount in range(self.sampleFirstLine, len(self.fileData)): # retrieve and strip newline line = string.rstrip(self.fileData[lineCount]) # restore the data with the newline stripped self.fileData[lineCount] = line fields = string.split(line, self.separator) if fieldCount != len(fields): print "error: incorrect number of fields:", len(fields), \ "found, should have:", fieldCount, \ "\noffending line is:\n", line def getPopData(self): """Returns a dictionary of population data. Dictionary is keyed by types specified in population metadata file""" return self.popData def getSampleMap(self): """Returns dictionary of sample data. Each dictionary position contains either a 2-tuple of column position or a single column position keyed by field originally specified in sample metadata file""" return self.sampleMap def getFileData(self): """Returns file data. Returns a 2-tuple `wrapper': - raw sample lines, *without* header metadata. - the field separator.""" return self.fileData[self.sampleFirstLine:], self.separator def genSampleOutput(self, fieldList): """Prints the data specified in ordered field list. *Use is currently deprecated.*""" #for field in fieldList: #print string.strip(field) + self.separator, for lineCount in range(self.sampleFirstLine, len(self.fileData)): line = string.strip(self.fileData[lineCount]) element = string.split(line, self.separator) for field in fieldList: if self.sampleMap.has_key(field): print element[self.sampleMap[field]], else: print "can't find this field" print "\n" def serializeMetadataTo(self, stream): type = getStreamType(stream) stream.opentag('populationdata') stream.writeln() if self.popData: for summary in self.popData.keys(): # convert metadata name into a XML tag name tagname = string.lower(string.replace(summary, ' ', '-')) stream.tagContents(tagname, self.popData[summary]) stream.writeln() # call subclass-specific metadata serialization self.serializeSubclassMetadataTo(stream) stream.closetag('populationdata') stream.writeln()
class ParseGenotypeFile(ParseFile): """Class to parse standard datafile in genotype form.""" def __init__(self, filename, untypedAllele='****', **kw): """Constructor for ParseGenotypeFile. - 'filename': filename for the file to be parsed. In addition to the arguments for the base class, this class accepts the following additional keywords: - 'untypedAllele': The designator for an untyped locus. Defaults to '****'. """ self.untypedAllele = untypedAllele ParseFile.__init__(self, filename, **kw) self._genDataStructures() def _genInternalMaps(self): """Returns dictionary containing 2-tuple of column position. It is keyed by locus names originally specified in sample metadata file, the locus names (keys) are made uppercase and don't contain the allele designator. Note that this is simply a transformed _subset_ of that returned by **getSampleMap()** *For internal use only.*""" # assume there is no population column popNameCol = None # create a map that only contains non-allele fields self.nonAlleleMap = OrderedDict() self.alleleMap = OrderedDict() for key in self.sampleMap.keys(): # do we have the allele designator? if key[0] == self.alleleDesignator: # remove allele designator, only necessary # for initial splitting out of locus keys from # other fields, and also make uppercase locusKey = string.upper(key[len(self.alleleDesignator):]) self.alleleMap[locusKey] = self.sampleMap[key] elif key[0] == self.popNameDesignator: popNameCol = self.sampleMap[key] self.nonAlleleMap[key[1:]] = self.sampleMap[key] else: self.nonAlleleMap[key] = self.sampleMap[key] if popNameCol == None: self.popName = None else: # save population name self.popName = string.split(self.fileData[self.sampleFirstLine], self.separator)[popNameCol] def _genDataStructures(self): """Generates matrix only *For internal use only.*""" # generate alleleMap and population field name self._genInternalMaps() sampleDataLines, separator = self.getFileData() if self.debug: print 'sampleMap keys:', self.sampleMap.keys() print 'sampleMap values:', self.sampleMap.values() print 'first line of data', sampleDataLines[0] # then total number of individuals in data file self.totalIndivCount = len(sampleDataLines) # total number of loci contained in original file self.totalLocusCount = len(self.alleleMap) # freeze the list of locusKeys in a particular order self.locusKeys = self.alleleMap.keys() # freeze list of non-allel data self.extraKeys = self.nonAlleleMap.keys() # create an empty-list of lists to store all the row data #self.individualsList = [[] for line in range(0, self.totalIndivCount)] self.matrix = StringMatrix(self.totalIndivCount, self.locusKeys, self.extraKeys, self.separator, self.fileData[:self.sampleFirstLine - 1]) rowCount = 0 # store all the non-allele meta-data for line in sampleDataLines: fields = string.split(line, self.separator) for key in self.nonAlleleMap.keys(): self.matrix[rowCount, key] = fields[self.nonAlleleMap[key]] rowCount += 1 if self.debug: print "before filling matrix with allele data" print self.matrix for locus in self.locusKeys: if self.debug: print "locus name:", locus print "column tuple:", self.alleleMap[locus] col1, col2 = self.alleleMap[locus] # re-initialise the row count on each iteration of the locus rowCount = 0 for line in sampleDataLines: fields = string.split(line, self.separator) # create data structures allele1 = string.strip(fields[col1]) allele2 = string.strip(fields[col2]) # underlying NumPy array data type won't allow storage # of any sequence-type object (e.g. list or tuple) but # we can workaround this by overriding the __setitem__ # method of the UserArray wrapper class used for # subtyping and storing tuple internally as two # separate columns in the underlying array. self.matrix[rowCount, locus] = (allele1, allele2) if self.debug: print rowCount, self.matrix[rowCount, locus] # increment row count rowCount += 1 def genValidKey(self, field, fieldList): """Check and validate key. - 'field': string with field name. - 'fieldList': a dictionary of valid fields. Check to see whether 'field' is a valid key, and generate the appropriate 'key'. Returns a 2-tuple consisting of 'isValidKey' boolean and the 'key'. *Note: this is explicitly done in the subclass of the abstract 'ParseFile' class (i.e. since this subclass should have `knowledge' about the nature of fields, but the abstract class should not have)*""" if (field in fieldList) or \ (self.alleleDesignator + field in fieldList): isValidKey = 1 else: if self.popNameDesignator + field in fieldList: isValidKey = 1 else: isValidKey = 0 # generate the key that matches the one in the data file # format # if this is an `allele'-type field if self.alleleDesignator + field in fieldList: li = string.split(self.fieldPairDesignator, ":") # if pair identifiers are both the same length and # non-zero (e.g. '_1' and '_2', then we can assume that # the underlying `stem' should be the field name with the # pair identifer stripped off, otherwise simply use the # field name if (len(li[0]) == len(li[1])) and (len(li[0]) != 0): key = self.alleleDesignator + field[:-len(li[0])] else: key = self.alleleDesignator + field else: # this is the population field name if self.popNameDesignator + field in fieldList: key = self.popNameDesignator + field else: # this is a regular (non-`allele' type field) key = field if self.debug: print "validKey: %d, key: %s" % (isValidKey, key) return isValidKey, key def getMatrix(self): """Returns the genotype data. Returns the genotype data in a 'StringMatrix' NumPy array. """ return self.matrix def serializeSubclassMetadataTo(self, stream): """Serialize subclass-specific metadata.""" if self.popName: # if present in input , print population name stream.tagContents('popname', self.popName) stream.writeln()
def _mapFields(self, line, fieldList): """Creates a list of valid database fields. From a separator delimited string, creates a list of valid fields and creates a dictionary of positions keyed by valid field names. - Complains if a field name is not valid. - Complains if the correct number of fields are not found for the metadata headers. Returns a 2-tuple: - a dictionary keyed by field name. - the total number of metadata fields. *For internal use only.*""" # split line fields = line.split(self.separator) # check to see if the correct number of fields found if len(fields) != len(fieldList) and self.debug: print "warning: found", len(fields), "fields expected", \ len(fieldList), "fields" i = 0 assoc = OrderedDict() for field in fields: # strip the field of leading and trailing blanks because # column name may inadvertantly contain these due to # spreadsheet -> tab-delimited file format idiosyncrasies field = string.strip(field) # check to see whether field is a valid key, and generate # the appropriate identifier, this is done as method call # so it can overwritten in subclasses of this abstract # class (i.e. the subclass will have 'knowledge' about the # nature of fields, but not this abstract class) # If an asterisk character is given as the first item in # the valid fields list, then accept any field name (ie, # locus name) as valid. This makes sense only in the # allelecount file context. if fieldList[0] == "*": isValidKey, key = (1, field) else: isValidKey, key = self.genValidKey(field, fieldList) if isValidKey: # if key is one of pair already in map, add it to make # a tuple at that key e.g. `HLA-A(2)' already exists # and inserting `HLA-A', or `DQB1_1' and `DQB1_2' should # both be inserted at `DQB1' if assoc.has_key(key): assoc[key] = assoc[key], i else: assoc[key] = i elif self.debug: print "warning: field name `%s' not valid" % field i = i + 1 return assoc, i
class ParseFile: """*Abstract* class for parsing a datafile. *Not to be instantiated.*""" def __init__(self, filename, validPopFields=None, validSampleFields=None, separator='\t', fieldPairDesignator='_1:_2', alleleDesignator='*', popNameDesignator='+', debug=0): """Constructor for ParseFile object. - 'filename': filename for the file to be parsed. - 'validPopFields': a string consisting of valid headers (one per line) for overall population data (no default) - 'validSampleFields': a string consisting of valid headers (one per line) for lines of sample data. (no default) - 'separator': separator for adjacent fields (default: a tab stop, '\\t'). - 'fieldPairDesignator': a string which consists of additions to the allele `stem' for fields grouped in pairs (allele fields) [e.g. for `HLA-A', and `HLA-A(2)', then we use ':(2)', for `DQA1_1' and `DQA1_2', then use use '_1:_2', the latter case distinguishes both fields from the stem] (default: ':(2)') - 'alleleDesignator': The first character of the key which determines whether this column contains allele data. Defaults to '*' - 'popNameDesignator': The first character of the key which determines whether this column contains the population name. Defaults to '+' - 'debug': Switches debugging on if set to '1' (default: no debugging, '0')""" self.filename = filename self.validPopFields=validPopFields self.validSampleFields=validSampleFields self.debug = debug self.separator = separator self.fieldPairDesignator = fieldPairDesignator self.alleleDesignator=alleleDesignator self.popNameDesignator = popNameDesignator # assume no population or sample data, until supplied self.popData = None self.sampleMap = None # Reads and parses a given filename. self._sampleFileRead(self.filename) if self.validPopFields == None: # skip parsing of metadata header self.sampleFirstLine = 1 else: # parse metadata header self.sampleFirstLine = 3 # gets the .ini file information for metadata self.popFields = ParseFile._dbFieldsRead(self,self.validPopFields) if self.debug: # debugging only print self.popFields # parse the metadata self._mapPopHeaders() # gets the .ini file information for samples self.sampleFields = ParseFile._dbFieldsRead(self,self.validSampleFields) if self.debug: print self.sampleFields # always parse the samples, they must always exist! self._mapSampleHeaders() def _dbFieldsRead(self, data): """Reads the valid key, value pairs. Takes a string that is expected to consist of database field names separated by newlines. Returns a tuple of field names. *For internal use only.*""" li = [] for line in string.split(data, os.linesep): if self.debug: print string.rstrip(line) li.append(string.rstrip(line)) return tuple(li) def _mapFields(self, line, fieldList): """Creates a list of valid database fields. From a separator delimited string, creates a list of valid fields and creates a dictionary of positions keyed by valid field names. - Complains if a field name is not valid. - Complains if the correct number of fields are not found for the metadata headers. Returns a 2-tuple: - a dictionary keyed by field name. - the total number of metadata fields. *For internal use only.*""" # split line fields = line.split(self.separator) # check to see if the correct number of fields found if len(fields) != len(fieldList) and self.debug: print "warning: found", len(fields), "fields expected", \ len(fieldList), "fields" i = 0 assoc = OrderedDict() for field in fields: # strip the field of leading and trailing blanks because # column name may inadvertantly contain these due to # spreadsheet -> tab-delimited file format idiosyncrasies field = string.strip(field) # check to see whether field is a valid key, and generate # the appropriate identifier, this is done as method call # so it can overwritten in subclasses of this abstract # class (i.e. the subclass will have 'knowledge' about the # nature of fields, but not this abstract class) # If an asterisk character is given as the first item in # the valid fields list, then accept any field name (ie, # locus name) as valid. This makes sense only in the # allelecount file context. if fieldList[0] == "*": isValidKey, key = (1, field) else: isValidKey, key = self.genValidKey(field, fieldList) if isValidKey: # if key is one of pair already in map, add it to make # a tuple at that key e.g. `HLA-A(2)' already exists # and inserting `HLA-A', or `DQB1_1' and `DQB1_2' should # both be inserted at `DQB1' if assoc.has_key(key): assoc[key] = assoc[key], i else: assoc[key] = i elif self.debug: print "warning: field name `%s' not valid" % field i = i + 1 return assoc, i def _sampleFileRead(self, filename): """Reads filename into object. Takes a filename and reads the file data into an instance variable. *For internal use only*. """ f = open(filename, 'r') self.fileData = f.readlines() def _mapPopHeaders(self): """Create associations for field names and input columns. Using the header information from the top of the file, creates a dictionary for the population-level data. Also validates the file information for the correct number of fields are present on each line *For internal use only*.""" # get population header metadata popHeaderLine = string.rstrip(self.fileData[0]) # parse it self.popMap, fieldCount = self._mapFields(popHeaderLine, self.popFields) # debugging only if self.debug: print "population header line: ", popHeaderLine print self.popMap # get population data popDataLine = string.rstrip(self.fileData[1]) # debugging only if self.debug: print "population data line: ", popDataLine # make sure pop data line matches number expected from metadata popDataFields = string.split(popDataLine, self.separator) if len(popDataFields) != fieldCount: print "error: found", len(popDataFields),\ "fields expected", fieldCount, "fields" # create a dictionary using the metadata field names as key # for the population data self.popData = OrderedDict() for popField in self.popMap.keys(): self.popData[popField] = popDataFields[self.popMap[popField]] def _mapSampleHeaders(self): """Create the associations between field names and input columns. Using the header information from the top of the file, creates associations for the sample data fields. Also validates the file information for the correct number of fields are present on each line *For internal use only*.""" # get sample header metadata sampleHeaderLine = string.rstrip(self.fileData[self.sampleFirstLine-1]) # parse it self.sampleMap, fieldCount = self._mapFields(sampleHeaderLine, self.sampleFields) # debugging only if self.debug: print "sample header line: ", sampleHeaderLine print self.sampleMap # check file data to see that correct number of fields are # present for each sample for lineCount in range(self.sampleFirstLine, len(self.fileData)): # retrieve and strip newline line = string.rstrip(self.fileData[lineCount]) # restore the data with the newline stripped self.fileData[lineCount] = line fields = string.split(line, self.separator) if fieldCount != len(fields): print "error: incorrect number of fields:", len(fields), \ "found, should have:", fieldCount, \ "\noffending line is:\n", line def getPopData(self): """Returns a dictionary of population data. Dictionary is keyed by types specified in population metadata file""" return self.popData def getSampleMap(self): """Returns dictionary of sample data. Each dictionary position contains either a 2-tuple of column position or a single column position keyed by field originally specified in sample metadata file""" return self.sampleMap def getFileData(self): """Returns file data. Returns a 2-tuple `wrapper': - raw sample lines, *without* header metadata. - the field separator.""" return self.fileData[self.sampleFirstLine:], self.separator def genSampleOutput(self, fieldList): """Prints the data specified in ordered field list. *Use is currently deprecated.*""" #for field in fieldList: #print string.strip(field) + self.separator, for lineCount in range(self.sampleFirstLine, len(self.fileData)): line = string.strip(self.fileData[lineCount]) element = string.split(line, self.separator) for field in fieldList: if self.sampleMap.has_key(field): print element[self.sampleMap[field]], else: print "can't find this field" print "\n" def serializeMetadataTo(self, stream): type = getStreamType(stream) stream.opentag('populationdata') stream.writeln() if self.popData: for summary in self.popData.keys(): # convert metadata name into a XML tag name tagname = string.lower(string.replace(summary,' ','-')) stream.tagContents(tagname, self.popData[summary]) stream.writeln() # call subclass-specific metadata serialization self.serializeSubclassMetadataTo(stream) stream.closetag('populationdata') stream.writeln()
class ParseGenotypeFile(ParseFile): """Class to parse standard datafile in genotype form.""" def __init__(self, filename, untypedAllele='****', **kw): """Constructor for ParseGenotypeFile. - 'filename': filename for the file to be parsed. In addition to the arguments for the base class, this class accepts the following additional keywords: - 'untypedAllele': The designator for an untyped locus. Defaults to '****'. """ self.untypedAllele=untypedAllele ParseFile.__init__(self, filename, **kw) self._genDataStructures() def _genInternalMaps(self): """Returns dictionary containing 2-tuple of column position. It is keyed by locus names originally specified in sample metadata file, the locus names (keys) are made uppercase and don't contain the allele designator. Note that this is simply a transformed _subset_ of that returned by **getSampleMap()** *For internal use only.*""" # assume there is no population column popNameCol = None # create a map that only contains non-allele fields self.nonAlleleMap = OrderedDict() self.alleleMap = OrderedDict() for key in self.sampleMap.keys(): # do we have the allele designator? if key[0] == self.alleleDesignator: # remove allele designator, only necessary # for initial splitting out of locus keys from # other fields, and also make uppercase locusKey = string.upper(key[len(self.alleleDesignator):]) self.alleleMap[locusKey] = self.sampleMap[key] elif key[0] == self.popNameDesignator: popNameCol = self.sampleMap[key] self.nonAlleleMap[key[1:]] = self.sampleMap[key] else: self.nonAlleleMap[key] = self.sampleMap[key] if popNameCol == None: self.popName = None else: # save population name self.popName = string.split(self.fileData[self.sampleFirstLine], self.separator)[popNameCol] def _genDataStructures(self): """Generates matrix only *For internal use only.*""" # generate alleleMap and population field name self._genInternalMaps() sampleDataLines, separator = self.getFileData() if self.debug: print 'sampleMap keys:', self.sampleMap.keys() print 'sampleMap values:', self.sampleMap.values() print 'first line of data', sampleDataLines[0] # then total number of individuals in data file self.totalIndivCount = len(sampleDataLines) # total number of loci contained in original file self.totalLocusCount = len(self.alleleMap) # freeze the list of locusKeys in a particular order self.locusKeys = self.alleleMap.keys() # freeze list of non-allel data self.extraKeys = self.nonAlleleMap.keys() # create an empty-list of lists to store all the row data #self.individualsList = [[] for line in range(0, self.totalIndivCount)] self.matrix = StringMatrix(self.totalIndivCount, self.locusKeys, self.extraKeys, self.separator, self.fileData[:self.sampleFirstLine-1]) rowCount = 0 # store all the non-allele meta-data for line in sampleDataLines: fields = string.split(line, self.separator) for key in self.nonAlleleMap.keys(): self.matrix[rowCount, key] = fields[self.nonAlleleMap[key]] rowCount += 1 if self.debug: print "before filling matrix with allele data" print self.matrix for locus in self.locusKeys: if self.debug: print "locus name:", locus print "column tuple:", self.alleleMap[locus] col1, col2 = self.alleleMap[locus] # re-initialise the row count on each iteration of the locus rowCount = 0 for line in sampleDataLines: fields = string.split(line, self.separator) # create data structures allele1 = string.strip(fields[col1]) allele2 = string.strip(fields[col2]) # underlying NumPy array data type won't allow storage # of any sequence-type object (e.g. list or tuple) but # we can workaround this by overriding the __setitem__ # method of the UserArray wrapper class used for # subtyping and storing tuple internally as two # separate columns in the underlying array. self.matrix[rowCount,locus] = (allele1, allele2) if self.debug: print rowCount, self.matrix[rowCount,locus] # increment row count rowCount += 1 def genValidKey(self, field, fieldList): """Check and validate key. - 'field': string with field name. - 'fieldList': a dictionary of valid fields. Check to see whether 'field' is a valid key, and generate the appropriate 'key'. Returns a 2-tuple consisting of 'isValidKey' boolean and the 'key'. *Note: this is explicitly done in the subclass of the abstract 'ParseFile' class (i.e. since this subclass should have `knowledge' about the nature of fields, but the abstract class should not have)*""" if (field in fieldList) or \ (self.alleleDesignator + field in fieldList): isValidKey = 1 else: if self.popNameDesignator + field in fieldList: isValidKey = 1 else: isValidKey = 0 # generate the key that matches the one in the data file # format # if this is an `allele'-type field if self.alleleDesignator + field in fieldList: li = string.split(self.fieldPairDesignator,":") # if pair identifiers are both the same length and # non-zero (e.g. '_1' and '_2', then we can assume that # the underlying `stem' should be the field name with the # pair identifer stripped off, otherwise simply use the # field name if (len(li[0]) == len(li[1])) and (len(li[0]) != 0): key = self.alleleDesignator + field[:-len(li[0])] else: key = self.alleleDesignator + field else: # this is the population field name if self.popNameDesignator + field in fieldList: key = self.popNameDesignator + field else: # this is a regular (non-`allele' type field) key = field if self.debug: print "validKey: %d, key: %s" % (isValidKey, key) return isValidKey, key def getMatrix(self): """Returns the genotype data. Returns the genotype data in a 'StringMatrix' NumPy array. """ return self.matrix def serializeSubclassMetadataTo(self, stream): """Serialize subclass-specific metadata.""" if self.popName: # if present in input , print population name stream.tagContents('popname', self.popName) stream.writeln()