Пример #1
0
 def __init__(self, inifile, defaults=None):
     SafeConfigParser.__init__(self, defaults)
     self.inifile = inifile
     # Make the _sections list an ordered dict, so that the section
     # names occur in order.
     self._sections = OrderedDict()
     self.orderedKeys = {}
Пример #2
0
class INIParser(SafeConfigParser):
    """
    An extended form of SafeConfigParser which stores the order of options for
    each section
    """

    ALLOWED_STREAMS = ('st', )

    def __init__(self, inifile, defaults=None):
        SafeConfigParser.__init__(self, defaults)
        self.inifile = inifile
        # Make the _sections list an ordered dict, so that the section
        # names occur in order.
        self._sections = OrderedDict()
        self.orderedKeys = {}

    def readfp(self, filehandle=None):
        if filehandle is None:
            filehandle = open(self.inifile, 'r')
        SafeConfigParser.readfp(self, filehandle)
        for section in self._sections.keys():
            for key, value in self._sections[section].items():
                if key and value:
                    self._sections[section][key] = value.replace('\n', '')

    def optionxform(self, optionstr):
        """
        Overrides SafeConfigParser.optionxform() method to prevent converting
        option names to lowercase, and to store defined keys in an ordered list.
        This works because the SafeConfigParser.readfp() method internally
        calls SafeConfigParser._read() to parse the .ini file, and each new
        option is passed to optionxform() to process (by default converting it
        to lowercase).  So each time an option string is passed to optionxform,
        it happens in the order those options are defined in the .ini file.
        """
        # Store the current stream in the correct ordered section
        currentSection = self._sections.keys()[-1]
        try:
            self.orderedKeys[currentSection].append(optionstr)
        except KeyError:
            self.orderedKeys[currentSection] = [
                optionstr,
            ]
        return optionstr

    def items_as_arguments(self, section):
        """
        This method returns the ordered list of options as a list of properly
        formatted strings as the user would enter on the command-line
        """
        arguments = []
        for orderedKey in self.orderedKeys[section]:
            for key, value in self.items(section):
                if key == orderedKey:
                    generic_key = re.sub('[0-9]*$', '', key)
                    if generic_key in self.ALLOWED_STREAMS:
                        arguments.append('--%s=%s' % (key, value))
        return arguments
Пример #3
0
class INIParser(SafeConfigParser):
    """
    An extended form of SafeConfigParser which stores the order of options for
    each section
    """

    ALLOWED_STREAMS = ('st',)

    def __init__(self, inifile, defaults=None):
        SafeConfigParser.__init__(self, defaults)
        self.inifile = inifile
        # Make the _sections list an ordered dict, so that the section
        # names occur in order.
        self._sections = OrderedDict()
        self.orderedKeys = {}

    def readfp(self, filehandle=None):
        if filehandle is None:
            filehandle = open(self.inifile,'r')
        SafeConfigParser.readfp(self, filehandle)
        for section in self._sections.keys():
            for key,value in self._sections[section].items():
                if key and value:
                    self._sections[section][key] = value.replace('\n', '')

    def optionxform(self, optionstr):
        """
        Overrides SafeConfigParser.optionxform() method to prevent converting
        option names to lowercase, and to store defined keys in an ordered list.
        This works because the SafeConfigParser.readfp() method internally
        calls SafeConfigParser._read() to parse the .ini file, and each new
        option is passed to optionxform() to process (by default converting it
        to lowercase).  So each time an option string is passed to optionxform,
        it happens in the order those options are defined in the .ini file.
        """
        # Store the current stream in the correct ordered section 
        currentSection = self._sections.keys()[-1]
        try:
            self.orderedKeys[currentSection].append(optionstr)
        except KeyError:
            self.orderedKeys[currentSection] = [optionstr,]
        return optionstr


    def items_as_arguments(self, section):
        """
        This method returns the ordered list of options as a list of properly
        formatted strings as the user would enter on the command-line
        """
        arguments = []
        for orderedKey in self.orderedKeys[section]:
            for key,value in self.items(section):
                if key == orderedKey:
                    generic_key = re.sub('[0-9]*$', '', key)
                    if generic_key in self.ALLOWED_STREAMS:
                        arguments.append('--%s=%s' % (key, value))
        return arguments
Пример #4
0
 def __init__(self, inifile, defaults=None):
     SafeConfigParser.__init__(self, defaults)
     self.inifile = inifile
     # Make the _sections list an ordered dict, so that the section
     # names occur in order.
     self._sections = OrderedDict()
     self.orderedKeys = {}
Пример #5
0
def recordSummary(data, popName, lociList):
    global summaryTable
    pattLabel = re.compile("            Label.*")
    pattSep = re.compile("[=].*")
    pattTotal = re.compile("          Totals.*")
    startrecording = 0
    mostsigsofar = ""

    for line in data:
        if re.search(pattLabel, line):
            startrecording = 1
        elif re.search(pattSep, line):
            startrecording = 0
        elif re.search(pattTotal, line):
            totalsig = (string.split(line)[6])[8:]
        elif startrecording:
            sig = (string.split(line)[6])[8:]
            if len(sig) > len(mostsigsofar):
                mostsigsofar = sig

    if mostsigsofar == '':
        mostsigsofar = '-'
    if totalsig == '':
        totalsig = '-'

    # parse popName into <pop><chrom>-cases
    pop = popName[:-(len("-cases") + 1)]
    chrom = popName[-(len("-cases") + 1)]

    print pop, chrom, mostsigsofar, totalsig

    datatuple = (totalsig, mostsigsofar)
    # generate haplotype locus name
    locus = lociListSuffix(lociList)

    if summaryTable.has_key(chrom):
        if summaryTable[chrom].has_key(locus):
            if summaryTable[chrom][locus].has_key(pop):
                print "has pop key"
            else:
                summaryTable[chrom][locus][pop] = datatuple
        else:
            summaryTable[chrom][locus] = OrderedDict([pop, datatuple])
    else:
        summaryTable[chrom] = \
                            OrderedDict([locus, OrderedDict([pop, datatuple])])
Пример #6
0
    def _genInternalMaps(self):
        """Returns dictionary containing 2-tuple of column position.

        It is keyed by locus names originally specified in sample
        metadata file, the locus names (keys) are made uppercase and
        don't contain the allele designator.

        Note that this is simply a transformed _subset_ of that
        returned by **getSampleMap()**

        *For internal use only.*"""

        # assume there is no population column

        popNameCol = None

        # create a map that only contains non-allele fields
        self.nonAlleleMap = OrderedDict()

        self.alleleMap = OrderedDict()
        for key in self.sampleMap.keys():

            # do we have the allele designator?
            if key[0] == self.alleleDesignator:
                # remove allele designator, only necessary
                # for initial splitting out of locus keys from
                # other fields, and also make uppercase
                locusKey = string.upper(key[len(self.alleleDesignator):])
                self.alleleMap[locusKey] = self.sampleMap[key]
            elif key[0] == self.popNameDesignator:
                popNameCol = self.sampleMap[key]
                self.nonAlleleMap[key[1:]] = self.sampleMap[key]
            else:
                self.nonAlleleMap[key] = self.sampleMap[key]

        if popNameCol == None:
            self.popName = None
        else:
            # save population name
            self.popName = string.split(self.fileData[self.sampleFirstLine],
                                        self.separator)[popNameCol]
Пример #7
0
    def _genInternalMaps(self):
        """Returns dictionary containing 2-tuple of column position.

        It is keyed by locus names originally specified in sample
        metadata file, the locus names (keys) are made uppercase and
        don't contain the allele designator.

        Note that this is simply a transformed _subset_ of that
        returned by **getSampleMap()**

        *For internal use only.*"""

        # assume there is no population column
        
        popNameCol = None

        # create a map that only contains non-allele fields
        self.nonAlleleMap = OrderedDict()

        self.alleleMap = OrderedDict()
        for key in self.sampleMap.keys():

            # do we have the allele designator?
            if key[0] == self.alleleDesignator:
                # remove allele designator, only necessary
                # for initial splitting out of locus keys from
                # other fields, and also make uppercase
                locusKey = string.upper(key[len(self.alleleDesignator):])
                self.alleleMap[locusKey] = self.sampleMap[key]
            elif key[0] == self.popNameDesignator:
                popNameCol = self.sampleMap[key]
                self.nonAlleleMap[key[1:]] = self.sampleMap[key]
            else:
                self.nonAlleleMap[key] = self.sampleMap[key]

        if popNameCol == None:
            self.popName = None
        else:
            # save population name
            self.popName = string.split(self.fileData[self.sampleFirstLine], self.separator)[popNameCol]
Пример #8
0
    def _mapPopHeaders(self):
        """Create associations for field names and input columns.
        
        Using the header information from the top of the file, creates
        a dictionary for the population-level data.

        Also validates the file information for the correct number of fields
        are present on each line

        *For internal use only*."""

        # get population header metadata
        popHeaderLine = string.rstrip(self.fileData[0])

        # parse it
        self.popMap, fieldCount = self._mapFields(popHeaderLine,
                                                  self.popFields)

        # debugging only
        if self.debug:
            print "population header line: ", popHeaderLine
            print self.popMap

        # get population data
        popDataLine = string.rstrip(self.fileData[1])
        # debugging only
        if self.debug:
            print "population data line: ", popDataLine

        # make sure pop data line matches number expected from metadata
        popDataFields = string.split(popDataLine, self.separator)
        if len(popDataFields) != fieldCount:
            print "error: found", len(popDataFields),\
                  "fields expected", fieldCount, "fields"

        # create a dictionary using the metadata field names as key
        # for the population data
        self.popData = OrderedDict()
        for popField in self.popMap.keys():
            self.popData[popField] = popDataFields[self.popMap[popField]]
Пример #9
0
    def _mapPopHeaders(self):

        """Create associations for field names and input columns.
        
        Using the header information from the top of the file, creates
        a dictionary for the population-level data.

        Also validates the file information for the correct number of fields
        are present on each line

        *For internal use only*."""

        # get population header metadata
        popHeaderLine = string.rstrip(self.fileData[0])

        # parse it
        self.popMap, fieldCount = self._mapFields(popHeaderLine, self.popFields)

        # debugging only
        if self.debug:
            print "population header line: ", popHeaderLine
            print self.popMap

        # get population data
        popDataLine = string.rstrip(self.fileData[1])
        # debugging only
        if self.debug:
            print "population data line: ", popDataLine

        # make sure pop data line matches number expected from metadata
        popDataFields = string.split(popDataLine, self.separator)
        if len(popDataFields) != fieldCount:
            print "error: found", len(popDataFields),\
                  "fields expected", fieldCount, "fields"

        # create a dictionary using the metadata field names as key
        # for the population data
        self.popData = OrderedDict()
        for popField in self.popMap.keys():
            self.popData[popField] = popDataFields[self.popMap[popField]]
Пример #10
0
#! /usr/bin/env python

import sys, os, string, glob, re
from Haplo import HaploArlequin
from Utils import OrderedDict

# global summary table
summaryTable = OrderedDict()


def stripSuffix(filename):
    return string.split(os.path.basename(filename), '.')[0]


def lociListSuffix(lociList):
    suffix = ""
    print lociList, len(lociList)
    for i in lociList:
        print "locus:", i
        extra = "%02d" % i
        print "formatting:", extra
        if suffix == "":
            suffix = extra
        else:
            suffix = suffix + "-" + extra
        print "new suffix:", suffix
    return suffix


def genArpFilename(prefix, lociList):
    return prefix + "-" + lociListSuffix(lociList) + ".haplo"
Пример #11
0
class ParseFile:
    """*Abstract* class for parsing a datafile.

    *Not to be instantiated.*"""
    def __init__(self,
                 filename,
                 validPopFields=None,
                 validSampleFields=None,
                 separator='\t',
                 fieldPairDesignator='_1:_2',
                 alleleDesignator='*',
                 popNameDesignator='+',
                 debug=0):
        """Constructor for ParseFile object.

        - 'filename': filename for the file to be parsed.

        - 'validPopFields': a string consisting of valid headers (one
           per line) for overall population data (no default)

        - 'validSampleFields': a string consisting of valid headers
           (one per line) for lines of sample data.  (no default)

        - 'separator': separator for adjacent fields (default: a tab
           stop, '\\t').

        - 'fieldPairDesignator': a string which consists of additions
          to the allele `stem' for fields grouped in pairs (allele
          fields) [e.g. for `HLA-A', and `HLA-A(2)', then we use
          ':(2)', for `DQA1_1' and `DQA1_2', then use use '_1:_2', the
          latter case distinguishes both fields from the stem]
          (default: ':(2)')

        - 'alleleDesignator': The first character of the key which
        determines whether this column contains allele data.  Defaults to
        '*'

        - 'popNameDesignator': The first character of the key which
        determines whether this column contains the population name.
        Defaults to '+'

        - 'debug': Switches debugging on if set to '1' (default: no
          debugging, '0')"""

        self.filename = filename
        self.validPopFields = validPopFields
        self.validSampleFields = validSampleFields
        self.debug = debug
        self.separator = separator
        self.fieldPairDesignator = fieldPairDesignator
        self.alleleDesignator = alleleDesignator
        self.popNameDesignator = popNameDesignator

        # assume no population or sample data, until supplied
        self.popData = None
        self.sampleMap = None

        # Reads and parses a given filename.

        self._sampleFileRead(self.filename)

        if self.validPopFields == None:
            # skip parsing of metadata header
            self.sampleFirstLine = 1
        else:
            # parse metadata header
            self.sampleFirstLine = 3

            # gets the .ini file information for metadata
            self.popFields = ParseFile._dbFieldsRead(self, self.validPopFields)
            if self.debug:
                # debugging only
                print self.popFields

            # parse the metadata
            self._mapPopHeaders()

        # gets the .ini file information for samples
        self.sampleFields = ParseFile._dbFieldsRead(self,
                                                    self.validSampleFields)
        if self.debug:
            print self.sampleFields

        # always parse the samples, they must always exist!
        self._mapSampleHeaders()

    def _dbFieldsRead(self, data):
        """Reads the valid key, value pairs.

        Takes a string that is expected to consist of database field
        names separated by newlines.

        Returns a tuple of field names.

        *For internal use only.*"""
        li = []
        for line in string.split(data, os.linesep):
            if self.debug:
                print string.rstrip(line)
            li.append(string.rstrip(line))
        return tuple(li)

    def _mapFields(self, line, fieldList):
        """Creates a list of valid database fields.

        From a separator delimited string, creates a list of valid
        fields and creates a dictionary of positions keyed by valid
        field names.

        - Complains if a field name is not valid.

        - Complains if the correct number of fields are not found for
        the metadata headers.
        
        Returns a 2-tuple:
        
        - a dictionary keyed by field name.

        - the total number of  metadata fields.

        *For internal use only.*"""

        # split line
        fields = line.split(self.separator)

        # check to see if the correct number of fields found
        if len(fields) != len(fieldList) and self.debug:
            print "warning: found", len(fields), "fields expected", \
                  len(fieldList), "fields"

        i = 0
        assoc = OrderedDict()
        for field in fields:

            # strip the field of leading and trailing blanks because
            # column name may inadvertantly contain these due to
            # spreadsheet -> tab-delimited file format idiosyncrasies

            field = string.strip(field)

            # check to see whether field is a valid key, and generate
            # the appropriate identifier, this is done as method call
            # so it can overwritten in subclasses of this abstract
            # class (i.e. the subclass will have 'knowledge' about the
            # nature of fields, but not this abstract class)

            # If an asterisk character is given as the first item in
            # the valid fields list, then accept any field name (ie,
            # locus name) as valid.  This makes sense only in the
            # allelecount file context.
            if fieldList[0] == "*":
                isValidKey, key = (1, field)
            else:
                isValidKey, key = self.genValidKey(field, fieldList)

            if isValidKey:

                # if key is one of pair already in map, add it to make
                # a tuple at that key e.g. `HLA-A(2)' already exists
                # and inserting `HLA-A', or `DQB1_1' and `DQB1_2' should
                # both be inserted at `DQB1'

                if assoc.has_key(key):
                    assoc[key] = assoc[key], i
                else:
                    assoc[key] = i

            elif self.debug:
                print "warning: field name `%s' not valid" % field

            i = i + 1

        return assoc, i

    def _sampleFileRead(self, filename):
        """Reads filename into object.

        Takes a filename and reads the file data into an instance variable.

        *For internal use only*.
        """
        f = open(filename, 'r')
        self.fileData = f.readlines()

    def _mapPopHeaders(self):
        """Create associations for field names and input columns.
        
        Using the header information from the top of the file, creates
        a dictionary for the population-level data.

        Also validates the file information for the correct number of fields
        are present on each line

        *For internal use only*."""

        # get population header metadata
        popHeaderLine = string.rstrip(self.fileData[0])

        # parse it
        self.popMap, fieldCount = self._mapFields(popHeaderLine,
                                                  self.popFields)

        # debugging only
        if self.debug:
            print "population header line: ", popHeaderLine
            print self.popMap

        # get population data
        popDataLine = string.rstrip(self.fileData[1])
        # debugging only
        if self.debug:
            print "population data line: ", popDataLine

        # make sure pop data line matches number expected from metadata
        popDataFields = string.split(popDataLine, self.separator)
        if len(popDataFields) != fieldCount:
            print "error: found", len(popDataFields),\
                  "fields expected", fieldCount, "fields"

        # create a dictionary using the metadata field names as key
        # for the population data
        self.popData = OrderedDict()
        for popField in self.popMap.keys():
            self.popData[popField] = popDataFields[self.popMap[popField]]

    def _mapSampleHeaders(self):
        """Create the associations between field names and input columns.

        Using the header information from the top of the file, creates
        associations for the sample data fields.

        Also validates the file information for the correct number of fields
        are present on each line

        *For internal use only*."""

        # get sample header metadata
        sampleHeaderLine = string.rstrip(self.fileData[self.sampleFirstLine -
                                                       1])

        # parse it
        self.sampleMap, fieldCount = self._mapFields(sampleHeaderLine,
                                                     self.sampleFields)
        # debugging only
        if self.debug:
            print "sample header line: ", sampleHeaderLine
            print self.sampleMap

        # check file data to see that correct number of fields are
        # present for each sample

        for lineCount in range(self.sampleFirstLine, len(self.fileData)):

            # retrieve and strip newline
            line = string.rstrip(self.fileData[lineCount])

            # restore the data with the newline stripped
            self.fileData[lineCount] = line

            fields = string.split(line, self.separator)
            if fieldCount != len(fields):
                print "error: incorrect number of fields:", len(fields), \
                      "found, should have:", fieldCount, \
                      "\noffending line is:\n", line

    def getPopData(self):
        """Returns a dictionary of population data.

        Dictionary is keyed by types specified in population metadata
        file"""
        return self.popData

    def getSampleMap(self):
        """Returns dictionary of sample data.

        Each dictionary position contains either a 2-tuple of column
        position or a single column position keyed by field originally
        specified in sample metadata file"""

        return self.sampleMap

    def getFileData(self):
        """Returns file data.

        Returns a 2-tuple `wrapper':

        - raw sample lines, *without*  header metadata.
        
        - the field separator."""
        return self.fileData[self.sampleFirstLine:], self.separator

    def genSampleOutput(self, fieldList):
        """Prints the data specified in ordered field list.

        *Use is currently deprecated.*"""

        #for field in fieldList:
        #print string.strip(field) + self.separator,
        for lineCount in range(self.sampleFirstLine, len(self.fileData)):
            line = string.strip(self.fileData[lineCount])
            element = string.split(line, self.separator)
            for field in fieldList:
                if self.sampleMap.has_key(field):
                    print element[self.sampleMap[field]],
                else:
                    print "can't find this field"
                    print "\n"

    def serializeMetadataTo(self, stream):
        type = getStreamType(stream)

        stream.opentag('populationdata')
        stream.writeln()

        if self.popData:

            for summary in self.popData.keys():
                # convert metadata name into a XML tag name
                tagname = string.lower(string.replace(summary, ' ', '-'))
                stream.tagContents(tagname, self.popData[summary])
                stream.writeln()

        # call subclass-specific metadata serialization
        self.serializeSubclassMetadataTo(stream)

        stream.closetag('populationdata')
        stream.writeln()
Пример #12
0
class ParseGenotypeFile(ParseFile):
    """Class to parse standard datafile in genotype form."""
    def __init__(self, filename, untypedAllele='****', **kw):
        """Constructor for ParseGenotypeFile.

        - 'filename': filename for the file to be parsed.
        
        In addition to the arguments for the base class, this class
        accepts the following additional keywords:

        - 'untypedAllele': The designator for an untyped locus.  Defaults
        to '****'.
        """
        self.untypedAllele = untypedAllele

        ParseFile.__init__(self, filename, **kw)

        self._genDataStructures()

    def _genInternalMaps(self):
        """Returns dictionary containing 2-tuple of column position.

        It is keyed by locus names originally specified in sample
        metadata file, the locus names (keys) are made uppercase and
        don't contain the allele designator.

        Note that this is simply a transformed _subset_ of that
        returned by **getSampleMap()**

        *For internal use only.*"""

        # assume there is no population column

        popNameCol = None

        # create a map that only contains non-allele fields
        self.nonAlleleMap = OrderedDict()

        self.alleleMap = OrderedDict()
        for key in self.sampleMap.keys():

            # do we have the allele designator?
            if key[0] == self.alleleDesignator:
                # remove allele designator, only necessary
                # for initial splitting out of locus keys from
                # other fields, and also make uppercase
                locusKey = string.upper(key[len(self.alleleDesignator):])
                self.alleleMap[locusKey] = self.sampleMap[key]
            elif key[0] == self.popNameDesignator:
                popNameCol = self.sampleMap[key]
                self.nonAlleleMap[key[1:]] = self.sampleMap[key]
            else:
                self.nonAlleleMap[key] = self.sampleMap[key]

        if popNameCol == None:
            self.popName = None
        else:
            # save population name
            self.popName = string.split(self.fileData[self.sampleFirstLine],
                                        self.separator)[popNameCol]

    def _genDataStructures(self):
        """Generates matrix only
        
        *For internal use only.*"""

        # generate alleleMap and population field name
        self._genInternalMaps()

        sampleDataLines, separator = self.getFileData()

        if self.debug:
            print 'sampleMap keys:', self.sampleMap.keys()
            print 'sampleMap values:', self.sampleMap.values()
            print 'first line of data', sampleDataLines[0]

        # then total number of individuals in data file
        self.totalIndivCount = len(sampleDataLines)

        # total number of loci contained in original file
        self.totalLocusCount = len(self.alleleMap)

        # freeze the list of locusKeys in a particular order
        self.locusKeys = self.alleleMap.keys()

        # freeze list of non-allel data
        self.extraKeys = self.nonAlleleMap.keys()

        # create an empty-list of lists to store all the row data
        #self.individualsList = [[] for line in range(0, self.totalIndivCount)]
        self.matrix = StringMatrix(self.totalIndivCount, self.locusKeys,
                                   self.extraKeys, self.separator,
                                   self.fileData[:self.sampleFirstLine - 1])

        rowCount = 0
        # store all the non-allele meta-data
        for line in sampleDataLines:
            fields = string.split(line, self.separator)
            for key in self.nonAlleleMap.keys():
                self.matrix[rowCount, key] = fields[self.nonAlleleMap[key]]

            rowCount += 1

        if self.debug:
            print "before filling matrix with allele data"
            print self.matrix

        for locus in self.locusKeys:
            if self.debug:
                print "locus name:", locus
                print "column tuple:", self.alleleMap[locus]

            col1, col2 = self.alleleMap[locus]

            # re-initialise the row count on each iteration of the locus
            rowCount = 0
            for line in sampleDataLines:
                fields = string.split(line, self.separator)

                # create data structures

                allele1 = string.strip(fields[col1])
                allele2 = string.strip(fields[col2])

                # underlying NumPy array data type won't allow storage
                # of any sequence-type object (e.g. list or tuple) but
                # we can workaround this by overriding the __setitem__
                # method of the UserArray wrapper class used for
                # subtyping and storing tuple internally as two
                # separate columns in the underlying array.

                self.matrix[rowCount, locus] = (allele1, allele2)

                if self.debug:
                    print rowCount, self.matrix[rowCount, locus]

                # increment row count
                rowCount += 1

    def genValidKey(self, field, fieldList):
        """Check and validate key.

        - 'field':  string with field name.

        - 'fieldList':  a dictionary of valid fields.
        
        Check to see whether 'field' is a valid key, and generate the
        appropriate 'key'.  Returns a 2-tuple consisting of
        'isValidKey' boolean and the 'key'.

        *Note: this is explicitly done in the subclass of the abstract
        'ParseFile' class (i.e. since this subclass should have
        `knowledge' about the nature of fields, but the abstract
        class should not have)*"""

        if (field in fieldList) or \
           (self.alleleDesignator + field in fieldList):
            isValidKey = 1
        else:
            if self.popNameDesignator + field in fieldList:
                isValidKey = 1
            else:
                isValidKey = 0

        # generate the key that matches the one in the data file
        # format

        # if this is an `allele'-type field
        if self.alleleDesignator + field in fieldList:

            li = string.split(self.fieldPairDesignator, ":")

            # if pair identifiers are both the same length and
            # non-zero (e.g. '_1' and '_2', then we can assume that
            # the underlying `stem' should be the field name with the
            # pair identifer stripped off, otherwise simply use the
            # field name

            if (len(li[0]) == len(li[1])) and (len(li[0]) != 0):
                key = self.alleleDesignator + field[:-len(li[0])]
            else:
                key = self.alleleDesignator + field

        else:
            # this is the population field name
            if self.popNameDesignator + field in fieldList:
                key = self.popNameDesignator + field
            else:
                # this is a regular (non-`allele' type field)
                key = field

        if self.debug:
            print "validKey: %d, key: %s" % (isValidKey, key)

        return isValidKey, key

    def getMatrix(self):
        """Returns the genotype data.

        Returns the genotype data in a 'StringMatrix' NumPy array.
        """
        return self.matrix

    def serializeSubclassMetadataTo(self, stream):
        """Serialize subclass-specific metadata."""

        if self.popName:
            # if present in input , print population name
            stream.tagContents('popname', self.popName)
            stream.writeln()
Пример #13
0
    def _mapFields(self, line, fieldList):
        """Creates a list of valid database fields.

        From a separator delimited string, creates a list of valid
        fields and creates a dictionary of positions keyed by valid
        field names.

        - Complains if a field name is not valid.

        - Complains if the correct number of fields are not found for
        the metadata headers.
        
        Returns a 2-tuple:
        
        - a dictionary keyed by field name.

        - the total number of  metadata fields.

        *For internal use only.*"""

        # split line
        fields = line.split(self.separator)

        # check to see if the correct number of fields found
        if len(fields) != len(fieldList) and self.debug:
            print "warning: found", len(fields), "fields expected", \
                  len(fieldList), "fields"

        i = 0
        assoc = OrderedDict()
        for field in fields:

            # strip the field of leading and trailing blanks because
            # column name may inadvertantly contain these due to
            # spreadsheet -> tab-delimited file format idiosyncrasies

            field = string.strip(field)

            # check to see whether field is a valid key, and generate
            # the appropriate identifier, this is done as method call
            # so it can overwritten in subclasses of this abstract
            # class (i.e. the subclass will have 'knowledge' about the
            # nature of fields, but not this abstract class)

            # If an asterisk character is given as the first item in
            # the valid fields list, then accept any field name (ie,
            # locus name) as valid.  This makes sense only in the
            # allelecount file context.
            if fieldList[0] == "*":
                isValidKey, key = (1, field)
            else:
                isValidKey, key = self.genValidKey(field, fieldList)

            if isValidKey:

                # if key is one of pair already in map, add it to make
                # a tuple at that key e.g. `HLA-A(2)' already exists
                # and inserting `HLA-A', or `DQB1_1' and `DQB1_2' should
                # both be inserted at `DQB1'

                if assoc.has_key(key):
                    assoc[key] = assoc[key], i
                else:
                    assoc[key] = i

            elif self.debug:
                print "warning: field name `%s' not valid" % field

            i = i + 1

        return assoc, i
Пример #14
0
class ParseFile:
    """*Abstract* class for parsing a datafile.

    *Not to be instantiated.*"""
    def __init__(self,
                 filename,
                 validPopFields=None,
                 validSampleFields=None,
                 separator='\t',
                 fieldPairDesignator='_1:_2',
                 alleleDesignator='*',
                 popNameDesignator='+',
                 debug=0):
        """Constructor for ParseFile object.

        - 'filename': filename for the file to be parsed.

        - 'validPopFields': a string consisting of valid headers (one
           per line) for overall population data (no default)

        - 'validSampleFields': a string consisting of valid headers
           (one per line) for lines of sample data.  (no default)

        - 'separator': separator for adjacent fields (default: a tab
           stop, '\\t').

        - 'fieldPairDesignator': a string which consists of additions
          to the allele `stem' for fields grouped in pairs (allele
          fields) [e.g. for `HLA-A', and `HLA-A(2)', then we use
          ':(2)', for `DQA1_1' and `DQA1_2', then use use '_1:_2', the
          latter case distinguishes both fields from the stem]
          (default: ':(2)')

        - 'alleleDesignator': The first character of the key which
        determines whether this column contains allele data.  Defaults to
        '*'

        - 'popNameDesignator': The first character of the key which
        determines whether this column contains the population name.
        Defaults to '+'

        - 'debug': Switches debugging on if set to '1' (default: no
          debugging, '0')"""

        self.filename = filename
        self.validPopFields=validPopFields
        self.validSampleFields=validSampleFields
        self.debug = debug
        self.separator = separator
        self.fieldPairDesignator = fieldPairDesignator
        self.alleleDesignator=alleleDesignator
        self.popNameDesignator = popNameDesignator

        # assume no population or sample data, until supplied
        self.popData = None
        self.sampleMap = None
    
        # Reads and parses a given filename.

        self._sampleFileRead(self.filename)


        if self.validPopFields == None:
            # skip parsing of metadata header
            self.sampleFirstLine = 1
        else:
            # parse metadata header
            self.sampleFirstLine = 3

            # gets the .ini file information for metadata
            self.popFields = ParseFile._dbFieldsRead(self,self.validPopFields)
            if self.debug:
                # debugging only
                print self.popFields

            # parse the metadata
            self._mapPopHeaders()


        # gets the .ini file information for samples
        self.sampleFields = ParseFile._dbFieldsRead(self,self.validSampleFields)
        if self.debug:
            print self.sampleFields

        # always parse the samples, they must always exist!
        self._mapSampleHeaders()

    def _dbFieldsRead(self, data):
        """Reads the valid key, value pairs.

        Takes a string that is expected to consist of database field
        names separated by newlines.

        Returns a tuple of field names.

        *For internal use only.*"""
        li = []
        for line in string.split(data, os.linesep):
            if self.debug:
                print string.rstrip(line)
            li.append(string.rstrip(line))
        return tuple(li)

    def _mapFields(self, line, fieldList):
        """Creates a list of valid database fields.

        From a separator delimited string, creates a list of valid
        fields and creates a dictionary of positions keyed by valid
        field names.

        - Complains if a field name is not valid.

        - Complains if the correct number of fields are not found for
        the metadata headers.
        
        Returns a 2-tuple:
        
        - a dictionary keyed by field name.

        - the total number of  metadata fields.

        *For internal use only.*"""

        # split line
        fields = line.split(self.separator)

        # check to see if the correct number of fields found
        if len(fields) != len(fieldList) and self.debug:
            print "warning: found", len(fields), "fields expected", \
                  len(fieldList), "fields"
        
        i = 0
        assoc = OrderedDict()
        for field in fields:

            # strip the field of leading and trailing blanks because
            # column name may inadvertantly contain these due to
            # spreadsheet -> tab-delimited file format idiosyncrasies
        
            field = string.strip(field)

            # check to see whether field is a valid key, and generate
            # the appropriate identifier, this is done as method call
            # so it can overwritten in subclasses of this abstract
            # class (i.e. the subclass will have 'knowledge' about the
            # nature of fields, but not this abstract class)
            
            # If an asterisk character is given as the first item in
            # the valid fields list, then accept any field name (ie,
            # locus name) as valid.  This makes sense only in the
            # allelecount file context.
            if fieldList[0] == "*":
                isValidKey, key = (1, field)
            else:
                isValidKey, key = self.genValidKey(field, fieldList)
                
            if isValidKey:

                # if key is one of pair already in map, add it to make
                # a tuple at that key e.g. `HLA-A(2)' already exists
                # and inserting `HLA-A', or `DQB1_1' and `DQB1_2' should
                # both be inserted at `DQB1'

                if assoc.has_key(key):
                    assoc[key] = assoc[key], i
                else:
                   assoc[key] = i
                    
            elif self.debug:
                print "warning: field name `%s' not valid" % field

            i = i + 1

        return assoc, i

    def _sampleFileRead(self, filename):
        """Reads filename into object.

        Takes a filename and reads the file data into an instance variable.

        *For internal use only*.
        """
        f = open(filename, 'r')
        self.fileData = f.readlines()

    def _mapPopHeaders(self):

        """Create associations for field names and input columns.
        
        Using the header information from the top of the file, creates
        a dictionary for the population-level data.

        Also validates the file information for the correct number of fields
        are present on each line

        *For internal use only*."""

        # get population header metadata
        popHeaderLine = string.rstrip(self.fileData[0])

        # parse it
        self.popMap, fieldCount = self._mapFields(popHeaderLine, self.popFields)

        # debugging only
        if self.debug:
            print "population header line: ", popHeaderLine
            print self.popMap

        # get population data
        popDataLine = string.rstrip(self.fileData[1])
        # debugging only
        if self.debug:
            print "population data line: ", popDataLine

        # make sure pop data line matches number expected from metadata
        popDataFields = string.split(popDataLine, self.separator)
        if len(popDataFields) != fieldCount:
            print "error: found", len(popDataFields),\
                  "fields expected", fieldCount, "fields"

        # create a dictionary using the metadata field names as key
        # for the population data
        self.popData = OrderedDict()
        for popField in self.popMap.keys():
            self.popData[popField] = popDataFields[self.popMap[popField]]

    def _mapSampleHeaders(self):
        """Create the associations between field names and input columns.

        Using the header information from the top of the file, creates
        associations for the sample data fields.

        Also validates the file information for the correct number of fields
        are present on each line

        *For internal use only*."""

        # get sample header metadata
        sampleHeaderLine = string.rstrip(self.fileData[self.sampleFirstLine-1])

        # parse it
        self.sampleMap, fieldCount = self._mapFields(sampleHeaderLine,
                                                     self.sampleFields)
        # debugging only
        if self.debug:
            print "sample header line: ", sampleHeaderLine
            print self.sampleMap

        # check file data to see that correct number of fields are
        # present for each sample

        for lineCount in range(self.sampleFirstLine, len(self.fileData)):

            # retrieve and strip newline
            line = string.rstrip(self.fileData[lineCount])

            # restore the data with the newline stripped
            self.fileData[lineCount] = line
            
            fields = string.split(line, self.separator)
            if fieldCount != len(fields):
                print "error: incorrect number of fields:", len(fields), \
                      "found, should have:", fieldCount, \
                      "\noffending line is:\n", line


    def getPopData(self):
        """Returns a dictionary of population data.

        Dictionary is keyed by types specified in population metadata
        file"""
        return self.popData

    def getSampleMap(self):
        """Returns dictionary of sample data.

        Each dictionary position contains either a 2-tuple of column
        position or a single column position keyed by field originally
        specified in sample metadata file"""

        return self.sampleMap

    def getFileData(self):
        """Returns file data.

        Returns a 2-tuple `wrapper':

        - raw sample lines, *without*  header metadata.
        
        - the field separator."""
        return self.fileData[self.sampleFirstLine:], self.separator
    
    def genSampleOutput(self, fieldList):
        """Prints the data specified in ordered field list.

        *Use is currently deprecated.*"""

        #for field in fieldList:
        #print string.strip(field) + self.separator,
        for lineCount in range(self.sampleFirstLine, len(self.fileData)):
            line = string.strip(self.fileData[lineCount])
            element = string.split(line, self.separator)
            for field in fieldList:
                if self.sampleMap.has_key(field):
                    print element[self.sampleMap[field]],
                else:
                    print "can't find this field"
                    print "\n"

    def serializeMetadataTo(self, stream):
        type = getStreamType(stream)

        stream.opentag('populationdata')
        stream.writeln()

        if self.popData:

            for summary in self.popData.keys():
                # convert metadata name into a XML tag name
                tagname = string.lower(string.replace(summary,' ','-'))
                stream.tagContents(tagname, self.popData[summary])
                stream.writeln()

        # call subclass-specific metadata serialization
        self.serializeSubclassMetadataTo(stream)
            
        stream.closetag('populationdata')
        stream.writeln()
Пример #15
0
class ParseGenotypeFile(ParseFile):
    """Class to parse standard datafile in genotype form."""
    
    def __init__(self,
                 filename,
                 untypedAllele='****',
                 **kw):
        """Constructor for ParseGenotypeFile.

        - 'filename': filename for the file to be parsed.
        
        In addition to the arguments for the base class, this class
        accepts the following additional keywords:

        - 'untypedAllele': The designator for an untyped locus.  Defaults
        to '****'.
        """
        self.untypedAllele=untypedAllele
        
        ParseFile.__init__(self, filename, **kw)

        self._genDataStructures()

    def _genInternalMaps(self):
        """Returns dictionary containing 2-tuple of column position.

        It is keyed by locus names originally specified in sample
        metadata file, the locus names (keys) are made uppercase and
        don't contain the allele designator.

        Note that this is simply a transformed _subset_ of that
        returned by **getSampleMap()**

        *For internal use only.*"""

        # assume there is no population column
        
        popNameCol = None

        # create a map that only contains non-allele fields
        self.nonAlleleMap = OrderedDict()

        self.alleleMap = OrderedDict()
        for key in self.sampleMap.keys():

            # do we have the allele designator?
            if key[0] == self.alleleDesignator:
                # remove allele designator, only necessary
                # for initial splitting out of locus keys from
                # other fields, and also make uppercase
                locusKey = string.upper(key[len(self.alleleDesignator):])
                self.alleleMap[locusKey] = self.sampleMap[key]
            elif key[0] == self.popNameDesignator:
                popNameCol = self.sampleMap[key]
                self.nonAlleleMap[key[1:]] = self.sampleMap[key]
            else:
                self.nonAlleleMap[key] = self.sampleMap[key]

        if popNameCol == None:
            self.popName = None
        else:
            # save population name
            self.popName = string.split(self.fileData[self.sampleFirstLine], self.separator)[popNameCol]


    def _genDataStructures(self):
        """Generates matrix only
        
        *For internal use only.*"""        

        # generate alleleMap and population field name
        self._genInternalMaps()

        sampleDataLines, separator = self.getFileData()

        if self.debug:
            print 'sampleMap keys:', self.sampleMap.keys()
            print 'sampleMap values:', self.sampleMap.values()
            print 'first line of data', sampleDataLines[0]


        # then total number of individuals in data file
        self.totalIndivCount = len(sampleDataLines)

        # total number of loci contained in original file
        self.totalLocusCount = len(self.alleleMap)

        # freeze the list of locusKeys in a particular order
        self.locusKeys = self.alleleMap.keys()

        # freeze list of non-allel data
        self.extraKeys = self.nonAlleleMap.keys()

        # create an empty-list of lists to store all the row data
        #self.individualsList = [[] for line in range(0, self.totalIndivCount)]
        self.matrix = StringMatrix(self.totalIndivCount,
                                   self.locusKeys,
                                   self.extraKeys,
                                   self.separator,
                                   self.fileData[:self.sampleFirstLine-1])

        rowCount = 0
        # store all the non-allele meta-data
        for line in sampleDataLines:
            fields = string.split(line, self.separator)
            for key in self.nonAlleleMap.keys():
                self.matrix[rowCount, key] = fields[self.nonAlleleMap[key]]

            rowCount += 1

        if self.debug:
            print "before filling matrix with allele data"
            print self.matrix

        for locus in self.locusKeys:
            if self.debug:
               print "locus name:", locus
               print "column tuple:", self.alleleMap[locus]

            col1, col2 = self.alleleMap[locus]

            # re-initialise the row count on each iteration of the locus
            rowCount = 0
            for line in sampleDataLines:
                fields = string.split(line, self.separator)

                # create data structures

                allele1 = string.strip(fields[col1])
                allele2 = string.strip(fields[col2])

                # underlying NumPy array data type won't allow storage
                # of any sequence-type object (e.g. list or tuple) but
                # we can workaround this by overriding the __setitem__
                # method of the UserArray wrapper class used for
                # subtyping and storing tuple internally as two
                # separate columns in the underlying array.

                self.matrix[rowCount,locus] = (allele1, allele2)
                
                if self.debug:
                    print rowCount, self.matrix[rowCount,locus]

                # increment row count
                rowCount += 1

    def genValidKey(self, field, fieldList):
        """Check and validate key.

        - 'field':  string with field name.

        - 'fieldList':  a dictionary of valid fields.
        
        Check to see whether 'field' is a valid key, and generate the
        appropriate 'key'.  Returns a 2-tuple consisting of
        'isValidKey' boolean and the 'key'.

        *Note: this is explicitly done in the subclass of the abstract
        'ParseFile' class (i.e. since this subclass should have
        `knowledge' about the nature of fields, but the abstract
        class should not have)*"""

        if (field in fieldList) or \
           (self.alleleDesignator + field in fieldList):
            isValidKey = 1
        else:
            if self.popNameDesignator + field in fieldList:
                isValidKey = 1
            else:
                isValidKey = 0

        # generate the key that matches the one in the data file
        # format

        # if this is an `allele'-type field
        if self.alleleDesignator + field in fieldList:

            li = string.split(self.fieldPairDesignator,":")

            # if pair identifiers are both the same length and
            # non-zero (e.g. '_1' and '_2', then we can assume that
            # the underlying `stem' should be the field name with the
            # pair identifer stripped off, otherwise simply use the
            # field name
            
            if (len(li[0]) == len(li[1])) and (len(li[0]) != 0):
                key = self.alleleDesignator + field[:-len(li[0])]
            else:
                key = self.alleleDesignator + field

        else:
            # this is the population field name
            if self.popNameDesignator + field in fieldList:
                key = self.popNameDesignator + field
            else:
                # this is a regular (non-`allele' type field)
                key = field

        if self.debug:
            print "validKey: %d, key: %s" % (isValidKey, key)
            
        return isValidKey, key

    def getMatrix(self):
        """Returns the genotype data.

        Returns the genotype data in a 'StringMatrix' NumPy array.
        """
        return self.matrix

    def serializeSubclassMetadataTo(self, stream):
        """Serialize subclass-specific metadata."""

        if self.popName:
            # if present in input , print population name
            stream.tagContents('popname', self.popName)
            stream.writeln()
Пример #16
0
    def _mapFields(self, line, fieldList):
        """Creates a list of valid database fields.

        From a separator delimited string, creates a list of valid
        fields and creates a dictionary of positions keyed by valid
        field names.

        - Complains if a field name is not valid.

        - Complains if the correct number of fields are not found for
        the metadata headers.
        
        Returns a 2-tuple:
        
        - a dictionary keyed by field name.

        - the total number of  metadata fields.

        *For internal use only.*"""

        # split line
        fields = line.split(self.separator)

        # check to see if the correct number of fields found
        if len(fields) != len(fieldList) and self.debug:
            print "warning: found", len(fields), "fields expected", \
                  len(fieldList), "fields"
        
        i = 0
        assoc = OrderedDict()
        for field in fields:

            # strip the field of leading and trailing blanks because
            # column name may inadvertantly contain these due to
            # spreadsheet -> tab-delimited file format idiosyncrasies
        
            field = string.strip(field)

            # check to see whether field is a valid key, and generate
            # the appropriate identifier, this is done as method call
            # so it can overwritten in subclasses of this abstract
            # class (i.e. the subclass will have 'knowledge' about the
            # nature of fields, but not this abstract class)
            
            # If an asterisk character is given as the first item in
            # the valid fields list, then accept any field name (ie,
            # locus name) as valid.  This makes sense only in the
            # allelecount file context.
            if fieldList[0] == "*":
                isValidKey, key = (1, field)
            else:
                isValidKey, key = self.genValidKey(field, fieldList)
                
            if isValidKey:

                # if key is one of pair already in map, add it to make
                # a tuple at that key e.g. `HLA-A(2)' already exists
                # and inserting `HLA-A', or `DQB1_1' and `DQB1_2' should
                # both be inserted at `DQB1'

                if assoc.has_key(key):
                    assoc[key] = assoc[key], i
                else:
                   assoc[key] = i
                    
            elif self.debug:
                print "warning: field name `%s' not valid" % field

            i = i + 1

        return assoc, i