Пример #1
0
    def __init__(self,locationConfig):
        ''' Constructor
        '''
        dict.__init__(self)

        # Parse the location configuration and store for retrieval
        root = ElementTree().parse(locationConfig)        
        for loc_elem in root.getchildren():
            loc_info = LocationInfo(loc_elem)
            self[loc_info.id] = loc_info
        return
Пример #2
0
def convert2naf_file(inputfile, nafobj, token_info, raw):

    myinput = ElementTree().parse(inputfile)
    token_dict = None
    word_count = token_info[2]
    for elem in myinput.getchildren():
        if elem.tag == 'node':
            token_dict = {}
            token_dict, updated_word_count = get_tokens(
                elem, word_count, token_dict)
            create_token_and_term_layer(token_dict, token_info, nafobj)
            #code from Ruben to create constituent and dependency layers
            find_propbank_rels(elem, nafobj, word_count)
            add_dependencies_to_naf(elem, nafobj, word_count)
        elif elem.tag == 'sentence':
            raw += ' ' + elem.text

    return updated_word_count, raw
Пример #3
0
# XML提供了一个格式方便和用途广泛的工具

# 11.4.3 ElementTree接口
# Python的ElementTree模型提供了一种方便的方式用于访问存储在XML文件中的数据。
merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml')
raw = open(merchant_file).read()
print(raw[:163])
print(raw[1789:2006])

from xml.etree.ElementTree import ElementTree

merchant = ElementTree().parse(merchant_file)
merchant
merchant[0]
merchant[0].text
merchant.getchildren()
merchant[-2][0].text
merchant[-2][1]
merchant[-2][1][0].text
merchant[-2][1][54]
merchant[-2][1][54][0]
merchant[-2][1][54][0].text
merchant[-2][1][54][1]
merchant[-2][1][54][1].text

for i, act in enumerate(merchant.findall('ACT')):
    for j, scene in enumerate(act.findall('SCENE')):
        for k, speech in enumerate(scene.findall('SPEECH')):
            for line in speech.findall('LINE'):
                if 'music' in str(line.text):
                    print('Act %d Scene %d Speech %d: %s' %
Пример #4
0
# XML提供了一个格式方便和用途广泛的工具

# 4.3 ElementTree接口
# Python的ElementTree模型提供了一种方便的方式用于访问存储在XML文件中的数据。
merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml')
raw = open(merchant_file).read()
print(raw[:163])
print(raw[1789:2006])

from xml.etree.ElementTree import ElementTree

merchant = ElementTree().parse(merchant_file)
print(merchant)
print(merchant[0])
print(merchant[0].text)
print(merchant.getchildren())
print(merchant[-2][0].text)
print(merchant[-2][1])
print(merchant[-2][1][0].text)
print(merchant[-2][1][54])
print(merchant[-2][1][54][0])
print(merchant[-2][1][54][0].text)
print(merchant[-2][1][54][1])
print(merchant[-2][1][54][1].text)

for i, act in enumerate(merchant.findall('ACT')):
    for j, scene in enumerate(act.findall('SCENE')):
        for k, speech in enumerate(scene.findall('SPEECH')):
            for line in speech.findall('LINE'):
                if 'music' in str(line.text):
                    print('Act %d Scene %d Speech %d: %s' %
Пример #5
0
class CEOSDB(object):

    typeMap = ['Skip', 'An', 'In', 'B1', 'B4', 'Fn', 'B2', 'Debug']

    def __init__(self, xml=None, dataFile=None):
        self.xml = xml
        self.dataFile = dataFile
        self.startPosition = dataFile.tell()
        self.recordLength = 0
        self.metadata = {}
        if not xml == None:
            self.xmlFP = open(self.xml, 'r')
            self.rootChildren = ET(file=self.xmlFP).getroot().getchildren()
        else:
            self.xmlFP = None
            self.rootChildren = []

    def getMetadata(self):
        return self.metadata

    def getEndOfRecordPosition(self):
        return self.startPosition + self.recordLength

    def finalizeParser(self):
        self.xmlFP.close()

    def parseFast(self):
        """
            Use the xml definition of the field positions, names and lengths to
            parse a CEOS data file
        """

        for z in self.rootChildren:
            # If the tag name is 'rec', this is a plain old record
            if z.tag == 'rec':
                (key, data) = self.decodeNode(z)
                self.metadata[key] = data
            # If the tag name is 'struct', we need to loop over some other
            # records
            elif z.tag == "struct":
                loopCounterName = z.attrib['loop']
                loopCount = self.metadata[loopCounterName]
                key = z.attrib['name']
                self.metadata[key] = [None] * loopCount
                for i in range(loopCount):
                    struct = {}
                    for node in z.getchildren():
                        (subkey, data) = self.decodeNode(node)
                        struct[subkey] = data
                    self.metadata[key][i] = struct

        self.recordLength = self.metadata['Record Length']

    def parse(self):
        """
            Use the xml definition of the field positions, names and lengths to
            parse a CEOS data file
        """
        xmlFP = open(self.xml, 'r')

        self.root = ET(file=xmlFP).getroot()
        for z in self.root.getchildren():
            # If the tag name is 'rec', this is a plain old record
            if z.tag == 'rec':
                (key, data) = self.decodeNode(z)
                self.metadata[key] = data
            # If the tag name is 'struct', we need to loop over some other
            #records
            if z.tag == "struct":
                loopCounterName = z.attrib['loop']
                loopCount = self.metadata[loopCounterName]
                key = z.attrib['name']
                self.metadata[key] = [None] * loopCount
                for i in range(loopCount):
                    struct = {}
                    for node in z.getchildren():
                        (subkey, data) = self.decodeNode(node)
                        struct[subkey] = data
                    self.metadata[key][i] = struct

        xmlFP.close()
        self.recordLength = self.metadata['Record Length']

    def decodeNode(self, node):
        """
            Create an entry in the metadata dictionary
        """
        key = node.attrib['name']
        size = int(node.attrib['num'])
        format = int(node.attrib['type'])
        data = self.readData(key, size, format)
        return key, data

    def readData(self, key, size, format):
        """
            Read data from a node and return it
        """
        formatString = ''
        strp_3 = lambda x: str.strip(x.decode('utf-8')).rstrip('\x00')
        convertFunction = None
        if (self.typeMap[format] == "Skip"):
            self.dataFile.seek(size, os.SEEK_CUR)
            return
        elif (self.typeMap[format] == "An"):
            formatString = "%ss" % size
            convertFunction = strp_3
        elif (self.typeMap[format] == "In"):
            formatString = "%ss" % size
            convertFunction = int
        elif (self.typeMap[format] == "Fn"):
            formatString = "%ss" % size
            convertFunction = float
        elif (self.typeMap[format] == "Debug"):
            print(key, size, format, self.dataFile.tell())
        elif (self.typeMap[format] == "B4"):
            formatString = ">I"
            convertFunction = int
            size = 4
        elif (self.typeMap[format] == "B2"):
            formatString = ">H"
            convertFunction = int
            size = 2
        elif (self.typeMap[format] == "B1"):
            formatString = ">B"
            convertFunction = int
            size = 1
        else:
            raise TypeError("Unknown format %s" % format)

        data = self._readAndUnpackData(length=size,
                                       format=formatString,
                                       typefunc=convertFunction)
        return data

    def _readAndUnpackData(self,
                           length=None,
                           format=None,
                           typefunc=None,
                           numberOfFields=1):
        """
        Convenience method for reading and unpacking data.

        length is the length of the field in bytes [required]
        format is the format code to use in struct.unpack() [required]
        numberOfFields is the number of fields expected from the call to
            struct.unpack() [default = 1]
        typefunc is the function through which the output of struct.unpack will
            be passed [default = None]
        """
        line = self.dataFile.read(length)
        try:
            data = struct.unpack(format, line)
        except struct.error as strerr:
            print(strerr)
            return
        if (numberOfFields == 1):
            data = data[0]
            if (typefunc == float):
                data = data.decode('utf-8').replace('D', 'E')
            if (typefunc):
                try:
                    data = typefunc(data)
                except ValueError:
                    data = 0

        return data