def __init__(self,locationConfig): ''' Constructor ''' dict.__init__(self) # Parse the location configuration and store for retrieval root = ElementTree().parse(locationConfig) for loc_elem in root.getchildren(): loc_info = LocationInfo(loc_elem) self[loc_info.id] = loc_info return
def convert2naf_file(inputfile, nafobj, token_info, raw): myinput = ElementTree().parse(inputfile) token_dict = None word_count = token_info[2] for elem in myinput.getchildren(): if elem.tag == 'node': token_dict = {} token_dict, updated_word_count = get_tokens( elem, word_count, token_dict) create_token_and_term_layer(token_dict, token_info, nafobj) #code from Ruben to create constituent and dependency layers find_propbank_rels(elem, nafobj, word_count) add_dependencies_to_naf(elem, nafobj, word_count) elif elem.tag == 'sentence': raw += ' ' + elem.text return updated_word_count, raw
# XML提供了一个格式方便和用途广泛的工具 # 11.4.3 ElementTree接口 # Python的ElementTree模型提供了一种方便的方式用于访问存储在XML文件中的数据。 merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml') raw = open(merchant_file).read() print(raw[:163]) print(raw[1789:2006]) from xml.etree.ElementTree import ElementTree merchant = ElementTree().parse(merchant_file) merchant merchant[0] merchant[0].text merchant.getchildren() merchant[-2][0].text merchant[-2][1] merchant[-2][1][0].text merchant[-2][1][54] merchant[-2][1][54][0] merchant[-2][1][54][0].text merchant[-2][1][54][1] merchant[-2][1][54][1].text for i, act in enumerate(merchant.findall('ACT')): for j, scene in enumerate(act.findall('SCENE')): for k, speech in enumerate(scene.findall('SPEECH')): for line in speech.findall('LINE'): if 'music' in str(line.text): print('Act %d Scene %d Speech %d: %s' %
# XML提供了一个格式方便和用途广泛的工具 # 4.3 ElementTree接口 # Python的ElementTree模型提供了一种方便的方式用于访问存储在XML文件中的数据。 merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml') raw = open(merchant_file).read() print(raw[:163]) print(raw[1789:2006]) from xml.etree.ElementTree import ElementTree merchant = ElementTree().parse(merchant_file) print(merchant) print(merchant[0]) print(merchant[0].text) print(merchant.getchildren()) print(merchant[-2][0].text) print(merchant[-2][1]) print(merchant[-2][1][0].text) print(merchant[-2][1][54]) print(merchant[-2][1][54][0]) print(merchant[-2][1][54][0].text) print(merchant[-2][1][54][1]) print(merchant[-2][1][54][1].text) for i, act in enumerate(merchant.findall('ACT')): for j, scene in enumerate(act.findall('SCENE')): for k, speech in enumerate(scene.findall('SPEECH')): for line in speech.findall('LINE'): if 'music' in str(line.text): print('Act %d Scene %d Speech %d: %s' %
class CEOSDB(object): typeMap = ['Skip', 'An', 'In', 'B1', 'B4', 'Fn', 'B2', 'Debug'] def __init__(self, xml=None, dataFile=None): self.xml = xml self.dataFile = dataFile self.startPosition = dataFile.tell() self.recordLength = 0 self.metadata = {} if not xml == None: self.xmlFP = open(self.xml, 'r') self.rootChildren = ET(file=self.xmlFP).getroot().getchildren() else: self.xmlFP = None self.rootChildren = [] def getMetadata(self): return self.metadata def getEndOfRecordPosition(self): return self.startPosition + self.recordLength def finalizeParser(self): self.xmlFP.close() def parseFast(self): """ Use the xml definition of the field positions, names and lengths to parse a CEOS data file """ for z in self.rootChildren: # If the tag name is 'rec', this is a plain old record if z.tag == 'rec': (key, data) = self.decodeNode(z) self.metadata[key] = data # If the tag name is 'struct', we need to loop over some other # records elif z.tag == "struct": loopCounterName = z.attrib['loop'] loopCount = self.metadata[loopCounterName] key = z.attrib['name'] self.metadata[key] = [None] * loopCount for i in range(loopCount): struct = {} for node in z.getchildren(): (subkey, data) = self.decodeNode(node) struct[subkey] = data self.metadata[key][i] = struct self.recordLength = self.metadata['Record Length'] def parse(self): """ Use the xml definition of the field positions, names and lengths to parse a CEOS data file """ xmlFP = open(self.xml, 'r') self.root = ET(file=xmlFP).getroot() for z in self.root.getchildren(): # If the tag name is 'rec', this is a plain old record if z.tag == 'rec': (key, data) = self.decodeNode(z) self.metadata[key] = data # If the tag name is 'struct', we need to loop over some other #records if z.tag == "struct": loopCounterName = z.attrib['loop'] loopCount = self.metadata[loopCounterName] key = z.attrib['name'] self.metadata[key] = [None] * loopCount for i in range(loopCount): struct = {} for node in z.getchildren(): (subkey, data) = self.decodeNode(node) struct[subkey] = data self.metadata[key][i] = struct xmlFP.close() self.recordLength = self.metadata['Record Length'] def decodeNode(self, node): """ Create an entry in the metadata dictionary """ key = node.attrib['name'] size = int(node.attrib['num']) format = int(node.attrib['type']) data = self.readData(key, size, format) return key, data def readData(self, key, size, format): """ Read data from a node and return it """ formatString = '' strp_3 = lambda x: str.strip(x.decode('utf-8')).rstrip('\x00') convertFunction = None if (self.typeMap[format] == "Skip"): self.dataFile.seek(size, os.SEEK_CUR) return elif (self.typeMap[format] == "An"): formatString = "%ss" % size convertFunction = strp_3 elif (self.typeMap[format] == "In"): formatString = "%ss" % size convertFunction = int elif (self.typeMap[format] == "Fn"): formatString = "%ss" % size convertFunction = float elif (self.typeMap[format] == "Debug"): print(key, size, format, self.dataFile.tell()) elif (self.typeMap[format] == "B4"): formatString = ">I" convertFunction = int size = 4 elif (self.typeMap[format] == "B2"): formatString = ">H" convertFunction = int size = 2 elif (self.typeMap[format] == "B1"): formatString = ">B" convertFunction = int size = 1 else: raise TypeError("Unknown format %s" % format) data = self._readAndUnpackData(length=size, format=formatString, typefunc=convertFunction) return data def _readAndUnpackData(self, length=None, format=None, typefunc=None, numberOfFields=1): """ Convenience method for reading and unpacking data. length is the length of the field in bytes [required] format is the format code to use in struct.unpack() [required] numberOfFields is the number of fields expected from the call to struct.unpack() [default = 1] typefunc is the function through which the output of struct.unpack will be passed [default = None] """ line = self.dataFile.read(length) try: data = struct.unpack(format, line) except struct.error as strerr: print(strerr) return if (numberOfFields == 1): data = data[0] if (typefunc == float): data = data.decode('utf-8').replace('D', 'E') if (typefunc): try: data = typefunc(data) except ValueError: data = 0 return data