def load_ace_file(textfile, fmt): print ' - %s' % os.path.split(textfile)[1] annfile = textfile+'.tmx.rdc.xml' # Read the xml file, and get a list of entities entities = [] xml = ET.parse(open(annfile)).getroot() for entity in xml.findall('document/entity'): typ = entity.find('entity_type').text for mention in entity.findall('entity_mention'): if mention.get('TYPE') != 'NAME': continue # only NEs s = int(mention.find('head/charseq/start').text) e = int(mention.find('head/charseq/end').text)+1 entities.append( (s, e, typ) ) # Read the text file, and mark the entities. text = open(textfile).read() # Strip XML tags, since they don't count towards the indices text = re.sub('<(?!/?TEXT)[^>]+>', '', text) # Blank out anything before/after <TEXT> def subfunc(m): return ' '*(m.end()-m.start()-6) text = re.sub('[\s\S]*<TEXT>', subfunc, text) text = re.sub('</TEXT>[\s\S]*', '', text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s,e,typ) in entities) # Binary distinction (NE or not NE) if fmt == 'binary': i = 0 toks = nltk.Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(nltk.word_tokenize(text[i:s])) toks.append(nltk.Tree('NE', text[s:e].split())) i = e toks.extend(nltk.word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == 'multiclass': i = 0 toks = nltk.Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(nltk.word_tokenize(text[i:s])) toks.append(nltk.Tree(typ, text[s:e].split())) i = e toks.extend(nltk.word_tokenize(text[i:])) yield toks else: raise ValueError('bad fmt value')
def roleset(self, roleset_id): """ @return: the xml description for the given roleset. """ lemma = roleset_id.split('.')[0] framefile = 'frames/%s.xml' % lemma if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % roleset_id) etree = ElementTree.parse(self.abspath(framefile)).getroot() for roleset in etree.findall('predicate/roleset'): if roleset.attrib['id'] == roleset_id: return roleset else: raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def roleset(self, roleset_id): """ @return: the xml description for the given roleset. """ lemma = roleset_id.split('.')[0] framefile = 'frames/%s.xml' % lemma if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % roleset_id) # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. etree = ElementTree.parse(self.abspath(framefile).open()).getroot() for roleset in etree.findall('predicate/roleset'): if roleset.attrib['id'] == roleset_id: return roleset else: raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def parse_from_file(self, filename): self._raw = ElementTree.parse(filename) self.filename_ = filename self.parse_tree(self._raw)
if ord(c) > 127: print '%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c)) print line.find(u'zosta\u0142y') line = line.lower() import re print line.encode('unicode_escape') m = re.search(u'\u015b\w*', line) print m.group() from nltk.tokenize import WordTokenizer tokenizer = WordTokenizer() print tokenizer.tokenize(line) path = nltk.data.find('samples/sinorama-gb.xml') f = codecs.open(path, encoding='gb2312') lines = f.readlines() for l in lines: l = l[:-1] utf_enc = l.encode('utf8') print repr(utf_enc) path = nltk.data.find('samples/sinorama-utf8.xml') from nltk.etree import ElementTree as ET tree = ET.parse(path) text = tree.findtext('sent') uni_text = text.encode('utf8') print repr(uni_text.splitlines()[1]) print "text=", text