OUTPUT Output xml file with tokenized text """ import xml.sax import sys from collections import namedtuple, defaultdict from Tokenizer import PTBTokenizer # data types for annotation aspect_term = namedtuple("aspect_term", "term polarity start end") aspect_category = namedtuple("aspect_category", "category polarity") # create Penn Treebank tokenizer tokenizer = PTBTokenizer() class AnnotationHandler(xml.sax.ContentHandler): def __init__(self): self.sentences = {} self.aspect_terms = defaultdict(list) self.aspect_categories = defaultdict(list) self.text = "" def startElement(self, name, attrs): if name == "sentence": self.id = int(attrs['id']) elif name == "text": self.text = "" elif name == "aspectTerm":