示例#1
0
 OUTPUT      Output xml file with tokenized text
                                                                                                                                                                                         
"""

import xml.sax
import sys
from collections import namedtuple, defaultdict

from Tokenizer import PTBTokenizer

# data types for annotation
aspect_term = namedtuple("aspect_term", "term polarity start end")
aspect_category = namedtuple("aspect_category", "category polarity")

# create Penn Treebank tokenizer
tokenizer = PTBTokenizer()


class AnnotationHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.sentences = {}
        self.aspect_terms = defaultdict(list)
        self.aspect_categories = defaultdict(list)
        self.text = ""

    def startElement(self, name, attrs):
        if name == "sentence":
            self.id = int(attrs['id'])
        elif name == "text":
            self.text = ""
        elif name == "aspectTerm":