Exemplo n.º 1
0
    def __init__(self, encoding=None):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = ""  # the name of the current unknown element

        # 在类别中发生分析错误时,将其设置为true。
        self._skipCurrentCategory = False

        # 统计特定AIML文档中解析错误的数量。用getNumErrors()查询。 如果为0,则文档符合AIML。
        self._numParseErrors = 0

        # TODO: 根据版本号选择合适的validInfo表。
        self._validInfo = self._validationInfo101

        # 这个bool值栈在解析<condition>元素中的<li>元素时,用来跟踪是否已经找到了一个无属性的“default”<li>元素。
        # 每个<condition>元素中只允许有一个默认的<li>。  我们需要一个栈来正确处理嵌套的<condition>标签。
        self._foundDefaultLiStack = []

        # 这个字符串堆栈表示当前的空白处理行为应该是什么。堆栈中的每个字符串都是"default" 或"preserve"。
        # 当遇到一个新的AIML元素时,根据元素的“xml:space”属性的值(如果没有,堆栈顶部被再次push),一个新的字符串被压入栈中。
        # 一个元素结束时,从堆栈中弹出一个对象。
        self._whitespaceBehaviorStack = ["default"]

        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)
Exemplo n.º 2
0
    def __init__(self, encoding="UTF-8"):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = ""  # the name of the current unknown element

        # This is set to true when a parse error occurs in a category.
        self._skipCurrentCategory = False

        # Counts the number of parse errors in a particular AIML document.
        # query with getNumErrors().  If 0, the document is AIML-compliant.
        self._numParseErrors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._validInfo = self._validationInfo101

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._foundDefaultLiStack = []

        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)
Exemplo n.º 3
0
    def __init__(self):
        ContentHandler.__init__(self)
        self._locator = Locator()  # Dummy setDocumentLocator does the same!
        self.setDocumentLocator(self._locator)
        self.m_lcnt = 0  # input line number
        self.m_ppath = [
        ]  # contains the XML path to the current node (names of the ancestors)
        self.m_done = -1  # -1: not started synset yet, 0: inside synset, 1: done with synset
        self.m_syns = Synset()  # points to the output struct
        self.m_syns_list = []  # points to the output struct

        self.m_ilrs0_temp = ''  # Temp vars for Tuples (std::pair in C++)
        self.m_ilrs1_temp = ''

        self.m_sumolinks0_temp = ''
        self.m_sumolinks1_temp = ''

        self.m_elrs0_temp = ''
        self.m_elrs1_temp = ''

        self.m_elrs30_temp = ''
        self.m_elrs31_temp = ''

        self.m_ekszlinks0_temp = ''
        self.m_ekszlinks1_temp = ''

        self.m_vframelinks0_temp = ''
        self.m_vframelinks1_temp = ''

        self.m_startroot = False  # was there a starting root tag?
        self.m_endroot = False  # was there an end root tag?
Exemplo n.º 4
0
    def __init__(self, wn):
        """
        Constructor.
            :param wn: an existing WNQuery object, that will be used for querying.
            @exception SemFeaturesException on file parsing errors
        """

        ContentHandler.__init__(self)
        self._locator = Locator()  # Dummy setDocumentLocator does the same!
        self.setDocumentLocator(self._locator)
        self.m_lcnt = 0                     # input line number
        self.m_ppath = []                   # contains the XML path to the current node (names of the ancestors)
        self.m_currfeat = ''                # feature currently being processed
        self.m_wn = wn                      # WordNet (WNQuery)
        self.m_featmap = defaultdict(list)  # semantic features to synset ids
Exemplo n.º 5
0
    def __init__(self, encoding = "UTF-8"):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat    = ""
        self._currentTopic   = ""
        self._insideTopic = False
        self._currentUnknown = "" # the name of the current unknown element

        # This is set to true when a parse error occurs in a category.
        self._skipCurrentCategory = False

        # Counts the number of parse errors in a particular AIML document.
        # query with getNumErrors().  If 0, the document is AIML-compliant.
        self._numParseErrors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._validInfo = self._validationInfo101

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._foundDefaultLiStack = []
        
        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)
Exemplo n.º 6
0
 def check(self):
     try:
         p = make_parser()
         self.locator = Locator()
         self.setDocumentLocator(self.locator)
         p.setErrorHandler(self)
         p.parse(self.document)
     except Exception as e:
         self.error(e)
Exemplo n.º 7
0
    def __init__(self, encoding="UTF-8"):

        super().__init__()

        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = ""  # the name of the current unknown element

        # This is set to true when a parse error occurs in a category.
        self._skipCurrentCategory = False

        # Counts the number of parse errors in a particular AIML document.
        # query with getNumErrors().  If 0, the document is AIML-compliant.
        self._numParseErrors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._validInfo = self._validationInfo101

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._foundDefaultLiStack = []

        # This stack of strings indicates what the current whitespace-handling
        # behavior should be.  Each string in the stack is either "default" or
        # "preserve".  When a new AIML element is encountered, a new string is
        # pushed onto the stack, based on the value of the element's "xml:space"
        # attribute (if absent, the top of the stack is pushed again).  When
        # ending an element, pop an object off the stack.
        self._whitespaceBehaviorStack = ["default"]

        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)
Exemplo n.º 8
0
    def __init__(self, encoding="UTF-8"):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = ""
        self._skipCurrentCategory = False
        self._numParseErrors = 0
        self._validInfo = self._validationInfo101
        self._foundDefaultLiStack = []
        self._whitespaceBehaviorStack = ["default"]

        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)
Exemplo n.º 9
0
    def __init__(self, encoding="UTF-8"):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        # the name of the current unknown element
        self._currentUnknown = ""

        # special treatment for <learn> to work with Badanswer.aiml
        # (http://www.alicebot.org/aiml/aaa/Badanswer.txt)
        self._insideLearn = False

        # This is set to true when a parse error occurs in a category.
        self._skipCurrentCategory = False

        # Counts the number of parse errors in a particular AIML document.
        # query with getNumErrors().  If 0, the document is AIML-compliant.
        self._numParseErrors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._validInfo = self._validationInfo101_ext

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._foundDefaultLiStack = []

        # This stack of strings indicates what the current whitespace-handling
        # behavior should be.  Each string in the stack is either "default" or
        # "preserve".  When a new AIML element is encountered, a new string is
        # pushed onto the stack, based on the value of the element's
        # "xml:space" attribute (if absent, the top of the stack is pushed
        # again).  When ending an element, pop an object off the stack.
        self._whitespaceBehaviorStack = ["default"]
        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)
Exemplo n.º 10
0
 def __init__(self, encoding="UTF-8"):
     ContentHandler.__init__(self)
     self.categories = {}
     self._encoding = encoding
     self._state = self._STATE_OutsideAiml
     self._version = ""
     self._namespace = ""
     self._forwardCompatibleMode = False
     self._currentPattern = ""
     self._currentThat = ""
     self._currentTopic = ""
     self._insideTopic = False
     self._currentUnknown = "" # the name of the current unknown element
     self._skipCurrentCategory = False
     self._numParseErrors = 0
     self._validInfo = self._validationInfo101
     self._foundDefaultLiStack = []
     self._whitespaceBehaviorStack = ["default"]
     self._elemStack = []
     self._locator = Locator()
     self.setDocumentLocator(self._locator)
Exemplo n.º 11
0
class AimlHandler(ContentHandler):

    # The legal states of the AIML parser
    _STATE_outside_aiml = 0
    _STATE_inside_aiml = 1
    _STATE_inside_category = 2
    _STATE_inside_pattern = 3
    _STATE_after_pattern = 4
    _STATE_inside_that = 5
    _STATE_after_that = 6
    _STATE_inside_template = 7
    _STATE_after_template = 8

    def __init__(self, encoding="UTF-8"):
        self.log = LoggingUtils().log
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_outside_aiml
        self._version = ""
        self._namespace = ""
        self._forward_compatible_mode = False
        self._current_pattern = ""
        self._current_that = ""
        self._current_topic = ""
        self._inside_topic = False
        self._current_unknown = ""  # the name of the current unknown element

        # This is set to true when a parse error occurs in a category.
        self._skip_current_category = False

        # Counts the number of parse errors in a particular AIML document.
        # query with get_num_errors().  If 0, the document is AIML-compliant.
        self._num_parse_errors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._valid_info = self._validation_info101

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._found_default_li_stack = []

        # This stack of strings indicates what the current whitespace-handling
        # behavior should be.  Each string in the stack is either "default" or
        # "preserve".  When a new AIML element is encountered, a new string is
        # pushed onto the stack, based on the value of the element's "xml:space"
        # attribute (if absent, the top of the stack is pushed again).  When
        # ending an element, pop an object off the stack.
        self._whitespace_behavior_stack = ["default"]

        self._elem_stack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)

    def get_num_errors(self):
        "Return the number of errors found while parsing the current document."
        return self._num_parse_errors

    def set_encoding(self, encoding):
        """Set the text encoding to use when encoding strings read from XML.

        Defaults to 'UTF-8'.

        """
        self._encoding = encoding

    def _location(self):
        "Return a string describing the current location in the source file."
        line = self._locator.getLineNumber()
        column = self._locator.getColumnNumber()
        return "(line {0}, column {1})".format(line, column)

    def _push_whitespace_behavior(self, attr):
        """Push a new string onto the whitespaceBehaviorStack.

        The string's value is taken from the "xml:space" attribute, if it exists
        and has a legal value ("default" or "preserve").  Otherwise, the previous
        stack element is duplicated.

        """
        assert len(self._whitespace_behavior_stack) > 0, "Whitespace behavior stack should never be empty!"
        try:
            if attr["xml:space"] == "default" or attr["xml:space"] == "preserve":
                self._whitespace_behavior_stack.append(attr["xml:space"])
            else:
                raise AimlParserError("Invalid value for xml:space attribute " + self._location())
        except KeyError:
            self._whitespace_behavior_stack.append(self._whitespace_behavior_stack[-1])

    def startElementNS(self, name, qname, attr):
        self.log.debug("QNAME: {0}".format(qname))
        self.log.debug("NAME: {0}".format(name))
        uri, elem = name
        if elem == "bot":
            self.log.debug("name:" + attr.getValueByQName("name") + "a'ite?")
        self.startElement(elem, attr)
        pass

    def startElement(self, name, attr):
        # Wrapper around _start_element, which catches errors in _start_element()
        # and keeps going.

        # If we're inside an unknown element, ignore everything until we're
        # out again.
        if self._current_unknown != "":
            return
        # If we're skipping the current category, ignore everything until
        # it's finished.
        if self._skip_current_category:
            return

        # process this start-element.
        try:
            self._start_element(name, attr)
        except AimlParserError as msg:
            # Print the error message
            sys.stderr.write("PARSE ERROR: {0}\n".format(msg))

            self._num_parse_errors += 1  # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_inside_category:
                self._skip_current_category = True

    def _start_element(self, name, attr):
        if name == "aiml":
            # <aiml> tags are only legal in the OutsideAiml state
            if self._state != self._STATE_outside_aiml:
                raise AimlParserError("Unexpected <aiml> tag " + self._location())
            self._state = self._STATE_inside_aiml
            self._inside_topic = False
            self._current_topic = ""
            try:
                self._version = attr["version"]
            except KeyError:
                # This SHOULD be a syntax error, but so many AIML sets out there are missing
                # "version" attributes that it just seems nicer to let it slide.
                # raise AimlParserError, "Missing 'version' attribute in <aiml> tag "+self._location()
                # print "WARNING: Missing 'version' attribute in <aiml> tag "+self._location()
                # print "         Defaulting to version 1.0"
                self._version = "1.0"
            self._forward_compatible_mode = (self._version != "1.0.1")
            self._push_whitespace_behavior(attr)
        # Not a comments from @MissMaximass
        # Not sure about this namespace business yet...
        # try:
        # 	self._namespace = attr["xmlns"]
        # 	if self._version == "1.0.1" and self._namespace != "http://alicebot.org/2001/AIML-1.0.1":
        # 		raise AimlParserError, "Incorrect namespace for AIML v1.0.1 "+self._location()
        # except KeyError:
        # 	if self._version != "1.0":
        # 		raise AimlParserError, "Missing 'version' attribute(s) in <aiml> tag "+self._location()
        elif self._state == self._STATE_outside_aiml:
            # If we're outside of an AIML element, we ignore all tags.
            return
        elif name == "topic":
            # <topic> tags are only legal in the InsideAiml state, and only
            # if we're not already inside a topic.
            if (self._state != self._STATE_inside_aiml) or self._inside_topic:
                raise AimlParserError("Unexpected <topic> tag" + self._location())
            try:
                self._current_topic = attr['name']
            except KeyError:
                raise AimlParserError("Required \"name\" attribute missing in <topic> element " + self._location())
            self._inside_topic = True
        elif name == "category":
            # <category> tags are only legal in the InsideAiml state
            if self._state != self._STATE_inside_aiml:
                raise AimlParserError("Unexpected <category> tag " + self._location())
            self._state = self._STATE_inside_category
            self._current_pattern = ""
            self._current_that = ""
            # If we're not inside a topic, the topic is implicitly set to *
            if not self._inside_topic:
                self._current_topic = "*"
            self._elem_stack = []
            self._push_whitespace_behavior(attr)
        elif name == "pattern":
            # <pattern> tags are only legal in the InsideCategory state
            if self._state != self._STATE_inside_category:
                raise AimlParserError("Unexpected <pattern> tag " + self._location())
            self._state = self._STATE_inside_pattern
        elif name == "that" and self._state == self._STATE_after_pattern:
            # <that> are legal either inside a <template> element, or
            # inside a <category> element, between the <pattern> and the
            # <template> elements.  This clause handles the latter case.
            self._state = self._STATE_inside_that
        elif name == "template":
            # <template> tags are only legal in the AfterPattern and AfterThat
            # states
            if self._state not in [self._STATE_after_pattern, self._STATE_after_that]:
                raise AimlParserError("Unexpected <template> tag " + self._location())
            # if no <that> element was specified, it is implicitly set to *
            if self._state == self._STATE_after_pattern:
                self._current_that = "*"
            self._state = self._STATE_inside_template
            self._elem_stack.append(['template', {}])
            self._push_whitespace_behavior(attr)
        elif self._state == self._STATE_inside_pattern:
            # Certain tags are allowed inside <pattern> elements.
            if name == "bot" and attr.get("name", False) and attr["name"] == "name":
                # Insert a special character string that the PatternMgr will
                # replace with the bot's name.
                self._current_pattern += " BOT_NAME "
            else:
                raise AimlParserError(("Unexpected <{0}> tag ".format(name)) + self._location())
        elif self._state == self._STATE_inside_that:
            # Certain tags are allowed inside <that> elements.
            if name == "bot" and attr.get("name", False) and attr["name"] == "name":
                # Insert a special character string that the PatternMgr will
                # replace with the bot's name.
                self._current_that += " BOT_NAME "
            else:
                raise AimlParserError(("Unexpected <{0}> tag ".format(name)) + self._location())
        elif self._state == self._STATE_inside_template and self._valid_info.get(name, False):
            # Starting a new element inside the current pattern. First
            # we need to convert 'attr' into a native Python dictionary,
            # so it can later be marshaled.
            attrDict = {}
            for k, v in attr.items():
                # attrDict[k[1].encode(self._encoding)] = v.encode(self._encoding)
                attrDict[k] = v
            self._validate_elem_start(name, attrDict)
            # Push the current element onto the element stack.
            self._elem_stack.append([name, attrDict])
            self._push_whitespace_behavior(attr)
            # If this is a condition element, push a new entry onto the
            # foundDefaultLiStack
            if name == "condition":
                self._found_default_li_stack.append(False)
        else:
            # we're now inside an unknown element.
            if self._forward_compatible_mode:
                # In Forward Compatibility Mode, we ignore the element and its
                # contents.
                self._current_unknown = name
            else:
                # Otherwise, unknown elements are grounds for error!
                raise AimlParserError(("Unexpected <{0}> tag ".format(name)) + self._location())

    def characters(self, ch):
        # Wrapper around _characters which catches errors in _characters()
        # and keeps going.
        if self._state == self._STATE_outside_aiml:
            # If we're outside of an AIML element, we ignore all text
            return
        if self._current_unknown != "":
            # If we're inside an unknown element, ignore all text
            return
        if self._skip_current_category:
            # If we're skipping the current category, ignore all text.
            return
        try:
            self._characters(ch)
        except AimlParserError as msg:
            # Print the message
            sys.stderr.write("PARSE ERROR: {0}\n".format(msg))
            self._num_parse_errors += 1  # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_inside_category:
                self._skip_current_category = True

    def _characters(self, ch):
        text = str(ch)
        if self._state == self._STATE_inside_pattern:
            # TODO: text inside patterns must be upper-case!
            self._current_pattern += text
        elif self._state == self._STATE_inside_that:
            self._current_that += text
        elif self._state == self._STATE_inside_template:
            # First, see whether the element at the top of the element stack
            # is permitted to contain text.
            try:
                parent = self._elem_stack[-1][0]
                parent_attr = self._elem_stack[-1][1]
                required, optional, can_be_parent = self._valid_info[parent]
                nonBlockStyleCondition = (
                    parent == "condition" and not (parent_attr.get("name", False) and parent_attr.get("value", False)))
                if not can_be_parent:
                    raise AimlParserError(("Unexpected text inside <{0}> element ".format(parent)) + self._location())
                elif parent == "random" or nonBlockStyleCondition:
                    # <random> elements can only contain <li> subelements. However,
                    # there's invariably some whitespace around the <li> that we need
                    # to ignore. Same for non-block-style <condition> elements (i.e.
                    # those which don't have both a "name" and a "value" attribute).
                    if len(text.strip()) == 0:
                        # ignore whitespace inside these elements.
                        return
                    else:
                        # non-whitespace text inside these elements is a syntax error.
                        raise AimlParserError(("Unexpected text inside <{0}> element ".format(parent)) +
                                              self._location())
            except IndexError:
                # the element stack is empty. This should never happen.
                raise AimlParserError("Element stack is empty while validating text " + self._location())

            # Add a new text element to the element at the top of the element
            # stack. If there's already a text element there, simply append the
            # new characters to its contents.
            try:
                text_elem_on_stack = (self._elem_stack[-1][-1][0] == "text")
            except IndexError:
                text_elem_on_stack = False
            except KeyError:
                text_elem_on_stack = False
            if text_elem_on_stack:
                self._elem_stack[-1][-1][2] += text
            else:
                self._elem_stack[-1].append(["text", {"xml:space": self._whitespace_behavior_stack[-1]}, text])
        else:
            # all other text is ignored
            pass

    def endElementNS(self, name, qname):
        uri, elem = name
        self.endElement(elem)

    def endElement(self, name):
        """Wrapper around _end_element which catches errors in _characters()
        and keeps going.

        """
        if self._state == self._STATE_outside_aiml:
            # If we're outside of an AIML element, ignore all tags
            return
        if self._current_unknown != "":
            # see if we're at the end of an unknown element.  If so, we can
            # stop ignoring everything.
            if name == self._current_unknown:
                self._current_unknown = ""
            return
        if self._skip_current_category:
            # If we're skipping the current category, see if it's ending. We
            # stop on ANY </category> tag, since we're not keeping track of
            # state in ignore-mode.
            if name == "category":
                self._skip_current_category = False
                self._state = self._STATE_inside_aiml
            return
        try:
            self._end_element(name)
        except AimlParserError as msg:
            # Print the message
            sys.stderr.write("PARSE ERROR: {0}\n".format(msg))
            self._num_parse_errors += 1  # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_inside_category:
                self._skip_current_category = True

    def _end_element(self, name):
        """Verify that an AIML end element is valid in the current
        context.

        Raises an AimlParserError if an illegal end element is encountered.

        """
        if name == "aiml":
            # </aiml> tags are only legal in the InsideAiml state
            if self._state != self._STATE_inside_aiml:
                raise AimlParserError("Unexpected </aiml> tag " + self._location())
            self._state = self._STATE_outside_aiml
            self._whitespace_behavior_stack.pop()
        elif name == "topic":
            # </topic> tags are only legal in the InsideAiml state, and
            # only if _insideTopic is true.
            if self._state != self._STATE_inside_aiml or not self._inside_topic:
                raise AimlParserError("Unexpected </topic> tag " + self._location())
            self._inside_topic = False
            self._current_topic = ""
        elif name == "category":
            # </category> tags are only legal in the AfterTemplate state
            if self._state != self._STATE_after_template:
                raise AimlParserError("Unexpected </category> tag " + self._location())
            self._state = self._STATE_inside_aiml
            # End the current category.  Store the current pattern/that/topic and
            # element in the categories dictionary.
            key = (self._current_pattern.strip(), self._current_that.strip(), self._current_topic.strip())
            self.categories[key] = self._elem_stack[-1]
            self._whitespace_behavior_stack.pop()
        elif name == "pattern":
            # </pattern> tags are only legal in the InsidePattern state
            if self._state != self._STATE_inside_pattern:
                raise AimlParserError("Unexpected </pattern> tag " + self._location())
            self._state = self._STATE_after_pattern
        elif name == "that" and self._state == self._STATE_inside_that:
            # </that> tags are only allowed inside <template> elements or in
            # the InsideThat state.  This clause handles the latter case.
            self._state = self._STATE_after_that
        elif name == "template":
            # </template> tags are only allowed in the InsideTemplate state.
            if self._state != self._STATE_inside_template:
                raise AimlParserError("Unexpected </template> tag " + self._location())
            self._state = self._STATE_after_template
            self._whitespace_behavior_stack.pop()
        elif self._state == self._STATE_inside_pattern:
            # Certain tags are allowed inside <pattern> elements.
            if name not in ["bot"]:
                raise AimlParserError(("Unexpected </{0}> tag ".format(name)) + self._location())
        elif self._state == self._STATE_inside_that:
            # Certain tags are allowed inside <that> elements.
            if name not in ["bot"]:
                raise AimlParserError(("Unexpected </{0}> tag ".format(name)) + self._location())
        elif self._state == self._STATE_inside_template:
            # End of an element inside the current template.  Append the
            # element at the top of the stack onto the one beneath it.
            elem = self._elem_stack.pop()
            self._elem_stack[-1].append(elem)
            self._whitespace_behavior_stack.pop()
            # If the element was a condition, pop an item off the
            # foundDefaultLiStack as well.
            if elem[0] == "condition":
                self._found_default_li_stack.pop()
        else:
            # Unexpected closing tag
            raise AimlParserError(("Unexpected </{0}> tag ".format(name)) + self._location())

    # A dictionary containing a validation information for each AIML
    # element. The keys are the names of the elements.  The values are a
    # tuple of three items. The first is a list containing the names of
    # REQUIRED attributes, the second is a list of OPTIONAL attributes,
    # and the third is a boolean value indicating whether or not the
    # element can contain other elements and/or text (if False, the
    # element can only appear in an atomic context, such as <date/>).
    _validation_info101 = {
        "bot": (["name"], [], False),
        "condition": ([], ["name", "value"], True),  # can only contain <li> elements
        "date": ([], [], False),
        "formal": ([], [], True),
        "gender": ([], [], True),
        "get": (["name"], [], False),
        "gossip": ([], [], True),
        "id": ([], [], False),
        "input": ([], ["index"], False),
        "javascript": ([], [], True),
        "learn": ([], [], True),
        "li": ([], ["name", "value"], True),
        "lowercase": ([], [], True),
        "person": ([], [], True),
        "person2": ([], [], True),
        "random": ([], [], True),  # can only contain <li> elements
        "sentence": ([], [], True),
        "set": (["name"], [], True),
        "size": ([], [], False),
        "sr": ([], [], False),
        "srai": ([], [], True),
        "star": ([], ["index"], False),
        "system": ([], [], True),
        "template": ([], [], True),  # needs to be in the list because it can be a parent.
        "that": ([], ["index"], False),
        "thatstar": ([], ["index"], False),
        "think": ([], [], True),
        "topicstar": ([], ["index"], False),
        "uppercase": ([], [], True),
        "version": ([], [], False),
    }

    def _validate_elem_start(self, name, attr):
        """Test the validity of an element starting inside a <template>
        element.

        This function raises an AimlParserError exception if it the tag is
        invalid.  Otherwise, no news is good news.

        """
        # Check the element's attributes.  Make sure that all required
        # attributes are present, and that any remaining attributes are
        # valid options.
        required, optional, can_be_parent = self._valid_info[name]
        for a in required:
            if a not in attr and not self._forward_compatible_mode:
                raise AimlParserError(("Required \"{0}\" attribute missing in <{1}> element "
                                       .format(a, name)) + self._location())
        for a in attr:
            if a in required:
                continue
            if a[0:4] == "xml:":
                continue  # attributes in the "xml" namespace can appear anywhere
            if a not in optional and not self._forward_compatible_mode:
                raise AimlParserError(("Unexpected \"{0}\" attribute in <{1}> element "
                                       .format(a, name)) + self._location())

        # special-case: several tags contain an optional "index" attribute.
        # This attribute's value must be a positive integer.
        if name in ["star", "thatstar", "topicstar"]:
            for k, v in attr.items():
                if k == "index":
                    temp = 0
                    try:
                        temp = int(v)
                    except:
                        raise AimlParserError(("Bad type for \"{0}\" attribute (expected integer, found \"{1}\") "
                                               .format(k, v)) + self._location())
                    if temp < 1:
                        raise AimlParserError(("\"{0}\" attribute must have non-negative value "
                                               .format(k)) + self._location())

        # See whether the containing element is permitted to contain
        # subelements. If not, this element is invalid no matter what it is.
        try:
            parent = self._elem_stack[-1][0]
            parent_attr = self._elem_stack[-1][1]
        except IndexError:
            # If the stack is empty, no parent is present.  This should never
            # happen.
            raise AimlParserError(("Element stack is empty while validating <{0}> ".format(name)) + self._location())
        required, optional, can_be_parent = self._valid_info[parent]
        non_block_style_condition = (
            parent == "condition" and not (parent_attr.get("name", False) and parent_attr.get("value", False)))
        if not can_be_parent:
            raise AimlParserError(("<{0}> elements cannot have any contents ".format(parent)) + self._location())
        # Special-case test if the parent element is <condition> (the
        # non-block-style variant) or <random>: these elements can only
        # contain <li> subelements.
        elif (parent == "random" or non_block_style_condition) and name != "li":
            raise AimlParserError(("<{0}> elements can only contain <li> subelements "
                                   .format(parent)) + self._location())
        # Special-case test for <li> elements, which can only be contained
        # by non-block-style <condition> and <random> elements, and whose
        # required attributes are dependent upon which attributes are
        # present in the <condition> parent.
        elif name == "li":
            if not (parent == "random" or non_block_style_condition):
                raise AimlParserError(("Unexpected <li> element contained by <{0}> element "
                                       .format(parent)) + self._location())
            if non_block_style_condition:
                if parent_attr.get("name", False):
                    # Single-predicate condition.  Each <li> element except the
                    # last must have a "value" attribute.
                    if len(attr) == 0:
                        # This could be the default <li> element for this <condition>,
                        # unless we've already found one.
                        if self._found_default_li_stack[-1]:
                            raise AimlParserError(
                                "Unexpected default <li> element inside <condition> " + self._location())
                        else:
                            self._found_default_li_stack[-1] = True
                    elif len(attr) == 1 and "value" in attr:
                        pass  # this is the valid case
                    else:
                        raise AimlParserError("Invalid <li> inside single-predicate <condition> " + self._location())
                elif len(parent_attr) == 0:
                    # Multi-predicate condition.  Each <li> element except the
                    # last must have a "name" and a "value" attribute.
                    if len(attr) == 0:
                        # This could be the default <li> element for this <condition>,
                        # unless we've already found one.
                        if self._found_default_li_stack[-1]:
                            raise AimlParserError(
                                "Unexpected default <li> element inside <condition> " + self._location())
                        else:
                            self._found_default_li_stack[-1] = True
                    elif len(attr) == 2 and attr.get("value", False) and attr.get("name", False):
                        pass  # this is the valid case
                    else:
                        raise AimlParserError("Invalid <li> inside multi-predicate <condition> " + self._location())
        return True
Exemplo n.º 12
0
class AimlHandler(ContentHandler):
    # The legal states of the AIML parser
    _STATE_OutsideAiml = 0
    _STATE_InsideAiml = 1
    _STATE_InsideCategory = 2
    _STATE_InsidePattern = 3
    _STATE_AfterPattern = 4
    _STATE_InsideThat = 5
    _STATE_AfterThat = 6
    _STATE_InsideTemplate = 7
    _STATE_AfterTemplate = 8

    def __init__(self, encoding="UTF-8"):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = ""  # the name of the current unknown element

        # This is set to true when a parse error occurs in a category.
        self._skipCurrentCategory = False

        # Counts the number of parse errors in a particular AIML document.
        # query with getNumErrors().  If 0, the document is AIML-compliant.
        self._numParseErrors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._validInfo = self._validationInfo101

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._foundDefaultLiStack = []

        # This stack of strings indicates what the current whitespace-handling
        # behavior should be.  Each string in the stack is either "default" or
        # "preserve".  When a new AIML element is encountered, a new string is
        # pushed onto the stack, based on the value of the element's "xml:space"
        # attribute (if absent, the top of the stack is pushed again).  When
        # ending an element, pop an object off the stack.
        self._whitespaceBehaviorStack = ["default"]

        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)

    def getNumErrors(self):
        "Return the number of errors found while parsing the current document."
        return self._numParseErrors

    def setEncoding(self, encoding):
        """Set the text encoding to use when encoding strings read from XML.

		Defaults to 'UTF-8'.

		"""
        self._encoding = encoding

    def _location(self):
        "Return a string describing the current location in the source file."
        line = self._locator.getLineNumber()
        column = self._locator.getColumnNumber()
        return "(line %d, column %d)" % (line, column)

    def _pushWhitespaceBehavior(self, attr):
        """Push a new string onto the whitespaceBehaviorStack.

		The string's value is taken from the "xml:space" attribute, if it exists
		and has a legal value ("default" or "preserve").  Otherwise, the previous
		stack element is duplicated.

		"""
        assert len(self._whitespaceBehaviorStack) > 0, "Whitespace behavior stack should never be empty!"
        try:
            if attr["xml:space"] == "default" or attr["xml:space"] == "preserve":
                self._whitespaceBehaviorStack.append(attr["xml:space"])
            else:
                raise AimlParserError, "Invalid value for xml:space attribute " + self._location()
        except KeyError:
            self._whitespaceBehaviorStack.append(self._whitespaceBehaviorStack[-1])

    def startElementNS(self, name, qname, attr):
        print "QNAME:", qname
        print "NAME:", name
        uri, elem = name
        if elem == "bot":
            print "name:", attr.getValueByQName("name"), "a'ite?"
        self.startElement(elem, attr)
        pass

    def startElement(self, name, attr):
        # Wrapper around _startElement, which catches errors in _startElement()
        # and keeps going.

        # If we're inside an unknown element, ignore everything until we're
        # out again.
        if self._currentUnknown != "":
            return
            # If we're skipping the current category, ignore everything until
            # it's finished.
        if self._skipCurrentCategory:
            return

            # process this start-element.
        try:
            self._startElement(name, attr)
        except AimlParserError, msg:
            # Print the error message
            sys.stderr.write("PARSE ERROR: %s\n" % msg)

            self._numParseErrors += 1  # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True
Exemplo n.º 13
0
        self._accumulator.append(text)


def _wrap_complete(method_name):
    def method(self, *a, **k):
        self._complete_text_node()
        getattr(self._downstream, method_name)(*a, **k)

    method.__name__ = method_name
    setattr(TextNormalizeFilter, method_name, method)


for n in '''startElement endElement endDocument'''.split():
    _wrap_complete(n)

documentLocator = Locator()


class WikiDumpHandler(handler.ContentHandler):
    """
	This extends handler ContentHandler

	A ContentHandler designed to pull out page ids, titles and text from
	Wiki pages. These are assembled into WikiPage objects and sent off
	to the supplied callback.


	"""
    def __init__(self, pageCallBack=None):
        handler.ContentHandler.__init__(self)
        self.currentTag = ''
Exemplo n.º 14
0
class AimlHandler(ContentHandler):
    # The legal states of the AIML parser
    _STATE_OutsideAiml    = 0
    _STATE_InsideAiml     = 1
    _STATE_InsideCategory = 2
    _STATE_InsidePattern  = 3
    _STATE_AfterPattern   = 4
    _STATE_InsideThat     = 5
    _STATE_AfterThat      = 6
    _STATE_InsideTemplate = 7
    _STATE_AfterTemplate  = 8

    def __init__(self, encoding="UTF-8"):
        ContentHandler.__init__(self)
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = "" # the name of the current unknown element
        self._skipCurrentCategory = False
        self._numParseErrors = 0
        self._validInfo = self._validationInfo101
        self._foundDefaultLiStack = []
        self._whitespaceBehaviorStack = ["default"]
        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)

    def getNumErrors(self):
        """Return the number of errors found while parsing the current document."""
        return self._numParseErrors

    def setEncoding(self, encoding):
        """Set the text encoding to use when encoding strings read from XML.

        Defaults to 'UTF-8'.

        """
        self._encoding = encoding

    def _location(self):
        """Return a string describing the current location in the source file."""
        line = self._locator.getLineNumber()
        column = self._locator.getColumnNumber()
        return "(line %d, column %d)" % (line, column)

    def _pushWhitespaceBehavior(self, attr):
        """Push a new string onto the whitespaceBehaviorStack.

        The string's value is taken from the "xml:space" attribute, if it exists
        and has a legal value ("default" or "preserve").  Otherwise, the previous
        stack element is duplicated.

        """
        assert len(self._whitespaceBehaviorStack) > 0, "Whitespace behavior stack should never be empty!"
        try:
            if attr["xml:space"] == "default" or attr["xml:space"] == "preserve":
                self._whitespaceBehaviorStack.append(attr["xml:space"])
            else:
                raise AimlParserError, "Invalid value for xml:space attribute "+self._location()
        except KeyError:
            self._whitespaceBehaviorStack.append(self._whitespaceBehaviorStack[-1])

    def startElementNS(self, name, qname, attr):
        print "QNAME:", qname
        print "NAME:", name
        uri,elem = name
        if elem == "bot": print "name:", attr.getValueByQName("name"), "a'ite?"
        self.startElement(elem, attr)
        pass

    def startElement(self, name, attr):
        # Wrapper around _startElement, which catches errors in _startElement()
        # and keeps going.

        # If we're inside an unknown element, ignore everything until we're
        # out again.
        if self._currentUnknown != "":
            return
        # If we're skipping the current category, ignore everything until
        # it's finished.
        if self._skipCurrentCategory:
            return

        # process this start-element.
        try: self._startElement(name, attr)
        except AimlParserError, msg:
            # Print the error message
            sys.stderr.write("PARSE ERROR: %s\n" % msg)

            self._numParseErrors += 1 # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True
Exemplo n.º 15
0
class WNXMLParserContentHandler(ContentHandler):
    def __init__(self):
        ContentHandler.__init__(self)
        self._locator = Locator()  # Dummy setDocumentLocator does the same!
        self.setDocumentLocator(self._locator)
        self.m_lcnt = 0  # input line number
        self.m_ppath = [
        ]  # contains the XML path to the current node (names of the ancestors)
        self.m_done = -1  # -1: not started synset yet, 0: inside synset, 1: done with synset
        self.m_syns = Synset()  # points to the output struct
        self.m_syns_list = []  # points to the output struct

        self.m_ilrs0_temp = ''  # Temp vars for Tuples (std::pair in C++)
        self.m_ilrs1_temp = ''

        self.m_sumolinks0_temp = ''
        self.m_sumolinks1_temp = ''

        self.m_elrs0_temp = ''
        self.m_elrs1_temp = ''

        self.m_elrs30_temp = ''
        self.m_elrs31_temp = ''

        self.m_ekszlinks0_temp = ''
        self.m_ekszlinks1_temp = ''

        self.m_vframelinks0_temp = ''
        self.m_vframelinks1_temp = ''

        self.m_startroot = False  # was there a starting root tag?
        self.m_endroot = False  # was there an end root tag?

    def endDocument(self):
        if self.m_done != 1:  # reached eof before end of segment
            raise WNXMLParserException(
                'Warning: end of file reached before </SYNSET>, possibly corrupt input'
            )

    def startElement(self, name, attrs):
        if DEBUG:
            print('(',
                  self._locator.getLineNumber(),
                  ', ',
                  self._locator.getColumnNumber(),
                  '): /',
                  '/'.join(self.m_ppath),
                  '/START: ',
                  name,
                  sep='')

        self.m_ppath.append(name)

        if len(self.m_ppath) >= 2:
            parent = self.m_ppath[-2]
        else:
            parent = ''

        if len(self.m_ppath) >= 3:
            gparent = self.m_ppath[-3]
        else:
            gparent = ''

        # VisDic XML format fault tolerance (no root tag)
        if name == 'WNXML':
            self.m_startroot = True

        elif name == 'SYNSET':
            if self.m_done == 0:
                raise WNXMLParserException(
                    f'WNXMLParser internal error: SYNSET should start now,'
                    ' but m_done is not 0 ({self.m_done})!')
            self.m_done = 0
            self.m_lcnt = self._locator.getLineNumber()

        elif name == 'LITERAL' and parent == 'SYNONYM' and gparent == 'SYNSET':
            self.m_syns.synonyms.append(Synonym('', ''))

        elif name == 'ILR' and parent == 'SYNSET':
            self.m_ilrs0_temp = ''
            self.m_ilrs1_temp = ''

        elif name == 'SUMO' and parent == 'SYNSET':
            self.m_sumolinks0_temp = ''
            self.m_sumolinks1_temp = ''

        elif name == 'ELR' and parent == 'SYNSET':
            self.m_elrs0_temp = ''
            self.m_elrs1_temp = ''

        elif name == 'ELR3' and parent == 'SYNSET':
            self.m_elrs30_temp = ''
            self.m_elrs31_temp = ''

        elif name == 'EKSZ' and parent == 'SYNSET':
            self.m_ekszlinks0_temp = ''
            self.m_ekszlinks1_temp = ''

        elif name == 'VFRAME' and parent == 'SYNSET':
            self.m_vframelinks0_temp = ''
            self.m_vframelinks1_temp = ''

        elif name == 'USAGE' and parent == 'SYNSET':
            self.m_syns.usages.append('')

        elif name == 'SNOTE' and parent == 'SYNSET':
            self.m_syns.snotes.append('')

        elif name == 'EQ_NEAR_SYNONYM' and parent == 'SYNSET':
            self.m_syns.elrs.append(['', 'eq_near_synonym'])

        elif name == 'EQ_HYPERNYM' and parent == 'SYNSET':
            self.m_syns.elrs.append(['', 'eq_has_hypernym'])

        elif name == 'EQ_HYPONYM' and parent == 'SYNSET':
            self.m_syns.elrs.append(['', 'eq_has_hyponym'])

        # elif name == 'ELR' and parent == 'SYNSET':
        #     self.m_syns.elrs.append(['', ''])

        elif name == 'EKSZ' and parent == 'SYNSET':
            self.m_syns.ekszlinks.append(['', ''])

        elif name == 'VFRAME' and parent == 'SYNSET':
            self.m_syns.vframelinks.append(['', ''])

    def characters(self, chrs):
        if DEBUG:
            print('(',
                  self._locator.getLineNumber(),
                  ', ',
                  self._locator.getColumnNumber(),
                  '): /',
                  '/'.join(self.m_ppath),
                  '#PCDATA: ',
                  chrs,
                  sep='')

        if self.m_done == 1 or self.m_done == -1:
            return

        self.m_ppath.append('#PCDATA')

        if 2 <= len(self.m_ppath):
            parent = self.m_ppath[-2]
        else:
            parent = ''

        if 3 <= len(self.m_ppath):
            gparent = self.m_ppath[-3]
        else:
            gparent = ''

        if 4 <= len(self.m_ppath):
            ggparent = self.m_ppath[-4]
        else:
            ggparent = ''

        if parent == 'ID' and gparent == 'SYNSET':  # SYNSET/ID
            self.m_syns.wnid += chrs

        elif parent == 'ID3' and gparent == 'SYNSET':  # SYNSET/ID3
            self.m_syns.wnid3 += chrs

        elif parent == 'POS' and gparent == 'SYNSET':  # SYNSET/POS
            self.m_syns.pos += chrs

        elif parent == 'LITERAL' and gparent == 'SYNONYM':  # SYNSET/SYNONYM/LITERAL
            if len(self.m_syns.synonyms) == 0:
                raise WNXMLParserException(
                    'WNXMLParser internal error: synonyms empty at LITERAL tag'
                )
            self.m_syns.synonyms[-1].literal += chrs

        elif parent == 'SENSE' and gparent == 'LITERAL' and ggparent == 'SYNONYM':  # SYNSET/SYNONYM/LITERAL/SENSE
            if len(self.m_syns.synonyms) == 0:
                raise WNXMLParserException(
                    'WNXMLParser internal error: synonyms empty at SENSE tag')
            self.m_syns.synonyms[-1].sense += chrs

        elif parent == 'LNOTE' and gparent == 'LITERAL' and ggparent == 'SYNONYM':  # SYNSET/SYNONYM/LITERAL/LNOTE
            if len(self.m_syns.synonyms) == 0:
                raise WNXMLParserException(
                    f'WNXMLParser internal error: synonyms empty({len(self.m_syns.synonyms)})'
                    ' at LNOTE tag')
            self.m_syns.synonyms[-1].lnote += chrs

        elif parent == 'NUCLEUS' and gparent == 'LITERAL' and ggparent == 'SYNONYM':  # SYNSET/SYNONYM/LITERAL/NUCLEUS
            if len(self.m_syns.synonyms) == 0:
                raise WNXMLParserException(
                    'WNXMLParser internal error: synonyms empty at NUCLEUS tag'
                )
            self.m_syns.synonyms[-1].nucleus += chrs

        elif parent == 'DEF' and gparent == 'SYNSET':  # SYNSET/DEF
            self.m_syns.definition += chrs

        elif parent == 'BCS' and gparent == 'SYNSET':  # SYNSET/BCS
            self.m_syns.bcs += chrs

        elif parent == 'USAGE' and gparent == 'SYNSET':  # SYNSET/USAGE
            if len(self.m_syns.usages) == 0:
                raise WNXMLParserException(
                    'WNXMLParser internal error: usages empty at USAGE tag')
            self.m_syns.usages[-1] += chrs

        elif parent == 'SNOTE' and gparent == 'SYNSET':  # SYNSET/SNOTE
            if len(self.m_syns.snotes) == 0:
                raise WNXMLParserException(
                    'WNXMLParser internal error: snotes empty at SNOTE tag')
            self.m_syns.snotes[-1] += chrs

        elif parent == 'STAMP' and gparent == 'SYNSET':  # SYNSET/STAMP
            self.m_syns.stamp += chrs

        elif parent == 'DOMAIN' and gparent == 'SYNSET':  # SYNSET/STAMP
            self.m_syns.domain += chrs

        elif parent == 'NL' and gparent == 'SYNSET':  # SYNSET/NL
            self.m_syns.nl += chrs

        elif parent == 'TNL' and gparent == 'SYNSET':  # SYNSET/TNL
            self.m_syns.tnl += chrs

        elif parent == 'ILR' and gparent == 'SYNSET':  # SYNSET/ILR
            self.m_ilrs0_temp += chrs

        elif parent == 'TYPE' and gparent == 'ILR':  # SYNSET/ILR/TYPE
            self.m_ilrs1_temp += chrs

        elif parent == 'SUMO' and gparent == 'SYNSET':  # SYNSET/SUMO
            self.m_sumolinks0_temp += chrs

        elif parent == 'TYPE' and gparent == 'SUMO':  # SYNSET/SUMO/TYPE
            self.m_sumolinks1_temp += chrs

        elif parent == 'EQ_NEAR_SYNONYM' and gparent == 'SYNSET':  # SYNSET/EQ_NEAR_SYNONYM
            self.m_elrs0_temp += chrs

        elif parent == 'EQ_HYPERNYM' and gparent == 'SYNSET':  # SYNSET/EQ_HYPERNYM
            self.m_elrs0_temp += chrs

        elif parent == 'EQ_HYPONYM' and gparent == 'SYNSET':  # SYNSET/EQ_HYPONYM
            self.m_elrs0_temp += chrs

        elif parent == 'ELR' and gparent == 'SYNSET':  # SYNSET/ELR
            self.m_elrs0_temp += chrs

        elif parent == 'TYPE' and gparent == 'ELR':  # SYNSET/ELR/TYPE
            self.m_elrs1_temp += chrs

        elif parent == 'ELR3' and gparent == 'SYNSET':  # SYNSET/ELR3
            self.m_elrs30_temp += chrs

        elif parent == 'TYPE' and gparent == 'ELR3':  # SYNSET/ELR3/TYPE
            self.m_elrs31_temp += chrs

        elif parent == 'EKSZ' and gparent == 'SYNSET':  # SYNSET/EKSZ
            self.m_ekszlinks0_temp += chrs

        elif parent == 'TYPE' and gparent == 'EKSZ':  # SYNSET/EKSZ/TYPE
            self.m_ekszlinks1_temp += chrs

        elif parent == 'VFRAME' and gparent == 'SYNSET':  # SYNSET/VFRAME
            self.m_vframelinks0_temp += chrs

        elif parent == 'TYPE' and gparent == 'VFRAME':  # SYNSET/VFRAME/TYPE
            self.m_vframelinks1_temp += chrs

        self.m_ppath.pop()

    def endElement(self, name):
        if DEBUG:
            print('(',
                  self._locator.getLineNumber(),
                  ', ',
                  self._locator.getColumnNumber(),
                  '): /',
                  '/'.join(self.m_ppath),
                  '/END: ',
                  name,
                  sep='')

        if len(self.m_ppath) >= 2:
            parent = self.m_ppath[-2]
        else:
            parent = ''

        if name == 'WNXML':  # WNXML
            self.m_endroot = True

        elif name == 'SYNSET':  # SYNSET
            if self.m_done != 0:
                raise WNXMLParserException(
                    'This is impossible!\nThe parser should\'ve caught this error:'
                    ' \'SYNSET\' end tag without previous begin tag')
            self.m_done = 1
            self.m_syns_list.append((self.m_syns, self.m_lcnt))
            self.m_syns = Synset()

        elif name == 'ILR' and parent == 'SYNSET':
            self.m_syns.ilrs.append((self.m_ilrs0_temp, self.m_ilrs1_temp))
            self.m_ilrs0_temp = ''
            self.m_ilrs1_temp = ''

        elif name == 'SUMO' and parent == 'SYNSET':
            self.m_syns.sumolinks.append(
                (self.m_sumolinks0_temp, self.m_sumolinks1_temp))
            self.m_sumolinks0_temp = ''
            self.m_sumolinks1_temp = ''

        elif name == 'ELR' and parent == 'SYNSET':
            self.m_syns.elrs.append((self.m_elrs0_temp, self.m_elrs1_temp))
            self.m_elrs0_temp = ''
            self.m_elrs1_temp = ''

        elif name == 'ELR3' and parent == 'SYNSET':
            self.m_syns.elrs3.append((self.m_elrs30_temp, self.m_elrs31_temp))
            self.m_elrs30_temp = ''
            self.m_elrs31_temp = ''

        elif name == 'EKSZ' and parent == 'SYNSET':
            self.m_syns.ekszlinks.append(
                (self.m_ekszlinks0_temp, self.m_ekszlinks1_temp))
            self.m_ekszlinks0_temp = ''
            self.m_ekszlinks1_temp = ''

        elif name == 'VFRAME' and parent == 'SYNSET':
            self.m_syns.vframelinks.append(
                (self.m_vframelinks0_temp, self.m_vframelinks1_temp))
            self.m_vframelinks0_temp = ''
            self.m_vframelinks1_temp = ''

        self.m_ppath.pop()

    def parse(self, input_file):
        """
        # Magic lies here
        # Source: http://stackoverflow.com/a/12263340
        """
        # Make parser
        xml_reader = make_parser()
        # set self as ContentHandler
        xml_reader.setContentHandler(self)
        # Set ErrorHandler
        xml_reader.setErrorHandler(WNXMLParserErrorHandler())
        # Do the actual parsing
        xml_reader.parse(input_file)
        # Return the gathered result
        return self.m_syns_list
Exemplo n.º 16
0
class AimlHandler(ContentHandler):
	# The legal states of the AIML parser
	_STATE_OutsideAiml    = 0
	_STATE_InsideAiml     = 1
	_STATE_InsideCategory = 2
	_STATE_InsidePattern  = 3
	_STATE_AfterPattern   = 4
	_STATE_InsideThat     = 5
	_STATE_AfterThat      = 6
	_STATE_InsideTemplate = 7
	_STATE_AfterTemplate  = 8
	
	def __init__(self, encoding = "UTF-8"):
		self.categories = {}
		self._encoding = encoding
		self._state = self._STATE_OutsideAiml
		self._version = ""
		self._namespace = ""
		self._forwardCompatibleMode = False
		self._currentPattern = ""
		self._currentThat    = ""
		self._currentTopic   = ""
		self._insideTopic = False
		self._currentUnknown = "" # the name of the current unknown element

		# This is set to true when a parse error occurs in a category.
		self._skipCurrentCategory = False

		# Counts the number of parse errors in a particular AIML document.
		# query with getNumErrors().  If 0, the document is AIML-compliant.
		self._numParseErrors = 0

		# TODO: select the proper validInfo table based on the version number.
		self._validInfo = self._validationInfo101

		# This stack of bools is used when parsing <li> elements inside
		# <condition> elements, to keep track of whether or not an
		# attribute-less "default" <li> element has been found yet.  Only
		# one default <li> is allowed in each <condition> element.  We need
		# a stack in order to correctly handle nested <condition> tags.
		self._foundDefaultLiStack = []

		# This stack of strings indicates what the current whitespace-handling
		# behavior should be.  Each string in the stack is either "default" or
		# "preserve".  When a new AIML element is encountered, a new string is
		# pushed onto the stack, based on the value of the element's "xml:space"
		# attribute (if absent, the top of the stack is pushed again).  When
		# ending an element, pop an object off the stack.
		self._whitespaceBehaviorStack = ["default"]
		
		self._elemStack = []
		self._locator = Locator()
		self.setDocumentLocator(self._locator)

	def getNumErrors(self):
		"Return the number of errors found while parsing the current document."
		return self._numParseErrors

	def setEncoding(self, encoding):
		"""Set the text encoding to use when encoding strings read from XML.

		Defaults to 'UTF-8'.

		"""
		self._encoding = encoding

	def _location(self):
		"Return a string describing the current location in the source file."
		line = self._locator.getLineNumber()
		column = self._locator.getColumnNumber()
		return "(line %d, column %d)" % (line, column)

	def _pushWhitespaceBehavior(self, attr):
		"""Push a new string onto the whitespaceBehaviorStack.

		The string's value is taken from the "xml:space" attribute, if it exists
		and has a legal value ("default" or "preserve").  Otherwise, the previous
		stack element is duplicated.

		"""
		assert len(self._whitespaceBehaviorStack) > 0, "Whitespace behavior stack should never be empty!"
		try:
			if attr["xml:space"] == "default" or attr["xml:space"] == "preserve":
				self._whitespaceBehaviorStack.append(attr["xml:space"])
			else:
				raise AimlParserError("Invalid value for xml:space attribute "+self._location())
		except KeyError:
			self._whitespaceBehaviorStack.append(self._whitespaceBehaviorStack[-1])

	def startElementNS(self, name, qname, attr):
		print("QNAME:", qname)
		print("NAME:", name)
		uri,elem = name
		if (elem == "bot"): print("name:", attr.getValueByQName("name"), "a'ite?")
		self.startElement(elem, attr)
		pass

	def startElement(self, name, attr):
		# Wrapper around _startElement, which catches errors in _startElement()
		# and keeps going.
		
		# If we're inside an unknown element, ignore everything until we're
		# out again.
		if self._currentUnknown != "":
			return
		# If we're skipping the current category, ignore everything until
		# it's finished.
		if self._skipCurrentCategory:
			return

		# process this start-element.
		try: self._startElement(name, attr)
		except AimlParserError as msg:
			# Print the error message
			sys.stderr.write("PARSE ERROR: %s\n" % msg)
			
			self._numParseErrors += 1 # increment error count
			# In case of a parse error, if we're inside a category, skip it.
			if self._state >= self._STATE_InsideCategory:
				self._skipCurrentCategory = True
			
	def _startElement(self, name, attr):
		if name == "aiml":
			# <aiml> tags are only legal in the OutsideAiml state
			if self._state != self._STATE_OutsideAiml:
				raise AimlParserError("Unexpected <aiml> tag "+self._location())
			self._state = self._STATE_InsideAiml
			self._insideTopic = False
			self._currentTopic = ""
			try: self._version = attr["version"]
			except KeyError:
				# This SHOULD be a syntax error, but so many AIML sets out there are missing
				# "version" attributes that it just seems nicer to let it slide.
				#raise AimlParserError, "Missing 'version' attribute in <aiml> tag "+self._location()
				#print "WARNING: Missing 'version' attribute in <aiml> tag "+self._location()
				#print "         Defaulting to version 1.0"
				self._version = "1.0"
			self._forwardCompatibleMode = (self._version != "1.0.1")
			self._pushWhitespaceBehavior(attr)			
			# Not sure about this namespace business yet...
			#try:
			#	self._namespace = attr["xmlns"]
			#	if self._version == "1.0.1" and self._namespace != "http://alicebot.org/2001/AIML-1.0.1":
			#		raise AimlParserError, "Incorrect namespace for AIML v1.0.1 "+self._location()
			#except KeyError:
			#	if self._version != "1.0":
			#		raise AimlParserError, "Missing 'version' attribute(s) in <aiml> tag "+self._location()
		elif self._state == self._STATE_OutsideAiml:
			# If we're outside of an AIML element, we ignore all tags.
			return
		elif name == "topic":
			# <topic> tags are only legal in the InsideAiml state, and only
			# if we're not already inside a topic.
			if (self._state != self._STATE_InsideAiml) or self._insideTopic:
				raise AimlParserError("Unexpected <topic> tag").with_traceback(self._location())
			try: self._currentTopic = str(attr['name'])
			except KeyError:
				raise AimlParserError("Required \"name\" attribute missing in <topic> element "+self._location())
			self._insideTopic = True
		elif name == "category":
			# <category> tags are only legal in the InsideAiml state
			if self._state != self._STATE_InsideAiml:
				raise AimlParserError("Unexpected <category> tag "+self._location())
			self._state = self._STATE_InsideCategory
			self._currentPattern = ""
			self._currentThat = ""
			# If we're not inside a topic, the topic is implicitly set to *
			if not self._insideTopic: self._currentTopic = "*"
			self._elemStack = []
			self._pushWhitespaceBehavior(attr)
		elif name == "pattern":
			# <pattern> tags are only legal in the InsideCategory state
			if self._state != self._STATE_InsideCategory:
				raise AimlParserError("Unexpected <pattern> tag "+self._location())
			self._state = self._STATE_InsidePattern
		elif name == "that" and self._state == self._STATE_AfterPattern:
			# <that> are legal either inside a <template> element, or
			# inside a <category> element, between the <pattern> and the
			# <template> elements.  This clause handles the latter case.
			self._state = self._STATE_InsideThat
		elif name == "template":
			# <template> tags are only legal in the AfterPattern and AfterThat
			# states
			if self._state not in [self._STATE_AfterPattern, self._STATE_AfterThat]:
				raise AimlParserError("Unexpected <template> tag "+self._location())
			# if no <that> element was specified, it is implicitly set to *
			if self._state == self._STATE_AfterPattern:
				self._currentThat = "*"
			self._state = self._STATE_InsideTemplate
			self._elemStack.append(['template',{}])
			self._pushWhitespaceBehavior(attr)
		elif self._state == self._STATE_InsidePattern:
			# Certain tags are allowed inside <pattern> elements.
			if name == "bot" and "name" in attr and attr["name"] == "name":
				# Insert a special character string that the PatternMgr will
				# replace with the bot's name.
				self._currentPattern += " BOT_NAME "
			else:
				raise AimlParserError(("Unexpected <%s> tag " % name)+self._location())
		elif self._state == self._STATE_InsideThat:
			# Certain tags are allowed inside <that> elements.
			if name == "bot" and "name" in attr and attr["name"] == "name":
				# Insert a special character string that the PatternMgr will
				# replace with the bot's name.
				self._currentThat += " BOT_NAME "
			else:
				raise AimlParserError(("Unexpected <%s> tag " % name)+self._location())
		elif self._state == self._STATE_InsideTemplate and name in self._validInfo:
			# Starting a new element inside the current pattern. First
			# we need to convert 'attr' into a native Python dictionary,
			# so it can later be marshaled.
			attrDict = {}
			for k,v in list(attr.items()):
				#attrDict[k[1].encode(self._encoding)] = v.encode(self._encoding)
				attrDict[k.encode(self._encoding)] = str(v)
			self._validateElemStart(name, attrDict, self._version)
			# Push the current element onto the element stack.
			self._elemStack.append([name.encode(self._encoding),attrDict])
			self._pushWhitespaceBehavior(attr)
			# If this is a condition element, push a new entry onto the
			# foundDefaultLiStack
			if name == "condition":
				self._foundDefaultLiStack.append(False)
		else:
			# we're now inside an unknown element.
			if self._forwardCompatibleMode:
				# In Forward Compatibility Mode, we ignore the element and its
				# contents.
				self._currentUnknown = name
			else:
				# Otherwise, unknown elements are grounds for error!
				raise AimlParserError(("Unexpected <%s> tag " % name)+self._location())

	def characters(self, ch):
		# Wrapper around _characters which catches errors in _characters()
		# and keeps going.
		if self._state == self._STATE_OutsideAiml:
			# If we're outside of an AIML element, we ignore all text
			return
		if self._currentUnknown != "":
			# If we're inside an unknown element, ignore all text
			return
		if self._skipCurrentCategory:
			# If we're skipping the current category, ignore all text.
			return
		try: self._characters(ch)
		except AimlParserError as msg:
			# Print the message
			sys.stderr.write("PARSE ERROR: %s\n" % msg)
			self._numParseErrors += 1 # increment error count
			# In case of a parse error, if we're inside a category, skip it.
			if self._state >= self._STATE_InsideCategory:
				self._skipCurrentCategory = True
			
	def _characters(self, ch):
		text = str(ch)
		if self._state == self._STATE_InsidePattern:
			# TODO: text inside patterns must be upper-case!
			self._currentPattern += text
		elif self._state == self._STATE_InsideThat:
			self._currentThat += text
		elif self._state == self._STATE_InsideTemplate:
			# First, see whether the element at the top of the element stack
			# is permitted to contain text.
			try:
				parent = self._elemStack[-1][0]
				parentAttr = self._elemStack[-1][1]
				required, optional, canBeParent = self._validInfo[parent]
				nonBlockStyleCondition = (parent == "condition" and not ("name" in parentAttr and "value" in parentAttr))
				if not canBeParent:
					raise AimlParserError(("Unexpected text inside <%s> element "%parent)+self._location())
				elif parent == "random" or nonBlockStyleCondition:
					# <random> elements can only contain <li> subelements. However,
					# there's invariably some whitespace around the <li> that we need
					# to ignore. Same for non-block-style <condition> elements (i.e.
					# those which don't have both a "name" and a "value" attribute).
					if len(text.strip()) == 0:
						# ignore whitespace inside these elements.
						return
					else:
						# non-whitespace text inside these elements is a syntax error.
						raise AimlParserError(("Unexpected text inside <%s> element "%parent)+self._location())
			except IndexError:
				# the element stack is empty. This should never happen.
				raise AimlParserError("Element stack is empty while validating text "+self._location())
			
			# Add a new text element to the element at the top of the element
			# stack. If there's already a text element there, simply append the
			# new characters to its contents.
			try: textElemOnStack = (self._elemStack[-1][-1][0] == "text")
			except IndexError: textElemOnStack = False
			except KeyError: textElemOnStack = False
			if textElemOnStack:
				self._elemStack[-1][-1][2] += text
			else:
				self._elemStack[-1].append(["text", {"xml:space": self._whitespaceBehaviorStack[-1]}, text])
		else:
			# all other text is ignored
			pass

	def endElementNS(self, name, qname):
		uri, elem = name
		self.endElement(elem)
		
	def endElement(self, name):
		"""Wrapper around _endElement which catches errors in _characters()
		and keeps going.

		"""		
		if self._state == self._STATE_OutsideAiml:
			# If we're outside of an AIML element, ignore all tags
			return
		if self._currentUnknown != "":
			# see if we're at the end of an unknown element.  If so, we can
			# stop ignoring everything.
			if name == self._currentUnknown:
				self._currentUnknown = ""
			return
		if self._skipCurrentCategory:
			# If we're skipping the current category, see if it's ending. We
			# stop on ANY </category> tag, since we're not keeping track of
			# state in ignore-mode.
			if name == "category":
				self._skipCurrentCategory = False
				self._state = self._STATE_InsideAiml
			return
		try: self._endElement(name)
		except AimlParserError as msg:
			# Print the message
			sys.stderr.write("PARSE ERROR: %s\n" % msg)
			self._numParseErrors += 1 # increment error count
			# In case of a parse error, if we're inside a category, skip it.
			if self._state >= self._STATE_InsideCategory:
				self._skipCurrentCategory = True

	def _endElement(self, name):
		"""Verify that an AIML end element is valid in the current
		context.

		Raises an AimlParserError if an illegal end element is encountered.

		"""
		if name == "aiml":
			# </aiml> tags are only legal in the InsideAiml state
			if self._state != self._STATE_InsideAiml:
				raise AimlParserError("Unexpected </aiml> tag "+self._location())
			self._state = self._STATE_OutsideAiml
			self._whitespaceBehaviorStack.pop()
		elif name == "topic":
			# </topic> tags are only legal in the InsideAiml state, and
			# only if _insideTopic is true.
			if self._state != self._STATE_InsideAiml or not self._insideTopic:
				raise AimlParserError("Unexpected </topic> tag "+self._location())
			self._insideTopic = False
			self._currentTopic = ""
		elif name == "category":
			# </category> tags are only legal in the AfterTemplate state
			if self._state != self._STATE_AfterTemplate:
				raise AimlParserError("Unexpected </category> tag "+self._location())
			self._state = self._STATE_InsideAiml
			# End the current category.  Store the current pattern/that/topic and
			# element in the categories dictionary.
			key = (self._currentPattern.strip(), self._currentThat.strip(),self._currentTopic.strip())
			self.categories[key] = self._elemStack[-1]
			self._whitespaceBehaviorStack.pop()
		elif name == "pattern":
			# </pattern> tags are only legal in the InsidePattern state
			if self._state != self._STATE_InsidePattern:
				raise AimlParserError("Unexpected </pattern> tag "+self._location())
			self._state = self._STATE_AfterPattern
		elif name == "that" and self._state == self._STATE_InsideThat:
			# </that> tags are only allowed inside <template> elements or in
			# the InsideThat state.  This clause handles the latter case.
			self._state = self._STATE_AfterThat
		elif name == "template":
			# </template> tags are only allowed in the InsideTemplate state.
			if self._state != self._STATE_InsideTemplate:
				raise AimlParserError("Unexpected </template> tag "+self._location())
			self._state = self._STATE_AfterTemplate
			self._whitespaceBehaviorStack.pop()
		elif self._state == self._STATE_InsidePattern:
			# Certain tags are allowed inside <pattern> elements.
			if name not in ["bot"]:
				raise AimlParserError(("Unexpected </%s> tag " % name)+self._location())
		elif self._state == self._STATE_InsideThat:
			# Certain tags are allowed inside <that> elements.
			if name not in ["bot"]:
				raise AimlParserError(("Unexpected </%s> tag " % name)+self._location())
		elif self._state == self._STATE_InsideTemplate:
			# End of an element inside the current template.  Append the
			# element at the top of the stack onto the one beneath it.
			elem = self._elemStack.pop()
			self._elemStack[-1].append(elem)
			self._whitespaceBehaviorStack.pop()
			# If the element was a condition, pop an item off the
			# foundDefaultLiStack as well.
			if elem[0] == "condition": self._foundDefaultLiStack.pop()
		else:
			# Unexpected closing tag
			raise AimlParserError(("Unexpected </%s> tag " % name)+self._location())

	# A dictionary containing a validation information for each AIML
	# element. The keys are the names of the elements.  The values are a
	# tuple of three items. The first is a list containing the names of
	# REQUIRED attributes, the second is a list of OPTIONAL attributes,
	# and the third is a boolean value indicating whether or not the
	# element can contain other elements and/or text (if False, the
	# element can only appear in an atomic context, such as <date/>).
	_validationInfo101 = {
		"bot":      	( ["name"], [], False ),
		"condition":    ( [], ["name", "value"], True ), # can only contain <li> elements
		"date":         ( [], [], False ),
		"formal":       ( [], [], True ),
		"gender":       ( [], [], True ),
		"get":          ( ["name"], [], False ),
		"gossip":		( [], [], True ),
		"id":           ( [], [], False ),
		"input":        ( [], ["index"], False ),
		"javascript":	( [], [], True ),
		"learn":        ( [], [], True ),
		"li":           ( [], ["name", "value"], True ),
		"lowercase":    ( [], [], True ),
		"person":       ( [], [], True ),
		"person2":      ( [], [], True ),
		"random":       ( [], [], True ), # can only contain <li> elements
		"sentence":     ( [], [], True ),
		"set":          ( ["name"], [], True),
		"size":         ( [], [], False ),
		"sr":           ( [], [], False ),
		"srai":         ( [], [], True ),
		"star":         ( [], ["index"], False ),
		"system":       ( [], [], True ),
		"template":		( [], [], True ), # needs to be in the list because it can be a parent.
		"that":         ( [], ["index"], False ),
		"thatstar":     ( [], ["index"], False ),
		"think":        ( [], [], True ),
		"topicstar":    ( [], ["index"], False ),
		"uppercase":    ( [], [], True ),
		"version":      ( [], [], False ),
	}

	def _validateElemStart(self, name, attr, version):
		"""Test the validity of an element starting inside a <template>
		element.

		This function raises an AimlParserError exception if it the tag is
		invalid.  Otherwise, no news is good news.

		"""		
		# Check the element's attributes.  Make sure that all required
		# attributes are present, and that any remaining attributes are
		# valid options.		
		required, optional, canBeParent = self._validInfo[name]
		for a in required:
			if a not in attr and not self._forwardCompatibleMode:
				raise AimlParserError(("Required \"%s\" attribute missing in <%s> element " % (a,name))+self._location())
		for a in attr:
			if a in required: continue
			if a[0:4] == "xml:": continue # attributes in the "xml" namespace can appear anywhere
			if a not in optional and not self._forwardCompatibleMode:
				raise AimlParserError(("Unexpected \"%s\" attribute in <%s> element " % (a,name))+self._location())

		# special-case: several tags contain an optional "index" attribute.
		# This attribute's value must be a positive integer.
		if name in ["star", "thatstar", "topicstar"]:
			for k,v in list(attr.items()):
				if k == "index":
					temp = 0
					try: temp = int(v)
					except:
						raise AimlParserError(("Bad type for \"%s\" attribute (expected integer, found \"%s\") " % (k,v))+self._location())
					if temp < 1:
						raise AimlParserError(("\"%s\" attribute must have non-negative value " % (k))+self._location())

		# See whether the containing element is permitted to contain
		# subelements. If not, this element is invalid no matter what it is.
		try:
			parent = self._elemStack[-1][0]
			parentAttr = self._elemStack[-1][1]
		except IndexError:
			# If the stack is empty, no parent is present.  This should never
			# happen.
			raise AimlParserError(("Element stack is empty while validating <%s> " % name)+self._location())
		required, optional, canBeParent = self._validInfo[parent]
		nonBlockStyleCondition = (parent == "condition" and not ("name" in parentAttr and "value" in parentAttr))
		if not canBeParent:
			raise AimlParserError(("<%s> elements cannot have any contents "%parent)+self._location())
		# Special-case test if the parent element is <condition> (the
		# non-block-style variant) or <random>: these elements can only
		# contain <li> subelements.
		elif (parent == "random" or nonBlockStyleCondition) and name!="li":
			raise AimlParserError(("<%s> elements can only contain <li> subelements "%parent)+self._location())
		# Special-case test for <li> elements, which can only be contained
		# by non-block-style <condition> and <random> elements, and whose
		# required attributes are dependent upon which attributes are
		# present in the <condition> parent.
		elif name=="li":
			if not (parent=="random" or nonBlockStyleCondition):
				raise AimlParserError(("Unexpected <li> element contained by <%s> element "%parent)+self._location())
			if nonBlockStyleCondition:
				if "name" in parentAttr:
					# Single-predicate condition.  Each <li> element except the
					# last must have a "value" attribute.
					if len(attr) == 0:
						# This could be the default <li> element for this <condition>,
						# unless we've already found one.
						if self._foundDefaultLiStack[-1]:
							raise AimlParserError("Unexpected default <li> element inside <condition> "+self._location())
						else:
							self._foundDefaultLiStack[-1] = True
					elif len(attr) == 1 and "value" in attr:
						pass # this is the valid case
					else:
						raise AimlParserError("Invalid <li> inside single-predicate <condition> "+self._location())
				elif len(parentAttr) == 0:
					# Multi-predicate condition.  Each <li> element except the
					# last must have a "name" and a "value" attribute.
					if len(attr) == 0:
						# This could be the default <li> element for this <condition>,
						# unless we've already found one.
						if self._foundDefaultLiStack[-1]:
							raise AimlParserError("Unexpected default <li> element inside <condition> "+self._location())
						else:
							self._foundDefaultLiStack[-1] = True
					elif len(attr) == 2 and "value" in attr and "name" in attr:
						pass # this is the valid case
					else:
						raise AimlParserError("Invalid <li> inside multi-predicate <condition> "+self._location())
		# All is well!
		return True
Exemplo n.º 17
0
class AimlHandler(ContentHandler):
    # The legal states of the AIML parser
    _STATE_OutsideAiml = 0
    _STATE_InsideAiml = 1
    _STATE_InsideCategory = 2
    _STATE_InsidePattern = 3
    _STATE_AfterPattern = 4
    _STATE_InsideThat = 5
    _STATE_AfterThat = 6
    _STATE_InsideTemplate = 7
    _STATE_AfterTemplate = 8

    def __init__(self, encoding="UTF-8"):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = ""  # the name of the current unknown element

        # This is set to true when a parse error occurs in a category.
        self._skipCurrentCategory = False

        # Counts the number of parse errors in a particular AIML document.
        # query with getNumErrors().  If 0, the document is AIML-compliant.
        self._numParseErrors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._validInfo = self._validationInfo101

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._foundDefaultLiStack = []

        # This stack of strings indicates what the current whitespace-handling
        # behavior should be.  Each string in the stack is either "default" or
        # "preserve".  When a new AIML element is encountered, a new string is
        # pushed onto the stack, based on the value of the element's "xml:space"
        # attribute (if absent, the top of the stack is pushed again).  When
        # ending an element, pop an object off the stack.
        self._whitespaceBehaviorStack = ["default"]

        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)

    def getNumErrors(self):
        "Return the number of errors found while parsing the current document."
        return self._numParseErrors

    def setEncoding(self, encoding):
        """Set the text encoding to use when encoding strings read from XML.

		Defaults to 'UTF-8'.

		"""
        self._encoding = encoding

    def _location(self):
        "Return a string describing the current location in the source file."
        line = self._locator.getLineNumber()
        column = self._locator.getColumnNumber()
        return "(line %d, column %d)" % (line, column)

    def _pushWhitespaceBehavior(self, attr):
        """Push a new string onto the whitespaceBehaviorStack.

		The string's value is taken from the "xml:space" attribute, if it exists
		and has a legal value ("default" or "preserve").  Otherwise, the previous
		stack element is duplicated.

		"""
        assert len(self._whitespaceBehaviorStack
                   ) > 0, "Whitespace behavior stack should never be empty!"
        try:
            if attr["xml:space"] == "default" or attr[
                    "xml:space"] == "preserve":
                self._whitespaceBehaviorStack.append(attr["xml:space"])
            else:
                raise AimlParserError, "Invalid value for xml:space attribute " + self._location(
                )
        except KeyError:
            self._whitespaceBehaviorStack.append(
                self._whitespaceBehaviorStack[-1])

    def startElementNS(self, name, qname, attr):
        print "QNAME:", qname
        print "NAME:", name
        uri, elem = name
        if (elem == "bot"):
            print "name:", attr.getValueByQName("name"), "a'ite?"
        self.startElement(elem, attr)
        pass

    def startElement(self, name, attr):
        # Wrapper around _startElement, which catches errors in _startElement()
        # and keeps going.

        # If we're inside an unknown element, ignore everything until we're
        # out again.
        if self._currentUnknown != "":
            return
        # If we're skipping the current category, ignore everything until
        # it's finished.
        if self._skipCurrentCategory:
            return

        # process this start-element.
        try:
            self._startElement(name, attr)
        except AimlParserError, msg:
            # Print the error message
            sys.stderr.write("PARSE ERROR: %s\n" % msg)

            self._numParseErrors += 1  # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True
Exemplo n.º 18
0
class AimlHandler(ContentHandler):
    ''' AIML文件的 一个 SAX handler    '''

    # AIML parser 的合法状态
    _STATE_OutsideAiml = 0
    _STATE_InsideAiml = 1
    _STATE_InsideCategory = 2
    _STATE_InsidePattern = 3
    _STATE_AfterPattern = 4
    _STATE_InsideThat = 5
    _STATE_AfterThat = 6
    _STATE_InsideTemplate = 7
    _STATE_AfterTemplate = 8

    def __init__(self, encoding=None):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = ""  # the name of the current unknown element

        # 在类别中发生分析错误时,将其设置为true。
        self._skipCurrentCategory = False

        # 统计特定AIML文档中解析错误的数量。用getNumErrors()查询。 如果为0,则文档符合AIML。
        self._numParseErrors = 0

        # TODO: 根据版本号选择合适的validInfo表。
        self._validInfo = self._validationInfo101

        # 这个bool值栈在解析<condition>元素中的<li>元素时,用来跟踪是否已经找到了一个无属性的“default”<li>元素。
        # 每个<condition>元素中只允许有一个默认的<li>。  我们需要一个栈来正确处理嵌套的<condition>标签。
        self._foundDefaultLiStack = []

        # 这个字符串堆栈表示当前的空白处理行为应该是什么。堆栈中的每个字符串都是"default" 或"preserve"。
        # 当遇到一个新的AIML元素时,根据元素的“xml:space”属性的值(如果没有,堆栈顶部被再次push),一个新的字符串被压入栈中。
        # 一个元素结束时,从堆栈中弹出一个对象。
        self._whitespaceBehaviorStack = ["default"]

        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)

    def getNumErrors(self):
        "返回解析当前文档时发现的错误数。"
        return self._numParseErrors

    def setEncoding(self, encoding):
        """设置在对从XML读取的字符串进行编码时使用的文本编码。  默认为'UTF-8'。        """
        self._encoding = encoding

    def _location(self):
        "返回描述源文件中当前位置的字符串。"
        line = self._locator.getLineNumber()
        column = self._locator.getColumnNumber()
        return "(line %d, column %d)" % (line, column)

    def _pushWhitespaceBehavior(self, attr):
        """将一个新的字符串推送到_whitespaceBehaviorStack。
     该字符串的值取自“xml:space”属性,如果它存在且具有合法值(“default”或“preserve”)。
         否则,以前的堆栈元素是重复的。         """
        assert len(self._whitespaceBehaviorStack
                   ) > 0, "Whitespace behavior stack should never be empty!"
        try:
            if attr["xml:space"] == "default" or attr[
                    "xml:space"] == "preserve":
                self._whitespaceBehaviorStack.append(attr["xml:space"])
            else:
                raise AimlParserError(
                    "Invalid value for xml:space attribute " +
                    self._location())
        except KeyError:
            self._whitespaceBehaviorStack.append(
                self._whitespaceBehaviorStack[-1])

    def startElementNS(self, name, qname, attr):
        print("QNAME:", qname)
        print("NAME:", name)
        uri, elem = name
        if (elem == "bot"):
            print("name:", attr.getValueByQName("name"), "a'ite?")
        self.startElement(elem, attr)
        pass

    def startElement(self, name, attr):
        # 包装在_startElement周围,捕获_startElement()中的错误并继续前进。
        # 如果我们在一个未知的元素内,那么在再次出来之前,不要理会任何东西。
        if self._currentUnknown != "":
            return
        #  如果我们跳过当前类别,则在完成之前忽略所有内容。
        if self._skipCurrentCategory:
            return

        # 处理这个起始元素.
        try:
            self._startElement(name, attr)
        except AimlParserError as err:
            # 打印错误消息
            sys.stderr.write("PARSE ERROR: %s\n" % err)

            self._numParseErrors += 1  # increment error count
            # 当发生解析错误时,如果在一个category内,跳过它。
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True

    def _startElement(self, name, attr):
        if name == "aiml":
            # <aiml> tags are only legal in the OutsideAiml state
            if self._state != self._STATE_OutsideAiml:
                raise AimlParserError("Unexpected <aiml> tag " +
                                      self._location())
            self._state = self._STATE_InsideAiml
            self._insideTopic = False
            self._currentTopic = u""
            try:
                self._version = attr["version"]
            except KeyError:
                # 这原本应该是一个语法错误,但是大量的AIML被指出缺少“版本”属性,让它溜走似乎更好。
                #raise AimlParserError( "Missing 'version' attribute in <aiml> tag "+self._location() )
                #print( "WARNING: Missing 'version' attribute in <aiml> tag "+self._location() )
                #print( "         Defaulting to version 1.0" )
                self._version = "1.0"
            self._forwardCompatibleMode = (self._version != "1.0.1")
            self._pushWhitespaceBehavior(attr)
            # 这个名字空间的业务尚不明确......
            #try:
            #   self._namespace = attr["xmlns"]
            #   if self._version == "1.0.1" and self._namespace != "http://alicebot.org/2001/AIML-1.0.1":
            #       raise AimlParserError( "Incorrect namespace for AIML v1.0.1 "+self._location() )
            #except KeyError:
            #   if self._version != "1.0":
            #       raise AimlParserError( "Missing 'version' attribute(s) in <aiml> tag "+self._location() )
        elif self._state == self._STATE_OutsideAiml:
            # 如果在AIML元素之外,我们会忽略所有的标签。
            return
        elif name == "topic":
            # <topic>标签只有在 InsideAiml 状态,而且不在在一个<topic>内。才是合法的,
            if (self._state != self._STATE_InsideAiml) or self._insideTopic:
                raise AimlParserError("Unexpected <topic> tag",
                                      self._location())
            try:
                self._currentTopic = unicode(attr['name'])
            except KeyError:
                raise AimlParserError(
                    "Required \"name\" attribute missing in <topic> element " +
                    self._location())
            self._insideTopic = True
        elif name == "category":
            # <category> 标签只有在 InsideAiml 状态才是合法的
            if self._state != self._STATE_InsideAiml:
                raise AimlParserError("Unexpected <category> tag " +
                                      self._location())
            self._state = self._STATE_InsideCategory
            self._currentPattern = u""
            self._currentThat = u""
            # 如果不在 topic 内部, topic 被隐式设置为 *
            if not self._insideTopic: self._currentTopic = u"*"
            self._elemStack = []
            self._pushWhitespaceBehavior(attr)
        elif name == "pattern":
            # <pattern> 标签只有在 InsideCategory 状态才是合法的
            if self._state != self._STATE_InsideCategory:
                raise AimlParserError("Unexpected <pattern> tag " +
                                      self._location())
            self._state = self._STATE_InsidePattern
        elif name == "that" and self._state == self._STATE_AfterPattern:
            # <that> 只有在 <template> 元素内部, 或者在 <category> 元素内部而且在 <pattern> 与<template> 元素之间
            # 才是合法的。  本条款处理后一种情况。
            self._state = self._STATE_InsideThat
        elif name == "template":
            # <template> 标签只有在 AfterPattern 和 AfterThat 状态才是合法的
            if self._state not in [
                    self._STATE_AfterPattern, self._STATE_AfterThat
            ]:
                raise AimlParserError("Unexpected <template> tag " +
                                      self._location())
            # 如果不指定 <that> 元素, 它被隐式设置为 *
            if self._state == self._STATE_AfterPattern:
                self._currentThat = u"*"
            self._state = self._STATE_InsideTemplate
            self._elemStack.append(['template', {}])
            self._pushWhitespaceBehavior(attr)
        elif self._state == self._STATE_InsidePattern:
            # 特定的一些标签在<pattern> 元素中是允许的。
            if name == "bot" and "name" in attr and attr["name"] == u"name":
                # 插入一个 PatternMgr 将会用 bot的名字替换掉的 特定的字符串 。
                self._currentPattern += u" BOT_NAME "
            else:
                raise AimlParserError(("Unexpected <%s> tag " % name) +
                                      self._location())
        elif self._state == self._STATE_InsideThat:
            # 特定的一些标签在<that>元素中是允许的。
            if name == "bot" and "name" in attr and attr["name"] == u"name":
                # 插入一个 PatternMgr 将会用 bot的名字替换掉的 特定的字符串 。
                self._currentThat += u" BOT_NAME "
            else:
                raise AimlParserError(("Unexpected <%s> tag " % name) +
                                      self._location())
        elif self._state == self._STATE_InsideTemplate and name in self._validInfo:
            # 在当前模式中开始一个新元素。 首先,需要将'attr'转换成一个本地的Python字典,以便将来可以编组。 marshaled. marshaled.
            it = ((unicode(k), unicode(v)) for k, v in attr.items())
            attrDict = dict(it)
            self._validateElemStart(name, attrDict, self._version)
            # 将当前元素推入元素堆栈。
            self._elemStack.append([unicode(name), attrDict])
            self._pushWhitespaceBehavior(attr)
            # 如果这是一个条件元素,则将新入口推送到foundDefaultLiStack
            if name == "condition":
                self._foundDefaultLiStack.append(False)
        else:
            #  现在我们处于一个未知元素的内部。
            if self._forwardCompatibleMode:
                # In Forward Compatibility Mode, 在向前兼容模式下,我们忽略元素及其内容。
                self._currentUnknown = name
            else:
                #  否则,不明的元素应该判断为错误!
                raise AimlParserError(("Unexpected <%s> tag " % name) +
                                      self._location())

    def characters(self, ch):
        # 包装在_characters()中捕获错误的_characters周围,并继续前进。
        if self._state == self._STATE_OutsideAiml:
            #  如果在AIML元素之外,则忽略所有文本
            return
        if self._currentUnknown != "":
            # 如果在一个未知元素内部,则忽略所有文本
            return
        if self._skipCurrentCategory:
            # 如果我们跳过目前的类别,则忽略所有文本
            return
        try:
            self._characters(ch)
        except AimlParserError as msg:
            # 打印消息
            sys.stderr.write("PARSE ERROR: %s\n" % msg)
            self._numParseErrors += 1  # 错误计数加1
            # 当发生解析错误时,如果在一个category类别内,跳过它。
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True

    def _characters(self, ch):
        text = unicode(ch)
        if self._state == self._STATE_InsidePattern:
            # TODO: text inside patterns must be upper-case!
            self._currentPattern += text
        elif self._state == self._STATE_InsideThat:
            self._currentThat += text
        elif self._state == self._STATE_InsideTemplate:
            # 首先,查看元素堆栈顶部的元素是否允许包含文本。
            try:
                parent = self._elemStack[-1][0]
                parentAttr = self._elemStack[-1][1]
                required, optional, canBeParent = self._validInfo[parent]
                nonBlockStyleCondition = (
                    parent == "condition"
                    and not ("name" in parentAttr and "value" in parentAttr))
                if not canBeParent:
                    raise AimlParserError(
                        ("Unexpected text inside <%s> element " % parent) +
                        self._location())
                elif parent == "random" or nonBlockStyleCondition:
                    # <random> 元素只能包含 <li> 子元素。 However,  there's invariably 一些空白 around the <li> that 我们需要忽略的。
                    # 非块风格的 <condition> 元素也一样 (i.e.   those 没有 both a "name" and a "value" 属性).
                    if len(text.strip()) == 0:
                        # 忽略这些元素内部的 空格
                        return
                    else:
                        # 这些元素内部的非空白文本是一个语法错误。
                        raise AimlParserError(
                            ("Unexpected text inside <%s> element " % parent) +
                            self._location())
            except IndexError:
                # 元素堆栈为空。这种事永远不该发生。
                raise AimlParserError(
                    "Element stack is empty while validating text " +
                    self._location())

            # 向元素堆栈顶部的元素添加一个新的文本元素。如果已经有一个文本元素,只需将新字符添加到它的内容中。
            try:
                textElemOnStack = (self._elemStack[-1][-1][0] == "text")
            except IndexError:
                textElemOnStack = False
            except KeyError:
                textElemOnStack = False
            if textElemOnStack:
                self._elemStack[-1][-1][2] += text
            else:
                self._elemStack[-1].append([
                    "text", {
                        "xml:space": self._whitespaceBehaviorStack[-1]
                    }, text
                ])
        else:
            # all other text is ignored
            pass

    def endElementNS(self, name, qname):
        uri, elem = name
        self.endElement(elem)

    def endElement(self, name):
        """包装在_characters()中捕获错误的_endElement周围,并继续前进。       """
        if self._state == self._STATE_OutsideAiml:
            # 如果在AIML元素之外,则忽略所有文本
            return
        if self._currentUnknown != "":
            # 看看我们是否处在一个未知的元素的末尾。如果是,我们就可以停止忽视一切。
            if name == self._currentUnknown:
                self._currentUnknown = ""
            return
        if self._skipCurrentCategory:
            # 如果我们跳过当前类别,看看它是否结束。我们停在任何</ category>标签上,因为我们没有在忽略模式下跟踪状态。
            if name == "category":
                self._skipCurrentCategory = False
                self._state = self._STATE_InsideAiml
            return
        try:
            self._endElement(name)
        except AimlParserError as msg:
            # 打印错误消息
            sys.stderr.write("PARSE ERROR: %s\n" % msg)
            self._numParseErrors += 1  # increment error count
            # 当发生解析错误时,如果在一个category内,跳过它。
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True

    def _endElement(self, name):
        """验证AIML结束元素在当前上下文中是否有效。 如果遇到非法的结束元素,则引发AimlParserError。        """
        if name == "aiml":
            # </aiml> 标签只有在 InsideAiml 状态才是合法的
            if self._state != self._STATE_InsideAiml:
                raise AimlParserError("Unexpected </aiml> tag " +
                                      self._location())
            self._state = self._STATE_OutsideAiml
            self._whitespaceBehaviorStack.pop()
        elif name == "topic":
            # </topic> 标签只有在InsideAiml 状态, 而且 _insideTopic 为 true才是合法的。
            if self._state != self._STATE_InsideAiml or not self._insideTopic:
                raise AimlParserError("Unexpected </topic> tag " +
                                      self._location())
            self._insideTopic = False
            self._currentTopic = u""
        elif name == "category":
            # </category> 标签只有在 AfterTemplate  状态才是合法的
            if self._state != self._STATE_AfterTemplate:
                raise AimlParserError("Unexpected </category> tag " +
                                      self._location())
            self._state = self._STATE_InsideAiml
            # 结束当前类别。 将当前 pattern/ that / topic和元素存储在类别字典中。
            #【注意:这里修改了当前模式,用中文分割结果做了替换。。】
            self._currentPattern = u' '.join(splitChinese(
                self._currentPattern))
            key = (self._currentPattern.strip(), self._currentThat.strip(),
                   self._currentTopic.strip())
            self.categories[key] = self._elemStack[-1]
            self._whitespaceBehaviorStack.pop()
        elif name == "pattern":
            # </pattern> 标签只有在 InsidePattern 状态才是合法的。
            if self._state != self._STATE_InsidePattern:
                raise AimlParserError("Unexpected </pattern> tag " +
                                      self._location())
            self._state = self._STATE_AfterPattern
        elif name == "that" and self._state == self._STATE_InsideThat:
            #  </ that>标签只允许在<template>元素内部,或InsideThat状态下。本条款处理后一种情况。
            self._state = self._STATE_AfterThat
        elif name == "template":
            # </template> 标签只允许在 InsideTemplate 状态出现。
            if self._state != self._STATE_InsideTemplate:
                raise AimlParserError("Unexpected </template> tag " +
                                      self._location())
            self._state = self._STATE_AfterTemplate
            self._whitespaceBehaviorStack.pop()
        elif self._state == self._STATE_InsidePattern:
            # 特定的标签允许在 <pattern> 元素内部出现。
            if name not in ["bot"]:
                raise AimlParserError(("Unexpected </%s> tag " % name) +
                                      self._location())
        elif self._state == self._STATE_InsideThat:
            # 特定的标签允许在 <that> 元素内部出现.
            if name not in ["bot"]:
                raise AimlParserError(("Unexpected </%s> tag " % name) +
                                      self._location())
        elif self._state == self._STATE_InsideTemplate:
            # 当前模板内的元素结束。 将堆栈顶部的元素追加到下面的元素上。
            elem = self._elemStack.pop()
            self._elemStack[-1].append(elem)
            self._whitespaceBehaviorStack.pop()
            #  如果元素是一个条件,那么也可以从foundDefaultLiStack中弹出一个条目。
            if elem[0] == "condition": self._foundDefaultLiStack.pop()
        else:
            # 意外的关闭标签
            raise AimlParserError(("Unexpected </%s> tag " % name) +
                                  self._location())

    # 包含每个AIML元素的验证信息的字典。 键是元素的名称。 值是三项的元组。
    # 第一个是包含REQUIRED 必需属性名称的列表,第二个是 OPTIONAL 可选属性列表,
    # 第三个是指示元素是否可以包含其他元素和/或文本的布尔值(如果为False,元素只能出现在原子上下文中,比如<date/>)。
    _validationInfo101 = {
        "bot": (["name"], [], False),
        "condition": ([], ["name",
                           "value"], True),  # can only contain <li> elements
        "date": ([], [], False),
        "formal": ([], [], True),
        "gender": ([], [], True),
        "get": (["name"], [], False),
        "gossip": ([], [], True),
        "id": ([], [], False),
        "input": ([], ["index"], False),
        "javascript": ([], [], True),
        "learn": ([], [], True),
        "li": ([], ["name", "value"], True),
        "lowercase": ([], [], True),
        "person": ([], [], True),
        "person2": ([], [], True),
        "random": ([], [], True),  # can only contain <li> elements
        "sentence": ([], [], True),
        "set": (["name"], [], True),
        "size": ([], [], False),
        "sr": ([], [], False),
        "srai": ([], [], True),
        "star": ([], ["index"], False),
        "system": ([], [], True),
        "template":
        ([], [], True),  # needs to be in the list because it can be a parent.
        "that": ([], ["index"], False),
        "thatstar": ([], ["index"], False),
        "think": ([], [], True),
        "topicstar": ([], ["index"], False),
        "uppercase": ([], [], True),
        "version": ([], [], False),
    }

    def _validateElemStart(self, name, attr, version):
        """测试在<template>元素内开始元素的有效性。如果标签无效,此函数将引发AimlParserError异常。否则,没有消息是好消息。"""

        #检查元素的属性。 确保所有必需的属性都存在,并且其余的属性都是有效的选项。
        required, optional, canBeParent = self._validInfo[name]
        for a in required:
            if a not in attr and not self._forwardCompatibleMode:
                raise AimlParserError(
                    ("Required \"%s\" attribute missing in <%s> element " %
                     (a, name)) + self._location())
        for a in attr:
            if a in required: continue
            if a[0:4] == "xml:":
                continue  # attributes in the "xml" namespace can appear anywhere
            if a not in optional and not self._forwardCompatibleMode:
                raise AimlParserError(
                    ("Unexpected \"%s\" attribute in <%s> element " %
                     (a, name)) + self._location())

        # 特殊情况: several tags 包含一个可选的"index" 属性。 这个 attribute 的值 必须是一个正整数。
        if name in ["star", "thatstar", "topicstar"]:
            for k, v in attr.items():
                if k == "index":
                    temp = 0
                    try:
                        temp = int(v)
                    except:
                        raise AimlParserError((
                            "Bad type for \"%s\" attribute (expected integer, found \"%s\") "
                            % (k, v)) + self._location())
                    if temp < 1:
                        raise AimlParserError(
                            ("\"%s\" attribute must have non-negative value " %
                             (k)) + self._location())

        # 查看包含的元素是否被允许包含子元素。 如果不是,不管它是什么,这个元素都是无效的。
        try:
            parent = self._elemStack[-1][0]
            parentAttr = self._elemStack[-1][1]
        except IndexError:
            #  如果堆栈为空,则不存在父代。 这绝不应该发生。
            raise AimlParserError(
                ("Element stack is empty while validating <%s> " % name) +
                self._location())
        required, optional, canBeParent = self._validInfo[parent]
        nonBlockStyleCondition = (
            parent == "condition"
            and not ("name" in parentAttr and "value" in parentAttr))
        if not canBeParent:
            raise AimlParserError(("<%s> elements cannot have any contents " %
                                   parent) + self._location())
        # 特殊情况测试 :如果父元素是<condition>(非块式变体)或<random>,则这些元素只能包含<li>子元素。
        elif (parent == "random" or nonBlockStyleCondition) and name != "li":
            raise AimlParserError(
                ("<%s> elements can only contain <li> subelements " % parent) +
                self._location())
        # <li>元素的特殊情况测试,只能由非块式<condition>和<random>元素包含,其必需属性取决于<condition>父级中存在的属性。
        elif name == "li":
            if not (parent == "random" or nonBlockStyleCondition):
                raise AimlParserError(
                    ("Unexpected <li> element contained by <%s> element " %
                     parent) + self._location())
            if nonBlockStyleCondition:
                if "name" in parentAttr:
                    # 单谓词条件。 除了最后一个,每个<li>元素都必须有一个“value”属性。
                    if len(attr) == 0:
                        # 这可能是这个<condition>的默认<li>元素,除非我们已经找到一个。
                        if self._foundDefaultLiStack[-1]:
                            raise AimlParserError(
                                "Unexpected default <li> element inside <condition> "
                                + self._location())
                        else:
                            self._foundDefaultLiStack[-1] = True
                    elif len(attr) == 1 and "value" in attr:
                        pass  # 这是valid case
                    else:
                        raise AimlParserError(
                            "Invalid <li> inside single-predicate <condition> "
                            + self._location())
                elif len(parentAttr) == 0:
                    # 多谓词条件。 除了最后一个,每个<li>元素都必须有一个“value”属性。
                    if len(attr) == 0:
                        # 这可能是这个<condition>的默认<li>元素,除非我们已经找到一个。
                        if self._foundDefaultLiStack[-1]:
                            raise AimlParserError(
                                "Unexpected default <li> element inside <condition> "
                                + self._location())
                        else:
                            self._foundDefaultLiStack[-1] = True
                    elif len(attr) == 2 and "value" in attr and "name" in attr:
                        pass  # 这是 valid case
                    else:
                        raise AimlParserError(
                            "Invalid <li> inside multi-predicate <condition> "
                            + self._location())
        # All is well!
        return True
Exemplo n.º 19
0
class SemFeaturesParserContentHandler(ContentHandler):
    def __init__(self, wn):
        """
        Constructor.
            :param wn: an existing WNQuery object, that will be used for querying.
            @exception SemFeaturesException on file parsing errors
        """

        ContentHandler.__init__(self)
        self._locator = Locator()  # Dummy setDocumentLocator does the same!
        self.setDocumentLocator(self._locator)
        self.m_lcnt = 0                     # input line number
        self.m_ppath = []                   # contains the XML path to the current node (names of the ancestors)
        self.m_currfeat = ''                # feature currently being processed
        self.m_wn = wn                      # WordNet (WNQuery)
        self.m_featmap = defaultdict(list)  # semantic features to synset ids

    def startElement(self, name, attrs):
        if DEBUG:
            print('(', self._locator.getLineNumber(), ', ', self._locator.getColumnNumber(), '): /',
                  '/'.join(self.m_ppath), '/START: ', name, sep='')
            self.m_ppath.append(name)

        if name == 'semfeature' and 'name' in attrs:
            # save current attribute
            self.m_currfeat = attrs['name']
        elif name == 'synset' and 'id' in attrs:
            # save current attribute + synset pair
            self.m_featmap[self.m_currfeat].append(attrs['id'])

    def characters(self, chrs):
        if DEBUG:
            print('(', self._locator.getLineNumber(), ', ', self._locator.getColumnNumber(), '): /',
                  '/'.join(self.m_ppath), '#PCDATA: ', chrs, sep='')

    def endElement(self, name):
        if DEBUG:
            print('(', self._locator.getLineNumber(), ', ', self._locator.getColumnNumber(), '): /',
                  '/'.join(self.m_ppath), '/END: ', name, sep='')

        self.m_ppath.pop()

    def look_up_feature(self, feature):
        """
        Get synset ids mapped to a semantic feature.
        :param feature: name of semantic feature to look up
        :return: res result: synset ids pertaining to feature, or empty if feature was not found
        """
        return {wnid for wnid in self.m_featmap[feature] if feature in self.m_featmap}

    def is_literal_compatible_with_feature(self, literal, pos, feature):
        """
        Check whether a literal with given POS is compatible with the given semantic feature.
        Check if any sense of literal in WN is a (distant) hyponym of any of the synset ids corresponding
         to the semantic feature.
        :param literal: the literal to check
        :param pos: part-of-speech of literal (allowed values: n, v, a, b)
        :param feature: feature semantic feature to check
        :return: res_sense_ssid if compatibility was found, the id of the synset containing the sense of the literal
                            that was compatible with the feature
             res_feature_ssid if compatibility was found, the synset id of the interpretation of the feature
                            that was found to be compatible with the literal
             true if compatibility was found, false otherwise (no sense of literal was compatible with any of ids
                            pertaining to feature, or literal or feature was not found)
        """
        feat_ids = self.look_up_feature(feature)
        if len(feat_ids) > 0:
            return self.m_wn.is_literal_connected_with(literal, pos, 'hypernym', feat_ids)
        return None, None

    @classmethod
    def read_xml(cls, wn, semfeaturesfilename, os):
        """
        Read mapping (semantic features to synset ids) from XML file.
        :param wn: An Initialised WNQuery instance
        :param semfeaturesfilename: name of XML file
        :param os: output stream number of feature name-synset id pairs read successfully
        :return: Initialised instance
        """
        content_handler = cls(wn)
        # open file
        try:
            with open(semfeaturesfilename, encoding='UTF-8') as fh:
                # Magic lies here
                # Source: http://stackoverflow.com/a/12263340
                # Make parser
                xml_reader = make_parser()
                # set self as ContentHandler
                xml_reader.setContentHandler(content_handler)
                # Set ErrorHandler
                xml_reader.setErrorHandler(SemFeaturesParserErrorHandler())
                # Do the actual parsing
                xml_reader.parse(fh)
        except (OSError, IOError) as e:
            raise SemFeaturesParserException(f'Could not open file: {semfeaturesfilename} because: {e}')

        # Close defaultdict for safety
        content_handler.m_featmap.default_factory = None
        # Return the gathered result
        m_featmap_len = 0
        for it in content_handler.m_featmap.values():
            m_featmap_len += len(it)
        print(m_featmap_len, 'pairs read', file=os)
        return cls
Exemplo n.º 20
0
class AimlHandler(ContentHandler):
    # The legal states of the AIML parser
    _STATE_OutsideAiml = 0
    _STATE_InsideAiml = 1
    _STATE_InsideCategory = 2
    _STATE_InsidePattern = 3
    _STATE_AfterPattern = 4
    _STATE_InsideThat = 5
    _STATE_AfterThat = 6
    _STATE_InsideTemplate = 7
    _STATE_AfterTemplate = 8

    def __init__(self, encoding="UTF-8"):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = ""
        self._skipCurrentCategory = False
        self._numParseErrors = 0
        self._validInfo = self._validationInfo101
        self._foundDefaultLiStack = []
        self._whitespaceBehaviorStack = ["default"]

        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)

    def getNumErrors(self):
        return self._numParseErrors

    def setEncoding(self, encoding):
        self._encoding = encoding

    def _location(self):
        line = self._locator.getLineNumber()
        column = self._locator.getColumnNumber()
        return "(line %d, column %d)" % (line, column)

    def _pushWhitespaceBehavior(self, attr):
        assert len(
            self._whitespaceBehaviorStack
        ) > 0, "Pilha de comportamento de espaço em branco nunca deve estar vazia!"
        try:
            if attr["xml:space"] == "default" or attr[
                    "xml:space"] == "preserve":
                self._whitespaceBehaviorStack.append(attr["xml:space"])
            else:
                raise AimlParserError(
                    "Invalid value for xml:space attribute " +
                    self._location())
        except KeyError:
            self._whitespaceBehaviorStack.append(
                self._whitespaceBehaviorStack[-1])

    def startElementNS(self, name, qname, attr):
        print("QNAME:", qname)
        print("NAME:", name)
        uri, elem = name
        if (elem == "bot"):
            print("name:", attr.getValueByQName("name"), "a'ite?")
        self.startElement(elem, attr)
        pass

    def startElement(self, name, attr):
        if self._currentUnknown != "":
            return
        if self._skipCurrentCategory:
            return
        try:
            self._startElement(name, attr)
        except AimlParserError as msg:
            sys.stderr.write("PARSE ERROR: %s\n" % msg)
            self._numParseErrors += 1
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True

    def _startElement(self, name, attr):
        if name == "aiml":
            if self._state != self._STATE_OutsideAiml:
                raise AimlParserError("Unexpected <aiml> tag " +
                                      self._location())
            self._state = self._STATE_InsideAiml
            self._insideTopic = False
            self._currentTopic = ""
            try:
                self._version = attr["version"]
            except KeyError:
                self._version = "1.0"
            self._forwardCompatibleMode = (self._version != "1.0.1")
            self._pushWhitespaceBehavior(attr)
        elif self._state == self._STATE_OutsideAiml:
            return
        elif name == "topic":
            if (self._state != self._STATE_InsideAiml) or self._insideTopic:
                raise AimlParserError("Unexpected <topic> tag").with_traceback(
                    self._location())
            try:
                self._currentTopic = str(attr['name'])
            except KeyError:
                raise AimlParserError(
                    "Required \"name\" attribute missing in <topic> element " +
                    self._location())
            self._insideTopic = True
        elif name == "category":
            if self._state != self._STATE_InsideAiml:
                raise AimlParserError("Unexpected <category> tag " +
                                      self._location())
            self._state = self._STATE_InsideCategory
            self._currentPattern = ""
            self._currentThat = ""
            if not self._insideTopic: self._currentTopic = "*"
            self._elemStack = []
            self._pushWhitespaceBehavior(attr)
        elif name == "pattern":
            if self._state != self._STATE_InsideCategory:
                raise AimlParserError("Unexpected <pattern> tag " +
                                      self._location())
            self._state = self._STATE_InsidePattern
        elif name == "that" and self._state == self._STATE_AfterPattern:
            self._state = self._STATE_InsideThat
        elif name == "template":
            if self._state not in [
                    self._STATE_AfterPattern, self._STATE_AfterThat
            ]:
                raise AimlParserError("Unexpected <template> tag " +
                                      self._location())
            if self._state == self._STATE_AfterPattern:
                self._currentThat = "*"
            self._state = self._STATE_InsideTemplate
            self._elemStack.append(['template', {}])
            self._pushWhitespaceBehavior(attr)
        elif self._state == self._STATE_InsidePattern:
            if name == "bot" and "name" in attr and attr["name"] == "name":
                self._currentPattern += " BOT_NAME "
            else:
                raise AimlParserError(("Unexpected <%s> tag " % name) +
                                      self._location())
        elif self._state == self._STATE_InsideThat:
            if name == "bot" and "name" in attr and attr["name"] == "name":
                self._currentThat += " BOT_NAME "
            else:
                raise AimlParserError(("Unexpected <%s> tag " % name) +
                                      self._location())
        elif self._state == self._STATE_InsideTemplate and name in self._validInfo:
            attrDict = {}
            for k, v in list(attr.items()):
                attrDict[str(k)] = str(v)
            self._validateElemStart(name, attrDict, self._version)
            self._elemStack.append([name, attrDict])
            self._pushWhitespaceBehavior(attr)
            if name == "condition":
                self._foundDefaultLiStack.append(False)
        else:
            if self._forwardCompatibleMode:
                self._currentUnknown = name
            else:
                raise AimlParserError(("Unexpected <%s> tag " % name) +
                                      self._location())

    def characters(self, ch):
        if self._state == self._STATE_OutsideAiml:
            return
        if self._currentUnknown != "":
            return
        if self._skipCurrentCategory:
            return
        try:
            self._characters(ch)
        except AimlParserError as msg:
            sys.stderr.write("PARSE ERROR: %s\n" % msg)
            self._numParseErrors += 1
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True

    def _characters(self, ch):
        text = str(ch)
        if self._state == self._STATE_InsidePattern:
            self._currentPattern += text
        elif self._state == self._STATE_InsideThat:
            self._currentThat += text
        elif self._state == self._STATE_InsideTemplate:
            try:
                parent = self._elemStack[-1][0]
                parentAttr = self._elemStack[-1][1]
                required, optional, canBeParent = self._validInfo[parent]
                nonBlockStyleCondition = (
                    parent == "condition"
                    and not ("name" in parentAttr and "value" in parentAttr))
                if not canBeParent:
                    raise AimlParserError(
                        ("Unexpected text inside <%s> element " % parent) +
                        self._location())
                elif parent == "random" or nonBlockStyleCondition:
                    if len(text.strip()) == 0:
                        return
                    else:
                        raise AimlParserError(
                            ("Unexpected text inside <%s> element " % parent) +
                            self._location())
            except IndexError:
                raise AimlParserError(
                    "Element stack is empty while validating text " +
                    self._location())
            try:
                textElemOnStack = (self._elemStack[-1][-1][0] == "text")
            except IndexError:
                textElemOnStack = False
            except KeyError:
                textElemOnStack = False
            if textElemOnStack:
                self._elemStack[-1][-1][2] += text
            else:
                self._elemStack[-1].append([
                    "text", {
                        "xml:space": self._whitespaceBehaviorStack[-1]
                    }, text
                ])
        else:
            pass

    def endElementNS(self, name, qname):
        uri, elem = name
        self.endElement(elem)

    def endElement(self, name):
        """Wrapper around _endElement which catches errors in _characters()
		and keeps going.

		"""
        if self._state == self._STATE_OutsideAiml:
            return
        if self._currentUnknown != "":
            if name == self._currentUnknown:
                self._currentUnknown = ""
            return
        if self._skipCurrentCategory:
            if name == "category":
                self._skipCurrentCategory = False
                self._state = self._STATE_InsideAiml
            return
        try:
            self._endElement(name)
        except AimlParserError as msg:
            sys.stderr.write("PARSE ERROR: %s\n" % msg)
            self._numParseErrors += 1
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True

    def _endElement(self, name):
        if name == "aiml":
            if self._state != self._STATE_InsideAiml:
                raise AimlParserError("Unexpected </aiml> tag " +
                                      self._location())
            self._state = self._STATE_OutsideAiml
            self._whitespaceBehaviorStack.pop()
        elif name == "topic":
            if self._state != self._STATE_InsideAiml or not self._insideTopic:
                raise AimlParserError("Unexpected </topic> tag " +
                                      self._location())
            self._insideTopic = False
            self._currentTopic = ""
        elif name == "category":
            if self._state != self._STATE_AfterTemplate:
                raise AimlParserError("Unexpected </category> tag " +
                                      self._location())
            self._state = self._STATE_InsideAiml
            key = (self._currentPattern.strip(), self._currentThat.strip(),
                   self._currentTopic.strip())
            self.categories[key] = self._elemStack[-1]
            self._whitespaceBehaviorStack.pop()
        elif name == "pattern":
            if self._state != self._STATE_InsidePattern:
                raise AimlParserError("Unexpected </pattern> tag " +
                                      self._location())
            self._state = self._STATE_AfterPattern
        elif name == "that" and self._state == self._STATE_InsideThat:
            self._state = self._STATE_AfterThat
        elif name == "template":
            if self._state != self._STATE_InsideTemplate:
                raise AimlParserError("Unexpected </template> tag " +
                                      self._location())
            self._state = self._STATE_AfterTemplate
            self._whitespaceBehaviorStack.pop()
        elif self._state == self._STATE_InsidePattern:
            if name not in ["bot"]:
                raise AimlParserError(("Unexpected </%s> tag " % name) +
                                      self._location())
        elif self._state == self._STATE_InsideThat:
            if name not in ["bot"]:
                raise AimlParserError(("Unexpected </%s> tag " % name) +
                                      self._location())
        elif self._state == self._STATE_InsideTemplate:
            elem = self._elemStack.pop()
            self._elemStack[-1].append(elem)
            self._whitespaceBehaviorStack.pop()
            if elem[0] == "condition": self._foundDefaultLiStack.pop()
        else:
            raise AimlParserError(("Unexpected </%s> tag " % name) +
                                  self._location())

    _validationInfo101 = {
        "bot": (["name"], [], False),
        "condition": ([], ["name", "value"], True),
        "date": ([], [], False),
        "formal": ([], [], True),
        "gender": ([], [], True),
        "get": (["name"], [], False),
        "gossip": ([], [], True),
        "id": ([], [], False),
        "input": ([], ["index"], False),
        "javascript": ([], [], True),
        "learn": ([], [], True),
        "li": ([], ["name", "value"], True),
        "lowercase": ([], [], True),
        "person": ([], [], True),
        "person2": ([], [], True),
        "random": ([], [], True),
        "sentence": ([], [], True),
        "set": (["name"], [], True),
        "size": ([], [], False),
        "sr": ([], [], False),
        "srai": ([], [], True),
        "star": ([], ["index"], False),
        "system": ([], [], True),
        "template": ([], [], True),
        "that": ([], ["index"], False),
        "thatstar": ([], ["index"], False),
        "think": ([], [], True),
        "topicstar": ([], ["index"], False),
        "uppercase": ([], [], True),
        "version": ([], [], False),
    }

    def _validateElemStart(self, name, attr, version):
        required, optional, canBeParent = self._validInfo[name]
        for a in required:
            if a not in attr and not self._forwardCompatibleMode:
                raise AimlParserError(
                    ("Required \"%s\" attribute missing in <%s> element " %
                     (a, name)) + self._location())
        for a in attr:
            if a in required: continue
            if a[0:4] == "xml:": continue
            if a not in optional and not self._forwardCompatibleMode:
                raise AimlParserError(
                    ("Unexpected \"%s\" attribute in <%s> element " %
                     (a, name)) + self._location())
        if name in ["star", "thatstar", "topicstar"]:
            for k, v in list(attr.items()):
                if k == "index":
                    temp = 0
                    try:
                        temp = int(v)
                    except:
                        raise AimlParserError((
                            "Bad type for \"%s\" attribute (expected integer, found \"%s\") "
                            % (k, v)) + self._location())
                    if temp < 1:
                        raise AimlParserError(
                            ("\"%s\" attribute must have non-negative value " %
                             (k)) + self._location())
        try:
            parent = self._elemStack[-1][0]
            parentAttr = self._elemStack[-1][1]
        except IndexError:
            raise AimlParserError(
                ("Element stack is empty while validating <%s> " % name) +
                self._location())
        required, optional, canBeParent = self._validInfo[parent]
        nonBlockStyleCondition = (
            parent == "condition"
            and not ("name" in parentAttr and "value" in parentAttr))
        if not canBeParent:
            raise AimlParserError(("<%s> elements cannot have any contents " %
                                   parent) + self._location())
        elif (parent == "random" or nonBlockStyleCondition) and name != "li":
            raise AimlParserError(
                ("<%s> elements can only contain <li> subelements " % parent) +
                self._location())
        elif name == "li":
            if not (parent == "random" or nonBlockStyleCondition):
                raise AimlParserError(
                    ("Unexpected <li> element contained by <%s> element " %
                     parent) + self._location())
            if nonBlockStyleCondition:
                if "name" in parentAttr:
                    if len(attr) == 0:
                        if self._foundDefaultLiStack[-1]:
                            raise AimlParserError(
                                "Unexpected default <li> element inside <condition> "
                                + self._location())
                        else:
                            self._foundDefaultLiStack[-1] = True
                    elif len(attr) == 1 and "value" in attr:
                        pass
                    else:
                        raise AimlParserError(
                            "Invalid <li> inside single-predicate <condition> "
                            + self._location())
                elif len(parentAttr) == 0:
                    if len(attr) == 0:
                        if self._foundDefaultLiStack[-1]:
                            raise AimlParserError(
                                "Unexpected default <li> element inside <condition> "
                                + self._location())
                        else:
                            self._foundDefaultLiStack[-1] = True
                    elif len(attr) == 2 and "value" in attr and "name" in attr:
                        pass
                    else:
                        raise AimlParserError(
                            "Invalid <li> inside multi-predicate <condition> "
                            + self._location())
        return True
Exemplo n.º 21
0
class AimlHandler(ContentHandler):
    # The legal states of the AIML parser
    _STATE_OutsideAiml    = 0
    _STATE_InsideAiml     = 1
    _STATE_InsideCategory = 2
    _STATE_InsidePattern  = 3
    _STATE_AfterPattern   = 4
    _STATE_InsideThat     = 5
    _STATE_AfterThat      = 6
    _STATE_InsideTemplate = 7
    _STATE_AfterTemplate  = 8
    
    def __init__(self, encoding = "UTF-8"):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat    = ""
        self._currentTopic   = ""
        self._insideTopic = False
        self._currentUnknown = "" # the name of the current unknown element

        # This is set to true when a parse error occurs in a category.
        self._skipCurrentCategory = False

        # Counts the number of parse errors in a particular AIML document.
        # query with getNumErrors().  If 0, the document is AIML-compliant.
        self._numParseErrors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._validInfo = self._validationInfo101

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._foundDefaultLiStack = []
        
        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)

    def getNumErrors(self):
        "Returns the number of errors found while parsing the current document."
        return self._numParseErrors

    def setEncoding(self, encoding):
        """Sets the text encoding to use when encoding strings read from XML.
Defaults to 'UTF-8'."""
        self._encoding = encoding

    def _location(self):
        "Returns a string describing the current location in the source file."
        line = self._locator.getLineNumber()
        column = self._locator.getColumnNumber()
        return "(line %d, column %d)" % (line, column)

    def startElementNS(self, name, qname, attr):
        print "QNAME:", qname
        print "NAME:", name
        uri,elem = name
        if (elem == "bot"): print "name:", attr.getValueByQName("name"), "a'ite?"
        self.startElement(elem, attr)
        pass

    def startElement(self, name, attr):
        # Wrapper around _startElement, which catches errors in _startElement()
        # and keeps going.
        
        # If we're inside an unknown element, ignore everything until we're
        # out again.
        if self._currentUnknown != "":
            return
        # If we're skipping the current category, ignore everything until
        # it's finished.
        if self._skipCurrentCategory:
            return

        # process this start-element.
        try: self._startElement(name, attr)
        except AimlParserError, msg:
            # Print the error message
            sys.stderr.write("PARSE ERROR: %s\n" % msg)
            
            self._numParseErrors += 1 # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True
Exemplo n.º 22
0
class AimlHandler(ContentHandler):
    # The legal states of the AIML parser
    _STATE_OutsideAiml = 0
    _STATE_InsideAiml = 1
    _STATE_InsideCategory = 2
    _STATE_InsidePattern = 3
    _STATE_AfterPattern = 4
    _STATE_InsideThat = 5
    _STATE_AfterThat = 6
    _STATE_InsideTemplate = 7
    _STATE_AfterTemplate = 8

    def __init__(self, encoding="UTF-8"):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat = ""
        self._currentTopic = ""
        self._insideTopic = False
        self._currentUnknown = ""  # the name of the current unknown element

        # This is set to true when a parse error occurs in a category.
        self._skipCurrentCategory = False

        # Counts the number of parse errors in a particular AIML document.
        # query with getNumErrors().  If 0, the document is AIML-compliant.
        self._numParseErrors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._validInfo = self._validationInfo101

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._foundDefaultLiStack = []

        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)

    def getNumErrors(self):
        "Returns the number of errors found while parsing the current document."
        return self._numParseErrors

    def setEncoding(self, encoding):
        """Sets the text encoding to use when encoding strings read from XML.
Defaults to 'UTF-8'."""
        self._encoding = encoding

    def _location(self):
        "Returns a string describing the current location in the source file."
        line = self._locator.getLineNumber()
        column = self._locator.getColumnNumber()
        return "(line %d, column %d)" % (line, column)

    def startElementNS(self, name, qname, attr):
        print "QNAME:", qname
        print "NAME:", name
        uri, elem = name
        if (elem == "bot"):
            print "name:", attr.getValueByQName("name"), "a'ite?"
        self.startElement(elem, attr)
        pass

    def startElement(self, name, attr):
        # Wrapper around _startElement, which catches errors in _startElement()
        # and keeps going.

        # If we're inside an unknown element, ignore everything until we're
        # out again.
        if self._currentUnknown != "":
            return
        # If we're skipping the current category, ignore everything until
        # it's finished.
        if self._skipCurrentCategory:
            return

        # process this start-element.
        try:
            self._startElement(name, attr)
        except AimlParserError, msg:
            # Print the error message
            sys.stderr.write("PARSE ERROR: %s\n" % msg)

            self._numParseErrors += 1  # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True