def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, lowercaseElementName=True, lowercaseAttrName=True): self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) #Perform case conversions? self.lowercaseElementName = lowercaseElementName self.lowercaseAttrName = lowercaseAttrName self.states = { "data":self.dataState, "entityData":self.entityDataState, "tagOpen":self.tagOpenState, "closeTagOpen":self.closeTagOpenState, "tagName":self.tagNameState, "beforeAttributeName":self.beforeAttributeNameState, "attributeName":self.attributeNameState, "afterAttributeName":self.afterAttributeNameState, "beforeAttributeValue":self.beforeAttributeValueState, "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState, "attributeValueSingleQuoted":self.attributeValueSingleQuotedState, "attributeValueUnQuoted":self.attributeValueUnQuotedState, "afterAttributeValue":self.afterAttributeValueState, "bogusComment":self.bogusCommentState, "bogusCommentContinuation":self.bogusCommentContinuationState, "markupDeclarationOpen":self.markupDeclarationOpenState, "commentStart":self.commentStartState, "commentStartDash":self.commentStartDashState, "comment":self.commentState, "commentEndDash":self.commentEndDashState, "commentEnd":self.commentEndState, "doctype":self.doctypeState, "beforeDoctypeName":self.beforeDoctypeNameState, "doctypeName":self.doctypeNameState, "afterDoctypeName":self.afterDoctypeNameState, "beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState, "doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState, "doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState, "afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState, "beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState, "doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState, "doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState, "afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState, "bogusDoctype":self.bogusDoctypeState } # Setup the initial tokenizer state self.contentModelFlag = contentModelFlags["PCDATA"] self.escapeFlag = False self.lastFourChars = [] self.state = self.states["data"] # The current token being created self.currentToken = None
def __init__(self, stream, encoding=None, parseMeta=True, lowercaseElementName=True, lowercaseAttrName=True,): self.stream = HTMLInputStream(stream, encoding, parseMeta) #Perform case conversions? self.lowercaseElementName = lowercaseElementName self.lowercaseAttrName = lowercaseAttrName self.states = { "data":self.dataState, "entityData":self.entityDataState, "tagOpen":self.tagOpenState, "closeTagOpen":self.closeTagOpenState, "tagName":self.tagNameState, "beforeAttributeName":self.beforeAttributeNameState, "attributeName":self.attributeNameState, "afterAttributeName":self.afterAttributeNameState, "beforeAttributeValue":self.beforeAttributeValueState, "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState, "attributeValueSingleQuoted":self.attributeValueSingleQuotedState, "attributeValueUnQuoted":self.attributeValueUnQuotedState, "bogusComment":self.bogusCommentState, "markupDeclarationOpen":self.markupDeclarationOpenState, "commentStart":self.commentStartState, "commentStartDash":self.commentStartDashState, "comment":self.commentState, "commentEndDash":self.commentEndDashState, "commentEnd":self.commentEndState, "doctype":self.doctypeState, "beforeDoctypeName":self.beforeDoctypeNameState, "doctypeName":self.doctypeNameState, "afterDoctypeName":self.afterDoctypeNameState, "beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState, "doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState, "doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState, "afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState, "beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState, "doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState, "doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState, "afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState, "bogusDoctype":self.bogusDoctypeState } # Setup the initial tokenizer state self.contentModelFlag = contentModelFlags["PCDATA"] self.escapeFlag = False self.lastFourChars = [] self.state = self.states["data"] # The current token being created self.currentToken = None # Tokens to be processed. self.tokenQueue = []
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, lowercaseElementName=True, lowercaseAttrName=True): self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) #Perform case conversions? self.lowercaseElementName = lowercaseElementName self.lowercaseAttrName = lowercaseAttrName # Setup the initial tokenizer state self.contentModelFlag = contentModelFlags["PCDATA"] self.escapeFlag = False self.lastFourChars = [] self.state = self.dataState self.escape = False # The current token being created self.currentToken = None
class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.states Holds a mapping between states and methods that implement the state. * self.stream Points to HTMLInputStream object. """ # XXX need to fix documentation def __init__(self, stream, encoding=None, parseMeta=True, lowercaseElementName=True, lowercaseAttrName=True,): self.stream = HTMLInputStream(stream, encoding, parseMeta) #Perform case conversions? self.lowercaseElementName = lowercaseElementName self.lowercaseAttrName = lowercaseAttrName self.states = { "data":self.dataState, "entityData":self.entityDataState, "tagOpen":self.tagOpenState, "closeTagOpen":self.closeTagOpenState, "tagName":self.tagNameState, "beforeAttributeName":self.beforeAttributeNameState, "attributeName":self.attributeNameState, "afterAttributeName":self.afterAttributeNameState, "beforeAttributeValue":self.beforeAttributeValueState, "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState, "attributeValueSingleQuoted":self.attributeValueSingleQuotedState, "attributeValueUnQuoted":self.attributeValueUnQuotedState, "bogusComment":self.bogusCommentState, "markupDeclarationOpen":self.markupDeclarationOpenState, "commentStart":self.commentStartState, "commentStartDash":self.commentStartDashState, "comment":self.commentState, "commentEndDash":self.commentEndDashState, "commentEnd":self.commentEndState, "doctype":self.doctypeState, "beforeDoctypeName":self.beforeDoctypeNameState, "doctypeName":self.doctypeNameState, "afterDoctypeName":self.afterDoctypeNameState, "beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState, "doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState, "doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState, "afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState, "beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState, "doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState, "doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState, "afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState, "bogusDoctype":self.bogusDoctypeState } # Setup the initial tokenizer state self.contentModelFlag = contentModelFlags["PCDATA"] self.escapeFlag = False self.lastFourChars = [] self.state = self.states["data"] # The current token being created self.currentToken = None # Tokens to be processed. self.tokenQueue = [] def __iter__(self): """ This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. """ self.tokenQueue = [] # Start processing. When EOF is reached self.state will return False # instead of True and the loop will terminate. while self.state(): while self.stream.errors: yield {"type": "ParseError", "data": self.stream.errors.pop(0)} while self.tokenQueue: yield self.tokenQueue.pop(0) # Below are various helper functions the tokenizer states use worked out. def processSolidusInTag(self): """If the next character is a '>', convert the currentToken into an EmptyTag """ # We need to consume another character to make sure it's a ">" data = self.stream.char() if self.currentToken["type"] == "StartTag" and data == u">": self.currentToken["type"] = "EmptyTag" else: self.tokenQueue.append({"type": "ParseError", "data": "incorrectly-placed-solidus"}) # The character we just consumed need to be put back on the stack so it # doesn't get lost... self.stream.unget(data) def consumeNumberEntity(self, isHex): """This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present self.tokenQueue.append({"type": "ParseError"}) is invoked. """ allowed = digits radix = 10 if isHex: allowed = hexDigits radix = 16 charStack = [] # Consume all the characters that are in range while making sure we # don't hit an EOF. c = self.stream.char() while c in allowed and c is not EOF: charStack.append(c) c = self.stream.char() # Convert the set of characters consumed to an int. charAsInt = int("".join(charStack), radix) if charAsInt == 13: self.tokenQueue.append({"type": "ParseError", "data": "incorrect-cr-newline-entity"}) charAsInt = 10 elif 127 < charAsInt < 160: # If the integer is between 127 and 160 (so 128 and bigger and 159 # and smaller) we need to do the "windows trick". self.tokenQueue.append({"type": "ParseError", "data": "illegal-windows-1252-entity"}) charAsInt = entitiesWindows1252[charAsInt - 128] # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF). if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343): try: # XXX We should have a separate function that does "int" to # "unicodestring" conversion since this doesn't always work # according to hsivonen. Also, unichr has a limitation of 65535 char = unichr(charAsInt) except: try: char = eval("u'\\U%08x'" % charAsInt) except: self.tokenQueue.append({"type": "ParseError", "data": "cant-convert-numeric-entity", "datavars": {"charAsInt": charAsInt}}) else: char = u"\uFFFD" self.tokenQueue.append({"type": "ParseError", "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. if c != u";": self.tokenQueue.append({"type": "ParseError", "data": "numeric-entity-without-semicolon"}) self.stream.unget(c) return char def consumeEntity(self, fromAttribute=False): char = None charStack = [self.stream.char()] if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&"): self.stream.unget(charStack) elif charStack[0] == u"#": # We might have a number entity here. charStack.extend([self.stream.char(), self.stream.char()]) if EOF in charStack[:2]: # If we reach the end of the file put everything up to EOF # back in the queue charStack = charStack[:charStack.index(EOF)] self.stream.unget(charStack) self.tokenQueue.append({"type": "ParseError", "data": "expected-numeric-entity-but-got-eof"}) else: if charStack[1].lower() == u"x" \ and charStack[2] in hexDigits: # Hexadecimal entity detected. self.stream.unget(charStack[2]) char = self.consumeNumberEntity(True) elif charStack[1] in digits: # Decimal entity detected. self.stream.unget(charStack[1:]) char = self.consumeNumberEntity(False) else: # No number entity detected. self.stream.unget(charStack) self.tokenQueue.append({"type": "ParseError", "data": "expected-numeric-entity"}) else: # At this point in the process might have named entity. Entities # are stored in the global variable "entities". # # Consume characters and compare to these to a substring of the # entity names in the list until the substring no longer matches. filteredEntityList = [e for e in entities if \ e.startswith(charStack[0])] def entitiesStartingWith(name): return [e for e in filteredEntityList if e.startswith(name)] while charStack[-1] != EOF and\ entitiesStartingWith("".join(charStack)): charStack.append(self.stream.char()) # At this point we have a string that starts with some characters # that may match an entity entityName = None # Try to find the longest entity the string will match to take care # of ¬i for instance. for entityLength in xrange(len(charStack)-1,1,-1): possibleEntityName = "".join(charStack[:entityLength]) if possibleEntityName in entities: entityName = possibleEntityName break if entityName is not None: if entityName[-1] != ";": self.tokenQueue.append({"type": "ParseError", "data": "named-entity-without-semicolon"}) if entityName[-1] != ";" and fromAttribute and \ (charStack[entityLength] in asciiLetters or charStack[entityLength] in digits): self.stream.unget(charStack) else: char = entities[entityName] self.stream.unget(charStack[entityLength:]) else: self.tokenQueue.append({"type": "ParseError", "data": "expected-named-entity"}) self.stream.unget(charStack) return char def processEntityInAttribute(self): """This method replaces the need for "entityInAttributeValueState". """ entity = self.consumeEntity(True) if entity: self.currentToken["data"][-1][1] += entity else: self.currentToken["data"][-1][1] += u"&" def emitCurrentToken(self): """This method is a generic handler for emitting the tags. It also sets the state to "data" because that's what's needed after a token has been emitted. """ token = self.currentToken # Add token to the queue to be yielded if (token["type"] in ("StartTag", "EndTag", "EmptyTag")): if self.lowercaseElementName: token["name"] = token["name"].translate(asciiUpper2Lower) if token["type"] == "EndTag" and token["data"]: self.tokenQueue.append({"type":"ParseError", "data":"attributes-in-end-tag"}) self.tokenQueue.append(token) self.state = self.states["data"] # Below are the various tokenizer states worked out. # XXX AT Perhaps we should have Hixie run some evaluation on billions of # documents to figure out what the order of the various if and elif # statements should be. def dataState(self): data = self.stream.char() # Keep a charbuffer to handle the escapeFlag if self.contentModelFlag in\ (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]): if len(self.lastFourChars) == 4: self.lastFourChars.pop(0) self.lastFourChars.append(data) # The rest of the logic if data == "&" and self.contentModelFlag in\ (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not\ self.escapeFlag: self.state = self.states["entityData"] elif data == "-" and self.contentModelFlag in\ (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not\ self.escapeFlag and "".join(self.lastFourChars) == "<!--": self.escapeFlag = True self.tokenQueue.append({"type": "Characters", "data":data}) elif data == "<" and (self.contentModelFlag ==\ contentModelFlags["PCDATA"] or (self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\ self.escapeFlag == False)): self.state = self.states["tagOpen"] elif data == ">" and self.contentModelFlag in\ (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\ self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->": self.escapeFlag = False self.tokenQueue.append({"type": "Characters", "data":data}) elif data == EOF: # Tokenization ends. return False elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. self.tokenQueue.append({"type": "SpaceCharacters", "data": data + self.stream.charsUntil(spaceCharacters, True)}) else: self.tokenQueue.append({"type": "Characters", "data": data + self.stream.charsUntil(("&", "<", ">", "-"))}) return True def entityDataState(self): entity = self.consumeEntity() if entity: self.tokenQueue.append({"type": "Characters", "data": entity}) else: self.tokenQueue.append({"type": "Characters", "data": u"&"}) self.state = self.states["data"] return True def tagOpenState(self): data = self.stream.char() if self.contentModelFlag == contentModelFlags["PCDATA"]: if data == u"!": self.state = self.states["markupDeclarationOpen"] elif data == u"/": self.state = self.states["closeTagOpen"] elif data in asciiLetters: self.currentToken =\ {"type": "StartTag", "name": data, "data": []} self.state = self.states["tagName"] elif data == u">": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": "ParseError", "data": "expected-tag-name-but-got-right-bracket"}) self.tokenQueue.append({"type": "Characters", "data": u"<>"}) self.state = self.states["data"] elif data == u"?": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": "ParseError", "data": "expected-tag-name-but-got-question-mark"}) self.stream.unget(data) self.state = self.states["bogusComment"] else: # XXX self.tokenQueue.append({"type": "ParseError", "data": "expected-tag-name"}) self.tokenQueue.append({"type": "Characters", "data": u"<"}) self.stream.unget(data) self.state = self.states["data"] else: # We know the content model flag is set to either RCDATA or CDATA # now because this state can never be entered with the PLAINTEXT # flag. if data == u"/": self.state = self.states["closeTagOpen"] else: self.tokenQueue.append({"type": "Characters", "data": u"<"}) self.stream.unget(data) self.state = self.states["data"] return True def closeTagOpenState(self): if (self.contentModelFlag in (contentModelFlags["RCDATA"], contentModelFlags["CDATA"])): if self.currentToken: charStack = [] # So far we know that "</" has been consumed. We now need to know # whether the next few characters match the name of last emitted # start tag which also happens to be the currentToken. We also need # to have the character directly after the characters that could # match the start tag name. for x in xrange(len(self.currentToken["name"]) + 1): charStack.append(self.stream.char()) # Make sure we don't get hit by EOF if charStack[-1] == EOF: break # Since this is just for checking. We put the characters back on # the stack. self.stream.unget(charStack) if self.currentToken \ and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \ and charStack[-1] in (spaceCharacters | frozenset((u">", u"/", u"<", EOF))): # Because the characters are correct we can safely switch to # PCDATA mode now. This also means we don't have to do it when # emitting the end tag token. self.contentModelFlag = contentModelFlags["PCDATA"] else: self.tokenQueue.append({"type": "Characters", "data": u"</"}) self.state = self.states["data"] # Need to return here since we don't want the rest of the # method to be walked through. return True data = self.stream.char() if data in asciiLetters: self.currentToken = {"type":"EndTag", "name":data, "data":[]} self.state = self.states["tagName"] elif data == u">": self.tokenQueue.append({"type": "ParseError", "data": "expected-closing-tag-but-got-right-bracket"}) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "expected-closing-tag-but-got-eof"}) self.tokenQueue.append({"type": "Characters", "data": u"</"}) self.state = self.states["data"] else: # XXX data can be _'_... self.tokenQueue.append({"type": "ParseError", "data": "expected-closing-tag-but-got-char", "datavars": {"data": data}}) self.stream.unget(data) self.state = self.states["bogusComment"] return True def tagNameState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] elif data in asciiLetters: self.currentToken["name"] += data +\ self.stream.charsUntil(asciiLetters, True) elif data == u">": self.emitCurrentToken() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-tag-name"}) self.emitCurrentToken() elif data == u"/": self.processSolidusInTag() self.state = self.states["beforeAttributeName"] else: self.currentToken["name"] += data return True def beforeAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u">": self.emitCurrentToken() elif data == u"/": self.processSolidusInTag() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "expected-attribute-name-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] return True def attributeNameState(self): data = self.stream.char() leavingThisState = True emitToken = False if data == u"=": self.state = self.states["beforeAttributeValue"] elif data in asciiLetters: self.currentToken["data"][-1][0] += data +\ self.stream.charsUntil(asciiLetters, True) leavingThisState = False elif data == u">": # XXX If we emit here the attributes are converted to a dict # without being checked and when the code below runs we error # because data is a dict not a list emitToken = True elif data in spaceCharacters: self.state = self.states["afterAttributeName"] elif data == u"/": self.processSolidusInTag() self.state = self.states["beforeAttributeName"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-attribute-name"}) self.state = self.states["data"] emitToken = True else: self.currentToken["data"][-1][0] += data leavingThisState = False if leavingThisState: # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. if self.lowercaseAttrName: self.currentToken["data"][-1][0] = ( self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, value in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": "ParseError", "data": "duplicate-attribute"}) break # XXX Fix for above XXX if emitToken: self.emitCurrentToken() return True def afterAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"=": self.state = self.states["beforeAttributeValue"] elif data == u">": self.emitCurrentToken() elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u"/": self.processSolidusInTag() self.state = self.states["beforeAttributeName"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "expected-end-of-tag-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] return True def beforeAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"\"": self.state = self.states["attributeValueDoubleQuoted"] elif data == u"&": self.state = self.states["attributeValueUnQuoted"] self.stream.unget(data); elif data == u"'": self.state = self.states["attributeValueSingleQuoted"] elif data == u">": self.emitCurrentToken() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "expected-attribute-value-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data self.state = self.states["attributeValueUnQuoted"] return True def attributeValueDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["beforeAttributeName"] elif data == u"&": self.processEntityInAttribute() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-attribute-value-double-quote"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("\"", u"&")) return True def attributeValueSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["beforeAttributeName"] elif data == u"&": self.processEntityInAttribute() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-attribute-value-single-quote"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("'", u"&")) return True def attributeValueUnQuotedState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] elif data == u"&": self.processEntityInAttribute() elif data == u">": self.emitCurrentToken() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-attribute-value-no-quotes"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \ frozenset(("&", ">","<")) | spaceCharacters) return True def bogusCommentState(self): # Make a new comment token and give it as value all the characters # until the first > or EOF (charsUntil checks for EOF automatically) # and emit it. self.tokenQueue.append( {"type": "Comment", "data": self.stream.charsUntil((u">"))}) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() self.state = self.states["data"] return True def markupDeclarationOpenState(self): charStack = [self.stream.char(), self.stream.char()] if charStack == [u"-", u"-"]: self.currentToken = {"type": "Comment", "data": u""} self.state = self.states["commentStart"] else: for x in xrange(5): charStack.append(self.stream.char()) # Put in explicit EOF check if (not EOF in charStack and "".join(charStack).upper() == u"DOCTYPE"): self.currentToken = {"type":"Doctype", "name":u"", "publicId":None, "systemId":None, "correct":True} self.state = self.states["doctype"] else: self.tokenQueue.append({"type": "ParseError", "data": "expected-dashes-or-doctype"}) self.stream.unget(charStack) self.state = self.states["bogusComment"] return True def commentStartState(self): data = self.stream.char() if data == "-": self.state = self.states["commentStartDash"] elif data == ">": self.tokenQueue.append({"type": "ParseError", "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") self.state = self.states["comment"] return True def commentStartDashState(self): data = self.stream.char() if data == "-": self.state = self.states["commentEnd"] elif data == ">": self.tokenQueue.append({"type": "ParseError", "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-") self.state = self.states["comment"] return True def commentState(self): data = self.stream.char() if data == u"-": self.state = self.states["commentEndDash"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") return True def commentEndDashState(self): data = self.stream.char() if data == u"-": self.state = self.states["commentEnd"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-comment-end-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += u"-" + data +\ self.stream.charsUntil(u"-") # Consume the next character which is either a "-" or an EOF as # well so if there's a "-" directly after the "-" we go nicely to # the "comment end state" without emitting a ParseError() there. self.stream.char() return True def commentEndState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == u"-": self.tokenQueue.append({"type": "ParseError", "data": "unexpected-dash-after-double-dash-in-comment"}) self.currentToken["data"] += data elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-comment-double-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: # XXX self.tokenQueue.append({"type": "ParseError", "data": "unexpected-char-in-comment"}) self.currentToken["data"] += u"--" + data self.state = self.states["comment"] return True def doctypeState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeDoctypeName"] else: self.tokenQueue.append({"type": "ParseError", "data": "need-space-after-doctype"}) self.stream.unget(data) self.state = self.states["beforeDoctypeName"] return True def beforeDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == u">": self.tokenQueue.append({"type": "ParseError", "data": "expected-doctype-name-but-got-right-bracket"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "expected-doctype-name-but-got-eof"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["name"] = data self.state = self.states["doctypeName"] return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["afterDoctypeName"] elif data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype-name"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["name"] += data return True def afterDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.currentToken["correct"] = False self.stream.unget(data) self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: charStack = [data] for x in xrange(5): charStack.append(self.stream.char()) if EOF not in charStack and\ "".join(charStack).translate(asciiUpper2Lower) == "public": self.state = self.states["beforeDoctypePublicIdentifier"] elif EOF not in charStack and\ "".join(charStack).translate(asciiUpper2Lower) == "system": self.state = self.states["beforeDoctypeSystemIdentifier"] else: self.stream.unget(charStack) self.tokenQueue.append({"type": "ParseError", "data": "expected-space-or-right-bracket-in-doctype", "datavars": {"data": data}}) self.state = self.states["bogusDoctype"] return True def beforeDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["publicId"] = u"" self.state = self.states["doctypePublicIdentifierDoubleQuoted"] elif data == "'": self.currentToken["publicId"] = u"" self.state = self.states["doctypePublicIdentifierSingleQuoted"] elif data == ">": self.tokenQueue.append({"type": "ParseError", "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({"type": "ParseError", "data": "unexpected-char-in-doctype"}) self.state = self.states["bogusDoctype"] return True def doctypePublicIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["afterDoctypePublicIdentifier"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["publicId"] += data return True def doctypePublicIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["afterDoctypePublicIdentifier"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["publicId"] += data return True def afterDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] elif data == "'": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierSingleQuoted"] elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({"type": "ParseError", "data": "unexpected-char-in-doctype"}) self.state = self.states["bogusDoctype"] return True def beforeDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] elif data == "'": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierSingleQuoted"] elif data == ">": self.tokenQueue.append({"type": "ParseError", "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({"type": "ParseError", "data": "unexpected-char-in-doctype"}) self.state = self.states["bogusDoctype"] return True def doctypeSystemIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["afterDoctypeSystemIdentifier"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["systemId"] += data return True def doctypeSystemIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["afterDoctypeSystemIdentifier"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["systemId"] += data return True def afterDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({"type": "ParseError", "data": "unexpected-char-in-doctype"}) self.state = self.states["bogusDoctype"] return True def bogusDoctypeState(self): data = self.stream.char() self.currentToken["correct"] = False if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: # XXX EMIT self.stream.unget(data) self.tokenQueue.append({"type": "ParseError", "data": "eof-in-bogus-doctype"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: pass return True
class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.states Holds a mapping between states and methods that implement the state. * self.stream Points to HTMLInputStream object. """ # XXX need to fix documentation def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, lowercaseElementName=True, lowercaseAttrName=True): self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) #Perform case conversions? self.lowercaseElementName = lowercaseElementName self.lowercaseAttrName = lowercaseAttrName self.states = { "data": self.dataState, "entityData": self.entityDataState, "tagOpen": self.tagOpenState, "closeTagOpen": self.closeTagOpenState, "tagName": self.tagNameState, "beforeAttributeName": self.beforeAttributeNameState, "attributeName": self.attributeNameState, "afterAttributeName": self.afterAttributeNameState, "beforeAttributeValue": self.beforeAttributeValueState, "attributeValueDoubleQuoted": self.attributeValueDoubleQuotedState, "attributeValueSingleQuoted": self.attributeValueSingleQuotedState, "attributeValueUnQuoted": self.attributeValueUnQuotedState, "afterAttributeValue": self.afterAttributeValueState, "bogusComment": self.bogusCommentState, "markupDeclarationOpen": self.markupDeclarationOpenState, "commentStart": self.commentStartState, "commentStartDash": self.commentStartDashState, "comment": self.commentState, "commentEndDash": self.commentEndDashState, "commentEnd": self.commentEndState, "doctype": self.doctypeState, "beforeDoctypeName": self.beforeDoctypeNameState, "doctypeName": self.doctypeNameState, "afterDoctypeName": self.afterDoctypeNameState, "beforeDoctypePublicIdentifier": self.beforeDoctypePublicIdentifierState, "doctypePublicIdentifierDoubleQuoted": self.doctypePublicIdentifierDoubleQuotedState, "doctypePublicIdentifierSingleQuoted": self.doctypePublicIdentifierSingleQuotedState, "afterDoctypePublicIdentifier": self.afterDoctypePublicIdentifierState, "beforeDoctypeSystemIdentifier": self.beforeDoctypeSystemIdentifierState, "doctypeSystemIdentifierDoubleQuoted": self.doctypeSystemIdentifierDoubleQuotedState, "doctypeSystemIdentifierSingleQuoted": self.doctypeSystemIdentifierSingleQuotedState, "afterDoctypeSystemIdentifier": self.afterDoctypeSystemIdentifierState, "bogusDoctype": self.bogusDoctypeState } # Setup the initial tokenizer state self.contentModelFlag = contentModelFlags["PCDATA"] self.escapeFlag = False self.lastFourChars = [] self.state = self.states["data"] # The current token being created self.currentToken = None def __iter__(self): """ This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. """ self.tokenQueue = deque([]) # Start processing. When EOF is reached self.state will return False # instead of True and the loop will terminate. while self.state(): while self.stream.errors: yield {"type": "ParseError", "data": self.stream.errors.pop(0)} while self.tokenQueue: yield self.tokenQueue.popleft() # Below are various helper functions the tokenizer states use worked out. def processSolidusInTag(self): """If the next character is a '>', convert the currentToken into an EmptyTag """ rv = False # We need to consume another character to make sure it's a ">" data = self.stream.char() if self.currentToken["type"] == "StartTag" and data == u">": self.currentToken["type"] = "EmptyTag" elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "EOF following solidus" }) self.state = self.states["data"] self.emitCurrentToken() rv = True else: self.tokenQueue.append({ "type": "ParseError", "data": "incorrectly-placed-solidus" }) # The character we just consumed need to be put back on the stack so it # doesn't get lost... self.stream.unget(data) return rv def consumeNumberEntity(self, isHex): """This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present self.tokenQueue.append({"type": "ParseError"}) is invoked. """ allowed = digits radix = 10 if isHex: allowed = hexDigits radix = 16 charStack = [] # Consume all the characters that are in range while making sure we # don't hit an EOF. c = self.stream.char() while c in allowed and c is not EOF: charStack.append(c) c = self.stream.char() # Convert the set of characters consumed to an int. charAsInt = int("".join(charStack), radix) if charAsInt == 13: self.tokenQueue.append({ "type": "ParseError", "data": "incorrect-cr-newline-entity" }) charAsInt = 10 elif 127 < charAsInt < 160: # If the integer is between 127 and 160 (so 128 and bigger and 159 # and smaller) we need to do the "windows trick". self.tokenQueue.append({ "type": "ParseError", "data": "illegal-windows-1252-entity" }) charAsInt = entitiesWindows1252[charAsInt - 128] # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF). if 0 < charAsInt and charAsInt <= 1114111 and not ( 55296 <= charAsInt and charAsInt <= 57343): try: # XXX We should have a separate function that does "int" to # "unicodestring" conversion since this doesn't always work # according to hsivonen. Also, unichr has a limitation of 65535 char = unichr(charAsInt) except: try: char = eval("u'\\U%08x'" % charAsInt) except: self.tokenQueue.append({ "type": "ParseError", "data": "cant-convert-numeric-entity", "datavars": { "charAsInt": charAsInt } }) else: char = u"\uFFFD" self.tokenQueue.append({ "type": "ParseError", "data": "illegal-codepoint-for-numeric-entity", "datavars": { "charAsInt": charAsInt } }) # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. if c != u";": self.tokenQueue.append({ "type": "ParseError", "data": "numeric-entity-without-semicolon" }) self.stream.unget(c) return char def consumeEntity(self, allowedChar=None, fromAttribute=False): char = None charStack = [self.stream.char()] if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")\ or (allowedChar is not None and allowedChar == charStack[0]): self.stream.unget(charStack) elif charStack[0] == u"#": # We might have a number entity here. charStack.extend([self.stream.char(), self.stream.char()]) if EOF in charStack[:2]: # If we reach the end of the file put everything up to EOF # back in the queue charStack = charStack[:charStack.index(EOF)] self.stream.unget(charStack) self.tokenQueue.append({ "type": "ParseError", "data": "expected-numeric-entity-but-got-eof" }) else: if charStack[1].lower() == u"x" \ and charStack[2] in hexDigits: # Hexadecimal entity detected. self.stream.unget(charStack[2]) char = self.consumeNumberEntity(True) elif charStack[1] in digits: # Decimal entity detected. self.stream.unget(charStack[1:]) char = self.consumeNumberEntity(False) else: # No number entity detected. self.stream.unget(charStack) self.tokenQueue.append({ "type": "ParseError", "data": "expected-numeric-entity" }) else: # At this point in the process might have named entity. Entities # are stored in the global variable "entities". # # Consume characters and compare to these to a substring of the # entity names in the list until the substring no longer matches. filteredEntityList = entitiesByFirstChar.get(charStack[0], []) def entitiesStartingWith(name): return [e for e in filteredEntityList if e.startswith(name)] while charStack[-1] != EOF and\ entitiesStartingWith("".join(charStack)): charStack.append(self.stream.char()) # At this point we have a string that starts with some characters # that may match an entity entityName = None # Try to find the longest entity the string will match to take care # of ¬i for instance. for entityLength in xrange(len(charStack) - 1, 1, -1): possibleEntityName = "".join(charStack[:entityLength]) if possibleEntityName in entities: entityName = possibleEntityName break if entityName is not None: if entityName[-1] != ";": self.tokenQueue.append({ "type": "ParseError", "data": "named-entity-without-semicolon" }) if entityName[-1] != ";" and fromAttribute and \ (charStack[entityLength] in asciiLetters or charStack[entityLength] in digits): self.stream.unget(charStack) else: char = entities[entityName] self.stream.unget(charStack[entityLength:]) else: self.tokenQueue.append({ "type": "ParseError", "data": "expected-named-entity" }) self.stream.unget(charStack) return char def processEntityInAttribute(self, allowedChar): """This method replaces the need for "entityInAttributeValueState". """ entity = self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) if entity: self.currentToken["data"][-1][1] += entity else: self.currentToken["data"][-1][1] += u"&" def emitCurrentToken(self): """This method is a generic handler for emitting the tags. It also sets the state to "data" because that's what's needed after a token has been emitted. """ token = self.currentToken # Add token to the queue to be yielded if (token["type"] in ("StartTag", "EndTag", "EmptyTag")): if self.lowercaseElementName: token["name"] = token["name"].translate(asciiUpper2Lower) if token["type"] == "EndTag" and token["data"]: self.tokenQueue.append({ "type": "ParseError", "data": "attributes-in-end-tag" }) self.tokenQueue.append(token) self.state = self.states["data"] # Below are the various tokenizer states worked out. # XXX AT Perhaps we should have Hixie run some evaluation on billions of # documents to figure out what the order of the various if and elif # statements should be. def dataState(self): data = self.stream.char() # Keep a charbuffer to handle the escapeFlag if self.contentModelFlag in\ (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]): if len(self.lastFourChars) == 4: self.lastFourChars.pop(0) self.lastFourChars.append(data) # The rest of the logic if data == "&" and self.contentModelFlag in\ (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not\ self.escapeFlag: self.state = self.states["entityData"] elif data == "-" and self.contentModelFlag in\ (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not\ self.escapeFlag and "".join(self.lastFourChars) == "<!--": self.escapeFlag = True self.tokenQueue.append({"type": "Characters", "data": data}) elif (data == "<" and (self.contentModelFlag == contentModelFlags["PCDATA"] or (self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and self.escapeFlag == False))): self.state = self.states["tagOpen"] elif data == ">" and self.contentModelFlag in\ (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\ self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->": self.escapeFlag = False self.tokenQueue.append({"type": "Characters", "data": data}) elif data == EOF: # Tokenization ends. return False elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. self.tokenQueue.append({ "type": "SpaceCharacters", "data": data + self.stream.charsUntil(spaceCharacters, True) }) # No need to update lastFourChars here, since the first space will # have already broken any <!-- or --> sequences else: chars = self.stream.charsUntil(("&", "<", ">", "-")) self.tokenQueue.append({ "type": "Characters", "data": data + chars }) self.lastFourChars += chars[-4:] self.lastFourChars = self.lastFourChars[-4:] return True def entityDataState(self): entity = self.consumeEntity() if entity: self.tokenQueue.append({"type": "Characters", "data": entity}) else: self.tokenQueue.append({"type": "Characters", "data": u"&"}) self.state = self.states["data"] return True def tagOpenState(self): data = self.stream.char() if self.contentModelFlag == contentModelFlags["PCDATA"]: if data == u"!": self.state = self.states["markupDeclarationOpen"] elif data == u"/": self.state = self.states["closeTagOpen"] elif data in asciiLetters: self.currentToken =\ {"type": "StartTag", "name": data, "data": []} self.state = self.states["tagName"] elif data == u">": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({ "type": "ParseError", "data": "expected-tag-name-but-got-right-bracket" }) self.tokenQueue.append({"type": "Characters", "data": u"<>"}) self.state = self.states["data"] elif data == u"?": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({ "type": "ParseError", "data": "expected-tag-name-but-got-question-mark" }) self.stream.unget(data) self.state = self.states["bogusComment"] else: # XXX self.tokenQueue.append({ "type": "ParseError", "data": "expected-tag-name" }) self.tokenQueue.append({"type": "Characters", "data": u"<"}) self.stream.unget(data) self.state = self.states["data"] else: # We know the content model flag is set to either RCDATA or CDATA # now because this state can never be entered with the PLAINTEXT # flag. if data == u"/": self.state = self.states["closeTagOpen"] else: self.tokenQueue.append({"type": "Characters", "data": u"<"}) self.stream.unget(data) self.state = self.states["data"] return True def closeTagOpenState(self): if (self.contentModelFlag in (contentModelFlags["RCDATA"], contentModelFlags["CDATA"])): if self.currentToken: charStack = [] # So far we know that "</" has been consumed. We now need to know # whether the next few characters match the name of last emitted # start tag which also happens to be the currentToken. We also need # to have the character directly after the characters that could # match the start tag name. for x in xrange(len(self.currentToken["name"]) + 1): charStack.append(self.stream.char()) # Make sure we don't get hit by EOF if charStack[-1] == EOF: break # Since this is just for checking. We put the characters back on # the stack. self.stream.unget(charStack) if self.currentToken \ and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \ and charStack[-1] in (spaceCharacters | frozenset((u">", u"/", u"<", EOF))): # Because the characters are correct we can safely switch to # PCDATA mode now. This also means we don't have to do it when # emitting the end tag token. self.contentModelFlag = contentModelFlags["PCDATA"] else: self.tokenQueue.append({"type": "Characters", "data": u"</"}) self.state = self.states["data"] # Need to return here since we don't want the rest of the # method to be walked through. return True data = self.stream.char() if data in asciiLetters: self.currentToken = {"type": "EndTag", "name": data, "data": []} self.state = self.states["tagName"] elif data == u">": self.tokenQueue.append({ "type": "ParseError", "data": "expected-closing-tag-but-got-right-bracket" }) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "expected-closing-tag-but-got-eof" }) self.tokenQueue.append({"type": "Characters", "data": u"</"}) self.state = self.states["data"] else: # XXX data can be _'_... self.tokenQueue.append({ "type": "ParseError", "data": "expected-closing-tag-but-got-char", "datavars": { "data": data } }) self.stream.unget(data) self.state = self.states["bogusComment"] return True def tagNameState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] elif data in asciiLetters: self.currentToken["name"] += data +\ self.stream.charsUntil(asciiLetters, True) elif data == u">": self.emitCurrentToken() elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-tag-name" }) self.emitCurrentToken() elif data == u"/": self.processSolidusInTag() self.state = self.states["beforeAttributeName"] else: self.currentToken["name"] += data return True def beforeAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u">": self.emitCurrentToken() elif data == u"/": self.processSolidusInTag() elif data == u"'" or data == u'"' or data == u"=": self.tokenQueue.append({ "type": "ParseError", "data": "invalid-character-in-attribute-name" }) self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "expected-attribute-name-but-got-eof" }) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] return True def attributeNameState(self): data = self.stream.char() leavingThisState = True emitToken = False if data == u"=": self.state = self.states["beforeAttributeValue"] elif data in asciiLetters: self.currentToken["data"][-1][0] += data +\ self.stream.charsUntil(asciiLetters, True) leavingThisState = False elif data == u">": # XXX If we emit here the attributes are converted to a dict # without being checked and when the code below runs we error # because data is a dict not a list emitToken = True elif data in spaceCharacters: self.state = self.states["afterAttributeName"] elif data == u"/": if not self.processSolidusInTag(): self.state = self.states["beforeAttributeName"] elif data == u"'" or data == u'"': self.tokenQueue.append({ "type": "ParseError", "data": "invalid-character-in-attribute-name" }) self.currentToken["data"][-1][0] += data leavingThisState = False elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-attribute-name" }) self.state = self.states["data"] emitToken = True else: self.currentToken["data"][-1][0] += data leavingThisState = False if leavingThisState: # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. if self.lowercaseAttrName: self.currentToken["data"][-1][0] = ( self.currentToken["data"][-1][0].translate( asciiUpper2Lower)) for name, value in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({ "type": "ParseError", "data": "duplicate-attribute" }) break # XXX Fix for above XXX if emitToken: self.emitCurrentToken() return True def afterAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"=": self.state = self.states["beforeAttributeValue"] elif data == u">": self.emitCurrentToken() elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u"/": if not self.processSolidusInTag(): self.state = self.states["beforeAttributeName"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "expected-end-of-tag-but-got-eof" }) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] return True def beforeAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"\"": self.state = self.states["attributeValueDoubleQuoted"] elif data == u"&": self.state = self.states["attributeValueUnQuoted"] self.stream.unget(data) elif data == u"'": self.state = self.states["attributeValueSingleQuoted"] elif data == u">": self.emitCurrentToken() elif data == u"=": self.tokenQueue.append({ "type": "ParseError", "data": "equals-in-unquoted-attribute-value" }) self.currentToken["data"][-1][1] += data self.state = self.states["attributeValueUnQuoted"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "expected-attribute-value-but-got-eof" }) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data self.state = self.states["attributeValueUnQuoted"] return True def attributeValueDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["afterAttributeValue"] elif data == u"&": self.processEntityInAttribute(u'"') elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-attribute-value-double-quote" }) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("\"", u"&")) return True def attributeValueSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["afterAttributeValue"] elif data == u"&": self.processEntityInAttribute(u"'") elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-attribute-value-single-quote" }) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("'", u"&")) return True def attributeValueUnQuotedState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] elif data == u"&": self.processEntityInAttribute(None) elif data == u">": self.emitCurrentToken() elif data == u'"' or data == u"'" or data == u"=": self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-character-in-unquoted-attribute-value" }) self.currentToken["data"][-1][1] += data elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-attribute-value-no-quotes" }) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \ frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters) return True def afterAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] elif data == u">": self.emitCurrentToken() self.state = self.states["data"] elif data == u"/": if not self.processSolidusInTag(): self.state = self.states["beforeAttributeName"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-EOF-after-attribute-value" }) self.emitCurrentToken() self.stream.unget(data) self.state = self.states["data"] else: self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-character-after-attribute-value" }) self.stream.unget(data) self.state = self.states["beforeAttributeName"] return True def bogusCommentState(self): # Make a new comment token and give it as value all the characters # until the first > or EOF (charsUntil checks for EOF automatically) # and emit it. self.tokenQueue.append({ "type": "Comment", "data": self.stream.charsUntil((u">")) }) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() self.state = self.states["data"] return True def markupDeclarationOpenState(self): charStack = [self.stream.char(), self.stream.char()] if charStack == [u"-", u"-"]: self.currentToken = {"type": "Comment", "data": u""} self.state = self.states["commentStart"] else: for x in xrange(5): charStack.append(self.stream.char()) # Put in explicit EOF check if (not EOF in charStack and "".join(charStack).upper() == u"DOCTYPE"): self.currentToken = { "type": "Doctype", "name": u"", "publicId": None, "systemId": None, "correct": True } self.state = self.states["doctype"] else: self.tokenQueue.append({ "type": "ParseError", "data": "expected-dashes-or-doctype" }) self.stream.unget(charStack) self.state = self.states["bogusComment"] return True def commentStartState(self): data = self.stream.char() if data == "-": self.state = self.states["commentStartDash"] elif data == ">": self.tokenQueue.append({ "type": "ParseError", "data": "incorrect-comment" }) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-comment" }) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") self.state = self.states["comment"] return True def commentStartDashState(self): data = self.stream.char() if data == "-": self.state = self.states["commentEnd"] elif data == ">": self.tokenQueue.append({ "type": "ParseError", "data": "incorrect-comment" }) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-comment" }) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += "-" + data + self.stream.charsUntil( u"-") self.state = self.states["comment"] return True def commentState(self): data = self.stream.char() if data == u"-": self.state = self.states["commentEndDash"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-comment" }) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") return True def commentEndDashState(self): data = self.stream.char() if data == u"-": self.state = self.states["commentEnd"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-comment-end-dash" }) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += u"-" + data +\ self.stream.charsUntil(u"-") # Consume the next character which is either a "-" or an EOF as # well so if there's a "-" directly after the "-" we go nicely to # the "comment end state" without emitting a ParseError() there. self.stream.char() return True def commentEndState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == u"-": self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-dash-after-double-dash-in-comment" }) self.currentToken["data"] += data elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-comment-double-dash" }) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: # XXX self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-char-in-comment" }) self.currentToken["data"] += u"--" + data self.state = self.states["comment"] return True def doctypeState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeDoctypeName"] else: self.tokenQueue.append({ "type": "ParseError", "data": "need-space-after-doctype" }) self.stream.unget(data) self.state = self.states["beforeDoctypeName"] return True def beforeDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == u">": self.tokenQueue.append({ "type": "ParseError", "data": "expected-doctype-name-but-got-right-bracket" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "expected-doctype-name-but-got-eof" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["name"] = data self.state = self.states["doctypeName"] return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["afterDoctypeName"] elif data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype-name" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["name"] += data return True def afterDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.currentToken["correct"] = False self.stream.unget(data) self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype" }) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: charStack = [data] for x in xrange(5): charStack.append(self.stream.char()) if EOF not in charStack and\ "".join(charStack).translate(asciiUpper2Lower) == "public": self.state = self.states["beforeDoctypePublicIdentifier"] elif EOF not in charStack and\ "".join(charStack).translate(asciiUpper2Lower) == "system": self.state = self.states["beforeDoctypeSystemIdentifier"] else: self.stream.unget(charStack) self.tokenQueue.append({ "type": "ParseError", "data": "expected-space-or-right-bracket-in-doctype", "datavars": { "data": data } }) self.currentToken["correct"] = False self.state = self.states["bogusDoctype"] return True def beforeDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["publicId"] = u"" self.state = self.states["doctypePublicIdentifierDoubleQuoted"] elif data == "'": self.currentToken["publicId"] = u"" self.state = self.states["doctypePublicIdentifierSingleQuoted"] elif data == ">": self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-end-of-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-char-in-doctype" }) self.currentToken["correct"] = False self.state = self.states["bogusDoctype"] return True def doctypePublicIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["afterDoctypePublicIdentifier"] elif data == ">": self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-end-of-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["publicId"] += data return True def doctypePublicIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["afterDoctypePublicIdentifier"] elif data == ">": self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-end-of-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["publicId"] += data return True def afterDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] elif data == "'": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierSingleQuoted"] elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-char-in-doctype" }) self.currentToken["correct"] = False self.state = self.states["bogusDoctype"] return True def beforeDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] elif data == "'": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierSingleQuoted"] elif data == ">": self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-char-in-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-char-in-doctype" }) self.currentToken["correct"] = False self.state = self.states["bogusDoctype"] return True def doctypeSystemIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["afterDoctypeSystemIdentifier"] elif data == ">": self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-end-of-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["systemId"] += data return True def doctypeSystemIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["afterDoctypeSystemIdentifier"] elif data == ">": self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-end-of-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["systemId"] += data return True def afterDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-doctype" }) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({ "type": "ParseError", "data": "unexpected-char-in-doctype" }) self.state = self.states["bogusDoctype"] return True def bogusDoctypeState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: # XXX EMIT self.stream.unget(data) self.tokenQueue.append({ "type": "ParseError", "data": "eof-in-bogus-doctype" }) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: pass return True
class HTMLTokenizer: """ This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.stream Points to HTMLInputStream object. """ # XXX need to fix documentation def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, lowercaseElementName=True, lowercaseAttrName=True): self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) #Perform case conversions? self.lowercaseElementName = lowercaseElementName self.lowercaseAttrName = lowercaseAttrName # Setup the initial tokenizer state self.contentModelFlag = contentModelFlags["PCDATA"] self.escapeFlag = False self.lastFourChars = [] self.state = self.dataState self.escape = False # The current token being created self.currentToken = None def __iter__(self): """ This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. """ self.tokenQueue = deque([]) # Start processing. When EOF is reached self.state will return False # instead of True and the loop will terminate. while self.state(): while self.stream.errors: yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} while self.tokenQueue: yield self.tokenQueue.popleft() def consumeNumberEntity(self, isHex): """This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. """ allowed = digits radix = 10 if isHex: allowed = hexDigits radix = 16 charStack = [] # Consume all the characters that are in range while making sure we # don't hit an EOF. c = self.stream.char() while c in allowed and c is not EOF: charStack.append(c) c = self.stream.char() # Convert the set of characters consumed to an int. charAsInt = int("".join(charStack), radix) # Certain characters get replaced with others if charAsInt in replacementCharacters: char = replacementCharacters[charAsInt] self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) elif ((0xD800 <= charAsInt <= 0xDFFF) or (charAsInt > 0x10FFFF)): char = u"\uFFFD" self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) else: #Should speed up this check somehow (e.g. move the set to a constant) if ((0x0001 <= charAsInt <= 0x0008) or (0x000E <= charAsInt <= 0x001F) or (0x007F <= charAsInt <= 0x009F) or (0xFDD0 <= charAsInt <= 0xFDEF) or charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF])): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) try: # XXX We should have a separate function that does "int" to # "unicodestring" conversion since this doesn't always work # according to hsivonen. Also, unichr has a limitation of 65535 char = unichr(charAsInt) except: try: char = eval("u'\\U%08x'" % charAsInt) except: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "cant-convert-numeric-entity", "datavars": {"charAsInt": charAsInt}}) # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. if c != u";": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "numeric-entity-without-semicolon"}) self.stream.unget(c) return char def consumeEntity(self, allowedChar=None, fromAttribute=False): # Initialise to the default output for when no entity is matched output = u"&" charStack = [self.stream.char()] if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") or (allowedChar is not None and allowedChar == charStack[0])): self.stream.unget(charStack[0]) elif charStack[0] == u"#": # Read the next character to see if it's hex or decimal hex = False charStack.append(self.stream.char()) if charStack[-1] in (u"x", u"X"): hex = True charStack.append(self.stream.char()) # charStack[-1] should be the first digit if (hex and charStack[-1] in hexDigits) \ or (not hex and charStack[-1] in digits): # At least one digit found, so consume the whole number self.stream.unget(charStack[-1]) output = self.consumeNumberEntity(hex) else: # No digits found self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-numeric-entity"}) self.stream.unget(charStack.pop()) output = u"&" + u"".join(charStack) else: # At this point in the process might have named entity. Entities # are stored in the global variable "entities". # # Consume characters and compare to these to a substring of the # entity names in the list until the substring no longer matches. filteredEntityList = entitiesByFirstChar.get(charStack[0], []) def entitiesStartingWith(name): return [e for e in filteredEntityList if e.startswith(name)] while charStack[-1] is not EOF and\ entitiesStartingWith("".join(charStack)): charStack.append(self.stream.char()) # At this point we have a string that starts with some characters # that may match an entity entityName = None # Try to find the longest entity the string will match to take care # of ¬i for instance. for entityLength in xrange(len(charStack)-1, 1, -1): possibleEntityName = "".join(charStack[:entityLength]) if possibleEntityName in entities: entityName = possibleEntityName break if entityName is not None: if entityName[-1] != ";": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "named-entity-without-semicolon"}) if entityName[-1] != ";" and fromAttribute and \ (charStack[entityLength] in asciiLetters or charStack[entityLength] in digits): self.stream.unget(charStack.pop()) output = u"&" + u"".join(charStack) else: output = entities[entityName] self.stream.unget(charStack.pop()) output += u"".join(charStack[entityLength:]) else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-named-entity"}) self.stream.unget(charStack.pop()) output = u"&" + u"".join(charStack) if fromAttribute: self.currentToken["data"][-1][1] += output else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": output}) def processEntityInAttribute(self, allowedChar): """This method replaces the need for "entityInAttributeValueState". """ self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) def emitCurrentToken(self): """This method is a generic handler for emitting the tags. It also sets the state to "data" because that's what's needed after a token has been emitted. """ token = self.currentToken # Add token to the queue to be yielded if (token["type"] in tagTokenTypes): if self.lowercaseElementName: token["name"] = token["name"].translate(asciiUpper2Lower) if token["type"] == tokenTypes["EndTag"]: if token["data"]: self.tokenQueue.append({"type":tokenTypes["ParseError"], "data":"attributes-in-end-tag"}) if token["selfClosing"]: self.tokenQueue.append({"type":tokenTypes["ParseError"], "data":"self-closing-flag-on-end-tag"}) self.tokenQueue.append(token) self.state = self.dataState # Below are the various tokenizer states worked out. def dataState(self): #XXX - consider splitting this state based on the content model flag data = self.stream.char() # Keep a charbuffer to handle the escapeFlag if (self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])): if len(self.lastFourChars) == 4: self.lastFourChars.pop(0) self.lastFourChars.append(data) # The rest of the logic if (data == "&" and self.contentModelFlag in (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not self.escapeFlag): self.state = self.entityDataState elif (data == "-" and self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not self.escapeFlag and "".join(self.lastFourChars) == "<!--"): self.escapeFlag = True self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data}) elif (data == "<" and (self.contentModelFlag == contentModelFlags["PCDATA"] or (self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and self.escapeFlag == False))): self.state = self.tagOpenState elif (data == ">" and self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"): self.escapeFlag = False self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data}) elif data is EOF: # Tokenization ends. return False elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": data + self.stream.charsUntil(spaceCharacters, True)}) # No need to update lastFourChars here, since the first space will # have already been appended to lastFourChars and will have broken # any <!-- or --> sequences else: if (self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])): chars = self.stream.charsUntil((u"&", u"<", u">", u"-")) self.lastFourChars += chars[-4:] self.lastFourChars = self.lastFourChars[-4:] else: chars = self.stream.charsUntil((u"&", u"<")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def entityDataState(self): self.consumeEntity() self.state = self.dataState return True def tagOpenState(self): data = self.stream.char() if self.contentModelFlag == contentModelFlags["PCDATA"]: if data == u"!": self.state = self.markupDeclarationOpenState elif data == u"/": self.state = self.closeTagOpenState elif data in asciiLetters: self.currentToken = {"type": tokenTypes["StartTag"], "name": data, "data": [], "selfClosing": False, "selfClosingAcknowledged": False} self.state = self.tagNameState elif data == u">": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-right-bracket"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"}) self.state = self.dataState elif data == u"?": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-question-mark"}) self.stream.unget(data) self.state = self.bogusCommentState else: # XXX self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) self.stream.unget(data) self.state = self.dataState else: # We know the content model flag is set to either RCDATA or CDATA # now because this state can never be entered with the PLAINTEXT # flag. if data == u"/": self.state = self.closeTagOpenState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) self.stream.unget(data) self.state = self.dataState return True def closeTagOpenState(self): if (self.contentModelFlag in (contentModelFlags["RCDATA"], contentModelFlags["CDATA"])): charStack = [] if self.currentToken: # So far we know that "</" has been consumed. We now need to know # whether the next few characters match the name of last emitted # start tag which also happens to be the currentToken. matched = True for expected in self.currentToken["name"].lower(): charStack.append(self.stream.char()) if charStack[-1] not in (expected, expected.upper()): matched = False break # If the tag name prefix matched, we also need to check the # subsequent character if matched: charStack.append(self.stream.char()) if charStack[-1] in (spaceCharacters | frozenset((u">", u"/", EOF))): self.contentModelFlag = contentModelFlags["PCDATA"] # Unget the last character, so it can be re-processed # in the next state self.stream.unget(charStack.pop()) # The remaining characters in charStack are the tag name self.currentToken = {"type": tokenTypes["EndTag"], "name": u"".join(charStack), "data": [], "selfClosing":False} self.state = self.tagNameState return True # Didn't find the end tag. The last character in charStack could be # anything, so it has to be re-processed in the data state self.stream.unget(charStack.pop()) # The remaining characters are a prefix of the tag name, so they're # just letters and digits, so they can be output as character # tokens immediately self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</" + u"".join(charStack)}) self.state = self.dataState return True data = self.stream.char() if data in asciiLetters: self.currentToken = {"type": tokenTypes["EndTag"], "name": data, "data": [], "selfClosing":False} self.state = self.tagNameState elif data == u">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-right-bracket"}) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-eof"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"}) self.state = self.dataState else: # XXX data can be _'_... self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-char", "datavars": {"data": data}}) self.stream.unget(data) self.state = self.bogusCommentState return True def tagNameState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeAttributeNameState elif data == u">": self.emitCurrentToken() elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-tag-name"}) self.state = self.dataState elif data == u"/": self.state = self.selfClosingStartTagState else: self.currentToken["name"] += data # (Don't use charsUntil here, because tag names are # very short and it's faster to not do anything fancy) return True def beforeAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data == u">": self.emitCurrentToken() elif data == u"/": self.state = self.selfClosingStartTagState elif data in (u"'", u'"', u"=", u"<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-in-attribute-name"}) self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-name-but-got-eof"}) self.state = self.dataState else: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState return True def attributeNameState(self): data = self.stream.char() leavingThisState = True emitToken = False if data == u"=": self.state = self.beforeAttributeValueState elif data in asciiLetters: self.currentToken["data"][-1][0] += data +\ self.stream.charsUntil(asciiLetters, True) leavingThisState = False elif data == u">": # XXX If we emit here the attributes are converted to a dict # without being checked and when the code below runs we error # because data is a dict not a list emitToken = True elif data in spaceCharacters: self.state = self.afterAttributeNameState elif data == u"/": self.state = self.selfClosingStartTagState elif data in (u"'", u'"', u"<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-in-attribute-name"}) self.currentToken["data"][-1][0] += data leavingThisState = False elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-name"}) self.state = self.dataState emitToken = True else: self.currentToken["data"][-1][0] += data leavingThisState = False if leavingThisState: # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. if self.lowercaseAttrName: self.currentToken["data"][-1][0] = ( self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, value in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "duplicate-attribute"}) break # XXX Fix for above XXX if emitToken: self.emitCurrentToken() return True def afterAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"=": self.state = self.beforeAttributeValueState elif data == u">": self.emitCurrentToken() elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data == u"/": self.state = self.selfClosingStartTagState elif data in (u"'", u'"', u"<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-after-attribute-name"}) self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-end-of-tag-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState return True def beforeAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"\"": self.state = self.attributeValueDoubleQuotedState elif data == u"&": self.state = self.attributeValueUnQuotedState self.stream.unget(data); elif data == u"'": self.state = self.attributeValueSingleQuotedState elif data == u">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-value-but-got-right-bracket"}) self.emitCurrentToken() elif data in (u"=", u"<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "equals-in-unquoted-attribute-value"}) self.currentToken["data"][-1][1] += data self.state = self.attributeValueUnQuotedState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-value-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data self.state = self.attributeValueUnQuotedState return True def attributeValueDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.afterAttributeValueState elif data == u"&": self.processEntityInAttribute(u'"') elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-double-quote"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("\"", u"&")) return True def attributeValueSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.afterAttributeValueState elif data == u"&": self.processEntityInAttribute(u"'") elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-single-quote"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("'", u"&")) return True def attributeValueUnQuotedState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeAttributeNameState elif data == u"&": self.processEntityInAttribute(">") elif data == u">": self.emitCurrentToken() elif data in (u'"', u"'", u"=", u"<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-in-unquoted-attribute-value"}) self.currentToken["data"][-1][1] += data elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-no-quotes"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \ frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters) return True def afterAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeAttributeNameState elif data == u">": self.emitCurrentToken() elif data == u"/": self.state = self.selfClosingStartTagState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-EOF-after-attribute-value"}) self.emitCurrentToken() self.stream.unget(data) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-after-attribute-value"}) self.stream.unget(data) self.state = self.beforeAttributeNameState return True def selfClosingStartTagState(self): data = self.stream.char() if data == ">": self.currentToken["selfClosing"] = True self.emitCurrentToken() elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-EOF-after-solidus-in-tag"}) self.stream.unget(data) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-after-soldius-in-tag"}) self.stream.unget(data) self.state = self.beforeAttributeNameState return True def bogusCommentState(self): # Make a new comment token and give it as value all the characters # until the first > or EOF (charsUntil checks for EOF automatically) # and emit it. self.tokenQueue.append( {"type": tokenTypes["Comment"], "data": self.stream.charsUntil(u">")}) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() self.state = self.dataState return True def bogusCommentContinuationState(self): # Like bogusCommentState, but the caller must create the comment token # and this state just adds more characters to it self.currentToken["data"] += self.stream.charsUntil(u">") self.tokenQueue.append(self.currentToken) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() self.state = self.dataState return True def markupDeclarationOpenState(self): charStack = [self.stream.char()] if charStack[-1] == u"-": charStack.append(self.stream.char()) if charStack[-1] == u"-": self.currentToken = {"type": tokenTypes["Comment"], "data": u""} self.state = self.commentStartState return True elif charStack[-1] in (u'd', u'D'): matched = True for expected in ((u'o', u'O'), (u'c', u'C'), (u't', u'T'), (u'y', u'Y'), (u'p', u'P'), (u'e', u'E')): charStack.append(self.stream.char()) if charStack[-1] not in expected: matched = False break if matched: self.currentToken = {"type": tokenTypes["Doctype"], "name": u"", "publicId": None, "systemId": None, "correct": True} self.state = self.doctypeState return True self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-dashes-or-doctype"}) # charStack[:-2] consists of 'safe' characters ('-', 'd', 'o', etc) # so they can be copied directly into the bogus comment data, and only # the last character might be '>' or EOF and needs to be ungetted self.stream.unget(charStack.pop()) self.currentToken = {"type": tokenTypes["Comment"], "data": u"".join(charStack)} self.state = self.bogusCommentContinuationState return True def commentStartState(self): data = self.stream.char() if data == "-": self.state = self.commentStartDashState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") self.state = self.commentState return True def commentStartDashState(self): data = self.stream.char() if data == "-": self.state = self.commentEndState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-") self.state = self.commentState return True def commentState(self): data = self.stream.char() if data == u"-": self.state = self.commentEndDashState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") return True def commentEndDashState(self): data = self.stream.char() if data == u"-": self.state = self.commentEndState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-end-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += u"-" + data +\ self.stream.charsUntil(u"-") # Consume the next character which is either a "-" or an EOF as # well so if there's a "-" directly after the "-" we go nicely to # the "comment end state" without emitting a ParseError() there. self.stream.char() return True def commentEndState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == u"-": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-dash-after-double-dash-in-comment"}) self.currentToken["data"] += data elif data in spaceCharacters: self.currentToken["data"] += "--" + data self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-space-after-double-dash-in-comment"}) self.state = self.commentEndSpaceState elif data == "!": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-bang-after-double-dash-in-comment"}) self.state = self.commentEndBangState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-double-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: # XXX self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-comment"}) self.currentToken["data"] += u"--" + data self.state = self.commentState return True def commentEndBangState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == u"-": self.currentToken["data"] += "--!" self.state = self.commentEndDashState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-end-bang-state"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += u"--!" + data self.state = self.commentState return True def commentEndSpaceState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == u"-": self.state = self.commentEndDashState elif data in spaceCharacters: self.currentToken["data"] += data elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-end-space-state"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += data self.state = self.commentState return True def doctypeState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeDoctypeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-eof"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "need-space-after-doctype"}) self.stream.unget(data) self.state = self.beforeDoctypeNameState return True def beforeDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == u">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-right-bracket"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-eof"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["name"] = data self.state = self.doctypeNameState return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.state = self.afterDoctypeNameState elif data == u">": self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype-name"}) self.currentToken["correct"] = False self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["name"] += data return True def afterDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == u">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.currentToken["correct"] = False self.stream.unget(data) self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: if data in (u"p", u"P"): matched = True for expected in ((u"u", u"U"), (u"b", u"B"), (u"l", u"L"), (u"i", u"I"), (u"c", u"C")): data = self.stream.char() if data not in expected: matched = False break if matched: self.state = self.beforeDoctypePublicIdentifierState return True elif data in (u"s", u"S"): matched = True for expected in ((u"y", u"Y"), (u"s", u"S"), (u"t", u"T"), (u"e", u"E"), (u"m", u"M")): data = self.stream.char() if data not in expected: matched = False break if matched: self.state = self.beforeDoctypeSystemIdentifierState return True # All the characters read before the current 'data' will be # [a-zA-Z], so they're garbage in the bogus doctype and can be # discarded; only the latest character might be '>' or EOF # and needs to be ungetted self.stream.unget(data) self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-space-or-right-bracket-in-doctype", "datavars": {"data": data}}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def beforeDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["publicId"] = u"" self.state = self.doctypePublicIdentifierDoubleQuotedState elif data == "'": self.currentToken["publicId"] = u"" self.state = self.doctypePublicIdentifierSingleQuotedState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def doctypePublicIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.afterDoctypePublicIdentifierState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["publicId"] += data return True def doctypePublicIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.afterDoctypePublicIdentifierState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["publicId"] += data return True def afterDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = u"" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.currentToken["systemId"] = u"" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def beforeDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = u"" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.currentToken["systemId"] = u"" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def doctypeSystemIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.afterDoctypeSystemIdentifierState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["systemId"] += data return True def doctypeSystemIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.afterDoctypeSystemIdentifierState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["systemId"] += data return True def afterDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.state = self.bogusDoctypeState return True def bogusDoctypeState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: # XXX EMIT self.stream.unget(data) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: pass return True
class HTMLTokenizer: """ This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.states Holds a mapping between states and methods that implement the state. * self.stream Points to HTMLInputStream object. """ # XXX need to fix documentation def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, lowercaseElementName=True, lowercaseAttrName=True): self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) #Perform case conversions? self.lowercaseElementName = lowercaseElementName self.lowercaseAttrName = lowercaseAttrName self.states = { "data":self.dataState, "entityData":self.entityDataState, "tagOpen":self.tagOpenState, "closeTagOpen":self.closeTagOpenState, "tagName":self.tagNameState, "beforeAttributeName":self.beforeAttributeNameState, "attributeName":self.attributeNameState, "afterAttributeName":self.afterAttributeNameState, "beforeAttributeValue":self.beforeAttributeValueState, "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState, "attributeValueSingleQuoted":self.attributeValueSingleQuotedState, "attributeValueUnQuoted":self.attributeValueUnQuotedState, "afterAttributeValue":self.afterAttributeValueState, "bogusComment":self.bogusCommentState, "bogusCommentContinuation":self.bogusCommentContinuationState, "markupDeclarationOpen":self.markupDeclarationOpenState, "commentStart":self.commentStartState, "commentStartDash":self.commentStartDashState, "comment":self.commentState, "commentEndDash":self.commentEndDashState, "commentEnd":self.commentEndState, "doctype":self.doctypeState, "beforeDoctypeName":self.beforeDoctypeNameState, "doctypeName":self.doctypeNameState, "afterDoctypeName":self.afterDoctypeNameState, "beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState, "doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState, "doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState, "afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState, "beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState, "doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState, "doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState, "afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState, "bogusDoctype":self.bogusDoctypeState } # Setup the initial tokenizer state self.contentModelFlag = contentModelFlags["PCDATA"] self.escapeFlag = False self.lastFourChars = [] self.state = self.states["data"] # The current token being created self.currentToken = None def __iter__(self): """ This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. """ self.tokenQueue = deque([]) # Start processing. When EOF is reached self.state will return False # instead of True and the loop will terminate. while self.state(): while self.stream.errors: yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} while self.tokenQueue: yield self.tokenQueue.popleft() # Below are various helper functions the tokenizer states use worked out. def processSolidusInTag(self): """If the next character is a '>', convert the currentToken into an EmptyTag """ rv = False # We need to consume another character to make sure it's a ">" data = self.stream.char() if self.currentToken["type"] == tokenTypes["StartTag"] and data == u">": self.currentToken["type"] = tokenTypes["EmptyTag"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "EOF following solidus"}) self.state = self.states["data"] self.emitCurrentToken() rv = True else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrectly-placed-solidus"}) # The character we just consumed need to be put back on the stack so it # doesn't get lost... self.stream.unget(data) return rv def consumeNumberEntity(self, isHex): """This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. """ allowed = digits radix = 10 if isHex: allowed = hexDigits radix = 16 charStack = [] # Consume all the characters that are in range while making sure we # don't hit an EOF. c = self.stream.char() while c in allowed and c is not EOF: charStack.append(c) c = self.stream.char() # Convert the set of characters consumed to an int. charAsInt = int("".join(charStack), radix) if charAsInt == 13: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrect-cr-newline-entity"}) charAsInt = 10 elif 127 < charAsInt < 160: # If the integer is between 127 and 160 (so 128 and bigger and 159 # and smaller) we need to do the "windows trick". self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-windows-1252-entity"}) charAsInt = entitiesWindows1252[charAsInt - 128] # Certain characters get replaced with U+FFFD if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F) or (0x007F <= charAsInt <= 0x009F) or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF) or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10 or (0x10FFFF < charAsInt)): char = u"\uFFFD" self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) else: try: # XXX We should have a separate function that does "int" to # "unicodestring" conversion since this doesn't always work # according to hsivonen. Also, unichr has a limitation of 65535 char = unichr(charAsInt) except: try: char = eval("u'\\U%08x'" % charAsInt) except: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "cant-convert-numeric-entity", "datavars": {"charAsInt": charAsInt}}) # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. if c != u";": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "numeric-entity-without-semicolon"}) self.stream.unget(c) return char def consumeEntity(self, allowedChar=None, fromAttribute=False): # Initialise to the default output for when no entity is matched output = u"&" charStack = [self.stream.char()] if charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") \ or (allowedChar is not None and allowedChar == charStack[0]): self.stream.unget(charStack[0]) elif charStack[0] == u"#": # Read the next character to see if it's hex or decimal hex = False charStack.append(self.stream.char()) if charStack[-1] in (u"x", u"X"): hex = True charStack.append(self.stream.char()) # charStack[-1] should be the first digit if (hex and charStack[-1] in hexDigits) \ or (not hex and charStack[-1] in digits): # At least one digit found, so consume the whole number self.stream.unget(charStack[-1]) output = self.consumeNumberEntity(hex) else: # No digits found self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-numeric-entity"}) self.stream.unget(charStack.pop()) output = u"&" + u"".join(charStack) else: # At this point in the process might have named entity. Entities # are stored in the global variable "entities". # # Consume characters and compare to these to a substring of the # entity names in the list until the substring no longer matches. filteredEntityList = entitiesByFirstChar.get(charStack[0], []) def entitiesStartingWith(name): return [e for e in filteredEntityList if e.startswith(name)] while charStack[-1] is not EOF and\ entitiesStartingWith("".join(charStack)): charStack.append(self.stream.char()) # At this point we have a string that starts with some characters # that may match an entity entityName = None # Try to find the longest entity the string will match to take care # of ¬i for instance. for entityLength in xrange(len(charStack)-1, 1, -1): possibleEntityName = "".join(charStack[:entityLength]) if possibleEntityName in entities: entityName = possibleEntityName break if entityName is not None: if entityName[-1] != ";": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "named-entity-without-semicolon"}) if entityName[-1] != ";" and fromAttribute and \ (charStack[entityLength] in asciiLetters or charStack[entityLength] in digits): self.stream.unget(charStack.pop()) output = u"&" + u"".join(charStack) else: output = entities[entityName] self.stream.unget(charStack.pop()) output += u"".join(charStack[entityLength:]) else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-named-entity"}) self.stream.unget(charStack.pop()) output = u"&" + u"".join(charStack) if fromAttribute: self.currentToken["data"][-1][1] += output else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": output}) def processEntityInAttribute(self, allowedChar): """This method replaces the need for "entityInAttributeValueState". """ self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) def emitCurrentToken(self): """This method is a generic handler for emitting the tags. It also sets the state to "data" because that's what's needed after a token has been emitted. """ token = self.currentToken # Add token to the queue to be yielded if (token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["EmptyTag"])): if self.lowercaseElementName: token["name"] = token["name"].translate(asciiUpper2Lower) if token["type"] == tokenTypes["EndTag"] and token["data"]: self.tokenQueue.append({"type":tokenTypes["ParseError"], "data":"attributes-in-end-tag"}) self.tokenQueue.append(token) self.state = self.states["data"] # Below are the various tokenizer states worked out. def dataState(self): data = self.stream.char() # Keep a charbuffer to handle the escapeFlag if (self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])): if len(self.lastFourChars) == 4: self.lastFourChars.pop(0) self.lastFourChars.append(data) # The rest of the logic if (data == "&" and self.contentModelFlag in (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not self.escapeFlag): self.state = self.states["entityData"] elif (data == "-" and self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not self.escapeFlag and "".join(self.lastFourChars) == "<!--"): self.escapeFlag = True self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data}) elif (data == "<" and (self.contentModelFlag == contentModelFlags["PCDATA"] or (self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and self.escapeFlag == False))): self.state = self.states["tagOpen"] elif (data == ">" and self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"): self.escapeFlag = False self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data}) elif data is EOF: # Tokenization ends. return False elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": data + self.stream.charsUntil(spaceCharacters, True)}) # No need to update lastFourChars here, since the first space will # have already been appended to lastFourChars and will have broken # any <!-- or --> sequences else: if (self.contentModelFlag in (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])): chars = self.stream.charsUntil((u"&", u"<", u">", u"-")) self.lastFourChars += chars[-4:] self.lastFourChars = self.lastFourChars[-4:] else: chars = self.stream.charsUntil((u"&", u"<")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def entityDataState(self): self.consumeEntity() self.state = self.states["data"] return True def tagOpenState(self): data = self.stream.char() if self.contentModelFlag == contentModelFlags["PCDATA"]: if data == u"!": self.state = self.states["markupDeclarationOpen"] elif data == u"/": self.state = self.states["closeTagOpen"] elif data in asciiLetters: self.currentToken =\ {"type": tokenTypes["StartTag"], "name": data, "data": []} self.state = self.states["tagName"] elif data == u">": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-right-bracket"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"}) self.state = self.states["data"] elif data == u"?": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-question-mark"}) self.stream.unget(data) self.state = self.states["bogusComment"] else: # XXX self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) self.stream.unget(data) self.state = self.states["data"] else: # We know the content model flag is set to either RCDATA or CDATA # now because this state can never be entered with the PLAINTEXT # flag. if data == u"/": self.state = self.states["closeTagOpen"] else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) self.stream.unget(data) self.state = self.states["data"] return True def closeTagOpenState(self): if (self.contentModelFlag in (contentModelFlags["RCDATA"], contentModelFlags["CDATA"])): charStack = [] if self.currentToken: # So far we know that "</" has been consumed. We now need to know # whether the next few characters match the name of last emitted # start tag which also happens to be the currentToken. matched = True for expected in self.currentToken["name"].lower(): charStack.append(self.stream.char()) if charStack[-1] not in (expected, expected.upper()): matched = False break # If the tag name prefix matched, we also need to check the # subsequent character if matched: charStack.append(self.stream.char()) if charStack[-1] in (spaceCharacters | frozenset((u">", u"/", EOF))): self.contentModelFlag = contentModelFlags["PCDATA"] # Unget the last character, so it can be re-processed # in the next state self.stream.unget(charStack.pop()) # The remaining characters in charStack are the tag name self.currentToken = {"type": tokenTypes["EndTag"], "name": u"".join(charStack), "data": []} self.state = self.states["tagName"] return True # Didn't find the end tag. The last character in charStack could be # anything, so it has to be re-processed in the data state self.stream.unget(charStack.pop()) # The remaining characters are a prefix of the tag name, so they're # just letters and digits, so they can be output as character # tokens immediately self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</" + u"".join(charStack)}) self.state = self.states["data"] return True data = self.stream.char() if data in asciiLetters: self.currentToken = {"type": tokenTypes["EndTag"], "name": data, "data": []} self.state = self.states["tagName"] elif data == u">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-right-bracket"}) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-eof"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"}) self.state = self.states["data"] else: # XXX data can be _'_... self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-char", "datavars": {"data": data}}) self.stream.unget(data) self.state = self.states["bogusComment"] return True def tagNameState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] elif data == u">": self.emitCurrentToken() elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-tag-name"}) self.emitCurrentToken() elif data == u"/": if not self.processSolidusInTag(): self.state = self.states["beforeAttributeName"] else: self.currentToken["name"] += data # (Don't use charsUntil here, because tag names are # very short and it's faster to not do anything fancy) return True def beforeAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u">": self.emitCurrentToken() elif data == u"/": self.processSolidusInTag() elif data == u"'" or data == u'"' or data == u"=": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-in-attribute-name"}) self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-name-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] return True def attributeNameState(self): data = self.stream.char() leavingThisState = True emitToken = False if data == u"=": self.state = self.states["beforeAttributeValue"] elif data in asciiLetters: self.currentToken["data"][-1][0] += data +\ self.stream.charsUntil(asciiLetters, True) leavingThisState = False elif data == u">": # XXX If we emit here the attributes are converted to a dict # without being checked and when the code below runs we error # because data is a dict not a list emitToken = True elif data in spaceCharacters: self.state = self.states["afterAttributeName"] elif data == u"/": if not self.processSolidusInTag(): self.state = self.states["beforeAttributeName"] elif data == u"'" or data == u'"': self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-in-attribute-name"}) self.currentToken["data"][-1][0] += data leavingThisState = False elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-name"}) self.state = self.states["data"] emitToken = True else: self.currentToken["data"][-1][0] += data leavingThisState = False if leavingThisState: # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. if self.lowercaseAttrName: self.currentToken["data"][-1][0] = ( self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, value in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "duplicate-attribute"}) break # XXX Fix for above XXX if emitToken: self.emitCurrentToken() return True def afterAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"=": self.state = self.states["beforeAttributeValue"] elif data == u">": self.emitCurrentToken() elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u"/": if not self.processSolidusInTag(): self.state = self.states["beforeAttributeName"] elif data == u"'" or data == u'"': self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-after-attribute-name"}) self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-end-of-tag-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] return True def beforeAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"\"": self.state = self.states["attributeValueDoubleQuoted"] elif data == u"&": self.state = self.states["attributeValueUnQuoted"] self.stream.unget(data); elif data == u"'": self.state = self.states["attributeValueSingleQuoted"] elif data == u">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-value-but-got-right-bracket"}) self.emitCurrentToken() elif data == u"=": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "equals-in-unquoted-attribute-value"}) self.currentToken["data"][-1][1] += data self.state = self.states["attributeValueUnQuoted"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-value-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data self.state = self.states["attributeValueUnQuoted"] return True def attributeValueDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["afterAttributeValue"] elif data == u"&": self.processEntityInAttribute(u'"') elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-double-quote"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("\"", u"&")) return True def attributeValueSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["afterAttributeValue"] elif data == u"&": self.processEntityInAttribute(u"'") elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-single-quote"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("'", u"&")) return True def attributeValueUnQuotedState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] elif data == u"&": self.processEntityInAttribute(None) elif data == u">": self.emitCurrentToken() elif data == u'"' or data == u"'" or data == u"=": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-in-unquoted-attribute-value"}) self.currentToken["data"][-1][1] += data elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-no-quotes"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \ frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters) return True def afterAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] elif data == u">": self.emitCurrentToken() self.state = self.states["data"] elif data == u"/": if not self.processSolidusInTag(): self.state = self.states["beforeAttributeName"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-EOF-after-attribute-value"}) self.emitCurrentToken() self.stream.unget(data) self.state = self.states["data"] else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-after-attribute-value"}) self.stream.unget(data) self.state = self.states["beforeAttributeName"] return True def bogusCommentState(self): # Make a new comment token and give it as value all the characters # until the first > or EOF (charsUntil checks for EOF automatically) # and emit it. self.tokenQueue.append( {"type": tokenTypes["Comment"], "data": self.stream.charsUntil(u">")}) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() self.state = self.states["data"] return True def bogusCommentContinuationState(self): # Like bogusCommentState, but the caller must create the comment token # and this state just adds more characters to it self.currentToken["data"] += self.stream.charsUntil(u">") self.tokenQueue.append(self.currentToken) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() self.state = self.states["data"] return True def markupDeclarationOpenState(self): charStack = [self.stream.char()] if charStack[-1] == u"-": charStack.append(self.stream.char()) if charStack[-1] == u"-": self.currentToken = {"type": tokenTypes["Comment"], "data": u""} self.state = self.states["commentStart"] return True elif charStack[-1] in (u'd', u'D'): matched = True for expected in ((u'o', u'O'), (u'c', u'C'), (u't', u'T'), (u'y', u'Y'), (u'p', u'P'), (u'e', u'E')): charStack.append(self.stream.char()) if charStack[-1] not in expected: matched = False break if matched: self.currentToken = {"type": tokenTypes["Doctype"], "name": u"", "publicId": None, "systemId": None, "correct": True} self.state = self.states["doctype"] return True self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-dashes-or-doctype"}) # charStack[:-2] consists of 'safe' characters ('-', 'd', 'o', etc) # so they can be copied directly into the bogus comment data, and only # the last character might be '>' or EOF and needs to be ungetted self.stream.unget(charStack.pop()) self.currentToken = {"type": tokenTypes["Comment"], "data": u"".join(charStack)} self.state = self.states["bogusCommentContinuation"] return True def commentStartState(self): data = self.stream.char() if data == "-": self.state = self.states["commentStartDash"] elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") self.state = self.states["comment"] return True def commentStartDashState(self): data = self.stream.char() if data == "-": self.state = self.states["commentEnd"] elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-") self.state = self.states["comment"] return True def commentState(self): data = self.stream.char() if data == u"-": self.state = self.states["commentEndDash"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") return True def commentEndDashState(self): data = self.stream.char() if data == u"-": self.state = self.states["commentEnd"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-end-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += u"-" + data +\ self.stream.charsUntil(u"-") # Consume the next character which is either a "-" or an EOF as # well so if there's a "-" directly after the "-" we go nicely to # the "comment end state" without emitting a ParseError() there. self.stream.char() return True def commentEndState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == u"-": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-dash-after-double-dash-in-comment"}) self.currentToken["data"] += data elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-double-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: # XXX self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-comment"}) self.currentToken["data"] += u"--" + data self.state = self.states["comment"] return True def doctypeState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeDoctypeName"] else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "need-space-after-doctype"}) self.stream.unget(data) self.state = self.states["beforeDoctypeName"] return True def beforeDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == u">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-right-bracket"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-eof"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["name"] = data self.state = self.states["doctypeName"] return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.state = self.states["afterDoctypeName"] elif data == u">": self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype-name"}) self.currentToken["correct"] = False self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["name"] += data return True def afterDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.currentToken["correct"] = False self.stream.unget(data) self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: if data in (u"p", u"P"): matched = True for expected in ((u"u", u"U"), (u"b", u"B"), (u"l", u"L"), (u"i", u"I"), (u"c", u"C")): data = self.stream.char() if data not in expected: matched = False break if matched: self.state = self.states["beforeDoctypePublicIdentifier"] return True elif data in (u"s", u"S"): matched = True for expected in ((u"y", u"Y"), (u"s", u"S"), (u"t", u"T"), (u"e", u"E"), (u"m", u"M")): data = self.stream.char() if data not in expected: matched = False break if matched: self.state = self.states["beforeDoctypeSystemIdentifier"] return True # All the characters read before the current 'data' will be # [a-zA-Z], so they're garbage in the bogus doctype and can be # discarded; only the latest character might be '>' or EOF # and needs to be ungetted self.stream.unget(data) self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-space-or-right-bracket-in-doctype", "datavars": {"data": data}}) self.currentToken["correct"] = False self.state = self.states["bogusDoctype"] return True def beforeDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["publicId"] = u"" self.state = self.states["doctypePublicIdentifierDoubleQuoted"] elif data == "'": self.currentToken["publicId"] = u"" self.state = self.states["doctypePublicIdentifierSingleQuoted"] elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.states["bogusDoctype"] return True def doctypePublicIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["afterDoctypePublicIdentifier"] elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["publicId"] += data return True def doctypePublicIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["afterDoctypePublicIdentifier"] elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["publicId"] += data return True def afterDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] elif data == "'": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierSingleQuoted"] elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.states["bogusDoctype"] return True def beforeDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] elif data == "'": self.currentToken["systemId"] = u"" self.state = self.states["doctypeSystemIdentifierSingleQuoted"] elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.states["bogusDoctype"] return True def doctypeSystemIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["afterDoctypeSystemIdentifier"] elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["systemId"] += data return True def doctypeSystemIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["afterDoctypeSystemIdentifier"] elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["systemId"] += data return True def afterDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.state = self.states["bogusDoctype"] return True def bogusDoctypeState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data is EOF: # XXX EMIT self.stream.unget(data) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: pass return True