def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, lowercaseElementName=False, lowercaseAttrName=False): #Change case matching defaults as we only output lowercase html anyway #This solution doesn't seem ideal... HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, lowercaseElementName, lowercaseAttrName) # flag to indicate if stripping is going on or not self.stripping = 0
def __init__(self, stream, encoding=None, parseMeta=True, lowercaseElementName=False, lowercaseAttrName=False): #Change case matching defaults as we only output lowercase html anyway #This solution doesn't seem ideal... HTMLTokenizer.__init__(self, stream, encoding, parseMeta, lowercaseElementName, lowercaseAttrName)
def __iter__(self): for token in HTMLTokenizer.__iter__(self): # if its a start tag and is a risky block element (e.g. script), we # indicate that we are in striping mode. Its a counter which allows us # to handle nested risky block elements if self.strip_tokens and token["type"] in ["StartTag", "EndTag"] \ and token["name"].lower() in HTMLSanitizerMixin.unacceptable_block_elements: if token["type"] == "StartTag": self.stripping += 1 elif token["type"] == "EndTag": self.stripping -= 1 # Only yield tokens if we are not in stripping mode if self.stripping < 1: token = self.sanitize_token(token, self.strip_tokens) if token: yield token
def parse(self, stream, output=True): tokenizer = HTMLTokenizer(stream) for token in tokenizer: if output: print(token)
def __iter__(self): for token in HTMLTokenizer.__iter__(self): token = self.sanitize_token(token) if token: yield token