def parse(self, markup_text): """Parse HTML markup. Args: markup_text: The HTML markup text to parse. Returns: True if the markup was parsed successfully. """ self._logger.info("Parsing HTML markup") # Start with the root of the document self._parsed_data = [] self._root = Tag(name='root', children=[]) self._current_tag = self._root self._number_of_tags = 0 self.reset() # Fix any issues before we call the parser if markup_text: markup_text = markup_text.replace('</>', '</a>') self._logger.debug("Replaced </> with </a>") try: # Feed our data to the parser self.feed(markup_text) except HTMLParser.HTMLParseError as excep: raise Error("Error parsing HTML: %s", str(excep)) # Store the root tag to the list of HTML tags self._store_tag(self._root) self._number_of_tags = len(self._parsed_data) self._logger.debug("finished parsing. Position " "- %s", str(self.getpos())) self._logger.debug("%s tags processed", str(self._number_of_tags)) return True
def __init__(self, log_handler=None): """Create and initialize the HTML parser. Prepare the parent class for processing, reset the root Tag, tag counter, and rate limiting. Args: log_handler: The log handler to use instead of the default. """ self._logger = logging.getLogger(__name__) handler = None if log_handler: handler = log_handler else: formatter = logging.Formatter('%(asctime)s - %(levelname)s:' '%(name)s:%(message)s') handler = logging.StreamHandler(stream=sys.stdout) handler.setLevel(logging.INFO) handler.setFormatter(formatter) self._logger.addHandler(handler) self._parsed_data = [] self._root = Tag(name='root', children=[]) self._current_tag = self._root self._number_of_tags = 0 HTMLParser.HTMLParser.__init__(self) self._rate_limit_counter_time = datetime.datetime.now() self._logger.debug("rate_limit_counter_time = %i", self._rate_limit_counter_time)
def handle_startendtag(self, tag, attrs): """Handle a start/end HTML tag. Event handler for when a tag that is both the start and end of a tag is encountered. Args: tag: The HTML tag. attrs: A list of the tag's attributes. """ self._logger.debug("start/end tag - %s / %s", tag, str(attrs)) if tag == 'br': self._current_tag.string_concat_list.append(" ") new_tag = Tag(name=tag, attributes=attrs, parent=self._current_tag) if not self._current_tag.children: self._current_tag.children = [] self._current_tag.children.append(new_tag)
def handle_starttag(self, tag, attrs): """Handle the start of an HTML tag. Event handler for when the start of a tag is encountered. Args: tag: The HTML tag. attrs: A list of the tag's attributes. """ self._logger.debug("start tag - %s / %s", tag, str(attrs)) new_tag = Tag(name=tag, attributes=attrs, parent=self._current_tag, data='') self._logger.debug("parent tag: %s", str(self._current_tag.name)) if not self._current_tag.children: self._current_tag.children = [] self._current_tag.children.append(new_tag) self._current_tag = new_tag
def unknown_decl(self, data): """Handle an unknown declaration. Event handler for when an unknown declaration is encountered. Args: data: The declaration. """ self._logger.debug("declaration - %s", data) if data: data = data.replace("\n", "") data = data.replace("\r", "") if len(data) > 0: new_tag = Tag(name='unknown_declaration', parent=self._current_tag, data=data) if not self._current_tag.children: self._current_tag.children = [] self._current_tag.children.append(new_tag)
def handle_decl(self, decl): """Handle a declaration. Event handler for when a declaration is encountered. Args: decl: The declaration. """ self._logger.debug("declaration - %s", decl) if decl: decl = decl.replace("\n", "") decl = decl.replace("\r", "") if len(decl) > 0: new_tag = Tag(name='declaration', parent=self._current_tag, data=decl) if not self._current_tag.children: self._current_tag.children = [] self._current_tag.children.append(new_tag)
def handle_comment(self, data): """Handle a comment. Event handler for when a comment is encountered. Args: data: The comment. """ self._logger.debug("comment - %s", data) if data: data = data.replace("\n", "") data = data.replace("\r", "") if len(data) > 0: new_tag = Tag(name='comment', parent=self._current_tag, data=data) if not self._current_tag.children: self._current_tag.children = [] self._current_tag.children.append(new_tag)