Exemplo n.º 1
0
    def parse(self, markup_text):
        """Parse HTML markup.

        Args:
            markup_text: The HTML markup text to parse.

        Returns:
            True if the markup was parsed successfully.

        """
        self._logger.info("Parsing HTML markup")
        # Start with the root of the document
        self._parsed_data = []
        self._root = Tag(name='root', children=[])
        self._current_tag = self._root
        self._number_of_tags = 0
        self.reset()

        # Fix any issues before we call the parser
        if markup_text:
            markup_text = markup_text.replace('</>', '</a>')
            self._logger.debug("Replaced </> with </a>")
            try:
                # Feed our data to the parser
                self.feed(markup_text)
            except HTMLParser.HTMLParseError as excep:
                raise Error("Error parsing HTML: %s", str(excep))
        # Store the root tag to the list of HTML tags
        self._store_tag(self._root)
        self._number_of_tags = len(self._parsed_data)
        self._logger.debug("finished parsing. Position "
                           "- %s", str(self.getpos()))
        self._logger.debug("%s tags processed", str(self._number_of_tags))

        return True
Exemplo n.º 2
0
    def __init__(self, log_handler=None):
        """Create and initialize the HTML parser.

        Prepare the parent class for processing, reset the root Tag, tag
        counter, and rate limiting.

        Args:
            log_handler: The log handler to use instead of the default.

        """
        self._logger = logging.getLogger(__name__)
        handler = None
        if log_handler:
            handler = log_handler
        else:
            formatter = logging.Formatter('%(asctime)s - %(levelname)s:'
                                          '%(name)s:%(message)s')
            handler = logging.StreamHandler(stream=sys.stdout)
            handler.setLevel(logging.INFO)
            handler.setFormatter(formatter)
        self._logger.addHandler(handler)

        self._parsed_data = []
        self._root = Tag(name='root', children=[])
        self._current_tag = self._root
        self._number_of_tags = 0
        HTMLParser.HTMLParser.__init__(self)
        self._rate_limit_counter_time = datetime.datetime.now()
        self._logger.debug("rate_limit_counter_time = %i",
                           self._rate_limit_counter_time)
Exemplo n.º 3
0
    def handle_startendtag(self, tag, attrs):
        """Handle a start/end HTML tag.

        Event handler for when a tag that is both the start and end of a tag
        is encountered.

        Args:
            tag: The HTML tag.
            attrs: A list of the tag's attributes.

        """
        self._logger.debug("start/end tag - %s / %s", tag, str(attrs))
        if tag == 'br':
            self._current_tag.string_concat_list.append(" ")
        new_tag = Tag(name=tag, attributes=attrs, parent=self._current_tag)
        if not self._current_tag.children:
            self._current_tag.children = []
        self._current_tag.children.append(new_tag)
Exemplo n.º 4
0
    def handle_starttag(self, tag, attrs):
        """Handle the start of an HTML tag.

        Event handler for when the start of a tag is encountered.

        Args:
            tag: The HTML tag.
            attrs: A list of the tag's attributes.

        """
        self._logger.debug("start tag - %s / %s", tag, str(attrs))
        new_tag = Tag(name=tag, attributes=attrs, parent=self._current_tag,
                      data='')
        self._logger.debug("parent tag: %s", str(self._current_tag.name))
        if not self._current_tag.children:
            self._current_tag.children = []
        self._current_tag.children.append(new_tag)
        self._current_tag = new_tag
Exemplo n.º 5
0
    def unknown_decl(self, data):
        """Handle an unknown declaration.

        Event handler for when an unknown declaration is encountered.

        Args:
            data: The declaration.

        """
        self._logger.debug("declaration - %s", data)
        if data:
            data = data.replace("\n", "")
            data = data.replace("\r", "")
            if len(data) > 0:
                new_tag = Tag(name='unknown_declaration',
                              parent=self._current_tag, data=data)
                if not self._current_tag.children:
                    self._current_tag.children = []
                self._current_tag.children.append(new_tag)
Exemplo n.º 6
0
    def handle_decl(self, decl):
        """Handle a declaration.

        Event handler for when a declaration is encountered.

        Args:
            decl: The declaration.

        """
        self._logger.debug("declaration - %s", decl)
        if decl:
            decl = decl.replace("\n", "")
            decl = decl.replace("\r", "")
            if len(decl) > 0:
                new_tag = Tag(name='declaration', parent=self._current_tag,
                              data=decl)
                if not self._current_tag.children:
                    self._current_tag.children = []
                self._current_tag.children.append(new_tag)
Exemplo n.º 7
0
    def handle_comment(self, data):
        """Handle a comment.

        Event handler for when a comment is encountered.

        Args:
            data: The comment.

        """
        self._logger.debug("comment - %s", data)
        if data:
            data = data.replace("\n", "")
            data = data.replace("\r", "")
            if len(data) > 0:
                new_tag = Tag(name='comment', parent=self._current_tag,
                              data=data)
                if not self._current_tag.children:
                    self._current_tag.children = []
                self._current_tag.children.append(new_tag)