示例#1
0
 def _restrict_content_type(self, curi):
     """
     Decide based on the `CrawlUri`s Content-Type whether we want to process
     it.
     """
     allowed = ["text/html", "application/xhtml", "text/vnd.wap.wml",
         "application/vnd.wap.wml", "application/vnd.wap.xhtm"]
     (ctype, _enc) = get_content_type_encoding(curi)
     return ctype in allowed
示例#2
0
    def __call__(self, curi):
        """
        Actually extract links from the html content if the content type
        matches.
        """
        if not self._restrict_content_type(curi):
            return curi

        if CURI_EXTRACTION_FINISHED in curi.optional_vars and \
            curi.optional_vars[CURI_EXTRACTION_FINISHED] == CURI_OPTIONAL_TRUE:
            return curi

        (_type, encoding) = get_content_type_encoding(curi)

        try:
            content = curi.content_body.decode(encoding)
        except Exception:
            content = curi.content_body

        parsed_url = urlparse.urlparse(curi.url)
        self._base_url = curi.url

        # iterate over all tags
        for tag in self._tag_extractor.finditer(content):

            if tag.start(8) > 0:
                # a html comment, ignore
                continue

            elif tag.start(7) > 0:
                # a meta tag
                curi = self._process_meta(curi, parsed_url, content,
                        (tag.start(5), tag.end(5)))

            elif tag.start(5) > 0:
                # generic <whatever tag
                curi = self._process_generic_tag(curi, parsed_url, content,
                        (tag.start(6), tag.end(6)),
                        (tag.start(5), tag.end(5)))

            elif tag.start(1) > 0:
                # <script> tag
                # TODO no script handling so far
                pass

            elif tag.start(3) > 0:
                # <style> tag
                # TODO no tag handling so far
                pass

        return curi