Exemplo n.º 1
0
    def preprocess(cls, dom):
        children = dom.getchildren()

        # cut useless/spam dom nodes
        for child in children:
            if not isinstance(child, HtmlElement) or \
                child.tag in DomTreeHelper.filtered_preprocess_tags or \
                dom.get("style") is not None and misc.find_list(dom.get("style"), DomTreeHelper.filtered_styles) or \
                child.get("class") is not None and misc.find_list(child.get("class"), DomTreeHelper.filtered_classids) or \
                child.get("id") is not None and misc.find_list(child.get("id"), DomTreeHelper.filtered_classids):
                child.drop_tree()
            else:
                DomTreeHelper.preprocess(child)
Exemplo n.º 2
0
 def is_domain_url(self, url):
     parse_result = urlparse.urlparse(url)
     if (len(parse_result.path) == 0 or parse_result.path == "/" or \
         misc.find_list(lambda filename : parse_result.path.startswith(filename),
             domain_url_filenames)) and len(parse_result.query) == 0:
         return True
     else:
         return False
Exemplo n.º 3
0
    def validate(self, url, html, headers, extras = None):
        #content type filtering
        content_type = headers.get('Content-Type', None)
        if content_type is not None and not misc.find_list(content_type.lower(), self._settings["general_crawl_policies"]["supported_content_types"]):
            False, "filtered by content_type %s" % content_type

        #doc check
        if headers.has_key('Content-Length') and headers['Content-Length'].strip() == "0" or len(html) == 0:
            False,  "doc is empty"

        return True, None