def _restrict_content_type(self, curi): """ Decide based on the `CrawlUri`s Content-Type whether we want to process it. """ allowed = ["text/html", "application/xhtml", "text/vnd.wap.wml", "application/vnd.wap.wml", "application/vnd.wap.xhtm"] (ctype, _enc) = get_content_type_encoding(curi) return ctype in allowed
def __call__(self, curi): """ Actually extract links from the html content if the content type matches. """ if not self._restrict_content_type(curi): return curi if CURI_EXTRACTION_FINISHED in curi.optional_vars and \ curi.optional_vars[CURI_EXTRACTION_FINISHED] == CURI_OPTIONAL_TRUE: return curi (_type, encoding) = get_content_type_encoding(curi) try: content = curi.content_body.decode(encoding) except Exception: content = curi.content_body parsed_url = urlparse.urlparse(curi.url) self._base_url = curi.url # iterate over all tags for tag in self._tag_extractor.finditer(content): if tag.start(8) > 0: # a html comment, ignore continue elif tag.start(7) > 0: # a meta tag curi = self._process_meta(curi, parsed_url, content, (tag.start(5), tag.end(5))) elif tag.start(5) > 0: # generic <whatever tag curi = self._process_generic_tag(curi, parsed_url, content, (tag.start(6), tag.end(6)), (tag.start(5), tag.end(5))) elif tag.start(1) > 0: # <script> tag # TODO no script handling so far pass elif tag.start(3) > 0: # <style> tag # TODO no tag handling so far pass return curi