def accept_reject_or_neither(self, url, parent_page=None): ''' Returns `True` (accepted), `False` (rejected), or `None` (no decision). `None` usually means rejected, unless `max_hops_off` comes into play. ''' if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) if not url.scheme in (b'http', b'https'): # XXX doesn't belong here maybe (where? worker ignores unknown # schemes?) return False try_parent_urls = [] if parent_page: try_parent_urls.append(urlcanon.semantic(parent_page.url)) if parent_page.redirect_url: try_parent_urls.append( urlcanon.semantic(parent_page.redirect_url)) # enforce max_hops if (parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]): return False # enforce reject rules if "blocks" in self.scope: for block_rule in self.scope["blocks"]: rule = urlcanon.MatchRule(**block_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): return False else: if rule.applies(url): return False # honor accept rules for accept_rule in self.scope["accepts"]: rule = urlcanon.MatchRule(**accept_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): return True else: if rule.applies(url): return True # no decision if we reach here return None
def test_match_rules(): rule = urlcanon.MatchRule( surt=urlcanon.semantic(b'http://example.com/foo/bar').surt()) assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh') assert rule.applies('http://example.com/foo/bar') assert not rule.applies('http://example.com/foo/baz') rule = urlcanon.MatchRule( ssurt=urlcanon.semantic(b'http://example.com/foo/bar').ssurt()) assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh') assert rule.applies(b'http://example.com/foo/bar') assert not rule.applies('http://example.com/foo/baz') rule = urlcanon.MatchRule( ssurt=urlcanon.semantic('http://example.com/foo/bar').ssurt().decode('ascii')) assert not rule.applies('hTTp://EXAmple.com.../FOo/Bar#zuh') assert rule.applies(b'http://example.com/foo/bar') assert not rule.applies('http://example.com/foo/baz') rule = urlcanon.MatchRule( url_match='REGEX_MATCH', value=b'^.*/audio_file/.*\.mp3$') assert not rule.applies('http://foo.com/some.mp3') assert rule.applies('http://foo.com/blah/audio_file/some.mp3') rule = urlcanon.MatchRule( url_match='SURT_MATCH', value=b'http://(com,vimeocdn,') assert rule.applies('http://a.b.vimeocdn.com/blahblah') assert not rule.applies('https://a.b.vimeocdn.com/blahblah') rule = urlcanon.MatchRule( url_match='STRING_MATCH', value=b'ec-media.soundcloud.com') rule = urlcanon.MatchRule( regex=b'^https?://twitter\.com.*$') rule = urlcanon.MatchRule(substring=b'facebook.com') assert rule.applies('https://www.facebook.com/whatevz') rule = urlcanon.MatchRule( regex=b'^https?://(www.)?youtube.com/watch?.*$', parent_url_regex=b'^https?://(www.)?youtube.com/user/.*$') assert not rule.applies('https://www.youtube.com/watch?v=dUIn5OAPS5s') assert rule.applies( 'https://www.youtube.com/watch?v=dUIn5OAPS5s', parent_url='https://www.youtube.com/user/SonoraSantaneraVEVO') rule = urlcanon.MatchRule( domain=b'twitter.com', url_match='REGEX_MATCH', value=b'^.*lang=(?!en).*$') assert not rule.applies('https://twitter.com/twit') assert not rule.applies('https://twitter.com/twit?lang=en') assert rule.applies('https://twitter.com/twit?lang=es')
def is_in_scope(self, url, parent_page=None): if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) try_parent_urls = [] if parent_page: try_parent_urls.append(urlcanon.semantic(parent_page.url)) if parent_page.redirect_url: try_parent_urls.append( urlcanon.semantic(parent_page.redirect_url)) might_accept = False if not url.scheme in (b'http', b'https'): # XXX doesn't belong here maybe (where? worker ignores unknown # schemes?) return False elif (parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]): pass elif url.surt().startswith(self.scope["surt"].encode("utf-8")): might_accept = True elif parent_page and parent_page.hops_off_surt < self.scope.get( "max_hops_off_surt", 0): might_accept = True elif "accepts" in self.scope: for accept_rule in self.scope["accepts"]: rule = urlcanon.MatchRule(**accept_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): might_accept = True else: if rule.applies(url): might_accept = True if might_accept: if "blocks" in self.scope: for block_rule in self.scope["blocks"]: rule = urlcanon.MatchRule(**block_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): return False else: if rule.applies(url): return False return True else: return False
def _enforce_blocks(self, warcprox_meta): """ Sends a 403 response and raises warcprox.RequestBlockedByRule if the url is blocked by a rule in warcprox_meta. """ url = urlcanon.semantic(self.url) if warcprox_meta and "blocks" in warcprox_meta: for rule in warcprox_meta["blocks"]: block_rule = urlcanon.MatchRule(**rule) if block_rule.applies(url): body = ("request rejected by warcprox: blocked by " "rule found in Warcprox-Meta header: %s" % rule).encode("utf-8") self.send_response(403, "Forbidden") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) response_meta = {"blocked-by-rule":rule} self.send_header( "Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) self.end_headers() if self.command != "HEAD": self.wfile.write(body) self.connection.close() raise warcprox.RequestBlockedByRule( "%s 403 %s %s -- blocked by rule in Warcprox-Meta " "request header %s" % ( self.client_address[0], self.command, self.url, rule))