def test_backward_filename_filter(self): url_filter = BackwardFilenameFilter( accepted=['html', 'image.*.png'], rejected=['bmp', 'jp[eg]', 'image.123.png'] ) record = URLRecord() record.url = 'http://example.com/' self.assertTrue(url_filter.test( URLInfo.parse('http://example/index.html'), record )) self.assertTrue(url_filter.test( URLInfo.parse('http://example/myimage.1003.png'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example/myimage.123.png'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example/blah.png'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example/image.1003.png.bmp'), record ))
def test_directory_filter(self): record = URLRecord() record.url = 'http://example.com/blog/' url_filter = DirectoryFilter() self.assertTrue(url_filter.test( URLInfo.parse('http://example.com'), record )) url_filter = DirectoryFilter(accepted=['/blog']) self.assertFalse(url_filter.test( URLInfo.parse('http://example.com'), record )) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/'), record )) url_filter = DirectoryFilter(rejected=['/cgi-bin/']) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example.com/cgi-bin'), record ))
def new_mock_url_record(): url_record = URLRecord() url_record.url = 'http://example.com' url_record.parent_url = 'http://example.com' url_record.level = 0 return url_record
def test_recursive_filter_requisites(self): record = URLRecord() record.level = 0 record.inline_level = 1 url_filter = RecursiveFilter(page_requisites=True) self.assertTrue(url_filter.test(None, record))
def test_regex_filter(self): record = URLRecord() record.url = 'http://example.com/blog/' url_filter = RegexFilter() self.assertTrue(url_filter.test( URLInfo.parse('http://example.net'), record )) url_filter = RegexFilter(accepted=r'blo[a-z]/$') self.assertTrue(url_filter.test( URLInfo.parse('http://example.net/blob/'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example.net/blob/123'), record )) url_filter = RegexFilter(rejected=r'\.gif$') self.assertTrue(url_filter.test( URLInfo.parse('http://example.net/blob/'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example.net/blob/123.gif'), record ))
def test_recursive_filter_on(self): record = URLRecord() record.level = 0 url_filter = RecursiveFilter(enabled=True) self.assertTrue(url_filter.test(None, record)) record.level = 1 self.assertTrue(url_filter.test(None, record))
def test_recursive_filter_off(self): record = URLRecord() record.level = 0 url_filter = RecursiveFilter() self.assertTrue(url_filter.test(None, record)) record.level = 1 self.assertFalse(url_filter.test(None, record))
def test_add_referer_https_to_http(self): request = Request() url_record = URLRecord() url_record.parent_url = 'https://example.com/' url_record.url = 'http://example.com/image.png' WebProcessorSession._add_referrer(request, url_record) self.assertNotIn('referer', request.fields)
def test_add_referer(self): request = Request() url_record = URLRecord() url_record.parent_url = 'http://example.com/' url_record.url = 'http://example.com/image.png' WebProcessorSession._add_referrer(request, url_record) self.assertEqual('http://example.com/', request.fields['Referer'])
def _new_url_record(cls, request: Request) -> URLRecord: '''Return new empty URLRecord.''' url_record = URLRecord() url_record.url = request.url_info.url url_record.status = Status.in_progress url_record.try_count = 0 url_record.level = 0 return url_record
def test_tries_filter(self): record = URLRecord() record.try_count = 4 url_filter = TriesFilter(0) self.assertTrue(url_filter.test(None, record)) url_filter = TriesFilter(5) record.try_count = 4 self.assertTrue(url_filter.test(None, record)) record.try_count = 5 self.assertFalse(url_filter.test(None, record))
def test_parent_filter(self): record = URLRecord() url_filter = ParentFilter() record.root_url = 'http://example.com/blog/topic2/' self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic2/'), record )) record.root_url = 'http://example.com/blog/topic1/' self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://example.com/blog/topic1/blah2.html'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://example.com/blog/'), record )) self.assertFalse(url_filter.test( URLInfo.parse('https://example.com/blog/'), record )) self.assertTrue(url_filter.test( URLInfo.parse('http://somewhere.com/'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://somewhere.com/'), record )) record.inline_level = 1 self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/styles.css'), record ))
def test_https_filter(self): record= URLRecord() url_filter = HTTPSOnlyFilter() self.assertFalse(url_filter.test( URLInfo.parse('http://example.net'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://example.net'), record )) self.assertFalse(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertFalse(url_filter.test( URLInfo.parse("javascript:alert('hello!')"), record ))
def test_span_hosts_filter(self): record = URLRecord() record.url = 'http://example.com' url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], enabled=False ) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), record )) self.assertFalse(url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], enabled=True ) self.assertTrue(url_filter.test( URLInfo.parse('http://example.com/blog/topic1/blah.html'), record )) self.assertTrue(url_filter.test( URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'), record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], page_requisites=True ) record = URLRecord() record.url = 'http://1.example.com/' record.inline_level = 1 self.assertTrue(url_filter.test( URLInfo.parse('http://1.example.com/'), record )) url_filter = SpanHostsFilter([ URLInfo.parse('http://example.com/blog/').hostname, ], linked_pages=True, ) record = URLRecord() record.url = 'http://1.example.com/' record.parent_url = 'http://example.com/blog/' self.assertTrue(url_filter.test( URLInfo.parse('http://1.example.com/'), record )) record = URLRecord() record.url = 'http://1.example.com/blah.html' record.parent_url = 'http://1.example.com/' self.assertFalse(url_filter.test( URLInfo.parse('http://1.example.com/blah.html'), record ))
def test_follow_ftp_filter(self): record = URLRecord() url_filter = FollowFTPFilter() self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertTrue(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record )) record.parent_url = 'http://wolf.farts' self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertFalse(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record )) url_filter = FollowFTPFilter(follow=True) self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertTrue(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record )) record.parent_url = 'ftp://wolf.farts' self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertTrue(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record )) url_filter = FollowFTPFilter(follow=True) self.assertTrue(url_filter.test( URLInfo.parse('http://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('https://wolf.farts/1'), record )) self.assertTrue(url_filter.test( URLInfo.parse('mailto:[email protected]'), record )) self.assertTrue(url_filter.test( URLInfo.parse('ftp://wolf.farts/'), record ))
def test_level_filter(self): record = URLRecord() record.level = 4 url_filter = LevelFilter(0) self.assertTrue(url_filter.test(None, record)) url_filter = LevelFilter(5) record.level = 5 self.assertTrue(url_filter.test(None, record)) record.level = 6 self.assertFalse(url_filter.test(None, record)) url_filter = LevelFilter(5) record.inline_level = 1 record.level = 5 self.assertTrue(url_filter.test(None, record)) record.level = 6 self.assertTrue(url_filter.test(None, record)) record.level = 7 self.assertTrue(url_filter.test(None, record)) record.level = 8 self.assertFalse(url_filter.test(None, record)) url_filter = LevelFilter(0) record.inline_level = 1 self.assertTrue(url_filter.test(None, record)) record.inline_level = 2 self.assertTrue(url_filter.test(None, record)) record.inline_level = 3 self.assertTrue(url_filter.test(None, record)) record.inline_level = 4 self.assertTrue(url_filter.test(None, record)) record.inline_level = 5 self.assertTrue(url_filter.test(None, record)) record.inline_level = 6 self.assertFalse(url_filter.test(None, record)) record.level = 1 url_filter = LevelFilter(0, inline_max_depth=0) record.inline_level = 1000 self.assertTrue(url_filter.test(None, record)) url_filter = LevelFilter(5, inline_max_depth=1) record.inline_level = 1 self.assertTrue(url_filter.test(None, record)) record.inline_level = 2 self.assertFalse(url_filter.test(None, record))
def to_plain(self) -> URLRecord: record = URLRecord() record.url = self.url record.parent_url = self.parent_url record.root_url = self.root_url record.status = Status(self.status) record.try_count = self.try_count record.level = self.level record.inline_level = self.inline_level record.link_type = LinkType(self.link_type) if self.link_type else None record.priority = self.priority record.post_data = self.post_data record.status_code = self.status_code record.filename = self.filename return record
def child_url_record(self, url: str, inline: bool=False, link_type: Optional[LinkType]=None, post_data: Optional[str]=None, level: Optional[int]=None): '''Return a child URLRecord. This function is useful for testing filters before adding to table. ''' url_record = URLRecord() url_record.url = url url_record.status = Status.todo url_record.try_count = 0 url_record.level = self.url_record.level + 1 if level is None else level url_record.root_url = self.url_record.root_url or self.url_record.url url_record.parent_url = self.url_record.url url_record.inline_level = (self.url_record.inline_level or 0) + 1 if inline else 0 url_record.link_type = link_type url_record.post_data = post_data return url_record