Пример #1
0
def new_mock_url_record():
    url_record = URLRecord()
    url_record.url = 'http://example.com'
    url_record.parent_url = 'http://example.com'
    url_record.level = 0

    return url_record
Пример #2
0
    def test_add_referer(self):
        request = Request()
        url_record = URLRecord()
        url_record.parent_url = 'http://example.com/'
        url_record.url = 'http://example.com/image.png'

        WebProcessorSession._add_referrer(request, url_record)

        self.assertEqual('http://example.com/', request.fields['Referer'])
Пример #3
0
    def test_add_referer_https_to_http(self):
        request = Request()
        url_record = URLRecord()
        url_record.parent_url = 'https://example.com/'
        url_record.url = 'http://example.com/image.png'

        WebProcessorSession._add_referrer(request, url_record)

        self.assertNotIn('referer', request.fields)
Пример #4
0
    def to_plain(self) -> URLRecord:
        record = URLRecord()
        record.url = self.url
        record.parent_url = self.parent_url
        record.root_url = self.root_url
        record.status = Status(self.status)
        record.try_count = self.try_count
        record.level = self.level
        record.inline_level = self.inline_level
        record.link_type = LinkType(self.link_type) if self.link_type else None
        record.priority = self.priority
        record.post_data = self.post_data
        record.status_code = self.status_code
        record.filename = self.filename

        return record
Пример #5
0
    def to_plain(self) -> URLRecord:
        record = URLRecord()
        record.url = self.url
        record.parent_url = self.parent_url
        record.root_url = self.root_url
        record.status = Status(self.status)
        record.try_count = self.try_count
        record.level = self.level
        record.inline_level = self.inline_level
        record.link_type = LinkType(self.link_type) if self.link_type else None
        record.priority = self.priority
        record.post_data = self.post_data
        record.status_code = self.status_code
        record.filename = self.filename

        return record
Пример #6
0
    def child_url_record(self, url: str, inline: bool=False,
                         link_type: Optional[LinkType]=None,
                         post_data: Optional[str]=None,
                         level: Optional[int]=None):
        '''Return a child URLRecord.

        This function is useful for testing filters before adding to table.
        '''
        url_record = URLRecord()
        url_record.url = url
        url_record.status = Status.todo
        url_record.try_count = 0
        url_record.level = self.url_record.level + 1 if level is None else level
        url_record.root_url = self.url_record.root_url or self.url_record.url
        url_record.parent_url = self.url_record.url
        url_record.inline_level = (self.url_record.inline_level or 0) + 1 if inline else 0
        url_record.link_type = link_type
        url_record.post_data = post_data

        return url_record
Пример #7
0
    def test_follow_ftp_filter(self):
        record = URLRecord()
        url_filter = FollowFTPFilter()

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))

        record.parent_url = 'http://wolf.farts'

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))

        url_filter = FollowFTPFilter(follow=True)

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))

        record.parent_url = 'ftp://wolf.farts'

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))

        url_filter = FollowFTPFilter(follow=True)

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://wolf.farts/1'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('ftp://wolf.farts/'),
            record
        ))
Пример #8
0
    def test_span_hosts_filter(self):
        record = URLRecord()
        record.url = 'http://example.com'

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/').hostname,
        ],
            enabled=False
        )

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic1/blah.html'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'),
            record
        ))

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/').hostname,
        ],
            enabled=True
        )
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic1/blah.html'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'),
            record
        ))

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/').hostname,
        ],
            page_requisites=True
        )
        record = URLRecord()
        record.url = 'http://1.example.com/'
        record.inline_level = 1

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://1.example.com/'),
            record
        ))

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/').hostname,
        ],
            linked_pages=True,
        )
        record = URLRecord()
        record.url = 'http://1.example.com/'
        record.parent_url = 'http://example.com/blog/'

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://1.example.com/'),
            record
        ))

        record = URLRecord()
        record.url = 'http://1.example.com/blah.html'
        record.parent_url = 'http://1.example.com/'

        self.assertFalse(url_filter.test(
            URLInfo.parse('http://1.example.com/blah.html'),
            record
        ))