예제 #1
0
class Settings(object):
    START_LINK = 'http://example.com/'

    def __init__(self):
        self.DOMAIN = Settings.START_LINK.split('/')[2]
        self.ROBOTS_LINK = Settings.START_LINK + 'robots.txt'
        self.COUNT_URLS = 200000
        self.THREADS = 4
        self.RAND_NUM = 5
        self.TIMEOUT = 30
        self.AGENT = ''
        self.DATABASE = 'mongodb://*****:*****@name="description"]/@content',
            u'keywords': u'//meta[@name="keywords"]/@content',
            u'robots': u'//meta[@name="robots"]/@content',
            u'canonical': u'//link[@rel="canonical"]/@href',
            u'h1': u'//h1//text()',
            u'h2': u'//h2//text()',
            u'h3': u'//h3//text()',
            u'text': u'''//body//*[not(self::script or self::a or self::h1 or
            self::h2 or self::h3)]/text()[normalize-space()]''',
            u'script': u'//script//text()',
            u'p': u'//p//text()',
            u'anchors': u'//a//text()',
            u'alt': u'//img/@alt',
            u'title2': u'//@title'
            }
        self.robots = RobotsCache()
        self.rules = self.robots.cache(self.ROBOTS_LINK)
        self.client = MongoClient(self.DATABASE)
        self.db = self.client[self.DB_NAME]
예제 #2
0
파일: test_cache.py 프로젝트: jmcarp/reppy
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(self.robots.allowed(
                'http://localhost:8080/foo', 'rogerbot'), True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            old_ttl = self.robots.min_ttl
            self.robots.min_ttl = 0
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)
            self.robots.min_ttl = old_ttl

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(
                self.robots.fetch('http://localhost:8080/foo'), None)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.cache('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.add(self.robots.fetch(
                'http://localhost:8080/foo'))
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
            'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(self.robots.disallowed(
                'http://localhost:8080/foo', 'rogerbot'))
            urls = [
                'http://localhost:8080/foo',
                'http://localhost:8080/bar'
            ]
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(self.robots.delay(
                'http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a',
                    'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])
예제 #3
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot'),
                True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                                 None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'),
                                None)
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'),
                                None)
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.add(self.robots.fetch('http://localhost:8080/foo'))
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
                          'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(
                self.robots.disallowed('http://localhost:8080/foo',
                                       'rogerbot'))
            urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar']
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(
                self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a', 'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])
예제 #4
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot'),
                True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                                 None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            old_ttl = self.robots.min_ttl
            self.robots.min_ttl = 0
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=True), None)
            # If we ignore the TTL, it should still be there.
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=False,
                                 honor_ttl=False), None)
            # However, if we honor the TTL, it should be missing in the cache.
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=False), None)
            self.robots.min_ttl = old_ttl

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'),
                                None)
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'),
                                None)
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.add(self.robots.fetch('http://localhost:8080/foo'))
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        with mock.patch.object(self.robots.session,
                               'get',
                               side_effect=TypeError):
            self.assertRaises(ServerError, self.robots.allowed,
                              'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(
                self.robots.disallowed('http://localhost:8080/foo',
                                       'rogerbot'))
            urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar']
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(
                self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a', 'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])

    def test_dns_exception(self):
        '''Raises an exception if url does not resolve.'''
        self.assertRaises(ConnectionException, self.robots.allowed,
                          'http://does-not-resolve', 'rogerbot')

    def test_malformed_url(self):
        '''Raises an exception if the url is malformed.'''
        self.assertRaises(MalformedUrl, self.robots.allowed, 'hhttp://moz.com',
                          'rogerbot')

    def test_ssl_exception(self):
        '''Raises an exception if there is an ssl error.'''
        with asis.Server('tests/asis/test_ssl_exception', port=8080):
            self.assertRaises(SSLException, self.robots.allowed,
                              'https://localhost:8080', 'rogerbot')

    def test_excessive_redirects(self):
        '''Raises an exception if there are too many redirects.'''
        with asis.Server('tests/asis/test_excessive_redirects', port=8080):
            self.assertRaises(ExcessiveRedirects, self.robots.allowed,
                              'http://localhost:8080/one', 'rogerbot')

    def test_bad_status_codes(self):
        '''Raises an exception if there is a 5xx status code.'''
        with asis.Server('tests/asis/test_bad_status_codes', port=8080):
            self.assertRaises(BadStatusCode, self.robots.allowed,
                              'http://localhost:8080', 'rogerbot')