示例#1
0
    def test_timeout(self):
        bot = build_spider(SimpleSpider, meta={'server': self.server})
        self.setup_cache(bot)
        bot.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('one', self.server.get_url()))
        bot.add_task(Task('one', self.server.get_url(), delay=2))
        bot.run()
        self.assertEqual(2, bot.stat.counters['spider:request'])
        self.assertEqual(1, bot.stat.counters['spider:request-cache'])
        self.assertEqual([1, 1], bot.stat.collections['resp_counters'])

        bot = build_spider(SimpleSpider, meta={'server': self.server},
                           parser_pool_size=1)
        self.setup_cache(bot)
        # DO not clear the cache
        # bot.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('one', self.server.get_url(), priority=1))
        bot.add_task(Task('one', self.server.get_url(),
                          priority=2, cache_timeout=0, delay=1))
        bot.add_task(Task('one', self.server.get_url(),
                          priority=3, cache_timeout=10, delay=3))
        bot.add_task(Task('one', self.server.get_url(),
                          priority=4, cache_timeout=0, delay=4))
        bot.run()
        self.assertEqual([1, 2, 2, 3], bot.stat.collections['resp_counters'])
示例#2
0
    def test_setup_grab(self):
        # Simple test, one task
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.setup_grab(proxy=PROXY1)
        bot.setup_queue()
        bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertEqual(set(bot.stat.collections['ports']),
                         set([self.server.port]))
        self.assertEqual(1, len(set(bot.stat.collections['ports'])))

        content = '%s\n%s' % (PROXY1, PROXY2)
        open('/tmp/__proxy.txt', 'w').write(content)

        # If proxy is configured with both methods
        # (setup_grab and load_proxylist)
        # then proxylist has priority
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.setup_grab(proxy=PROXY3)
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertTrue(EXTRA_PORT2 not in bot.stat.collections['ports'])
        self.assertTrue(EXTRA_PORT2 not in set(bot.stat.collections['ports']))
示例#3
0
    def test_task_priority(self):
        # Automatic random priority
        grab.spider.base.RANDOM_TASK_PRIORITY_RANGE = (10, 20)
        bot = build_spider(SimpleSpider, priority_mode='random')
        bot.setup_queue()
        task = Task('baz', url='xxx')
        self.assertEqual(task.priority, None)
        bot.add_task(task)
        self.assertTrue(10 <= task.priority <= 20)

        # Automatic constant priority
        grab.spider.base.DEFAULT_TASK_PRIORITY = 33
        bot = build_spider(SimpleSpider, priority_mode='const')
        bot.setup_queue()
        task = Task('baz', url='xxx')
        self.assertEqual(task.priority, None)
        bot.add_task(task)
        self.assertEqual(33, task.priority)

        # Automatic priority does not override explictily setted priority
        grab.spider.base.DEFAULT_TASK_PRIORITY = 33
        bot = build_spider(SimpleSpider, priority_mode='const')
        bot.setup_queue()
        task = Task('baz', url='xxx', priority=1)
        self.assertEqual(1, task.priority)
        bot.add_task(task)
        self.assertEqual(1, task.priority)

        self.assertRaises(SpiderMisuseError,
                          lambda: SimpleSpider(priority_mode='foo'))
示例#4
0
    def test_task_cache_timeout(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                self.stat.collect('points', grab.doc.body)

        bot = build_spider(TestSpider, parser_pool_size=1)
        self.setup_cache(bot)
        bot.cache.clear()
        bot.setup_queue()
        # This task will receive first data from `get.data` iterator
        bot.add_task(Task('page', url=self.server.get_url()))
        # This task will be spawned in 1 second and will
        # receive cached data  (cache timeout = 1.5sec > 1sec)
        bot.add_task(Task('page', url=self.server.get_url(),
                     delay=1, cache_timeout=10))
        # This task will be spawned in 2 seconds and will not
        # receive cached data (cache timeout = 1.5 sec < 2 sec)
        # So, this task will receive next data from `get.data` iterator
        bot.add_task(Task('page', url=self.server.get_url(),
                     delay=2, cache_timeout=0.5))

        self.server.response['get.data'] = iter([b'a', b'b'])
        bot.run()
        self.assertEqual(bot.stat.collections['points'],
                         [b'a', b'a', b'b'])
示例#5
0
 def test_delay_error(self):
     bot = build_spider(self.SimpleSpider)
     self.setup_queue(bot)
     bot.task_queue.clear()
     self.assertRaises(SpiderMisuseError,
                       bot.add_task,
                       Task('page', url=self.server.get_url(), delay=1))
示例#6
0
    def test_task_limit(self):
        self.server.response['get.data'] = 'Hello spider!'
        self.server.response['sleep'] = 1.1

        bot = build_spider(self.SimpleSpider, network_try_limit=1)
        bot.setup_grab(connect_timeout=1, timeout=1)
        bot.setup_queue()
        bot.add_task(Task('baz', self.server.get_url()))
        bot.run()
        self.assertEqual(bot.stat.counters['spider:task-baz'], 1)

        bot = build_spider(self.SimpleSpider, task_try_limit=2)
        bot.setup_queue()
        bot.add_task(Task('baz', self.server.get_url(), task_try_count=3))
        bot.run()
        self.assertEqual(bot.stat.counters['spider:request-network'], 0)
 def test_spider_mp_changes(self):
     bot = build_spider(self.SimpleSpider)
     bot.setup_queue()
     bot.meta['url'] = self.server.get_url()
     bot.add_task(Task('page', self.server.get_url()))
     bot.run()
     self.assertEqual(2, bot.foo_count)
示例#8
0
    def test_task_callback(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                self.meta['tokens'].append('0_handler')

        class FuncWithState(object):
            def __init__(self, tokens):
                self.tokens = tokens

            def __call__(self, grab, task):
                self.tokens.append('1_func')

        tokens = []
        func = FuncWithState(tokens)

        bot = build_spider(TestSpider, )
        bot.meta['tokens'] = tokens
        bot.setup_queue()
        # classic handler
        bot.add_task(Task('page', url=self.server.get_url()))
        # callback option overried classic handler
        bot.add_task(Task('page', url=self.server.get_url(), callback=func))
        # callback and null task name
        bot.add_task(Task(name=None, url=self.server.get_url(), callback=func))
        # callback and default task name
        bot.add_task(Task(url=self.server.get_url(), callback=func))
        bot.run()
        self.assertEqual(['0_handler', '1_func', '1_func', '1_func'],
                         sorted(tokens))
示例#9
0
 def test_counter(self):
     bot = build_spider(SimpleSpider, meta={'server': self.server})
     self.setup_cache(bot)
     bot.cache.clear()
     bot.setup_queue()
     bot.add_task(Task('one', self.server.get_url()))
     bot.run()
     self.assertEqual([1], bot.stat.collections['resp_counters'])
示例#10
0
    def test_generator_with_invalid_url(self):

        class SomeSpider(Spider):
            def task_generator(self):
                yield Task('page', url=INVALID_URL)

        bot = build_spider(SomeSpider)
        bot.run()
示例#11
0
    def test_task_nohandler_error(self):
        class TestSpider(Spider):
            pass

        bot = build_spider(TestSpider, )
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        self.assertRaises(NoTaskHandler, bot.run)
示例#12
0
 def test_task_retry(self):
     self.server.response['get.data'] = 'xxx'
     self.server.response_once['code'] = 403
     bot = build_spider(self.SimpleSpider)
     bot.setup_queue()
     bot.add_task(Task('baz', self.server.get_url()))
     bot.run()
     self.assertEqual(b'xxx', bot.stat.collections['SAVED_ITEM'][0])
示例#13
0
 def test_spider(self):
     self.server.response['get.data'] = 'Hello spider!'
     self.server.response['sleep'] = 0
     bot = build_spider(self.SimpleSpider)
     bot.setup_queue()
     bot.add_task(Task('baz', self.server.get_url()))
     bot.run()
     self.assertEqual(b'Hello spider!',
                      bot.stat.collections['SAVED_ITEM'][0])
示例#14
0
    def test_fatal_error(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                raise FatalError

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        self.assertRaises(FatalError, bot.run)
示例#15
0
 def test_setup_grab(self):
     # Mulitple calls to `setup_grab` should accumulate
     # changes in config object.
     bot = build_spider(self.SimpleSpider)
     bot.setup_grab(log_dir='/tmp')
     bot.setup_grab(timeout=30)
     grab = bot.create_grab_instance()
     self.assertEqual(grab.config['log_dir'], '/tmp')
     self.assertEqual(grab.config['timeout'], 30)
示例#16
0
 def test_spider_nonmp_changes(self):
     """This test tests that in non-multiprocess-mode changes made
     inside handler applied to main spider instance."""
     bot = build_spider(self.SimpleSpider)
     bot.setup_queue()
     bot.meta['url'] = self.server.get_url()
     bot.add_task(Task('page', self.server.get_url()))
     bot.run()
     self.assertEqual(4, bot.foo_count)
示例#17
0
    def test_data_nohandler_error(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                yield Data('foo', num=1)

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        self.assertRaises(NoDataHandler, bot.run)
示例#18
0
 def test_something(self):
     bot = build_spider(SimpleSpider, meta={'server': self.server},
                        parser_pool_size=1)
     self.setup_cache(bot)
     bot.cache.clear()
     bot.setup_queue()
     bot.add_task(Task('foo', self.server.get_url()))
     bot.run()
     self.assertEqual([1, 1, 1, 2], bot.stat.collections['resp_counters'])
示例#19
0
    def test_clear(self):
        bot = build_spider(self.SimpleSpider)
        self.setup_queue(bot)
        bot.task_queue.clear()

        for x in six.moves.range(5):
            bot.add_task(Task('page', url=self.server.get_url()))
        self.assertEqual(5, bot.task_queue.size())
        bot.task_queue.clear()
        self.assertEqual(0, bot.task_queue.size())
示例#20
0
    def test_connection_kwargs(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

        config = deepcopy(MONGODB_CONNECTION)
        # Set port that would go as **kwargs into MongoClient()
        MONGODB_CONNECTION.setdefault('port', 27017)
        bot = build_spider(TestSpider)
        bot.setup_cache(backend='mongo', **config)
示例#21
0
    def test_schedule_list_clear(self):
        bot = build_spider(self.SimpleSpider)
        self.setup_queue(bot)
        bot.task_queue.clear()

        for x in six.moves.range(5):
            bot.add_task(Task('page', url=self.server.get_url(), delay=x+1))

        self.assertEqual(5, len(bot.task_queue.schedule_list))
        bot.task_queue.clear()
        self.assertEqual(0, len(bot.task_queue.schedule_list))
示例#22
0
    def test_task_raw(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                self.stat.collect('codes', grab.response.code)

        self.server.response['code'] = 502

        bot = build_spider(TestSpider, network_try_limit=1)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(0, len(bot.stat.collections['codes']))

        bot = build_spider(TestSpider, network_try_limit=1)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url(), raw=True))
        bot.add_task(Task('page', url=self.server.get_url(), raw=True))
        bot.run()
        self.assertEqual(2, len(bot.stat.collections['codes']))
示例#23
0
    def test_add_task_invalid_url_raise_error(self):
        class TestSpider(Spider):
            pass

        bot = build_spider(TestSpider, )
        bot.setup_queue()
        self.assertRaises(SpiderError, bot.add_task,
                          Task('page', url='zz://zz'), raise_error=True)
        self.assertEqual(0, bot.task_queue.size())
        bot.add_task(Task('page', url='http://example.com/'))
        self.assertEqual(1, bot.task_queue.size())
示例#24
0
    def test_task_useragent(self):
        bot = build_spider(SimpleSpider, )
        bot.setup_queue()

        g = Grab()
        g.setup(url=self.server.get_url())
        g.setup(user_agent='Foo')

        task = Task('baz', grab=g)
        bot.add_task(task.clone())
        bot.run()
        self.assertEqual(self.server.request['headers']['User-Agent'], 'Foo')
示例#25
0
    def test_only_cache_task(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                self.stat.collect('points', 1)

        bot = build_spider(TestSpider, only_cache=True)
        self.setup_cache(bot)
        bot.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('page', self.server.get_url()))
        bot.run()
        self.assertEqual(bot.stat.collections['points'], [])
示例#26
0
    def test_handler_result_none(self):
        class TestSpider(Spider):
            def prepare(self):
                self.points = []

            def task_page(self, grab, task):
                yield None

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
示例#27
0
    def test_cache_size(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

        bot = build_spider(TestSpider)
        self.setup_cache(bot)
        bot.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('page', self.server.get_url()))
        bot.run()
        self.assertEqual(bot.cache.size(), 1)
示例#28
0
    def test_create_table(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

        bot = build_spider(TestSpider)
        self.setup_cache(bot)
        bot.cache.cursor.execute('begin')
        bot.cache.cursor.execute('DROP TABLE cache')
        bot.cache.cursor.execute('commit')
        self.setup_cache(bot)
        bot.cache.clear()
        self.assertEqual(0, bot.cache.size())
示例#29
0
    def test_check_task_limits_invalid_value(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

            def check_task_limits(self, task):
                return False, 'zz'

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url(),
                          fallback_name='fallback_zz'))
        self.assertRaises(SpiderError, bot.run)
示例#30
0
    def test_exception_from_data_handler(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                yield Data('foo', num=1)

            def data_foo(self, num):
                1/0

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertTrue('data_foo' in bot.stat.collections['fatal'][0])
示例#31
0
    def test_counters_and_collections(self):
        class TestSpider(Spider):
            def prepare(self):
                self.stat.logging_period = 0
                self.stat.inc('foo')

            def task_page_valid(self, grab, task):
                self.stat.inc('foo')

            def task_page_fail(self, grab, task):
                1 / 0

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page_valid', url=self.server.get_url()))
        bot.add_task(Task('page_fail', url=self.server.get_url()))
        bot.run()
        self.assertEqual(2, bot.stat.counters['foo'])
        self.assertEqual(1, len(bot.stat.collections['fatal']))
示例#32
0
    def test_task_url_and_grab_options(self):
        class TestSpider(Spider):
            def setup(self):
                # pylint: disable=attribute-defined-outside-init
                self.done = False

            def task_page(self, dummy_grab, dummy_task):
                # pylint: disable=attribute-defined-outside-init
                self.done = True

        bot = build_spider(TestSpider, )
        bot.setup_queue()
        grab = Grab()
        grab.setup(url=self.server.get_url())
        self.assertRaises(SpiderMisuseError,
                          Task,
                          'page',
                          grab=grab,
                          url=self.server.get_url())
示例#33
0
 def test_setup_proxylist(self):
     with temp_file() as proxy_file:
         content = '\n'.join(x['proxy']
                             for x in self.extra_servers.values())
         with open(proxy_file, 'w') as out:
             out.write(content)
         # Simple test, one task
         bot = build_spider(SimpleSpider, thread_number=1)
         bot.load_proxylist(proxy_file, 'text_file')
         bot.setup_queue()
         bot.add_task(
             Task('baz', grab=Grab(url='http://yandex.ru', debug=True)))
         bot.run()
         serv = [
             x['server'] for x in self.extra_servers.values()
             if x['server'].request['done']
         ][0]
         self.assertEqual(serv.request['headers']['host'], 'yandex.ru')
         self.assertEqual(1, len(set(bot.stat.collections['ports'])))
示例#34
0
    def test_setup_proxylist2(self):
        with temp_file() as proxy_file:
            content = '\n'.join(x['proxy'] for x in
                                self.extra_servers.values())
            open(proxy_file, 'w').write(content)

            # By default auto_change is True
            bot = build_spider(SimpleSpider, thread_number=1)
            bot.load_proxylist(proxy_file, 'text_file')
            bot.setup_queue()
            for _ in six.moves.range(10):
                bot.add_task(Task('baz', 'http://yandex.ru'))
            bot.run()

            servers = [x['server'] for x in self.extra_servers.values()
                       if x['server'].request['done']]
            for serv in servers:
                self.assertEqual(serv.request['headers']['host'], 'yandex.ru')
            self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)
示例#35
0
    def test_too_large_document(self):
        self.server.response['get.data'] = ContentGenerator(self.server)

        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

        # The maximum BSON document size is 16 megabytes.
        self.server.response['get.data'] = 'x' * (1024 * 1024 * 17)
        bot = build_spider(TestSpider)
        self.setup_cache(bot, use_compression=False)
        bot.cache_pipeline.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        patch = mock.Mock()
        with mock.patch('logging.error', patch):
            bot.run()
        self.assertEqual(bot.cache_pipeline.cache.size(), 0)
        self.assertTrue('Document too large' in patch.call_args[0][0])
示例#36
0
文件: spider.py 项目: target-v/grab
    def test_fallback_handler_by_default_name(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.points = []

            def task_page(self, grab, task):
                pass

            def task_page_fallback(self, dummy_task):
                self.points.append(1)

        self.server.response['code'] = 403

        bot = build_spider(TestSpider, network_try_limit=1)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(bot.points, [1])
示例#37
0
    def test_complex_data(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.data_processed = []

            def task_page(self, dummy_grab, dummy_task):
                yield Data('foo', one=1, two=2, bar='gaz')

            def data_foo(self, one, two, **kwargs):
                self.data_processed.append(one)
                self.data_processed.append(two)
                self.data_processed.append(kwargs)

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(bot.data_processed, [1, 2, {'bar': 'gaz'}])
示例#38
0
    def test_stat_error_name_threaded_urllib3(self):

        server = self.server
        server.response['sleep'] = 2

        class SimpleSpider(Spider):
            def prepare(self):
                self.network_try_limit = 1

            def task_generator(self):
                grab = Grab(url=server.get_url(), timeout=1)
                yield Task('page', grab=grab)

            def task_page(self, grab, unused_task):
                pass

        bot = build_spider(SimpleSpider)
        bot.run()
        self.assertTrue('error:read-timeout-error' in bot.stat.counters)
示例#39
0
    def test_setup_proxylist5(self):
        content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3)
        open('/tmp/__proxy.txt', 'w').write(content)
        # Disable auto_change
        # Disable auto_init
        # Proxylist will not be used by default
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt',
                           'text_file',
                           auto_change=False,
                           auto_init=False)
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', self.server.get_url()))
        bot.run()

        self.assertEqual(self.server.request['headers'].get('host'),
                         '%s:%s' % (ADDRESS, self.server.port))
        self.assertEqual(1, len(set(bot.stat.collections['ports'])))
        self.assertEqual(bot.stat.collections['ports'][0], self.server.port)
示例#40
0
    def test_update_grab_instance(self):
        class TestSpider(Spider):
            def update_grab_instance(self, grab):
                grab.setup(timeout=77)

            def task_generator(self):
                yield Task('page', url=self.meta['server'].get_url())
                yield Task('page', grab=Grab(url=self.meta['server'].get_url(),
                                             timeout=1))

            def task_page(self, grab, dummy_task):
                self.stat.collect('points', grab.config['timeout'])

        bot = build_spider(TestSpider, meta={'server': self.server})
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.add_task(Task('page', grab=Grab(url=self.server.get_url(),
                                            timeout=1)))
        bot.run()
        self.assertEqual(set([77]), set(bot.stat.collections['points']))
示例#41
0
    def test_redirect_with_invalid_url(self):

        server = self.server

        class TestSpider(Spider):
            def task_generator(self):
                # pylint: disable=attribute-defined-outside-init
                self.done_counter = 0
                # pylint: enable=attribute-defined-outside-init
                yield Task('page', url=server.get_url())

            def task_page(self, grab, task):
                pass

        self.server.response_once['code'] = 301
        self.server.response_once['headers'] = [
            ('Location', INVALID_URL),
        ]
        bot = build_spider(TestSpider, network_try_limit=1)
        bot.run()
示例#42
0
    def test_multiple_internal_worker_error(self):
        class TestSpider(Spider):
            """
            This class derived from Spider super-class
            contains fatal bug in overriden `process_network_result`
            method
            """
            # pylint: disable=unused-argument
            def process_network_result(self, *args, **kwargs):
                raise Exception('Shit happens!')
            # pylint: enable=unused-argument

            def task_page(self, dummy_grab, dummy_task):
                pass

        bot = build_spider(TestSpider, )
        bot.setup_queue()
        for _ in range(5):
            bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertTrue(bot.stat.counters['parser-pipeline-restore'] > 1)
示例#43
0
    def test_setup_proxylist5(self):
        with temp_file() as proxy_file:
            content = '\n'.join(x['proxy'] for x in
                                self.extra_servers.values())
            open(proxy_file, 'w').write(content)
            # Disable auto_change
            # Disable auto_init
            # Proxylist will not be used by default
            bot = build_spider(SimpleSpider, thread_number=1)
            bot.load_proxylist(proxy_file, 'text_file',
                               auto_change=False, auto_init=False)
            bot.setup_queue()
            for _ in six.moves.range(10):
                bot.add_task(Task('baz', self.server.get_url()))
            bot.run()

            self.assertEqual(self.server.request['headers'].get('host'),
                             '%s:%s' % (ADDRESS, self.server.port))
            self.assertEqual(1, len(set(bot.stat.collections['ports'])))
            self.assertEqual(bot.stat.collections['ports'][0],
                             self.server.port)
示例#44
0
    def test_null_grab_bug(self):
        # Test following bug:
        # Create task and process it
        # In task handler spawn another task with grab instance
        # received in arguments of current task
        server = self.server

        class SimpleSpider(Spider):
            def task_generator(self):
                yield Task('one', url=server.get_url())

            def task_one(self, grab, task):
                self.stat.inc('page_count')
                yield Task('two', grab=grab)

            def task_two(self, grab, task):
                self.stat.inc('page_count')

        bot = build_spider(SimpleSpider, thread_number=1)
        bot.run()
        self.assertEqual(2, bot.stat.counters['page_count'])
示例#45
0
    def test_things_yiled_from_data_handler(self):
        server = self.server

        class TestSpider(Spider):
            def prepare(self):
                self.data_processed = []

            def task_page(self, grab, task):
                yield Data('foo', count=task.get('count', 1))

            def data_foo(self, count):
                self.data_processed.append(count)
                if count == 1:
                    yield Data('foo', count=666)
                    yield Task('page', url=server.get_url(), count=count + 1)

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(bot.data_processed, [1, 666, 2])
示例#46
0
    def test_schedule(self):
        """
        In this test I create a number of delayed task
        and then check the order in which they was executed
        """
        server = self.server

        class TestSpider(Spider):
            def task_generator(self):
                yield Task('page', url=server.get_url(), delay=1.5, num=3)
                yield Task('page', url=server.get_url(), delay=4.5, num=2)
                yield Task('page', url=server.get_url(), delay=3, num=4)
                yield Task('page', url=server.get_url(), num=1)

            def task_page(self, dummy_grab, task):
                self.stat.collect('numbers', task.num)

        bot = build_spider(TestSpider, thread_number=1)
        self.setup_queue(bot)
        bot.run()
        self.assertEqual(bot.stat.collections['numbers'], [1, 3, 4, 2])
示例#47
0
    def test_fallback_handler_by_fallback_name(self):
        class TestSpider(Spider):
            def prepare(self):
                self.points = []

            def task_page(self, grab, task):
                pass

            def fallback_zz(self, task):
                self.points.append(1)

        self.server.response['code'] = 403

        bot = build_spider(TestSpider, network_try_limit=1)
        bot.setup_queue()
        bot.add_task(
            Task('page',
                 url=self.server.get_url(),
                 fallback_name='fallback_zz'))
        bot.run()
        self.assertEquals(bot.points, [1])
示例#48
0
    def test_spider_custom_proxy_source(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                self.stat.collect(
                    'ports', int(grab.response.headers.get('Listen-Port', 0)))

        class CustomProxySource(BaseProxySource):
            def load(self):
                return [
                    Proxy(ADDRESS, TEST_SERVER_PORT, None, None, 'http'),
                ]

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.load_proxylist(CustomProxySource())
        bot.add_task(Task('page', url='http://yandex.ru/'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertEqual(set(bot.stat.collections['ports']),
                         set([TEST_SERVER_PORT]))
示例#49
0
    def test_task_queue_clear(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                self.stop()

            def task_keyboard_interrupt_page(self, grab, task):
                raise KeyboardInterrupt

        bot = build_spider(TestSpider)
        bot.setup_queue()
        for _ in six.moves.range(5):
            bot.add_task(Task('page', url=self.server.get_url()))
        self.assertEqual(5, bot.task_queue.size())
        bot.run()
        self.assertEqual(0, bot.task_queue.size())

        for _ in six.moves.range(5):
            bot.add_task(
                Task('keyboard_interrupt_page', url=self.server.get_url()))
        self.assertEqual(5, bot.task_queue.size())
        bot.run()
        self.assertEqual(0, bot.task_queue.size())
示例#50
0
    def test_no_warning(self):
        """Simple spider should not generate
        any warnings (warning module sends messages to stderr)
        """
        out = StringIO()
        with mock.patch('sys.stderr', out):
            server = self.server
            server.response['data'] = b'<div>test</div>'

            class SimpleSpider(Spider):
                # pylint: disable=unused-argument
                initial_urls = [server.get_url()]

                def task_initial(self, grab, task):
                    yield Task('more', url=server.get_url())

                def task_more(self, grab, task):
                    #print(grab.doc.url)
                    grab.doc('//div').text()

            bot = build_spider(SimpleSpider)
            bot.run()
        self.assertTrue(out.getvalue() == '')
示例#51
0
    def test_has_item(self):
        self.server.response['get.data'] = ContentGenerator(self.server)
        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

        bot = build_spider(TestSpider)
        self.setup_cache(bot)
        bot.cache_pipeline.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.add_task(Task('page', url=self.server.get_url('/foo')))
        bot.run()
        self.assertTrue(bot.cache_pipeline.cache
                        .has_item(self.server.get_url()))
        self.assertTrue(bot.cache_pipeline.cache
                        .has_item(self.server.get_url(), timeout=100))
        self.assertFalse(bot.cache_pipeline.cache
                         .has_item(self.server.get_url(), timeout=0))
        self.assertTrue(bot.cache_pipeline.cache
                        .has_item(self.server.get_url('/foo')))
        self.assertFalse(bot.cache_pipeline.cache
                         .has_item(self.server.get_url('/bar')))
示例#52
0
    def test_create_grab_instance(self):
        class TestSpider(Spider):
            def create_grab_instance(self, **kwargs):
                grab = super(TestSpider, self).create_grab_instance(**kwargs)
                grab.setup(timeout=77)
                return grab

            def task_generator(self):
                yield Task('page', url=self.meta['server'].get_url())
                yield Task('page', grab=Grab(url=self.meta['server'].get_url(),
                                             timeout=76))

            def task_page(self, grab, unused_task):
                self.stat.collect('points', grab.config['timeout'])

        bot = build_spider(TestSpider, meta={'server': self.server})
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.add_task(Task('page', grab=Grab(url=self.server.get_url(),
                                            timeout=75)))
        bot.run()
        self.assertEqual(set([77, 76, 75]),
                         set(bot.stat.collections['points']))
示例#53
0
    def test_bug1(self):
        # Test the bug:
        # * enable cache
        # * fetch document (it goes to cache)
        # * request same URL
        # * got exception

        server = self.server

        class Bug1Spider(Spider):
            def task_foo(self, grab, task):
                grab.setup(url=server.get_url())
                yield Task('bar', grab=grab)

            def task_bar(self, grab, task):
                pass

        bot = build_spider(Bug1Spider)
        self.setup_cache(bot)
        bot.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('foo', self.server.get_url()))
        bot.run()
示例#54
0
    def test_inline_task(self):
        def callback(self):
            self.write(self.request.uri)
            self.finish()

        self.server.response['get.callback'] = callback

        server = self.server

        class TestSpider(Spider):
            def add_response(self, grab):
                self.stat.collect('responses', grab.doc.unicode_body())

            def task_generator(self):
                url = server.get_url('/?foo=start')
                yield Task('inline', url=url)

            def subroutine_task(self, grab):

                for x in six.moves.range(2):
                    url = server.get_url('/?foo=subtask%s' % x)
                    grab.setup(url=url)
                    grab = yield Task(grab=grab)
                    self.add_response(grab)
                    self.stat.collect('calls', 'subinline%s' % x)

            @inline_task
            def task_inline(self, grab, task):
                self.add_response(grab)
                self.stat.collect('calls', 'generator')

                for x in six.moves.range(3):
                    url = server.get_url('/?foo=%s' % x)
                    grab.setup(url=url)
                    grab = yield Task(grab=grab)

                    self.add_response(grab)
                    self.stat.collect('calls', 'inline%s' % x)

                    grab = yield self.subroutine_task(grab)
                    # In this case the grab body will be the same
                    # as is in subroutine task:  /?foo=subtask1
                    self.add_response(grab)

                url = server.get_url('/?foo=yield')
                self.add_task(Task('yield', url=url))

            def task_yield(self, grab, task):
                self.add_response(grab)
                self.stat.collect('calls', 'yield')

                url = server.get_url('/?foo=end')
                yield Task('end', url=url)

            def task_end(self, grab, task):
                self.add_response(grab)
                self.stat.collect('calls', 'end')

        bot = build_spider(TestSpider, )
        bot.run()

        self.assertEqual([
            '/?foo=start', '/?foo=0', '/?foo=subtask0', '/?foo=subtask1',
            '/?foo=subtask1', '/?foo=1', '/?foo=subtask0', '/?foo=subtask1',
            '/?foo=subtask1', '/?foo=2', '/?foo=subtask0', '/?foo=subtask1',
            '/?foo=subtask1', '/?foo=yield', '/?foo=end'
        ], bot.stat.collections['responses'])
        self.assertEqual([
            'generator', 'inline0', 'subinline0', 'subinline1', 'inline1',
            'subinline0', 'subinline1', 'inline2', 'subinline0', 'subinline1',
            'yield', 'end'
        ], bot.stat.collections['calls'])
示例#55
0
 def test_delay_error(self):
     bot = build_spider(self.SimpleSpider)
     self.setup_queue(bot)
     bot.task_queue.clear()
     self.assertRaises(SpiderMisuseError, bot.add_task,
                       Task('page', url=self.server.get_url(), delay=1))
示例#56
0
 def test_integrity_decorator_in_mp_mode(self):
     bot = build_spider(self.SimpleSpider)
     bot.setup_queue()
     bot.add_task(Task('page2', self.server.get_url()))
     bot.run()
示例#57
0
    def test_setup_proxylist(self):
        content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3)
        open('/tmp/__proxy.txt', 'w').write(content)

        # Simple test, one task
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        bot.add_task(Task('baz', grab=Grab(url='http://yandex.ru',
                                           debug=True)))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertEqual(1, len(set(bot.stat.collections['ports'])))

        # By default auto_change is True
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)

        # DO the same test with load_proxylist method
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)

        # Disable auto_change
        # By default auto_init is True
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False)
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertEqual(1, len(set(bot.stat.collections['ports'])))

        # Disable auto_change
        # Disable auto_init
        # Proxylist will not be used by default
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt',
                           'text_file',
                           auto_change=False,
                           auto_init=False)
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', self.server.get_url()))
        bot.run()

        self.assertEqual(self.server.request['headers'].get('host'),
                         '%s:%s' % (ADDRESS, self.server.port))
        self.assertEqual(1, len(set(bot.stat.collections['ports'])))
        self.assertEqual(bot.stat.collections['ports'][0], self.server.port)
示例#58
0
 def test_clear_collection(self):
     bot = build_spider(self.SimpleSpider)
     self.setup_queue(bot)
     bot.task_queue.clear()
示例#59
0
 def test_task_queue_render_stats(self):
     bot = build_spider(self.SimpleSpider)
     bot.render_stats()
示例#60
0
    def test_stop_timer_invalid_input(self):
        class TestSpider(Spider):
            pass

        bot = build_spider(TestSpider)
        self.assertRaises(KeyError, bot.timer.stop, 'zzz')