Exemplos de crawler em Python, exemplos de solution.crawler em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_race_condition_with_timeout(self):
        urls = []
        search_string = 'success'

        path = '/timed_out_success'
        self.httpd.responses[path] = sleep_func(search_string, 4)
        urls.append(self.path_to_url(path))

        path = '/slow_success'
        self.httpd.responses[path] = sleep_func(search_string, 2)
        urls.append(self.path_to_url(path))

        for i in range(5):
            path = '/not_success_{0}'.format(i)
            self.httpd.responses[path] = 'nothing to see here'
            urls.append(self.path_to_url(path))

        path = '/fast_success'
        self.httpd.responses[path] = search_string
        expected_result = self.path_to_url(path)
        urls.append(expected_result)

        callback = contains_callback(search_string)

        result = solution.crawler(iter(urls), callback, 3)
        self.assertEqual(result, expected_result)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_if_concurrent_with_sleep_in_server(self):
        '''
        All responsens will be ~1 second long. If the solution is not concurrent
        and it tried all URLs in turns it should have been working for more than 2
        seconds since there are at least 2 URLs in the work queue.
        '''
        urls = []
        concurrency = 5
        
        def with_sleep(hndl):
            hndl.send_response(404)
            hndl.end_headers()
            time.sleep(0.5)
            hndl.wfile.write(bytes('not what you are searching for', 'UTF-8'))

        for i in range(concurrency):
            path = '/not-there-{0}'.format(i)
            self.httpd.responses[path] = with_sleep
            urls.append(self.path_to_url(path))

        success_path = '/bingo'
        success_url = self.path_to_url(success_path)

        self.httpd.responses[success_path] = 'I made in less than 5 seconds'
        urls.append(success_url)

        callback = contains_callback('less than 5')

        start = time.time()
        result = solution.crawler(iter(urls), callback, concurrency)
        took = time.time() - start

        self.assertEqual(result, success_url)
        msg = ('This crawler is probably not an concurrent one.')
        self.assertTrue(took < 1, msg)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_timeout_value(self):
        urls = []
        concurrency = 2
        
        def with_sleep(hndl):
            hndl.send_response(200)
            hndl.end_headers()
            time.sleep(4)
            hndl.wfile.write(bytes('I made in less than 5 seconds', 'UTF-8'))

        for i in range(concurrency):
            path = '/timeout-{0}'.format(i)
            self.httpd.responses[path] = with_sleep
            urls.append(self.path_to_url(path))

        success_path = '/bingo'
        success_url = self.path_to_url(success_path)

        self.httpd.responses[success_path] = 'I made in less than 5 seconds'
        urls.append(success_url)

        callback = contains_callback('less than 5')

        start = time.time()
        result = solution.crawler(iter(urls), callback, concurrency)
        took = time.time() - start

        self.assertEqual(result, success_url, 'Crawler did not timeout')
        msg = ('This crawler is probably not an concurrent one.')
        self.assertTrue(took < 5, msg)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_with_non_2xx_responses(self):

        search_for = 'some string'
        server_response = 'There should be some string in here'

        def create_err_handler(status_code):
            def error_handler(hndl):
                hndl.send_response(status_code)
                hndl.end_headers()
            return error_handler
        
        def return_409_and_the_string(hndl):
            hndl.send_response(409)
            hndl.end_headers()
            hndl.wfile.write(bytes(server_response, 'UTF-8'))

        self.httpd.responses = {
            '/500': create_err_handler(500),
            '/504': create_err_handler(504),
            '/401': create_err_handler(401),
            '/416': create_err_handler(416),
            '/409': return_409_and_the_string,
            '/bingo': server_response,
        }

        urls = [self.path_to_url(path) for path in self.httpd.responses]

        result = solution.crawler(iter(urls), contains_callback(search_for), 5)
        self.assertEqual(result, self.path_to_url('/bingo'))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_wrong_arguments_passed(self):
        with self.assertRaises(Exception):
            solution.crawler(iter([]), always_false, -1)

        with self.assertRaises(Exception):
            solution.crawler(iter([]), always_false, 0)

        with self.assertRaises(Exception):
            solution.crawler(iter(['http://google.com']), None, 3)

        with self.assertRaises(Exception):
            solution.crawler(None, always_false, 3)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_no_result(self):

        contains_luck = contains_callback('You are lucky, FMI python 2015')

        urls = map(self.path_to_url, ["/1", "/2", "wrong.url!", "/3", "/5",])

        result = solution.crawler(iter(urls), contains_luck, 4)

        self.assertIsNone(result, ('Did not expect the string "you are '
                                    'lucky" in any of the URLs'))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_with_many_wrong_urls(self):
        success_path = '/success'
        self.httpd.responses[success_path] = 'first not wrong url'

        urls = [self.path_to_url('/not-found-{0}'.format(i)) for i in range(15)]

        urls += ["wrong.url!", "python://almost-an-URL", "http://almost-a-domain"]

        success_url = self.path_to_url(success_path)
        urls.append(success_url)

        callback = contains_callback('not wrong')
        result = solution.crawler(iter(urls), callback, 5)
        self.assertEqual(result, success_url)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_simple_operation(self):
        self.httpd.responses = {
            '/path1': 'Hello, lonely! How are you today?'
        }
        success_url = self.path_to_url('/path1')

        work_urls = [
            self.path_to_url('/path-not-here'),
            success_url,
            self.path_to_url('/ops'),
        ]

        callback = contains_callback('How are you today?')
        result = solution.crawler(iter(work_urls), callback, 3)
        self.assertEqual(result, success_url)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_crawler_starts_no_more_than_expected_workers(self):

        urls = []

        for i in range(5):
            path = '/not_success_{0}'.format(i)
            self.httpd.responses[path] = sleep_func('nothing to see here', 0.100)
            urls.append(self.path_to_url(path))

        start = time.time()
        result = solution.crawler(iter(urls), always_false, 4)
        took = time.time() - start

        self.assertIsNone(result)
        # if the time is less than 200ms it meas there have been more than 4 workers
        self.assertTrue(took >= 0.200, 'crawler spawned more workers than expected')

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_when_callback_never_returns_true(self):

        self.httpd.responses = {
            '/500': 'cuddly',
            '/504': 'little',
            '/401': 'cat',
            '/416': 'which',
            '/409': 'eats',
            '/bingo': 'mice',
            '/355': 'but',
            '/356': 'likes',
            '/357': 'fish',
            '/358': 'too',
        }

        urls = [self.path_to_url(path) for path in self.httpd.responses]
        result = solution.crawler(iter(urls), always_false, 3)
        self.assertIsNone(result)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_only_one_worker(self):

        self.httpd.responses = {
            '/500': 'cuddly',
            '/504': 'little',
            '/401': 'cat',
            '/416': 'which',
            '/409': 'eats',
            '/bingo': 'mice',
            '/355': 'but',
            '/356': 'likes',
            '/357': 'fish',
            '/358': 'too',
        }

        urls = [self.path_to_url(path) for path in self.httpd.responses]
        result = solution.crawler(iter(urls), contains_callback('mice'), 1)
        self.assertEqual(result, self.path_to_url('/bingo'))

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_race_condition(self):
        urls = []
        search_string = 'success'

        path = '/failure'
        self.httpd.responses[path] = 'failure'
        urls.append(self.path_to_url(path))

        path = '/slow_success'
        self.httpd.responses[path] = sleep_func(search_string, 1)
        urls.append(self.path_to_url(path))

        path = '/fast_success'
        self.httpd.responses[path] = sleep_func(search_string, 0.5)
        expected_result = self.path_to_url(path)
        urls.append(expected_result)

        callback = contains_callback(search_string)

        result = solution.crawler(iter(urls), callback, 3)
        self.assertEqual(result, expected_result)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

    def test_crawling_stops_after_successful_callback(self):
        search_for = 'Ame-no-Murakumo-no-Tsurugi'
        server_response = 'Ame-no-Murakumo-no-Tsurugi was given to the warrior'
        success_path = '/kusanagi'

        touched = []
        not_touched = []
        urls = []

        for i in range(10):
            path = '/touched-{0}'.format(i)
            self.httpd.responses[path] = 'Nothing to see here'
            urls.append(self.path_to_url(path))
            touched.append(path)

        self.httpd.responses[success_path] = server_response
        urls.append(self.path_to_url(success_path))
        touched.append(success_path)

        for i in range(10):
            path = '/not-touched-{0}'.format(i)
            self.httpd.responses[path] = 'Nothing to see here'
            urls.append(self.path_to_url(path))
            not_touched.append(path)

        result = solution.crawler(iter(urls), contains_callback(search_for), 2)
        self.assertEqual(result, self.path_to_url(success_path))

        for path in touched:
            self.assertTrue(self.httpd.is_touched(path), 
                'crawler did not check {0}'.format(path))

        # There might be up to 2 (workers_count) urls visited before the crawler 
        # returns.
        for path in not_touched[2:]:
            self.assertFalse(self.httpd.is_touched(path), 
                'crawler did check `{0}` when it was not supposed to'.format(path))

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test.py Projeto: d0ivanov/Python-2014-2015

 def test_empty_iterator(self):
     result = solution.crawler(iter([]), always_true, 4)
     self.assertIsNone(result)