Exemplo n.º 1
0
 def test_directory_already_exists(self):
     with temp_dir() as tmp_dir:
         old_dir = os.getcwd()
         os.chdir(tmp_dir)
         dir_ = os.path.join(tmp_dir, "test_project")
         os.mkdir(dir_)
         self.assertRaises(GrabError, start_project.main, project_name="test_project", template=None)
         os.chdir(old_dir)
Exemplo n.º 2
0
 def test_explicit_template_name(self):
     with temp_dir() as tmp_dir:
         old_dir = os.getcwd()
         os.chdir(tmp_dir)
         test_dir = os.path.dirname(__file__)
         project_sample_path = os.path.join(test_dir, "files/project_sample")
         start_project.main(project_name="test_project", template=project_sample_path)
         self.assertTrue(os.path.join(tmp_dir, "test_project/foo.py"))
         os.chdir(old_dir)
Exemplo n.º 3
0
    def test_save_hash(self):
        "Test `Response.save_hash` method."
        with temp_dir() as tmp_dir:
            img_data = open(IMG_FILE, 'rb').read()
            self.server.response['get.data'] = img_data

            g = build_grab()
            g.go(self.server.get_url())
            path = g.response.save_hash(self.server.get_url(), tmp_dir)
            test_data = open(os.path.join(tmp_dir, path), 'rb').read()
            self.assertEqual(test_data, img_data)
Exemplo n.º 4
0
    def test_save(self):
        "Test `Response.save` method."
        with temp_dir() as tmp_dir:
            img_data = open(IMG_FILE, 'rb').read()
            tmp_file = os.path.join(tmp_dir, 'file.bin')
            self.server.response['get.data'] = img_data

            g = build_grab()
            g.go(self.server.get_url())
            g.response.save(tmp_file)
            self.assertEqual(open(tmp_file, 'rb').read(), img_data)
Exemplo n.º 5
0
 def test_start_project(self):
     with temp_dir() as tmp_dir:
         old_dir = os.getcwd()
         os.chdir(tmp_dir)
         dir_ = os.path.join(tmp_dir, "test_project")
         start_project.main(project_name="test_project", template=None)
         os.chdir(old_dir)
         self.assertTrue(os.path.exists(os.path.join(dir_, "var")))
         self.assertTrue(os.path.exists(os.path.join(dir_, "var/log")))
         self.assertTrue(os.path.exists(os.path.join(dir_, "var/run")))
         path = os.path.join(dir_, "spider.py")
         self.assertTrue("TestProjectSpider" in open(path).read())
Exemplo n.º 6
0
    def test_log_option(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            log_file_path = os.path.join(tmp_dir, 'log.html')
            g = build_grab()
            g.setup(log_file=log_file_path)
            self.server.response['get.data'] = 'omsk'

            self.assertEqual(os.listdir(tmp_dir), [])
            g.go(self.server.get_url())
            self.assertEqual(os.listdir(tmp_dir), ['log.html'])
            self.assertEqual(open(log_file_path).read(), 'omsk')
Exemplo n.º 7
0
    def test_log_option(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            log_file_path = os.path.join(tmp_dir, 'log.html')
            g = build_grab()
            g.setup(log_file=log_file_path)
            self.server.response['get.data'] = 'omsk'

            self.assertEqual(os.listdir(tmp_dir), [])
            g.go(self.server.get_url())
            self.assertEqual(os.listdir(tmp_dir), ['log.html'])
            self.assertEqual(open(log_file_path).read(), 'omsk')
Exemplo n.º 8
0
    def test_log_dir_request_content_is_empty(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            g = build_grab()
            g.setup(log_dir=tmp_dir)
            g.setup(headers={'X-Name': 'spider'}, post='xxxPost')

            self.assertEqual(os.listdir(tmp_dir), [])
            g.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log'])
            log_file_content = open(os.path.join(tmp_dir, '01.log')).read()
            self.assertFalse('X-Name' in log_file_content)
            self.assertFalse('xxxPost' in log_file_content)
Exemplo n.º 9
0
    def test_log_dir_response_content(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            g = build_grab()
            g.setup(log_dir=tmp_dir)
            self.server.response['get.data'] = 'omsk'
            self.server.response['headers'] = [('X-Engine', 'PHP')]

            self.assertEqual(os.listdir(tmp_dir), [])
            g.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log'])
            log_file_content = open(os.path.join(tmp_dir, '01.log')).read()
            self.assertTrue('x-engine' in log_file_content.lower())
Exemplo n.º 10
0
 def test_lxml_security_bug(self):
     with temp_dir() as tmp_dir:
         injection_path = os.path.join(tmp_dir, 'injection')
         with open(injection_path, 'w') as out:
             out.write('Hey there!')
         # Prepare file:// URL valid for both linux and windows
         injection_url = 'file:///%s' % (injection_path.lstrip('/')
                                         .replace('\\', '/'))
         bad_xml = (
             '<!DOCTYPE external ['
             '<!ENTITY ee SYSTEM "' + injection_url + '">'
             ']>'
             '<root>&ee;</root>'
         ).encode()
         tree = parse(BytesIO(bad_xml))
         self.assertEqual(tree.xpath('//root/text()')[0], 'Hey there!')
Exemplo n.º 11
0
    def test_log_dir_request_content_headers_and_post(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            g = build_grab()
            g.setup(log_dir=tmp_dir, debug=True)
            g.setup(headers={'X-Name': 'spider'}, post={'xxx': 'Post'})

            self.assertEqual(os.listdir(tmp_dir), [])
            g.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log'])
            log_file_content = open(os.path.join(tmp_dir, '01.log')).read()
            #if not 'x-name' in log_file_content.lower():
            #    print('CONTENT OF 01.log:')
            #    print(log_file_content)
            self.assertTrue('x-name' in log_file_content.lower())
            self.assertTrue('xxx=post' in log_file_content.lower())
Exemplo n.º 12
0
    def test_log_dir_option(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            g = build_grab()
            g.setup(log_dir=tmp_dir)
            self.server.response_once['get.data'] = 'omsk1'
            self.server.response['get.data'] = 'omsk2'

            self.assertEqual(os.listdir(tmp_dir), [])
            g.go(self.server.get_url())
            g.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log', '02.html', '02.log'])
            self.assertEqual(
                open(os.path.join(tmp_dir, '01.html')).read(), 'omsk1')
            self.assertEqual(
                open(os.path.join(tmp_dir, '02.html')).read(), 'omsk2')
Exemplo n.º 13
0
    def test_log_dir_option(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            g = build_grab()
            g.setup(log_dir=tmp_dir)
            self.server.response_once['get.data'] = 'omsk1'
            self.server.response['get.data'] = 'omsk2'

            self.assertEqual(os.listdir(tmp_dir), [])
            g.go(self.server.get_url())
            g.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log', '02.html', '02.log'])
            self.assertEqual(open(os.path.join(tmp_dir, '01.html')).read(),
                             'omsk1')
            self.assertEqual(open(os.path.join(tmp_dir, '02.html')).read(),
                             'omsk2')
Exemplo n.º 14
0
    def test_log_dir_response_network_error(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            g = build_grab()
            g.setup(log_dir=tmp_dir, timeout=1, user_agent='Perl', debug=True)
            self.server.response['get.data'] = 'omsk'
            self.server.response['headers'] = [('X-Engine', 'PHP')]
            self.server.response['sleep'] = 2

            self.assertEqual(os.listdir(tmp_dir), [])
            try:
                g.go(self.server.get_url())
            except GrabTimeoutError:
                pass

            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log'])
            log_file_content = open(os.path.join(tmp_dir, '01.log')).read()
            self.assertTrue('user-agent: perl' in log_file_content.lower())
Exemplo n.º 15
0
 def test_grab_parse_defensedxml(self):
     with temp_dir() as tmp_dir:
         injection_path = os.path.join(tmp_dir, 'injection')
         with open(injection_path, 'w') as out:
             out.write('Hey there!')
         # Prepare file:// URL valid for both linux and windows
         injection_url = 'file:///%s' % (injection_path.lstrip('/')
                                         .replace('\\', '/'))
         bad_xml = (
             '<!DOCTYPE external ['
             '<!ENTITY ee SYSTEM "' + injection_url + '">'
             ']>'
             '<root>&ee;</root>'
         ).encode()
         xml_file = os.path.join(tmp_dir, 'bad.xml')
         with open(xml_file, 'wb') as out:
             out.write(bad_xml)
         grab = build_grab(content_type='xml')
         grab.go('file://%s' % xml_file)
         self.assertRaises(EntitiesForbidden, grab.doc, '//title')
Exemplo n.º 16
0
    def test_log_dir_response_content_thread(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir)
            self.server.response['get.data'] = 'omsk'
            self.server.response['headers'] = [('X-Engine', 'PHP')]

            self.assertEqual(os.listdir(tmp_dir), [])

            def func():
                grab.go(self.server.get_url())
            thread = threading.Thread(target=func)
            thread.start()
            thread.join()

            files = os.listdir(tmp_dir)
            self.assertEqual(2, len([x for x in files if '01-thread' in x]))
            fname = [x for x in files if x.endswith('.log')][0]
            log_file_content = open(os.path.join(tmp_dir, fname)).read()
            self.assertTrue('x-engine' in log_file_content.lower())