def test_directory_already_exists(self): with temp_dir() as tmp_dir: old_dir = os.getcwd() os.chdir(tmp_dir) dir_ = os.path.join(tmp_dir, "test_project") os.mkdir(dir_) self.assertRaises(GrabError, start_project.main, project_name="test_project", template=None) os.chdir(old_dir)
def test_explicit_template_name(self): with temp_dir() as tmp_dir: old_dir = os.getcwd() os.chdir(tmp_dir) test_dir = os.path.dirname(__file__) project_sample_path = os.path.join(test_dir, "files/project_sample") start_project.main(project_name="test_project", template=project_sample_path) self.assertTrue(os.path.join(tmp_dir, "test_project/foo.py")) os.chdir(old_dir)
def test_save_hash(self): "Test `Response.save_hash` method." with temp_dir() as tmp_dir: img_data = open(IMG_FILE, 'rb').read() self.server.response['get.data'] = img_data g = build_grab() g.go(self.server.get_url()) path = g.response.save_hash(self.server.get_url(), tmp_dir) test_data = open(os.path.join(tmp_dir, path), 'rb').read() self.assertEqual(test_data, img_data)
def test_save(self): "Test `Response.save` method." with temp_dir() as tmp_dir: img_data = open(IMG_FILE, 'rb').read() tmp_file = os.path.join(tmp_dir, 'file.bin') self.server.response['get.data'] = img_data g = build_grab() g.go(self.server.get_url()) g.response.save(tmp_file) self.assertEqual(open(tmp_file, 'rb').read(), img_data)
def test_start_project(self): with temp_dir() as tmp_dir: old_dir = os.getcwd() os.chdir(tmp_dir) dir_ = os.path.join(tmp_dir, "test_project") start_project.main(project_name="test_project", template=None) os.chdir(old_dir) self.assertTrue(os.path.exists(os.path.join(dir_, "var"))) self.assertTrue(os.path.exists(os.path.join(dir_, "var/log"))) self.assertTrue(os.path.exists(os.path.join(dir_, "var/run"))) path = os.path.join(dir_, "spider.py") self.assertTrue("TestProjectSpider" in open(path).read())
def test_log_option(self): with temp_dir() as tmp_dir: reset_request_counter() log_file_path = os.path.join(tmp_dir, 'log.html') g = build_grab() g.setup(log_file=log_file_path) self.server.response['get.data'] = 'omsk' self.assertEqual(os.listdir(tmp_dir), []) g.go(self.server.get_url()) self.assertEqual(os.listdir(tmp_dir), ['log.html']) self.assertEqual(open(log_file_path).read(), 'omsk')
def test_log_dir_request_content_is_empty(self): with temp_dir() as tmp_dir: reset_request_counter() g = build_grab() g.setup(log_dir=tmp_dir) g.setup(headers={'X-Name': 'spider'}, post='xxxPost') self.assertEqual(os.listdir(tmp_dir), []) g.go(self.server.get_url()) self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log']) log_file_content = open(os.path.join(tmp_dir, '01.log')).read() self.assertFalse('X-Name' in log_file_content) self.assertFalse('xxxPost' in log_file_content)
def test_log_dir_response_content(self): with temp_dir() as tmp_dir: reset_request_counter() g = build_grab() g.setup(log_dir=tmp_dir) self.server.response['get.data'] = 'omsk' self.server.response['headers'] = [('X-Engine', 'PHP')] self.assertEqual(os.listdir(tmp_dir), []) g.go(self.server.get_url()) self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log']) log_file_content = open(os.path.join(tmp_dir, '01.log')).read() self.assertTrue('x-engine' in log_file_content.lower())
def test_lxml_security_bug(self): with temp_dir() as tmp_dir: injection_path = os.path.join(tmp_dir, 'injection') with open(injection_path, 'w') as out: out.write('Hey there!') # Prepare file:// URL valid for both linux and windows injection_url = 'file:///%s' % (injection_path.lstrip('/') .replace('\\', '/')) bad_xml = ( '<!DOCTYPE external [' '<!ENTITY ee SYSTEM "' + injection_url + '">' ']>' '<root>ⅇ</root>' ).encode() tree = parse(BytesIO(bad_xml)) self.assertEqual(tree.xpath('//root/text()')[0], 'Hey there!')
def test_log_dir_request_content_headers_and_post(self): with temp_dir() as tmp_dir: reset_request_counter() g = build_grab() g.setup(log_dir=tmp_dir, debug=True) g.setup(headers={'X-Name': 'spider'}, post={'xxx': 'Post'}) self.assertEqual(os.listdir(tmp_dir), []) g.go(self.server.get_url()) self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log']) log_file_content = open(os.path.join(tmp_dir, '01.log')).read() #if not 'x-name' in log_file_content.lower(): # print('CONTENT OF 01.log:') # print(log_file_content) self.assertTrue('x-name' in log_file_content.lower()) self.assertTrue('xxx=post' in log_file_content.lower())
def test_log_dir_option(self): with temp_dir() as tmp_dir: reset_request_counter() g = build_grab() g.setup(log_dir=tmp_dir) self.server.response_once['get.data'] = 'omsk1' self.server.response['get.data'] = 'omsk2' self.assertEqual(os.listdir(tmp_dir), []) g.go(self.server.get_url()) g.go(self.server.get_url()) self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log', '02.html', '02.log']) self.assertEqual( open(os.path.join(tmp_dir, '01.html')).read(), 'omsk1') self.assertEqual( open(os.path.join(tmp_dir, '02.html')).read(), 'omsk2')
def test_log_dir_option(self): with temp_dir() as tmp_dir: reset_request_counter() g = build_grab() g.setup(log_dir=tmp_dir) self.server.response_once['get.data'] = 'omsk1' self.server.response['get.data'] = 'omsk2' self.assertEqual(os.listdir(tmp_dir), []) g.go(self.server.get_url()) g.go(self.server.get_url()) self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log', '02.html', '02.log']) self.assertEqual(open(os.path.join(tmp_dir, '01.html')).read(), 'omsk1') self.assertEqual(open(os.path.join(tmp_dir, '02.html')).read(), 'omsk2')
def test_log_dir_response_network_error(self): with temp_dir() as tmp_dir: reset_request_counter() g = build_grab() g.setup(log_dir=tmp_dir, timeout=1, user_agent='Perl', debug=True) self.server.response['get.data'] = 'omsk' self.server.response['headers'] = [('X-Engine', 'PHP')] self.server.response['sleep'] = 2 self.assertEqual(os.listdir(tmp_dir), []) try: g.go(self.server.get_url()) except GrabTimeoutError: pass self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log']) log_file_content = open(os.path.join(tmp_dir, '01.log')).read() self.assertTrue('user-agent: perl' in log_file_content.lower())
def test_grab_parse_defensedxml(self): with temp_dir() as tmp_dir: injection_path = os.path.join(tmp_dir, 'injection') with open(injection_path, 'w') as out: out.write('Hey there!') # Prepare file:// URL valid for both linux and windows injection_url = 'file:///%s' % (injection_path.lstrip('/') .replace('\\', '/')) bad_xml = ( '<!DOCTYPE external [' '<!ENTITY ee SYSTEM "' + injection_url + '">' ']>' '<root>ⅇ</root>' ).encode() xml_file = os.path.join(tmp_dir, 'bad.xml') with open(xml_file, 'wb') as out: out.write(bad_xml) grab = build_grab(content_type='xml') grab.go('file://%s' % xml_file) self.assertRaises(EntitiesForbidden, grab.doc, '//title')
def test_log_dir_response_content_thread(self): with temp_dir() as tmp_dir: reset_request_counter() grab = build_grab() grab.setup(log_dir=tmp_dir) self.server.response['get.data'] = 'omsk' self.server.response['headers'] = [('X-Engine', 'PHP')] self.assertEqual(os.listdir(tmp_dir), []) def func(): grab.go(self.server.get_url()) thread = threading.Thread(target=func) thread.start() thread.join() files = os.listdir(tmp_dir) self.assertEqual(2, len([x for x in files if '01-thread' in x])) fname = [x for x in files if x.endswith('.log')][0] log_file_content = open(os.path.join(tmp_dir, fname)).read() self.assertTrue('x-engine' in log_file_content.lower())