def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['robots_txt'] = False request['url'] = self.httpbin+'/deny' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result)
def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request["fetch"]["robots_txt"] = False request["url"] = self.httpbin + "/deny" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request["fetch"]["robots_txt"] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result)
def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['robots_txt'] = False request['url'] = self.httpbin + '/deny' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result)
def run(project): task = json.loads(request.form['task']) project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } fetch_result = {} start_time = time.time() try: fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) module = build_module(project_info, { 'debugger': True }) ret = module['instance'].run(module['module'], task, response) except Exception, e: type, value, tb = sys.exc_info() tb = hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, }
def test_e020_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn('redirects followed', response.error)
def test_20_dataurl_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello')
def test_40_with_rpc(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(request).data) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello')
def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request["fetch"]["max_redirects"] = 10 request["url"] = self.httpbin + "/redirect/10" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result)
def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request["fetch"]["validate_cert"] = False request["url"] = self.httpbin + "/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result)
def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/cookies/set?k1=v1&k2=v2" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, {"a": "b", "k1": "v1", "k2": "v2", "c": "d"}, result)
def test_a150_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn('redirects followed', response.error)
def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['validate_cert'] = False request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result)
def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result)
def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['validate_cert'] = False request['url'] = self.httpbin + '/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result)
def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['max_redirects'] = 10 request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result)
def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['max_redirects'] = 10 request['url'] = self.httpbin + '/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result)
def test_30_with_queue(self): request= copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello')
def run(project): task = utils.decode_unicode_obj(json.loads(request.form['task'])) project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } fetch_result = {} start_time = time.time() try: fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) module = build_module(project_info, { 'debugger': True }) ret = module['instance'].run(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
def get(self, url, **kwargs): if not url.startswith('http://'): url = self.httpbin + url request = copy.deepcopy(self.sample_task_http) request['url'] = url request.update(kwargs) result = self.fetcher.fetch(request) response = rebuild_response(result) return response
def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/status/418' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn('teapot', response.text)
def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/status/418' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn('teapot', response.text)
def test_75_splash_robots(self): request = self.sample_task_http request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'splash' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result)
def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None
def test_a120_http_get_with_proxy_fail_1(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None
def test_30_with_queue(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello')
def test_a140_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect-to?url=/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.url, self.httpbin+'/get')
def test_e010_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/redirect-to?url=/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.url, self.httpbin + '/get')
def test_75_phantomjs_robots(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result)
def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/post" request["fetch"]["method"] = "POST" # gbk encoding 中文 request["fetch"]["data"] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content)
def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/post' request['fetch']['method'] = 'POST' # gbk encoding 中文 request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content)
def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # gbk encoding 中文 request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content)
def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request["url"] = self.httpbin + "/get" result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request["url"]) self.assertEqual(response.save, request["fetch"]["save"]) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json["headers"].get("A"), "b", response.json) self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json) self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json)
def _run(self, task, response): self._reset() if isinstance(response, dict): response = rebuild_response(response) process = task.get('process', {}) callback = process.get('callback', '__call__') if not hasattr(self, callback): raise NotImplementedError("self.%s() not implemented!" % callback) function = getattr(self, callback) if not getattr(function, '_catch_status_code_error', False): response.raise_for_status() return self._run_func(function, response, task)
def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertEqual(response.json['headers'].get('Cookie'), 'c=d', response.json)
def test_69_no_splash(self): splash_endpoint = self.fetcher.splash_endpoint self.fetcher.splash_endpoint = None request = self.sample_task_http request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.splash_endpoint = splash_endpoint
def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)
def run_task(self, module, task, response): """ 处理task,捕捉错误,返回ProcessorResult object Processing the task, catching exceptions and logs, return a `ProcessorResult` object """ # TODO module是啥 self.logger = logger = module.logger result = None exception = None stdout = sys.stdout self.task = task # deep copy response if isinstance(response, dict): response = rebuild_response(response) self.response = response self.save = (task.get('track') or {}).get('save', {}) try: if self.__env__.get('enable_stdout_capture', True): # 一个具有writelines等方法的基于list的obj sys.stdout = ListO(module.log_buffer) self._reset() # 运行task result = self._run_task(task, response) # TODO # on_result 用于处理结果 if inspect.isgenerator(result): for r in result: self._run_func(self.on_result, r, response, task) else: self._run_func(self.on_result, result, response, task) except Exception as e: logger.exception(e) exception = e finally: follows = self._follows messages = self._messages logs = list(module.log_buffer) extinfo = self._extinfo save = self.save sys.stdout = stdout self.task = None self.response = None self.save = None # 清空buffer module.log_buffer[:] = [] return ProcessorResult(result, follows, messages, logs, exception, extinfo, save)
def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/delay/5' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) response = rebuild_response(result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save'])
def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) response = rebuild_response(result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save'])
def on_task(self, task, response): start_time = time.time() try: response = rebuild_response(response) assert 'taskid' in task, 'need taskid in task' project = task['project'] if project not in self.projects: raise LookupError("no such project: %s" % project) project_data = self.projects[project] ret = project_data['instance'].run( project_data['module'], task, response) except Exception, e: logger.exception(e) return False
def test_70_splash_url(self): request = self.sample_task_http request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertIsNotNone(data, response.content) self.assertEqual(data['headers'].get('A'), 'b', response.json) self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json)
def on_task(self, task, response): start_time = time.time() try: response = rebuild_response(response) assert 'taskid' in task, 'need taskid in task' project = task['project'] if project not in self.projects: raise LookupError("no such project: %s" % project) project_data = self.projects[project] ret = project_data['instance'].run(project_data['module'], task, response) except Exception, e: logger.exception(e) return False
def test_a130_http_get_with_proxy_ok_1(self): self.fetcher.proxy = 'http://*****:*****@%s/' % self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) self.fetcher.proxy = None
def test_zzzz_issue375(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = '127.0.0.1:20000' if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.fetcher.phantomjs_proxy = phantomjs_proxy
def test_69_no_phantomjs(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = None if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'js' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.phantomjs_proxy = phantomjs_proxy
def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'js' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertIsNotNone(data, response.content) self.assertEqual(data['headers'].get('A'), 'b', response.json) self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json)
def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = 'http://*****:*****@%s/' % self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertEqual(data['headers'].get('A'), 'b', response.content) self.assertIn('c=d', data['headers'].get('Cookie'), response.content) self.assertIn('a=b', data['headers'].get('Cookie'), response.content) self.fetcher.proxy = None
def test_15_http_post(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/post' request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['form'].get('binux'), '') self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertEqual(response.json['headers'].get('Cookie'), 'c=d', response.json)
def run_task(self, module, task, response): """ Processing the task, catching exceptions and logs, return a `ProcessorResult` object """ logger = module.logger result = None exception = None stdout = sys.stdout self.task = task if isinstance(response, dict): response = rebuild_response(response) self.response = response self.save = (task.get('track') or {}).get('save', {}) try: if self.__env__.get('enable_stdout_capture', True): sys.stdout = ListO(module.log_buffer) self._reset() #执行脚本 result = self._run_task(task, response) if inspect.isgenerator(result): for r in result: self._run_func(self.on_result, r, response, task) else: #调用on_result self._run_func(self.on_result, result, response, task) except Exception as e: logger.exception(e) exception = e finally: follows = self._follows messages = self._messages logs = list(module.log_buffer) extinfo = self._extinfo save = self.save sys.stdout = stdout self.task = None self.response = None self.save = None module.log_buffer[:] = [] return ProcessorResult(result, follows, messages, logs, exception, extinfo, save)