def __init__(self, result, follows, messages, logs, exception, extinfo): self.result = unicode_obj(result) self.follows = unicode_obj(follows) self.messages = unicode_obj(messages) self.logs = logs self.exception = unicode_obj(exception) self.extinfo = unicode_obj(extinfo)
def run(project): task = utils.decode_unicode_obj(json.loads(request.form['task'])) project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } fetch_result = {} start_time = time.time() try: fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) module = build_module(project_info, { 'debugger': True }) ret = module['instance'].run(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
def get_active_tasks(project=None, limit=100): allowed_keys = set(( 'taskid', 'project', 'status', 'url', 'lastcrawltime', 'updatetime', 'track', )) iters = [iter(x['active_tasks']) for k, x in iteritems(self.projects) if x and (k == project if project else True)] tasks = [next(x, None) for x in iters] result = [] while len(result) < limit and tasks and not all(x is None for x in tasks): updatetime, task = t = max(tasks) i = tasks.index(t) tasks[i] = next(iters[i], None) for key in list(task): if key == 'track': track = {} if 'fetch' in task['track'] and 'ok' in task['track']['fetch']: track['fetch'] = {'ok': task['track']['fetch']['ok']} if 'process' in task['track'] and 'ok' in task['track']['process']: track['process'] = {'ok': task['track']['process']['ok']} task['track'] = track elif key in allowed_keys: continue del task[key] result.append(t) # fix for "<type 'exceptions.TypeError'>:dictionary key must be string" # have no idea why return utils.unicode_obj(json.loads(json.dumps(result)))
def get_script(project): projectdb = app.config['projectdb'] if not projectdb.verify_project_name(project): return 'project name is not allowed!', 400 info = projectdb.get(project, fields=['name', 'script']) return json.dumps(utils.unicode_obj(info)), \ 200, {'Content-Type': 'application/json'}
def spiderweb_get_data(project): projectdb = app.config['projectdb'] print(request.values) url1 = request.form.get('url') print(url1) conn = pymysql.connect(host='127.0.0.1',port=3306,user='******',password='******',db='repository',charset='utf8') cur= conn.cursor() cur.execute("select url,title,content,publish_time,crawl_time from shuiliting where project_id = %s",url1) result = cur.fetchall() return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
def save(project): projectdb = app.config['projectdb'] if not projectdb.verify_project_name(project): return 'project name is not allowed!', 400 script = request.form['script'] project_info = projectdb.get(project, fields=['name', 'status', 'group']) if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \ and not login.current_user.is_active(): return app.login_response if project_info: info = { 'script': script, } if project_info.get('status') in ( 'DEBUG', 'RUNNING', ): info['status'] = 'CHECKING' projectdb.update(project, utils.unicode_obj(info)) else: info = { 'name': project, 'script': script, 'status': 'TODO', 'rate': app.config.get('max_rate', 1), 'burst': app.config.get('max_burst', 3), } projectdb.insert(project, utils.unicode_obj(info)) rpc = app.config['scheduler_rpc'] if rpc is not None: try: rpc.update_project() except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return 'rpc error', 200 return 'ok', 200
def restart(self, project, status): restart_tasks = list() for task in self.taskdb.load_tasks(status, project): url = task['url'] if task['taskid'] in self.task_queue[task['project']]: continue url = task['url'] if url.startswith('data') or url.startswith('curl'): continue task.setdefault('schedule', self.default_schedule) task['schedule']['force_update'] = True restart_tasks.append(task) for each in (restart_tasks[x:x + 1000] for x in range(0, len(restart_tasks), 1000)): self.newtask_queue.put( [utils.unicode_obj(newtask) for newtask in each]) logger.info("restart tasks of project %s success!" % project)
def get_active_tasks(project=None, limit=100): allowed_keys = set(( 'taskid', 'project', 'status', 'url', 'lastcrawltime', 'updatetime', 'track', )) iters = [ iter(x['active_tasks']) for k, x in iteritems(self.projects) if x and (k == project if project else True) ] tasks = [next(x, None) for x in iters] result = [] while len(result) < limit and tasks and not all(x is None for x in tasks): updatetime, task = t = max(tasks) i = tasks.index(t) tasks[i] = next(iters[i], None) for key in list(task): if key == 'track': track = {} if 'fetch' in task['track'] and 'ok' in task['track'][ 'fetch']: track['fetch'] = { 'ok': task['track']['fetch']['ok'] } if 'process' in task['track'] and 'ok' in task[ 'track']['process']: track['process'] = { 'ok': task['track']['process']['ok'] } task['track'] = track elif key in allowed_keys: continue del task[key] result.append(t) # fix for "<type 'exceptions.TypeError'>:dictionary key must be string" # have no idea why return utils.unicode_obj(json.loads(json.dumps(result)))
def add_seed(self, seed_path, project, callback): new_tasks = list() from pyspider.libs.utils import md5string with open(seed_path) as fi: for line in fi: url = line.strip().decode('utf-8') task = {} task['url'] = url task['project'] = project task['status'] = 1 task.setdefault('schedule', {'force_update': True}) task['taskid'] = md5string(task['url']) task['fetch'] = {} task['process'] = {} task['process']['callback'] = callback task['depth'] = 0 new_tasks.append(task) for each in (new_tasks[x:x + 1000] for x in range(0, len(new_tasks), 1000)): self.newtask_queue.put( [utils.unicode_obj(newtask) for newtask in each]) logger.info("add seed success for project %s!" % project)
def migrate(pool, from_connection, to_connection): """ Migrate tool for pyspider """ f = connect_database(from_connection) t = connect_database(to_connection) if isinstance(f, ProjectDB): for each in f.get_all(): each = unicode_obj(each) logging.info("projectdb: %s", each['name']) t.drop(each['name']) t.insert(each['name'], each) elif isinstance(f, TaskDB): pool = Pool(pool) pool.map(lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t), f.projects) elif isinstance(f, ResultDB): pool = Pool(pool) pool.map(lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t), f.projects)
def migrate(pool, from_connection, to_connection): """ Migrate tool for pyspider """ f = connect_database(from_connection) t = connect_database(to_connection) if isinstance(f, ProjectDB): for each in f.get_all(): each = unicode_obj(each) logging.info("projectdb: %s", each['name']) t.drop(each['name']) t.insert(each['name'], each) elif isinstance(f, TaskDB): pool = Pool(pool) pool.map( lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t), f.projects) elif isinstance(f, ResultDB): pool = Pool(pool) pool.map( lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t), f.projects)
def on_task(self, task, response): '''Deal one task''' start_time = time.time() response = rebuild_response(response) try: assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('project_updatetime', None) md5sum = task.get('project_md5sum', None) project_data = self.project_manager.get(project, updatetime, md5sum) assert project_data, "no such project!" if project_data.get('exception'): ret = ProcessorResult( logs=(project_data.get('exception_log'), ), exception=project_data['exception']) else: ret = project_data['instance'].run_task( project_data['module'], task, response) except Exception as e: logstr = traceback.format_exc() ret = ProcessorResult(logs=(logstr, ), exception=e) process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': getattr(response, '_encoding', None), 'headers': track_headers, 'content': response.text[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': (None if ret.result is None else utils.text( ret.result)[:self.RESULT_RESULT_LIMIT]), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, 'save': ret.save, }, } if 'schedule' in task: status_pack['schedule'] = task['schedule'] # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)): self.newtask_queue.put( [utils.unicode_obj(newtask) for newtask in each]) for project, msg, url in ret.messages: try: self.on_task( { 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), }) except Exception as e: logger.exception('Sending message error.') continue if ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func( 'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
return False process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): status_pack = utils.unicode_obj({ 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'time': response.time, 'status_code': response.status_code, 'headers': dict(response.headers), 'encoding': response.encoding, 'content': response.content[:500] \ if not response.isok() or ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': unicode(ret.result)[:100], 'logs': ret.logstr()[-200:], 'exception': ret.exception, }, }, }) self.status_queue.put(status_pack) for newtask in ret.follows: self.newtask_queue.put(newtask)
def run(project): start_time = time.time() try: task = utils.decode_unicode_obj(json.loads(request.form['task'])) except Exception: result = { 'fetch_result': "", 'logs': u'task json error', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } if request.form.get('webdav_mode') == 'true': projectdb = app.config['projectdb'] info = projectdb.get(project, fields=['name', 'script']) if not info: result = { 'fetch_result': "", 'logs': u' in wevdav mode, cannot load script', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info['script'] = info['script'] fetch_result = {} try: fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) module = ProjectManager.build_module(project_info, { 'debugger': True }) ret = module['instance'].run_task(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text if (response.headers.get('content-type', '').startswith('image')): result['fetch_result']['dataurl'] = dataurl.encode( response.content, response.headers['content-type']) try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
return False process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): status_pack = utils.unicode_obj({ 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'time': response.time, 'status_code': response.status_code, 'headers': dict(response.headers), 'encoding': response.encoding, 'content': response.content[:500] \ if not response.isok() or ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT], 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, }) self.status_queue.put(status_pack) for newtask in ret.follows: self.newtask_queue.put(newtask)
def on_task(self, task, response): start_time = time.time() try: response = rebuild_response(response) assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('updatetime', None) project_data = self.project_manager.get(project, updatetime) if not project_data: logger.error("no such project: %s", project) return False ret = project_data['instance'].run( project_data['module'], task, response) except Exception as e: logger.exception(e) return False process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': response.encoding, 'headers': track_headers, 'content': response.content[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': ( None if ret.result is None else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT] ), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, } # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in ret.follows]) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % ( task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
if not response.isok() or ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT], 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, } # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) for newtask in ret.follows: # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.newtask_queue.put(utils.unicode_obj(newtask)) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, {
def run(project): start_time = time.time() try: task = utils.decode_unicode_obj(json.loads(request.form['task'])) except Exception: result = { 'fetch_result': "", 'logs': u'task json error', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } if request.form.get('webdav_mode') == 'true': projectdb = app.config['projectdb'] info = projectdb.get(project, fields=['name', 'script']) if not info: result = { 'fetch_result': "", 'logs': u' in wevdav mode, cannot load script', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info['script'] = info['script'] fetch_result = {} try: fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) module = ProjectManager.build_module(project_info, {'debugger': True}) ret = module['instance'].run_task(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text if (response.headers.get('content-type', '').startswith('image')): result['fetch_result']['dataurl'] = dataurl.encode( response.content, response.headers['content-type']) try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps(utils.unicode_obj(result)), 200, { 'Content-Type': 'application/json' } except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, { 'Content-Type': 'application/json' }
def on_task(self, task, response): '''Deal one task''' start_time = time.time() response = rebuild_response(response) try: assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('project_updatetime', None) md5sum = task.get('project_md5sum', None) project_data = self.project_manager.get(project, updatetime, md5sum) assert project_data, "no such project!" if project_data.get('exception'): ret = ProcessorResult(logs=(project_data.get('exception_log'), ), exception=project_data['exception']) else: ret = project_data['instance'].run_task( project_data['module'], task, response) except Exception as e: logstr = traceback.format_exc() ret = ProcessorResult(logs=(logstr, ), exception=e) process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': response.encoding, 'headers': track_headers, 'content': response.text[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': ( None if ret.result is None else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT] ), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, 'save': ret.save, }, } if 'schedule' in task: status_pack['schedule'] = task['schedule'] # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. # logger.info('process follows :%s' % ret.follows) # logger.info('process messages :%s' % ret.messages) if ret.follows: for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)): self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each]) for project, msg, url in ret.messages: try: self.on_task({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), }) except Exception as e: logger.exception('Sending message error.') continue if ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % ( task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
def api_debug_run(project): start_time = time.time() try: task = utils.decode_unicode_obj(json.loads(request.form['task'])) except Exception: result = { 'fetch_result': "", 'logs': u'task json error', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps({ 'Status': 1, 'Result': utils.unicode_obj(result) }), 200, cors_resp_header project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } if request.form.get('webdav_mode') == 'true': projectdb = app.config['projectdb'] info = projectdb.get(project, fields=['name', 'script']) if not info: result = { 'fetch_result': "", 'logs': u' in wevdav mode, cannot load script', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps({ 'Status': 1, 'Result': utils.unicode_obj(result) }), 200, cors_resp_header project_info['script'] = info['script'] fetch_result = {} try: module = ProjectManager.build_module( project_info, { 'debugger': True, 'process_time_limit': app.config['process_time_limit'], }) # The code below is to mock the behavior that crawl_config been joined when selected by scheduler. # but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True` # crawl_config = module['instance'].crawl_config # task = module['instance'].task_join_crawl_config(task, crawl_config) fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) ret = module['instance'].run_task(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text if (response.headers.get('content-type', '').startswith('image')): result['fetch_result']['dataurl'] = dataurl.encode( response.content, response.headers['content-type']) try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps({ 'Status': 1, 'Result': utils.unicode_obj(result) }), 200, cors_resp_header except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps({ 'Status': 1, 'Result': utils.unicode_obj(result) }), 200, cors_resp_header
def run(project): start_time = time.time() try: task = utils.decode_unicode_obj(json.loads(request.form['task'])) except Exception: result = { 'fetch_result': "", 'logs': u'task json error', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, { 'Content-Type': 'application/json' } project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } fetch_result = {} try: fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) module = ProjectManager.build_module(project_info, {'debugger': True}) ret = module['instance'].run(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps(utils.unicode_obj(result)), 200, { 'Content-Type': 'application/json' } except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, { 'Content-Type': 'application/json' }
def on_task(self, task, response): start_time = time.time() try: response = rebuild_response(response) assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('updatetime', None) project_data = self.project_manager.get(project, updatetime) if not project_data: logger.error("no such project: %s", project) return False ret = project_data['instance'].run(project_data['module'], task, response) except Exception as e: logger.exception(e) return False process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'time': response.time, 'status_code': response.status_code, 'headers': dict(response.headers), 'encoding': response.encoding, 'content': (response.content[:500] if ret.exception else None), }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': utils.text(ret.result)[:self.RESULT_RESULT_LIMIT], 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, } # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.newtask_queue.put( [utils.unicode_obj(newtask) for newtask in ret.follows]) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func( 'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
def on_task(self, task, response): start_time = time.time() try: response = rebuild_response(response) assert 'taskid' in task, 'need taskid in task' project = task['project'] if project not in self.projects: raise LookupError("no such project: %s" % project) project_data = self.projects[project] ret = project_data['instance'].run( project_data['module'], task, response) except Exception as e: logger.exception(e) return False process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'time': response.time, 'status_code': response.status_code, 'headers': dict(response.headers), 'encoding': response.encoding, 'content': ( response.content[:500] if not response.isok() or ret.exception else None ), }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT], 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, } # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) for newtask in ret.follows: # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.newtask_queue.put(utils.unicode_obj(newtask)) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % ( task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True