def _update_project(self, project): '''update one project''' if project['name'] not in self.projects: self.projects[project['name']] = {} self.projects[project['name']].update(project) self.projects[project['name']]['md5sum'] = utils.md5string(project['script']) if not self.projects[project['name']].get('active_tasks', None): self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS) # load task queue when project is running and delete task_queue when project is stoped if project['status'] in ('RUNNING', 'DEBUG'): if project['name'] not in self.task_queue: self._load_tasks(project['name']) self.task_queue[project['name']].rate = project['rate'] self.task_queue[project['name']].burst = project['burst'] # update project runtime info from processor by sending a _on_get_info # request, result is in status_page.track.save self.on_select_task({ 'taskid': '_on_get_info', 'project': project['name'], 'url': 'data:,_on_get_info', 'status': self.taskdb.SUCCESS, 'fetch': { 'save': ['min_tick', ], }, 'process': { 'callback': '_on_get_info', }, }) else: if project['name'] in self.task_queue: self.task_queue[project['name']].rate = 0 self.task_queue[project['name']].burst = 0 del self.task_queue[project['name']]
def test_put(n): logger.info("message queue put %d", n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.spider.org/?l=%d' % i task['taskid'] = md5string(task['url']) queue.put(task, block=True, timeout=1) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def test_update(n, start=0): logger.info("taskdb update %d" % n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.spider.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.update(task['project'], task['taskid'], task) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def test_get(n, start=0, random=True, fields=request_task_fields): logger.info("taskdb get %d %s" % (n, "randomly" if random else "")) range_n = list(range(n)) if random: from random import shuffle shuffle(range_n) start_time = time.time() for i in range_n: task['url'] = 'http://bench.spider.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.get_task(task['project'], task['taskid'], fields=fields) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def _load_project(self, project): """Load project into self.projects from project info dict""" try: project["md5sum"] = utils.md5string(project["script"]) ret = self.build_module(project, self.env) self.projects[project["name"]] = ret except Exception as e: logger.exception("load project %s error", project.get("name", None)) ret = { "loader": None, "module": None, "class": None, "instance": None, "exception": e, "exception_log": traceback.format_exc(), "info": project, "load_time": time.time(), } self.projects[project["name"]] = ret return False logger.debug("project: %s updated.", project.get("name", None)) return True
def send_message(ctx, scheduler_rpc, project, message): """ Send Message to project from command line """ if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): scheduler_rpc = connect_rpc(ctx, None, 'http://%s/' % ( os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) if scheduler_rpc is None: scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/') return scheduler_rpc.send_task({ 'taskid': utils.md5string('data:,on_message'), 'project': project, 'url': 'data:,on_message', 'fetch': { 'save': ('__command__', message), }, 'process': { 'callback': '_on_message', } })
def get_taskid(self, task): '''Generate taskid by information of task md5(url) by default, override me''' return md5string(task['url'])
def on_task(self, task, response): """Deal one task""" start_time = time.time() response = rebuild_response(response) try: assert "taskid" in task, "need taskid in task" project = task["project"] updatetime = task.get("project_updatetime", None) md5sum = task.get("project_md5sum", None) project_data = self.project_manager.get(project, updatetime, md5sum) assert project_data, "no such project!" if project_data.get("exception"): ret = ProcessorResult(logs=(project_data.get("exception_log"),), exception=project_data["exception"]) else: ret = project_data["instance"].run_task(project_data["module"], task, response) except Exception as e: logstr = traceback.format_exc() ret = ProcessorResult(logs=(logstr,), exception=e) process_time = time.time() - start_time if not ret.extinfo.get("not_send_status", False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ("etag", "last-modified"): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { "taskid": task["taskid"], "project": task["project"], "url": task.get("url"), "track": { "fetch": { "ok": response.isok(), "redirect_url": response.url if response.url != response.orig_url else None, "time": response.time, "error": response.error, "status_code": response.status_code, "encoding": response.encoding, "headers": track_headers, "content": response.text[:500] if ret.exception else None, }, "process": { "ok": not ret.exception, "time": process_time, "follows": len(ret.follows), "result": (None if ret.result is None else utils.text(ret.result)[: self.RESULT_RESULT_LIMIT]), "logs": ret.logstr()[-self.RESULT_LOGS_LIMIT :], "exception": ret.exception, }, "save": ret.save, }, } if "schedule" in task: status_pack["schedule"] = task["schedule"] # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in ret.follows]) for project, msg, url in ret.messages: try: self.on_task( { "taskid": utils.md5string(url), "project": project, "url": url, "process": {"callback": "_on_message"}, }, {"status_code": 200, "url": url, "save": (task["project"], msg)}, ) except Exception as e: logger.exception("Sending message error.") continue if ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func( "process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r" % ( task["project"], task["taskid"], task.get("url"), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception, ) ) return True