def _update_project(self, project): '''update one project''' if project['name'] not in self.projects: self.projects[project['name']] = {} self.projects[project['name']].update(project) self.projects[project['name']]['md5sum'] = utils.md5string(project['script']) if not self.projects[project['name']].get('active_tasks', None): self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS) # load task queue when project is running and delete task_queue when project is stoped if project['status'] in ('RUNNING', 'DEBUG'): if project['name'] not in self.task_queue: self._load_tasks(project['name']) self.task_queue[project['name']].rate = project['rate'] self.task_queue[project['name']].burst = project['burst'] # update project runtime info from processor by sending a _on_get_info # request, result is in status_page.track.save self.on_select_task({ 'taskid': '_on_get_info', 'project': project['name'], 'url': 'data:,_on_get_info', 'status': self.taskdb.SUCCESS, 'fetch': { 'save': ['min_tick', ], }, 'process': { 'callback': '_on_get_info', }, }) else: if project['name'] in self.task_queue: self.task_queue[project['name']].rate = 0 self.task_queue[project['name']].burst = 0 del self.task_queue[project['name']]
def _crawl(self, url, **kwargs): task = {} if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, basestring) and hasattr(self, callback): func = getattr(self, callback) elif hasattr(callback, 'im_self') and callback.im_self is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in func._config.iteritems(): kwargs.setdefault(k, v) if hasattr(self, 'crawl_config'): for k, v in self.crawl_config.iteritems(): kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.get('params'))) if kwargs.get('files'): assert isinstance(kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.get('data', {}), kwargs.get('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'): if key in kwargs and kwargs[key] is not None: schedule[key] = kwargs[key] if schedule: task['schedule'] = schedule fetch = {} for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'): if key in kwargs and kwargs[key] is not None: fetch[key] = kwargs[key] if fetch: task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs and kwargs[key] is not None: process[key] = kwargs[key] if process: task['process'] = process task['project'] = self.project_name task['url'] = url task['taskid'] = task.get('taskid') or md5string(url) self._follows.append(task) return task
def followeeUser(info): """ Information of people concerned, this function can control the rate of send task to the detail page :param content: The content of the page :return: """ url_token, following_count = info['url_token'], info['following_count'] for page in range(0, following_count, 20): url = FOLLOWEES_URL.format(url_token=url_token, offset=page) try: r = requests.get(url, headers=HEADERS, timeout=TIMEOUT) except: pass else: content = json.loads(r.content.decode('utf-8')) page_datas = content['data'] user_objs, relation_objs = [], [] for data in page_datas: user_objs.append(UserInfo(user_id=data['id'], url_token=data['url_token'])) relation_objs.append(Relation( parent_url_token=data['url_token'], children_url_token=url_token, md5=md5string((data['url_token']+url_token).encode('utf-8')) )) try: session.add_all(user_objs) session.commit() except Exception as e: session.rollback() try: session.add_all(relation_objs) session.commit() except Exception as e: session.rollback() time.sleep(TIME_DELAY)
def test_put(n): logger.info("message queue put %d", n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.pyspider.org/?l=%d' % i task['taskid'] = md5string(task['url']) queue.put(task, block=True, timeout=1) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def test_update(n, start=0): logger.info("taskdb update %d" % n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.update(task['project'], task['taskid'], task) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def test_get(n, start=0, random=True, fields=request_task_fields): logger.info("taskdb get %d %s" % (n, "randomly" if random else "")) range_n = list(range(n)) if random: from random import shuffle shuffle(range_n) start_time = time.time() for i in range_n: task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.get_task(task['project'], task['taskid'], fields=fields) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
def _load_project(self, project): '''Load project into self.projects from project info dict''' try: project['md5sum'] = utils.md5string(project['script']) ret = self.build_module(project, self.env) self.projects[project['name']] = ret except Exception as e: logger.exception("load project %s error", project.get('name', None)) ret = { 'loader': None, 'module': None, 'class': None, 'instance': None, 'exception': e, 'exception_log': traceback.format_exc(), 'info': project, 'load_time': time.time(), } self.projects[project['name']] = ret return False logger.debug('project: %s updated.', project.get('name', None)) return True
def send_message(ctx, scheduler_rpc, project, message): """ Send Message to project from command line """ if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): scheduler_rpc = connect_rpc(ctx, None, 'http://%s/' % ( os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) if scheduler_rpc is None: scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/') return scheduler_rpc.send_task({ 'taskid': utils.md5string('data:,on_message'), 'project': project, 'url': 'data:,on_message', 'fetch': { 'save': ('__command__', message), }, 'process': { 'callback': '_on_message', } })
'time': process_time, 'follows': len(ret.follows), 'result': unicode(ret.result)[:100], 'logs': ret.logstr()[:200], 'exception': ret.exception, }, }, }) self.status_queue.put(status_pack) for newtask in ret.follows: self.newtask_queue.put(newtask) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info
def get_taskid(self, task): '''Generate taskid by information of task md5(url) by default, override me''' return md5string(task['url'])
'time': process_time, 'follows': len(ret.follows), 'result': unicode(ret.result)[:100], 'logs': ret.logstr()[-200:], 'exception': ret.exception, }, }, }) self.status_queue.put(status_pack) for newtask in ret.follows: self.newtask_queue.put(newtask) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info