def initialize(self): super(SaveLogTask, self).initialize() # self.logger = get_task_logger(self.name) pagedb = get_connection('page') logdb = get_connection('log') taskdb = get_connection('task') self._task_coll = taskdb['task_infos'] self._page_coll = pagedb['page_infos'] self._log_coll = logdb['task_logs'] self._log_coll.create_index([('created', -1), ('task_id', 1)]) self._page_coll.create_index([('task_id', 1)]) self._task_coll.create_index([('received', -1), ('task_id', 1)])
def on_worker_ready(sender, signal, **kwargs): # redis_conf = redis.from_url(app.conf.redis['conf']) from yunduo.resource import get_connection redis_run = get_connection('run') kw = { 'exchange': 'xcrawl', 'reply': True, 'binding_key': None, 'exchange_type': 'direct', 'queue_arguments': { 'x-max-priority': 10 }, 'consumer_arguments': { 'x-priority': 8 } } # for p, s in redis_run.zscan_iter('queue:running'): running = redis_run.hgetall('queue:running') for q, s in running.items(): if bytes_to_str(s) == 'pause': continue logger.info('regain consuming %s %s', q, s) q1 = bytes_to_str(q) kkw = kw.copy() kkw['binding_key'] = q1.split(':')[3] sender.add_task_queue(q1, **kw)
def initialize(self): super(CrawlTask, self).initialize() # self.logger = get_task_logger(self.name) self.df = Dupefilter() # self.exchange = Exchange('xcrawl', type='direct') # self.HTTP_MAX_RETRIES = conf.get_http('max_retries') self.influx_stat = get_connection('stat') self.http_max_retries = xconf.get_http('max_retries') self.http_retry_codes = xconf.get_http('retry_codes')
def default_save_result(self, data, kwargs): project = kwargs['project'] page = kwargs['page'] _meta = { 'project': project, 'job': kwargs.get('job'), 'page': page, 'batch_id': kwargs.get('batch_id'), 'created': kwargs.get('created', datetime.now()), } db = get_connection('result') col = db['%s_%s' % (project, page)] if isinstance(data, dict): data['_meta'] = _meta col.insert_one(data) return 1 else: for it in data: it['_meta'] = _meta col.insert_many(data) return len(data)
from wtforms import form from flask import (request, redirect, flash) from flask_admin import expose from flask_admin.model import template from flask_admin.model.helpers import get_mdict_item_or_list from flask_admin.helpers import get_redirect_target from flask_admin.babel import gettext from flask_admin.contrib.pymongo import ModelView, filters # from connections import mongo_log from yunduo.resource import get_connection from xadmin.utils.format import date_format, map_format from xadmin.view.base import MongoView mongo_logs = get_connection('log') PageDB = get_connection('page') def message_format(view, context, model, name): if 'exception' in model: exc = model['exception'] s = '<div data-toggle="popover" data-trigger="hover" title="%s" data-content="%s">%s</a>' \ % (cgi.escape(exc['message'], True), cgi.escape('<pre>%s</pre>' % exc['stackTrace'], True), model[name]) return Markup(s) else: return model[name] class TaskView(MongoView):
from yunduo.parser.htmlextractor import Link, Extractor, ItemResult # from conf.common import HTTP_MAX_RETRIES, HTTP_RETRY_CODE from yunduo.conf import xconf # from connections import redis_conf, influx_stat, mongo_page from yunduo.utils import merge, arg_to_iter from yunduo.code import compile, get_function, get_script from yunduo.downloader import download, proxy as proxy_mod # from connections import get_connection from yunduo.resource import get_connection from xspider.jobaction import JobAction from xspider.app import app # from xspider.log import get_task_logger # from xspider.job import gen_queue_name from .base import StrategyTask redis_run = get_connection('run') # 'queue', 'routing_key', 'exchange', 'priority', 'expires', # 'serializer', 'delivery_mode', 'compression', 'time_limit', class CrawlTask(StrategyTask): # 定义自己的策略 # Strategy = 'xspider.strategy:default' # rate_limit = True # counter = True # store_info = True # custom_queue = True # def __init__(self): # super(CrawlTask, self).__init__() # self.df = Dupefilter() # self._exchange = Exchange('xcrawl', type='direct')
def initialize(self): # super(SaveResultTask, self).initialize() # self.logger = get_task_logger(self.name) self.logger = get_task_logger(self.name, save=True) self.df = Dupefilter() self.influx_stat = get_connection('stat')
class Store(object): _redis = get_connection('conf') entries_key = 'beat_entries' schedule_key = 'schedule_entries' def __init__(self, lock_ttl=None): self.lock_ttl = lock_ttl self.lock = self._redis.lock('beat_lock', lock_ttl) def __getitem__(self, key): data = self._redis.hget(self.entries_key, key) if not data: raise KeyError() return deserialize_entry(json.loads(data), ScheduleEntry) def __setitem__(self, key, entry): # print('__setitem__', key, entry) if entry: self.add(entry) else: self.remove(key) # next_time = entry.next_time() # if next_time is None: # score = -1 # else: # score = next_time.timestamp() # self._redis.zadd(self.schedule_key, entry.name, score) # self._redis.hset(self.entries_key, entry.name, serialize_entry(entry)) def __iter__(self): if self.lock.acquire(False): try: max_score = time.time() keys = self._redis.zrangebyscore(self.schedule_key, 0, max_score) for key in keys: yield self[key] key = self._redis.zrange(self.schedule_key, 0, 1) if key: yield self[key[0]] finally: try: self.lock.release() except Exception as e: logger.exception('release lock') else: yield def get(self, key, default=None): try: return self[key] except KeyError: return default def update(self, key, value=None): if isinstance(key, dict) and value is None: for k in key: self[k] = key[k] else: self[key] = value @classmethod def add(cls, entry): if not isinstance(entry, ScheduleEntry): entry = ScheduleEntry(**entry) next_time = entry.next_time() if next_time is None: score = float("inf") else: score = next_time.timestamp() with cls._redis.pipeline() as pipe: pipe.zadd(cls.schedule_key, entry.name, score) pipe.hset(cls.entries_key, entry.name, json.dumps(serialize_entry(entry))) pipe.execute() @classmethod def remove(cls, entry): if isinstance(entry, ScheduleEntry): key = entry.name else: key = entry with cls._redis.pipeline() as pipe: pipe.zrem(cls.schedule_key, key) pipe.hdel(cls.entries_key, key) pipe.execute()
m = self._resolve_symbol(context, 'row_actions.link') get_url = self._resolve_symbol(context, 'get_url') meta = row.get('meta') if not meta or self.id_field not in row: return '' kwargs = dict(self.url_args) if self.url_args else {} kwargs[self.id_arg] = row[self.id_field] view = context['admin_view'] url = get_url(self.endpoint, **kwargs) return m(self, url) page_db = get_connection('page') log_db = get_connection('log') task_db = get_connection('task') log_task_coll = task_db['task_infos'] log_page_coll = page_db['page_infos'] log_log_coll = log_db['task_logs'] class LogView(MongoView): collection = log_log_coll column_list = ( 'meta.project', 'meta.job', 'meta.page',
def __init__(self, *args, **kwargs): super(StatView, self).__init__(*args, **kwargs) self.influx_stat = get_connection('stat')
def default(task, app, consumer, info=logger.info, error=logger.error, task_reserved=task_reserved, to_system_tz=timezone.to_system, bytes=bytes, buffer_t=buffer_t): """Default task execution strategy. Note: Strategies are here as an optimization, so sadly it's not very easy to override. """ redis_run = get_connection('run') hostname = consumer.hostname connection_errors = consumer.connection_errors _does_info = logger.isEnabledFor(logging.INFO) # task event related # (optimized to avoid calling request.send_event) eventer = consumer.event_dispatcher events = eventer and eventer.enabled send_event = eventer.send task_sends_events = events and task.send_events call_at = consumer.timer.call_at apply_eta_task = consumer.apply_eta_task rate_limits_enabled = not consumer.disable_rate_limits get_bucket = consumer.task_buckets.__getitem__ handle = consumer.on_task_request limit_task = consumer._limit_task body_can_be_buffer = consumer.pool.body_can_be_buffer # Req = create_request_cls(Request, task, consumer.pool, hostname, eventer) # def create_request_cls(base, task, pool, hostname, eventer, # ref=ref, revoked_tasks=revoked_tasks, # task_ready=task_ready, trace=trace_task_ret): # default_time_limit = task.time_limit # default_soft_time_limit = task.soft_time_limit # apply_async = pool.apply_async # acks_late = task.acks_late # events = eventer and eventer.enabled # task_ready = state.task_ready # task_accepted = state.task_accepted task_ready = state.task_ready revoked_tasks = state.revoked default_time_limit = task.time_limit default_soft_time_limit = task.soft_time_limit apply_async = consumer.pool.apply_async # print '=======-----', consumer, consumer.pool, apply_async acks_late = task.acks_late events = eventer and eventer.enabled # == END == Request var controller_revoked_tasks = consumer.controller.state.revoked task_name = task.name # celery_app = task._get_app() # task_send_task = task.send_task # log_exception = task.logger.exception # _logger = get_task_logger(task_name, save=True) # _info = _logger.info # _error = _logger.error _info = task.logger.info _error = task.logger.error # task_store_info = task.store_info # task_rate_limit = task.rate_limit # task_counter = task.counter get_task_info = task.brief # task_counter_key = task.counter_key # task_on_all_finished = task.on_all_finished # taskstore = TaskStore(task_name) task_save = app.tasks['xspider.save_log'] def save_task_status(type_, task_id, data): type_, _, subject = type_.partition('-') if type_ != 'task' or not data: return data[subject] = datetime.now() data['task_id'] = task_id task_save.apply_async(('task', task_name, {task_id: data})) # dispatcher = consumer.event_dispatcher # if dispatcher.groups and 'project' not in dispatcher.groups: # dispatcher.groups.add('project') # info('Events of group {project} enabled by local.') class BaseReq(Request): def __init__(self, *args, **kwargs): super(BaseReq, self).__init__(*args, **kwargs) self._args, self._kwargs, self._embed = self._payload def execute_using_pool(self, pool, **kwargs): task_id = self.id if (self.expires or task_id in revoked_tasks) and self.revoked(): raise TaskRevokedError(task_id) time_limit, soft_time_limit = self.time_limits result = pool.apply_async( trace_task_ret, args=(self.type, task_id, self.request_dict, self.body, self.content_type, self.content_encoding), accept_callback=self.on_accepted, timeout_callback=self.on_timeout, callback=self.on_success, error_callback=self.on_failure, soft_timeout=soft_time_limit or default_soft_time_limit, timeout=time_limit or default_time_limit, correlation_id=task_id, ) # cannot create weakref to None # pylint: disable=attribute-defined-outside-init self._apply_result = maybe(ref, result) return result def on_success(self, failed__retval__runtime, **kwargs): failed, retval, runtime = failed__retval__runtime if failed: if isinstance(retval.exception, (SystemExit, KeyboardInterrupt)): raise retval.exception return self.on_failure(retval, return_ok=True) task_ready(self) if acks_late: self.acknowledge() if events: self.send_event( 'task-succeeded', result=retval, runtime=runtime, ) def send_event(self, type_, **fields): super(BaseReq, self).send_event(type_, **fields) if type_ == 'task-succeeded': try: if 'result' in fields: fields['result'] = json.dumps(fields['result']) except Exception: pass # taskstore.save(type_, self.id, fields) save_task_status(type_, self.id, fields) def task_info(self): info = get_task_info(self._args, self._kwargs) info['task_id'] = self.id info['task_name'] = task_name info['worker'] = self.hostname return info def task_message_handler(message, body, ack, reject, callbacks, to_timestamp=to_timestamp): # print('crawl_task_message_handler %s %s' % (task_name, repr(body))) body, headers, decoded, utc = ( message.body, message.headers, False, True, ) if not body_can_be_buffer: body = bytes(body) if isinstance(body, buffer_t) else body req = BaseReq( message, on_ack=ack, on_reject=reject, app=app, hostname=hostname, eventer=eventer, task=task, connection_errors=connection_errors, body=body, headers=headers, decoded=decoded, utc=utc, ) # if _does_info: meta = req.task_info() taskinfo = {'meta': meta} _info(u'收到任务', extra=taskinfo) if (req.expires or req.id in controller_revoked_tasks) and req.revoked(): return # req_args, req_kwargs, req_embed = req._payload if task_sends_events: send_event( 'task-received', uuid=req.id, name=req.name, args=req.argsrepr, kwargs=req.kwargsrepr, root_id=req.root_id, parent_id=req.parent_id, retries=req.request_dict.get('retries', 0), eta=req.eta and req.eta.isoformat(), expires=req.expires and req.expires.isoformat(), ) # 保存 # ti = get_task_info(req._args, req._kwargs) fields = dict( name=req.name, # project=req._project, page=req._page, url=req._url, kwargs=json.dumps(req._kwargs), # args=req_args, kwargs=req_kwargs, root_id=req.root_id, parent_id=req.parent_id, retries=req.request_dict.get('retries', 0), eta=req.eta and req.eta.isoformat(), expires=req.expires and req.expires.isoformat(), meta=meta) save_task_status('task-received', req.id, fields) # 限速 if req._kwargs.get('__limit__'): try: key = 'rate:%s' % meta['project'] pending = get_expected_time(key) # print '----Rate limit pending: %s %r' % (req.id, pending) if pending > 0: req.eta = maybe_make_aware(datetime.utcnow() + timedelta(seconds=pending)) info('Rate Limit [%s.%s] %s', meta['project'], meta['page'], pending) except Exception: error('Rate limit. Task: %r', req.info(safe=True), exc_info=True) if req.eta: try: if req.utc: eta = to_timestamp(to_system_tz(req.eta)) else: eta = to_timestamp(req.eta, timezone.local) except (OverflowError, ValueError): error("Couldn't convert ETA %r to timestamp. Task: %r", req.eta, req.info(safe=True), exc_info=True) req.reject(requeue=False) else: consumer.qos.increment_eventually() call_at(eta, apply_eta_task, (req, ), priority=6) else: if rate_limits_enabled: bucket = get_bucket(task.name) if bucket: return limit_task(req, bucket, 1) task_reserved(req) if callbacks: [callback(req) for callback in callbacks] handle(req) return task_message_handler
def initialize(self): self.redis_run = get_connection('run') self.influx_stat = get_connection('stat')
# coding=utf8 import time import six from hashlib import sha1 from redis.exceptions import NoScriptError from yunduo.resource import get_connection redis_conf = get_connection('conf') RATE_LIMIT_SCRIPT = '''\ local key, now, token = KEYS[1], tonumber(ARGV[1]), tonumber(ARGV[2]) local timestamp, fill_rate, capacity, tokens, rhold = 0, 0, 0, 0, 0 local vals = redis.call("hgetall", key) for i = 1, #vals, 2 do if vals[i] == "timestamp" then timestamp = tonumber(vals[i+1]) elseif vals[i] == "fill_rate" then fill_rate = tonumber(vals[i+1]) elseif vals[i] == "capacity" then capacity = tonumber(vals[i+1]) elseif vals[i] == "tokens" then tokens = tonumber(vals[i+1]) elseif vals[i] == "rhold" then rhold = tonumber(vals[i+1]) end end if fill_rate == 0 then return 0 end local delta = fill_rate * (now - timestamp) rhold = rhold - delta if rhold < 0 then rhold = 0 end
def __init__(self): self.expire = xconf.get('df_expire', 1296000) self.redisobj = get_connection('df')
def __init__(self, project, job, batch_id): super(JobAction, self).__init__() self.project = project self.job = job self.batch_id = batch_id self.redis_run = get_connection('run')