def execute(cls, stage, state, data, next_allowed_exec_time=None): """Execute the operation, rate limiting allowing.""" try: context = Context.from_state(state, stage) now = datetime.utcnow() if next_allowed_exec_time and now < next_allowed_exec_time: # task not allowed to run yet; put it back in the queue Queue.queue(stage, state, data, delay=next_allowed_exec_time) elif context.crawler.disabled: pass elif context.stage.rate_limit: try: with rate_limiter(context): context.execute(data) except RateLimitException: delay = max(1, 1.0 / context.stage.rate_limit) delay = random.randint(1, int(delay)) context.log.info("Rate limit exceeded, delaying %d sec.", delay) Queue.queue(stage, state, data, delay=delay) else: context.execute(data) except Exception: log.exception("Task failed to execute:") finally: # Decrease the pending task count after excuting a task. Queue.decr_pending(context.crawler) # If we don't have anymore tasks to execute, time to clean up. if not context.crawler.is_running: context.crawler.aggregate(context)
def execute(cls, stage, state, data, next_allowed_exec_time=None): """Execute the operation, rate limiting allowing.""" now = datetime.utcnow() if next_allowed_exec_time and now < next_allowed_exec_time: # task not allowed to run yet; put it back in the queue Queue.queue(stage, state, data, delay=next_allowed_exec_time) return context = Context.from_state(state, stage) if context.crawler.disabled: return if context.stage.rate_limit: try: with rate_limiter(context): context.execute(data) return except RateLimitException: delay = max(1, 1.0 / context.stage.rate_limit) delay = random.randint(1, int(delay)) context.log.info("Rate limit exceeded, delaying %d sec.", delay) Queue.queue(stage, state, data, delay=delay) context.execute(data)
def after_task(self, task): if task.job.is_done(): stage = CrawlerStage.detach_namespace(task.stage.stage) state = task.context context = Context.from_state(state, stage) context.crawler.aggregate(context) self.timeout_expiration_check()
def handle(self, task): apply_task_context(task) data = task.payload stage = CrawlerStage.detach_namespace(task.stage.stage) state = task.context context = Context.from_state(state, stage) context.execute(data)
def handle(self, task): data = task.payload stage = task.stage.stage state = task.context context = Context.from_state(state, stage) if context.crawler.disabled: return context.execute(data)
def test_dump_load_state(self, context, crawler, stage): dump = context.dump_state() new_context = Context.from_state(dump, stage.name) assert isinstance(new_context, Context) assert new_context.run_id == context.run_id assert new_context.crawler.name == crawler.name assert new_context.stage.name == stage.name assert all((k, v) in new_context.state.items() for k, v in context.state.items())
def context(): ctx = Context(crawler(), stage(), {"foo": "bar"}) # Assign a fake operation id, so that the DB doesn't complain about # NotNullContraint while saving events etc. ctx.operation_id = randint(1, 99999) return ctx
def get_context(): ctx = Context(get_crawler(), get_stage(), {"foo": "bar"}) ctx.run_id = str(uuid.uuid4()) return ctx
def after_task(self, task): if task.job.is_done(): stage = task.stage.stage state = task.context context = Context.from_state(state, stage) context.crawler.aggregate(context)
def handle(self, task): data = task.payload stage = task.stage.stage state = task.context context = Context.from_state(state, stage) context.execute(data)