def queue(cls, stage, state, data): crawler = state.get('crawler') job = Job(conn, str(crawler), state['run_id']) job_stage = job.get_stage(stage) queue_length = job_stage.get_status().get('pending') if queue_length > MAX_QUEUE_LENGTH: msg = "queue for %s:%s too big." raise QueueTooBigError(msg % (str(crawler), stage)) job_stage.queue(payload=data, context=state)
def queue(cls, stage, state, data): crawler = state.get("crawler") job = Job(conn, str(crawler), state["run_id"]) job_stage = job.get_stage(stage.namespaced_name) job_stage.sync() queue_length = job_stage.get_status().get("pending") if queue_length > MAX_QUEUE_LENGTH: msg = "queue for %s:%s too big." raise QueueTooBigError(msg % (str(crawler), stage)) job_stage.queue(payload=data, context=state)
def handle(self, status, operation=None, exception=None, task=None, **payload): """Report a processing event that may be related to a task.""" if not WORKER_REPORTING: return task = task or self.task if task is not None: payload["task"] = task.serialize() stage = task.stage else: stage = self.stage dataset = stage.job.dataset.name job_id = stage.job.id operation = operation or stage.stage now = datetime.utcnow() payload.update({ "dataset": dataset, "operation": operation, "job": job_id, "status": status, "updated_at": now, "%s_at" % status: now, "has_error": False, }) if exception is not None: payload.update({ "status": Status.ERROR, "has_error": True, "error_name": exception.__class__.__name__, "error_msg": stringify(exception), }) job = Job(stage.conn, dataset, job_id) stage = job.get_stage(OP_REPORT) stage.queue(payload)
def get_stage(collection, stage, job_id=None): job_id = job_id or Job.random_id() job = Job(kv, collection.foreign_id, job_id) return job.get_stage(stage)
def get_stage(collection, stage, job_id=None): dataset = dataset_from_collection(collection) job_id = job_id or Job.random_id() job = Job(kv, dataset, job_id) return job.get_stage(stage)