Пример #1
0
def _enqueue_message(redis, logger, msg, message_key, message_hash_key,
                     queue_name):
    timestamp = to_unix(datetime.utcnow())
    pipe = redis.pipeline()
    pipe.set(message_key, msg.serialize())
    pipe.zadd(message_hash_key, {msg.id: timestamp})
    pipe.lpush(queue_name, msg.id)

    if msg.cron_str is not None:
        next_dt = get_next_cron_timestamp(msg.cron_str)
        timestamp = to_unix(next_dt)

        _enqueue_at_timestamp(
            pipe,
            logger,
            msg.queue,
            Queue.get_message_name,
            timestamp,
            msg.category,
            msg.cron_str,
            *msg.args,
            **msg.kwargs,
        )

    return pipe.execute()
Пример #2
0
    def enqueue_in(self, incr, category, *args, **kw):
        start_in = parse_time(incr)
        future_date = datetime.now(tz=timezone.utc) + start_in
        timestamp = to_unix(future_date)

        return self.__enqueue_at_timestamp(timestamp, category, None, *args,
                                           **kw)
Пример #3
0
def test_move_jobs(client):
    """Test moving jobs from scheduled to enqueued"""

    redis = client.application.redis
    jobs_queue = client.application.jobs_queue
    enqueued_ids = []

    for offset in range(-5, 6):
        if offset == 0:
            continue
        timestamp = to_unix(datetime.utcnow()) + offset * 2
        enqueued_ids.append(
            jobs_queue.enqueue_at(
                timestamp,
                Categories.Notify,
                f"something {offset}",
                other=f"öther {offset}",
            ))

        expect(redis.exists(Queue.get_message_name(
            enqueued_ids[-1]))).to_be_true()

    expect(redis.zcard(Queue.SCHEDULED_QUEUE_NAME)).to_equal(10)

    moved = client.application.queue_group.move_jobs()
    expect(moved).to_length(5)

    expect(redis.llen(jobs_queue.queue_name)).to_equal(5)
    expect(redis.zcard(Queue.SCHEDULED_QUEUE_NAME)).to_equal(5)

    for enqueued_id in enqueued_ids:
        expect(redis.exists(Queue.get_message_name(enqueued_id))).to_be_true()
Пример #4
0
def validate_expiration(job, ex, logger):
    now = datetime.utcnow()
    unixtime = to_unix(now)

    if (
        job.metadata.get("expiration") is not None
        and job.metadata["expiration"] < unixtime
    ):
        expiration_utc = from_unix(job.metadata["expiration"])
        ex.status = JobExecution.Status.expired
        ex.error = (
            f"Job was supposed to be done before {expiration_utc.isoformat()}, "
            f"but was started at {from_unix(unixtime).isoformat()}."
        )
        ex.finished_at = now
        job.save()
        logger.info(
            "Job execution canceled due to being expired.",
            job_expiration=job.metadata["expiration"],
            current_ts=unixtime,
        )

        return False

    return True
Пример #5
0
def reenqueue_job_due_to_break(task_id, job_id, execution_id, image, command):
    args = [task_id, job_id, execution_id, image, command]
    delta = timedelta(seconds=1.0)

    future_date = to_unix(datetime.utcnow() + delta)
    enqueued = current_app.jobs_queue.enqueue_at(future_date, Categories.Job, *args)

    return enqueued
Пример #6
0
    def enqueue_message(self, message):
        timestamp = to_unix(datetime.utcnow())

        pipe = self.redis.pipeline()
        pipe.set(message.message_key(), message.serialize())
        pipe.zadd(message.message_hash_key(), {message.id: timestamp})
        pipe.lpush(message.queue, message.id)

        if message.cron_str:
            next_dt = get_next_cron_timestamp(message.cron_str)
            timestamp = to_unix(next_dt)
            msg_schedule = Message(message.queue, message.category,
                                   message.cron_str, *message.args,
                                   **message.kwargs)

            self._pipe_at_timestamp(pipe, msg_schedule, timestamp)

        return pipe.execute()
Пример #7
0
    def move_jobs(self):
        logger = self.logger.bind(operation="move_jobs")

        lock = self.redis.lock(
            Queue.MOVE_JOBS_LOCK_NAME,
            timeout=5,
            sleep=0.1,
            blocking_timeout=500,
            thread_local=False,
        )

        if not lock.acquire():
            logger.info(
                "Lock could not be acquired. Trying to move jobs again later.")

            return None

        moved_items = []

        try:
            timestamp = to_unix(datetime.utcnow())

            pipe = self.redis.pipeline()
            pipe.zrangebyscore(Queue.SCHEDULED_QUEUE_NAME, "-inf", timestamp)
            pipe.zremrangebyscore(Queue.SCHEDULED_QUEUE_NAME, "-inf",
                                  timestamp)
            items, _ = pipe.execute()

            if not items:
                logger.info("No jobs in need of moving right now.")

                return moved_items

            logger.info("Found jobs in need of moving.", job_count=len(items))

            for message_id in items:
                key = Queue.get_message_name(message_id.decode("utf-8"))
                data = self.redis.get(key)
                msg = Message.deserialize(data.decode("utf-8"))
                logger.info(
                    "Moving job to queue.",
                    queue_name=msg.queue,
                    job_id=msg.id,
                    msg=msg.serialize(),
                )
                queue_hash_name = f"{Queue.QUEUE_HASH_NAME}:{msg.queue}"
                _enqueue_message(self.redis, self.logger, msg, key,
                                 queue_hash_name, msg.queue)
                moved_items.append(msg)

            logger.info("Moved jobs successfully.")

            return moved_items
        finally:
            lock.release()
Пример #8
0
def test_schedule3(client):
    """Test Scheduling a job with cron"""

    queue = client.application.jobs_queue

    cron_str = "* * * * *"
    timestamp = to_unix(get_next_cron_timestamp(cron_str))

    enqueued_id = queue.enqueue_cron(cron_str,
                                     Categories.Monitor,
                                     "something",
                                     other="öther")

    redis = client.application.redis
    expect(redis.llen(queue.queue_name)).to_equal(0)

    expect(redis.zcard(Queue.SCHEDULED_QUEUE_NAME)).to_equal(1)

    items = redis.zrange(Queue.SCHEDULED_QUEUE_NAME, 0, -1, withscores=True)
    expect(items).to_length(1)

    message_id, actual_ts = items[0]
    expect(actual_ts).to_equal(timestamp)
    expect(message_id).to_equal(enqueued_id)

    message_key = queue.get_message_name(enqueued_id)
    expect(redis.exists(message_key)).to_be_true()

    data = redis.get(message_key)

    job = Message.deserialize(data)
    job = Message.deserialize(data)

    expect(job.id).to_equal(enqueued_id)
    expect(job.category).to_equal(Categories.Monitor)
    expect(job.args).to_length(1)
    expect(job.args[0]).to_equal("something")
    expect(job.kwargs).to_include("other")
    expect(job.kwargs["other"]).to_equal("öther")
Пример #9
0
def test_schedule2(client):
    """Test Scheduling a job in a few seconds"""

    queue = client.application.jobs_queue

    timestamp = to_unix(datetime.utcnow()) + 5

    enqueued_id = queue.enqueue_in("5s",
                                   Categories.Monitor,
                                   "something",
                                   other="öther")

    redis = client.application.redis
    expect(redis.llen(queue.queue_name)).to_equal(0)

    expect(redis.zcard(Queue.SCHEDULED_QUEUE_NAME)).to_equal(1)

    items = redis.zrange(Queue.SCHEDULED_QUEUE_NAME, 0, -1, withscores=True)
    expect(items).to_length(1)

    message_id, actual_ts = items[0]
    expect(actual_ts).to_equal(timestamp)
    expect(message_id).to_equal(enqueued_id)

    message_key = Message.generate_message_key(enqueued_id)
    expect(redis.exists(message_key)).to_be_true()

    data = redis.get(message_key)

    job = Message.deserialize(data)

    expect(job.id).to_equal(enqueued_id)
    expect(job.category).to_equal(Categories.Monitor)
    expect(job.args).to_length(1)
    expect(job.args[0]).to_equal("something")
    expect(job.kwargs).to_include("other")
    expect(job.kwargs["other"]).to_equal("öther")
Пример #10
0
    def enqueue_cron(self, cron, category, *args, **kw):
        next_dt = get_next_cron_timestamp(cron)
        timestamp = to_unix(next_dt)

        return self.__enqueue_at_timestamp(timestamp, category, cron, *args,
                                           **kw)
Пример #11
0
def send_webhook(
    task_id, job_id, execution_id, method, url, headers, retries, retry_count
):
    app = current_app

    job = Job.get_by_id(task_id, job_id)
    logger = app.logger.bind(
        operation="send_webhook",
        task_id=task_id,
        job_id=job_id,
        execution_id=execution_id,
        method=method,
        url=url,
        headers=headers,
        retries=retries,
        retry_count=retry_count,
    )

    if job is None:
        logger.error("Failed to retrieve task or job.")

        return False

    execution = job.get_execution_by_id(execution_id)
    logger.info("Execution loaded successfully")

    data = execution.to_dict(include_log=True, include_error=True)
    data = loads(dumps(data))
    if "webhookDispatch" in data["metadata"]:
        del data["metadata"]["webhookDispatch"]
    data["metadata"]["custom"] = job.metadata.get("custom", {})
    data["job_id"] = job_id
    data = dumps(data)
    try:
        dispatcher = WebhooksDispatcher()
        response = dispatcher.dispatch(method, url, data, headers)

        execution.metadata.setdefault("webhookDispatch", [])
        execution.metadata["webhookDispatch"].append(
            {
                "timestamp": datetime.utcnow().isoformat(),
                "url": url,
                "statusCode": response.status_code,
                "body": response.body,
                "headers": response.headers,
            }
        )
        execution.save()
        job.save()
        logger.info("Webhook dispatched successfully.")
    except WebhooksDispatchError as err:
        error = traceback.format_exc()
        execution.metadata.setdefault("webhookDispatch", [])
        execution.metadata["webhookDispatch"].append(
            {
                "timestamp": datetime.utcnow().isoformat(),
                "url": url,
                "statusCode": err.status_code,
                "body": err.body,
                "headers": err.headers,
                "error": error,
            }
        )
        execution.metadata["webhookDispatch"] = execution.metadata["webhookDispatch"][
            -3:
        ]
        execution.save()
        job.save()

        logger.error("Failed to dispatch webhook.", err=error)
        if retry_count < retries:
            logger.debug("Retrying...")
            args = [
                task_id,
                job_id,
                execution_id,
                method,
                url,
                headers,
                retries,
                retry_count + 1,
            ]

            factor = app.config["WEBHOOKS_EXPONENTIAL_BACKOFF_FACTOR"]
            min_backoff = app.config["WEBHOOKS_EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0
            delta = to_unix(
                datetime.utcnow()
                + timedelta(seconds=math.pow(factor, retry_count) * min_backoff)
            )
            current_app.webhooks_queue.enqueue_at(delta, Categories.Webhook, *args)
            logger.info("Webhook dispatch retry scheduled.", date=delta)

    return True
Пример #12
0
def monitor_job(task_id, job_id, execution_id):
    try:
        app = current_app
        executor = app.executor

        job = Job.get_by_id(task_id, job_id)
        logger = app.logger.bind(
            operation="monitor_job",
            task_id=task_id,
            job_id=job_id,
            execution_id=execution_id,
        )

        if job is None:
            logger.error("Failed to retrieve task or job.")

            return False

        execution = job.get_execution_by_id(execution_id)

        if execution.status not in (JobExecution.Status.running,):
            logger.error("Execution result already retrieved. Skipping monitoring...")

            return False

        try:
            result = executor.get_result(job.task, job, execution)
        except HostUnavailableError as err:
            error = traceback.format_exc()
            logger.error("Failed to get results.", error=error)
            current_app.report_error(
                err,
                metadata=dict(
                    operation="Monitoring Job",
                    task_id=task_id,
                    job_id=job_id,
                    execution_id=execution_id,
                ),
            )

            reenqueue_monitor_due_to_break(task_id, job_id, execution_id)

            logger.warn("Job monitor re-enqueued successfully.")

            return False

        if result is None:
            execution.finished_at = datetime.utcnow()
            execution.exit_code = result.exit_code
            execution.status = JobExecution.Status.failed
            execution.log = ""
            execution.error = (
                "Job failed since container could not be found in docker host."
            )

            logger.debug(
                "Job failed, since container could not be found in host.",
                status="failed",
            )
            execution.save()
            job.save()

            send_webhooks(job.task, job, execution, logger)
            notify_users(job.task, job, execution, logger)

            return False

        logger.info(
            "Container result obtained.",
            container_status=result.status,
            container_exit_code=result.exit_code,
        )

        if result.status in (
            ExecutionResult.Status.created,
            ExecutionResult.Status.running,
        ):
            ellapsed = (datetime.utcnow() - execution.started_at).total_seconds()

            if ellapsed > job.metadata["timeout"]:
                execution.finished_at = datetime.utcnow()
                execution.status = JobExecution.Status.timedout
                execution.error = "Job execution timed out after %d seconds." % ellapsed

                try:
                    executor.stop_job(job.task, job, execution)
                except HostUnavailableError as err:
                    error = traceback.format_exc()
                    logger.error("Failed to stop job.", error=error)
                    current_app.report_error(
                        err,
                        metadata=dict(
                            operation="Monitoring Job",
                            task_id=task_id,
                            job_id=job_id,
                            execution_id=execution_id,
                        ),
                    )

                    reenqueue_monitor_due_to_break(task_id, job_id, execution_id)

                    logger.warn("Job monitor re-enqueued successfully.")

                    return False

                logger.debug(
                    "Job execution timed out. Storing job details in mongo db.",
                    status=execution.status,
                    ellapsed=ellapsed,
                    error=result.error,
                )
                execution.save()
                job.save()
                logger.info("Job execution timed out.", status=execution.status)

                send_webhooks(job.task, job, execution, logger)
                notify_users(job.task, job, execution, logger)

                return False

            logger.info(
                "Job has not finished. Retrying monitoring in the future.",
                container_status=result.status,
                seconds=1,
            )

            current_app.monitor_queue.enqueue_in(
                "5s", Categories.Monitor, task_id, job_id, execution_id
            )

            return True

        if (
            result.exit_code != 0
            and "retry_count" in job.metadata
            and job.metadata["retry_count"] < job.metadata["retries"]
        ):
            retry_logger = logger.bind(
                exit_code=result.exit_code,
                retry_count=job.metadata["retry_count"],
                retries=job.metadata["retries"],
            )
            retry_logger.debug("Job failed. Enqueuing job retry...")
            job.metadata["retry_count"] += 1

            new_exec = job.create_execution(execution.image, execution.command)
            new_exec.status = JobExecution.Status.enqueued

            args = [
                task_id,
                job_id,
                new_exec.execution_id,
                execution.image,
                execution.command,
            ]
            factor = app.config["EXPONENTIAL_BACKOFF_FACTOR"]
            min_backoff = app.config["EXPONENTIAL_BACKOFF_MIN_MS"] / 1000.0
            delta = timedelta(seconds=min_backoff)

            if job.metadata["retries"] > 0:
                delta = timedelta(
                    seconds=math.pow(factor, job.metadata["retry_count"]) * min_backoff
                )
            future_date = datetime.utcnow() + delta
            enqueued_id = current_app.jobs_queue.enqueue_at(
                to_unix(future_date), Categories.Job, *args
            )

            job.metadata["enqueued_id"] = enqueued_id
            job.save()

            retry_logger.info("Job execution enqueued successfully.")

            # still need to finish current execution as the retry
            # will be a new execution

        execution.finished_at = datetime.utcnow()
        execution.exit_code = result.exit_code
        execution.status = (
            JobExecution.Status.done

            if execution.exit_code == 0
            else JobExecution.Status.failed
        )
        execution.log = result.log.decode("utf-8")
        execution.error = result.error.decode("utf-8")

        logger.debug(
            "Job finished. Storing job details in mongo db.",
            status=execution.status,
            log=result.log,
            error=result.error,
        )
        execution.save()
        job.save()
        logger.info("Job details stored in mongo db.", status=execution.status)

        try:
            executor.mark_as_done(job.task, job, execution)
        except HostUnavailableError:
            error = traceback.format_exc()
            logger.error("Failed to mark job as done.", error=error)
            reenqueue_monitor_due_to_break(task_id, job_id, execution_id)

            logger.warn("Job monitor re-enqueued successfully.")

            return False

        send_webhooks(job.task, job, execution, logger)
        notify_users(job.task, job, execution, logger)

        return True
    except Exception as err:
        error = traceback.format_exc()
        logger.error("Failed to monitor job", error=error)
        current_app.report_error(
            err,
            metadata=dict(
                operation="Monitoring Job",
                task_id=task_id,
                job_id=job_id,
                execution_id=execution_id,
            ),
        )

        raise err