def write(self, preferred_queue, body): if 'guid' in body: GuidMiddleware.set_guid(body['guid']) try: # when the cluster heartbeat occurs, clean up internally if isinstance(body, dict) and 'cluster_node_heartbeat' in body['task']: self.cleanup() if self.should_grow: self.up() # we don't care about "preferred queue" round robin distribution, just # find the first non-busy worker and claim it workers = self.workers[:] random.shuffle(workers) for w in workers: if not w.busy: w.put(body) break else: return super(AutoscalePool, self).write(preferred_queue, body) except Exception: for conn in connections.all(): # If the database connection has a hiccup, re-establish a new # connection conn.close_if_unusable_or_obsolete() logger.exception('failed to write inbound message')
def delete_guid(sender: Optional[dict], **kwargs: dict) -> None: """ Receiver function for when a request finishes. When a request is finished, delete a requests _guid reference to prevent memory leaks. :param sender: The sender of the signal. By documentation, we must allow this input parameter. :param kwargs: The request_finished signal does not actually send any kwargs, but Django will throw an error if we don't accept them. This is because at any point arguments could get added to the signal, and the receiver must be able to handle those new arguments. :return: None """ logger.debug('Received signal `request_finished`') GuidMiddleware.delete_guid()
def apply_async(cls, args=None, kwargs=None, queue=None, uuid=None, **kw): task_id = uuid or str(uuid4()) args = args or [] kwargs = kwargs or {} queue = (queue or getattr(cls.queue, 'im_func', cls.queue)) if not queue: msg = f'{cls.name}: Queue value required and may not be None' logger.error(msg) raise ValueError(msg) obj = { 'uuid': task_id, 'args': args, 'kwargs': kwargs, 'task': cls.name } guid = GuidMiddleware.get_guid() if guid: obj['guid'] = guid obj.update(**kw) if callable(queue): queue = queue() if not settings.IS_TESTING(sys.argv): with pg_bus_conn() as conn: conn.notify(queue, json.dumps(obj)) return (obj, queue)
def run_callable(self, body): ''' Given some AMQP message, import the correct Python code and run it. ''' task = body['task'] uuid = body.get('uuid', '<unknown>') args = body.get('args', []) kwargs = body.get('kwargs', {}) if 'guid' in body: GuidMiddleware.set_guid(body.pop('guid')) _call = TaskWorker.resolve_callable(task) if inspect.isclass(_call): # the callable is a class, e.g., RunJob; instantiate and # return its `run()` method _call = _call().run # don't print kwargs, they often contain launch-time secrets logger.debug('task {} starting {}(*{})'.format(uuid, task, args)) return _call(*args, **kwargs)
def _enqueue_with_reservation( func, resources, args=None, kwargs=None, options=None, task_group=None ): if not args: args = tuple() if not kwargs: kwargs = dict() if not options: options = dict() def as_url(r): if isinstance(r, str): return r if isinstance(r, Model): return util.get_url(r) raise ValueError(_("Must be (str|Model)")) resources = {as_url(r) for r in resources} inner_task_id = str(uuid.uuid4()) resource_task_id = str(uuid.uuid4()) redis_conn = connection.get_redis_connection() current_job = get_current_job(connection=redis_conn) parent_kwarg = {} json.dumps(args, cls=NonJSONWarningEncoder) json.dumps(kwargs, cls=NonJSONWarningEncoder) if current_job: # set the parent task of the spawned task to the current task ID (same as rq Job ID) parent_kwarg["parent_task"] = Task.objects.get(pk=current_job.id) with transaction.atomic(): task = Task.objects.create( pk=inner_task_id, _resource_job_id=resource_task_id, state=TASK_STATES.WAITING, logging_cid=(GuidMiddleware.get_guid() or ""), task_group=task_group, name=f"{func.__module__}.{func.__name__}", **parent_kwarg, ) for resource in resources: reservation_record = ReservedResourceRecord.objects.get_or_create(resource=resource)[0] TaskReservedResourceRecord.objects.create(resource=reservation_record, task=task) task_args = (func, inner_task_id, list(resources), args, kwargs, options) try: q = Queue("resource-manager", connection=redis_conn) q.enqueue( _queue_reserved_task, job_id=resource_task_id, args=task_args, job_timeout=TASK_TIMEOUT, ) except RedisConnectionError as e: task.set_failed(e, None) return Job(id=inner_task_id, connection=redis_conn)
def __init__(self, model=None): self.parent_workflow_job_id = None self.host_map = {} self.guid = GuidMiddleware.get_guid() self.job_created = None self.recent_event_timings = deque( maxlen=settings.MAX_WEBSOCKET_EVENT_RATE) self.dispatcher = CallbackQueueDispatcher() self.safe_env = {} self.event_ct = 0 self.model = model
def run(): ppid = os.getppid() logger.warn('periodic beat started') while True: if os.getppid() != ppid: # if the parent PID changes, this process has been orphaned # via e.g., segfault or sigkill, we should exit too pid = os.getpid() logger.warn(f'periodic beat exiting gracefully pid:{pid}') raise SystemExit() try: for conn in connections.all(): # If the database connection has a hiccup, re-establish a new # connection conn.close_if_unusable_or_obsolete() GuidMiddleware.set_guid(GuidMiddleware._generate_guid()) self.run_pending() except Exception: logger.exception( 'encountered an error while scheduling periodic tasks') time.sleep(idle_seconds)
def filter(self, record: LogRecord) -> bool: """ Determines that the specified record is to be logged. From the docs: Is the specified record to be logged? Returns 0 for no, nonzero for yes. If deemed appropriate, the record may be modified in-place. :param record: Log record :return: True """ record.correlation_id = GuidMiddleware.get_guid() return True
def perform_job(self, job, queue): """ Set the :class:`pulpcore.app.models.Task` to running and init logging. This method is called by the worker's work horse thread (the forked child) just before the task begins executing. Args: job (rq.job.Job): The job to perform queue (rq.queue.Queue): The Queue associated with the job """ try: task = Task.objects.get(pk=job.get_id()) except Task.DoesNotExist: pass else: task.set_running() user = get_users_with_perms(task).first() _set_current_user(user) GuidMiddleware.set_guid(task.logging_cid) with TaskWorkingDirectory(job): return super().perform_job(job, queue)
def perform_work(self, body): try: flush = body.get('event') == 'FLUSH' if flush: self.last_event = '' if not flush: event_map = { 'job_id': JobEvent, 'ad_hoc_command_id': AdHocCommandEvent, 'project_update_id': ProjectUpdateEvent, 'inventory_update_id': InventoryUpdateEvent, 'system_job_id': SystemJobEvent, } job_identifier = 'unknown job' for key, cls in event_map.items(): if key in body: job_identifier = body[key] break self.last_event = f'\n\t- {cls.__name__} for #{job_identifier} ({body.get("event", "")} {body.get("uuid", "")})' # noqa if body.get('event') == 'EOF': try: if 'guid' in body: GuidMiddleware.set_guid(body['guid']) final_counter = body.get('final_counter', 0) logger.info( 'Event processing is finished for Job {}, sending notifications' .format(job_identifier)) # EOF events are sent when stdout for the running task is # closed. don't actually persist them to the database; we # just use them to report `summary` websocket events as an # approximation for when a job is "done" emit_channel_notification( 'jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier, final_counter=final_counter)) # Additionally, when we've processed all events, we should # have all the data we need to send out success/failure # notification templates uj = UnifiedJob.objects.get(pk=job_identifier) if isinstance(uj, Job): # *actual playbooks* send their success/failure # notifications in response to the playbook_on_stats # event handling code in main.models.events pass elif hasattr(uj, 'send_notification_templates'): handle_success_and_failure_notifications.apply_async( [uj.id]) except Exception: logger.exception( 'Worker failed to emit notifications: Job {}'. format(job_identifier)) finally: self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -1) GuidMiddleware.set_guid('') return skip_websocket_message = body.pop('skip_websocket_message', False) event = cls.create_from_data(**body) if skip_websocket_message: event._skip_websocket_message = True self.buff.setdefault(cls, []).append(event) retries = 0 while retries <= self.MAX_RETRIES: try: self.flush(force=flush) break except (OperationalError, InterfaceError, InternalError): if retries >= self.MAX_RETRIES: logger.exception( 'Worker could not re-establish database connectivity, giving up on one or more events.' ) return delay = 60 * retries logger.exception( 'Database Error Saving Job Event, retry #{i} in {delay} seconds:' .format(i=retries + 1, delay=delay)) django_connection.close() time.sleep(delay) retries += 1 except DatabaseError: logger.exception('Database Error Saving Job Event') break except Exception as exc: tb = traceback.format_exc() logger.error('Callback Task Processor Raised Exception: %r', exc) logger.error('Detail: {}'.format(tb))
def filter(self, record): guid = GuidMiddleware.get_guid() or '-' if MODE == 'development': guid = guid[:8] record.guid = guid return True
def _queue_reserved_task(func, inner_task_id, resources, inner_args, inner_kwargs, options): """ A task that encapsulates another task to be dispatched later. This task being encapsulated is called the "inner" task, and a task name, UUID, and accepts a list of positional args and keyword args for the inner task. These arguments are named inner_args and inner_kwargs. inner_args is a list, and inner_kwargs is a dictionary passed to the inner task as positional and keyword arguments using the * and ** operators. The inner task is dispatched into a dedicated queue for a worker that is decided at dispatch time. The logic deciding which queue receives a task is controlled through the find_worker function. Args: func (basestring): The function to be called inner_task_id (basestring): The task_id to be set on the task being called. By providing the UUID, the caller can have an asynchronous reference to the inner task that will be dispatched. resources (basestring): The urls of the resource you wish to reserve for your task. The system will ensure that no other tasks that want that same reservation will run concurrently with yours. inner_args (tuple): The positional arguments to pass on to the task. inner_kwargs (dict): The keyword arguments to pass on to the task. options (dict): For all options accepted by enqueue see the RQ docs """ redis_conn = connection.get_redis_connection() task_status = Task.objects.get(pk=inner_task_id) GuidMiddleware.set_guid(task_status.logging_cid) task_name = func.__module__ + "." + func.__name__ while True: if task_name == "pulpcore.app.tasks.orphan.orphan_cleanup": if ReservedResource.objects.exists(): # wait until there are no reservations time.sleep(0.25) continue else: rq_worker = util.get_current_worker() worker = Worker.objects.get(name=rq_worker.name) task_status.worker = worker task_status.set_running() q = Queue("resource-manager", connection=redis_conn, is_async=False) try: q.enqueue( func, args=inner_args, kwargs=inner_kwargs, job_id=inner_task_id, job_timeout=TASK_TIMEOUT, **options, ) task_status.set_completed() except RedisConnectionError as e: task_status.set_failed(e, None) return try: with transaction.atomic(): # lock the worker - there is a similar lock in mark_worker_offline() worker = _acquire_worker(resources) # Attempt to lock all resources by their urls. Must be atomic to prevent deadlocks. for resource in resources: if worker.reservations.filter(resource=resource).exists(): reservation = worker.reservations.get(resource=resource) else: reservation = ReservedResource.objects.create( worker=worker, resource=resource ) TaskReservedResource.objects.create(resource=reservation, task=task_status) except (Worker.DoesNotExist, IntegrityError): # if worker is ready, or we have a worker but we can't create the reservations, wait time.sleep(0.25) else: # we have a worker with the locks break task_status.worker = worker task_status.save() try: q = Queue(worker.name, connection=redis_conn) q.enqueue( func, args=inner_args, kwargs=inner_kwargs, job_id=inner_task_id, job_timeout=TASK_TIMEOUT, **options, ) except RedisConnectionError as e: task_status.set_failed(e, None)
def enqueue_with_reservation( func, resources, args=None, kwargs=None, options=None, task_group=None ): """ Enqueue a message to Pulp workers with a reservation. This method provides normal enqueue functionality, while also requesting necessary locks for serialized urls. No two tasks that claim the same resource can execute concurrently. It accepts resources which it transforms into a list of urls (one for each resource). This does not dispatch the task directly, but instead promises to dispatch it later by encapsulating the desired task through a call to a :func:`_queue_reserved_task` task. See the docblock on :func:`_queue_reserved_task` for more information on this. This method creates a :class:`pulpcore.app.models.Task` object. Pulp expects to poll on a task just after calling this method, so a Task entry needs to exist for it before it returns. Args: func (callable): The function to be run by RQ when the necessary locks are acquired. resources (list): A list of resources to reserve guaranteeing that only one task reserves these resources. Each resource can be either a (str) resource URL or a (django.models.Model) resource instance. args (tuple): The positional arguments to pass on to the task. kwargs (dict): The keyword arguments to pass on to the task. options (dict): The options to be passed on to the task. task_group (pulpcore.app.models.TaskGroup): A TaskGroup to add the created Task to. Returns (rq.job.job): An RQ Job instance as returned by RQ's enqueue function Raises: ValueError: When `resources` is an unsupported type. """ if not args: args = tuple() if not kwargs: kwargs = dict() if not options: options = dict() def as_url(r): if isinstance(r, str): return r if isinstance(r, Model): return util.get_url(r) raise ValueError(_("Must be (str|Model)")) resources = {as_url(r) for r in resources} inner_task_id = str(uuid.uuid4()) resource_task_id = str(uuid.uuid4()) redis_conn = connection.get_redis_connection() current_job = get_current_job(connection=redis_conn) parent_kwarg = {} if current_job: # set the parent task of the spawned task to the current task ID (same as rq Job ID) parent_kwarg["parent_task"] = Task.objects.get(pk=current_job.id) with transaction.atomic(): task = Task.objects.create( pk=inner_task_id, _resource_job_id=resource_task_id, state=TASK_STATES.WAITING, logging_cid=(GuidMiddleware.get_guid() or ""), task_group=task_group, name=f"{func.__module__}.{func.__name__}", **parent_kwarg, ) for resource in resources: reservation_record = ReservedResourceRecord.objects.get_or_create(resource=resource)[0] TaskReservedResourceRecord.objects.create(resource=reservation_record, task=task) task_args = (func, inner_task_id, list(resources), args, kwargs, options) try: q = Queue("resource-manager", connection=redis_conn) q.enqueue( _queue_reserved_task, job_id=resource_task_id, args=task_args, job_timeout=TASK_TIMEOUT, ) except RedisConnectionError as e: task.set_failed(e, None) return Job(id=inner_task_id, connection=redis_conn)
def test_is_valid_dashed_guid(): assert GuidMiddleware._validate_guid( '07742cab-407e-4e80-89eb-fd191acbb752') is True
def test_valid_guid(): assert GuidMiddleware._validate_guid( '07742cab407e4e8089ebfd191acbb752') is True