Пример #1
0
 def __init__(self, model=None):
     self.parent_workflow_job_id = None
     self.host_map = {}
     self.guid = GuidMiddleware.get_guid()
     self.job_created = None
     self.recent_event_timings = deque(
         maxlen=settings.MAX_WEBSOCKET_EVENT_RATE)
     self.dispatcher = CallbackQueueDispatcher()
     self.safe_env = {}
     self.event_ct = 0
     self.model = model
Пример #2
0
 def __init__(self, model=None):
     self.parent_workflow_job_id = None
     self.host_map = {}
     self.guid = get_guid()
     self.job_created = None
     self.recent_event_timings = deque(maxlen=settings.MAX_WEBSOCKET_EVENT_RATE)
     self.dispatcher = CallbackQueueDispatcher()
     self.safe_env = {}
     self.event_ct = 0
     self.model = model
     self.update_attempts = int(settings.DISPATCHER_DB_DOWNTOWN_TOLLERANCE / 5)
Пример #3
0
    def run(self, instance, private_data_dir, playbook, module, module_args,
            event_data_key, ident=None):
        """
        Run a job on an isolated host.

        :param instance:         a `model.Job` instance
        :param private_data_dir: an absolute path on the local file system
                                 where job-specific data should be written
                                 (i.e., `/tmp/awx_N_xyz/`)
        :param playbook:         the playbook to run
        :param module:           the module to run
        :param module_args:      the module args to use
        :param event_data_key:   e.g., job_id, inventory_id, ...

        For a completed job run, this function returns (status, rc),
        representing the status and return code of the isolated
        `ansible-playbook` run.
        """
        self.ident = ident
        self.event_data_key = event_data_key
        self.instance = instance
        self.private_data_dir = private_data_dir
        self.runner_params = self.build_runner_params(
            [instance.execution_node],
            verbosity=min(5, self.instance.verbosity)
        )
        status, rc = self.dispatch(playbook, module, module_args)
        if status == 'successful':
            status, rc = self.check()
        else:
            # emit an EOF event
            event_data = {'event': 'EOF', 'final_counter': 0}
            event_data.setdefault(self.event_data_key, self.instance.id)
            CallbackQueueDispatcher().dispatch(event_data)
        return status, rc
Пример #4
0
    def wrap_stdout_handle(instance,
                           private_data_dir,
                           stdout_handle,
                           event_data_key='job_id'):
        dispatcher = CallbackQueueDispatcher()

        def job_event_callback(event_data):
            event_data.setdefault(event_data_key, instance.id)
            if 'uuid' in event_data:
                filename = '{}-partial.json'.format(event_data['uuid'])
                partial_filename = os.path.join(private_data_dir, 'artifacts',
                                                'job_events', filename)
                try:
                    with codecs.open(partial_filename, 'r',
                                     encoding='utf-8') as f:
                        partial_event_data = json.load(f)
                    event_data.update(partial_event_data)
                except IOError:
                    if event_data.get('event', '') != 'verbose':
                        logger.error(
                            'Missing callback data for event type `{}`, uuid {}, job {}.\nevent_data: {}'
                            .format(event_data.get('event',
                                                   ''), event_data['uuid'],
                                    instance.id, event_data))
            dispatcher.dispatch(event_data)

        return OutputEventFilter(stdout_handle, job_event_callback)
Пример #5
0
    def dispatch(self, playbook=None, module=None, module_args=None):
        '''
        Ship the runner payload to a remote host for isolated execution.
        '''
        self.handled_events = set()
        self.started_at = time.time()

        self.build_isolated_job_data()
        extra_vars = {
            'src': self.private_data_dir,
            'dest': settings.AWX_PROOT_BASE_PATH,
            'ident': self.ident
        }
        if playbook:
            extra_vars['playbook'] = playbook
        if module and module_args:
            extra_vars['module'] = module
            extra_vars['module_args'] = module_args

        # Run ansible-playbook to launch a job on the isolated host.  This:
        #
        # - sets up a temporary directory for proot/bwrap (if necessary)
        # - copies encrypted job data from the controlling host to the isolated host (with rsync)
        # - writes the encryption secret to a named pipe on the isolated host
        # - launches ansible-runner
        args = self._build_args('run_isolated.yml', '%s,' % self.host,
                                extra_vars)
        if self.instance.verbosity:
            args.append('-%s' % ('v' * min(5, self.instance.verbosity)))
        buff = StringIO()
        logger.debug(
            'Starting job {} on isolated host with `run_isolated.yml` playbook.'
            .format(self.instance.id))
        status, rc = IsolatedManager.run_pexpect(
            args,
            self.awx_playbook_path(),
            self.management_env,
            buff,
            idle_timeout=self.idle_timeout,
            job_timeout=settings.AWX_ISOLATED_LAUNCH_TIMEOUT,
            pexpect_timeout=5)
        output = buff.getvalue()
        playbook_logger.info('Isolated job {} dispatch:\n{}'.format(
            self.instance.id, output))
        if status != 'successful':
            for event_data in [
                {
                    'event': 'verbose',
                    'stdout': output
                },
                {
                    'event': 'EOF',
                    'final_counter': 1
                },
            ]:
                event_data.setdefault(self.event_data_key, self.instance.id)
                CallbackQueueDispatcher().dispatch(event_data)
        return status, rc
Пример #6
0
    def check(self, interval=None):
        """
        Repeatedly poll the isolated node to determine if the job has run.

        On success, copy job artifacts to the controlling node.
        On failure, continue to poll the isolated node (until the job timeout
        is exceeded).

        For a completed job run, this function returns (status, rc),
        representing the status and return code of the isolated
        `ansible-playbook` run.

        :param interval: an interval (in seconds) to wait between status polls
        """
        interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL
        extravars = {'src': self.private_data_dir}
        status = 'failed'
        rc = None
        last_check = time.time()
        dispatcher = CallbackQueueDispatcher()
        while status == 'failed':
            canceled = self.cancelled_callback() if self.cancelled_callback else False
            if not canceled and time.time() - last_check < interval:
                # If the job isn't cancelled, but we haven't waited `interval` seconds, wait longer
                time.sleep(1)
                continue

            if canceled:
                logger.warning('Isolated job {} was manually cancelled.'.format(self.instance.id))

            logger.debug('Checking on isolated job {} with `check_isolated.yml`.'.format(self.instance.id))
            runner_obj = self.run_management_playbook('check_isolated.yml',
                                                      self.private_data_dir,
                                                      extravars=extravars)
            status, rc = runner_obj.status, runner_obj.rc

            if self.check_callback is not None and not self.captured_command_artifact:
                command_path = self.path_to('artifacts', self.ident, 'command')
                # If the configuration artifact has been synced back, update the model
                if os.path.exists(command_path):
                    try:
                        with open(command_path, 'r') as f:
                            data = json.load(f)
                        self.check_callback(data)
                        self.captured_command_artifact = True
                    except json.decoder.JSONDecodeError:  # Just in case it's not fully here yet.
                        pass

            self.consume_events(dispatcher)

            last_check = time.time()

        if status == 'successful':
            status_path = self.path_to('artifacts', self.ident, 'status')
            rc_path = self.path_to('artifacts', self.ident, 'rc')
            if os.path.exists(status_path):
                with open(status_path, 'r') as f:
                    status = f.readline()
                with open(rc_path, 'r') as f:
                    rc = int(f.readline())
            else:
                # if there's no status file, it means that runner _probably_
                # exited with a traceback (which should be logged to
                # daemon.log)  Record it so we can see how runner failed.
                daemon_path = self.path_to('daemon.log')
                if os.path.exists(daemon_path):
                    with open(daemon_path, 'r') as f:
                        self.instance.result_traceback = f.read()
                        self.instance.save(update_fields=['result_traceback'])
                else:
                    logger.error('Failed to rsync daemon.log (is ansible-runner installed on the isolated host?)')
                status = 'failed'
                rc = 1

        # consume events one last time just to be sure we didn't miss anything
        # in the final sync
        self.consume_events(dispatcher)

        # emit an EOF event
        event_data = {
            'event': 'EOF',
            'final_counter': len(self.handled_events)
        }
        event_data.setdefault(self.event_data_key, self.instance.id)
        dispatcher.dispatch(event_data)

        return status, rc
Пример #7
0
    def check(self, interval=None):
        """
        Repeatedly poll the isolated node to determine if the job has run.

        On success, copy job artifacts to the controlling node.
        On failure, continue to poll the isolated node (until the job timeout
        is exceeded).

        For a completed job run, this function returns (status, rc),
        representing the status and return code of the isolated
        `ansible-playbook` run.

        :param interval: an interval (in seconds) to wait between status polls
        """
        interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL
        extra_vars = {'src': self.private_data_dir}
        args = self._build_args('check_isolated.yml', '%s,' % self.host,
                                extra_vars)
        if self.instance.verbosity:
            args.append('-%s' % ('v' * min(5, self.instance.verbosity)))

        status = 'failed'
        output = ''
        rc = None
        buff = StringIO()
        last_check = time.time()
        job_timeout = remaining = self.job_timeout
        dispatcher = CallbackQueueDispatcher()
        while status == 'failed':
            if job_timeout != 0:
                remaining = max(0,
                                job_timeout - (time.time() - self.started_at))
                if remaining == 0:
                    # if it takes longer than $REMAINING_JOB_TIMEOUT to retrieve
                    # job artifacts from the host, consider the job failed
                    if isinstance(self.extra_update_fields, dict):
                        self.extra_update_fields[
                            'job_explanation'] = "Job terminated due to timeout"
                    status = 'failed'
                    break

            canceled = self.cancelled_callback(
            ) if self.cancelled_callback else False
            if not canceled and time.time() - last_check < interval:
                # If the job isn't cancelled, but we haven't waited `interval` seconds, wait longer
                time.sleep(1)
                continue

            buff = StringIO()
            logger.debug(
                'Checking on isolated job {} with `check_isolated.yml`.'.
                format(self.instance.id))
            status, rc = IsolatedManager.run_pexpect(
                args,
                self.awx_playbook_path(),
                self.management_env,
                buff,
                cancelled_callback=self.cancelled_callback,
                idle_timeout=remaining,
                job_timeout=remaining,
                pexpect_timeout=5,
                proot_cmd=self.proot_cmd)
            output = buff.getvalue().encode('utf-8')
            playbook_logger.info('Isolated job {} check:\n{}'.format(
                self.instance.id, output))

            # discover new events and ingest them
            events_path = self.path_to('artifacts', self.ident, 'job_events')

            # it's possible that `events_path` doesn't exist *yet*, because runner
            # hasn't actually written any events yet (if you ran e.g., a sleep 30)
            # only attempt to consume events if any were rsynced back
            if os.path.exists(events_path):
                for event in set(
                        os.listdir(events_path)) - self.handled_events:
                    path = os.path.join(events_path, event)
                    if os.path.exists(path):
                        event_data = json.load(
                            open(os.path.join(events_path, event), 'r'))
                        event_data.setdefault(self.event_data_key,
                                              self.instance.id)
                        dispatcher.dispatch(event_data)
                        self.handled_events.add(event)

                        # handle artifacts
                        if event_data.get('event_data',
                                          {}).get('artifact_data', {}):
                            self.instance.artifacts = event_data['event_data'][
                                'artifact_data']
                            self.instance.save(update_fields=['artifacts'])

            last_check = time.time()

        if status == 'successful':
            status_path = self.path_to('artifacts', self.ident, 'status')
            rc_path = self.path_to('artifacts', self.ident, 'rc')
            with open(status_path, 'r') as f:
                status = f.readline()
            with open(rc_path, 'r') as f:
                rc = int(f.readline())

        # emit an EOF event
        event_data = {
            'event': 'EOF',
            'final_counter': len(self.handled_events)
        }
        event_data.setdefault(self.event_data_key, self.instance.id)
        dispatcher.dispatch(event_data)

        return status, rc
Пример #8
0
class RunnerCallback:
    event_data_key = 'job_id'

    def __init__(self, model=None):
        self.parent_workflow_job_id = None
        self.host_map = {}
        self.guid = GuidMiddleware.get_guid()
        self.job_created = None
        self.recent_event_timings = deque(
            maxlen=settings.MAX_WEBSOCKET_EVENT_RATE)
        self.dispatcher = CallbackQueueDispatcher()
        self.safe_env = {}
        self.event_ct = 0
        self.model = model

    def update_model(self, pk, _attempt=0, **updates):
        return update_model(self.model, pk, _attempt=0, **updates)

    def event_handler(self, event_data):
        #
        # ⚠️  D-D-D-DANGER ZONE ⚠️
        # This method is called once for *every event* emitted by Ansible
        # Runner as a playbook runs.  That means that changes to the code in
        # this method are _very_ likely to introduce performance regressions.
        #
        # Even if this function is made on average .05s slower, it can have
        # devastating performance implications for playbooks that emit
        # tens or hundreds of thousands of events.
        #
        # Proceed with caution!
        #
        """
        Ansible runner puts a parent_uuid on each event, no matter what the type.
        AWX only saves the parent_uuid if the event is for a Job.
        """
        # cache end_line locally for RunInventoryUpdate tasks
        # which generate job events from two 'streams':
        # ansible-inventory and the awx.main.commands.inventory_import
        # logger

        if event_data.get(self.event_data_key, None):
            if self.event_data_key != 'job_id':
                event_data.pop('parent_uuid', None)
        if self.parent_workflow_job_id:
            event_data['workflow_job_id'] = self.parent_workflow_job_id
        event_data['job_created'] = self.job_created
        if self.host_map:
            host = event_data.get('event_data', {}).get('host', '').strip()
            if host:
                event_data['host_name'] = host
                if host in self.host_map:
                    event_data['host_id'] = self.host_map[host]
            else:
                event_data['host_name'] = ''
                event_data['host_id'] = ''
            if event_data.get('event') == 'playbook_on_stats':
                event_data['host_map'] = self.host_map

        if isinstance(self, RunnerCallbackForProjectUpdate):
            # need a better way to have this check.
            # it's common for Ansible's SCM modules to print
            # error messages on failure that contain the plaintext
            # basic auth credentials (username + password)
            # it's also common for the nested event data itself (['res']['...'])
            # to contain unredacted text on failure
            # this is a _little_ expensive to filter
            # with regex, but project updates don't have many events,
            # so it *should* have a negligible performance impact
            task = event_data.get('event_data', {}).get('task_action')
            try:
                if task in ('git', 'svn'):
                    event_data_json = json.dumps(event_data)
                    event_data_json = UriCleaner.remove_sensitive(
                        event_data_json)
                    event_data = json.loads(event_data_json)
            except json.JSONDecodeError:
                pass

        if 'event_data' in event_data:
            event_data['event_data']['guid'] = self.guid

        # To prevent overwhelming the broadcast queue, skip some websocket messages
        if self.recent_event_timings:
            cpu_time = time.time()
            first_window_time = self.recent_event_timings[0]
            last_window_time = self.recent_event_timings[-1]

            if event_data.get('event') in MINIMAL_EVENTS:
                should_emit = True  # always send some types like playbook_on_stats
            elif event_data.get('stdout') == '' and event_data[
                    'start_line'] == event_data['end_line']:
                should_emit = False  # exclude events with no output
            else:
                should_emit = any([
                    # if 30the most recent websocket message was sent over 1 second ago
                    cpu_time - first_window_time > 1.0,
                    # if the very last websocket message came in over 1/30 seconds ago
                    self.recent_event_timings.maxlen *
                    (cpu_time - last_window_time) > 1.0,
                    # if the queue is not yet full
                    len(self.recent_event_timings
                        ) != self.recent_event_timings.maxlen,
                ])

            if should_emit:
                self.recent_event_timings.append(cpu_time)
            else:
                event_data.setdefault('event_data', {})
                event_data['skip_websocket_message'] = True

        elif self.recent_event_timings.maxlen:
            self.recent_event_timings.append(time.time())

        event_data.setdefault(self.event_data_key, self.instance.id)
        self.dispatcher.dispatch(event_data)
        self.event_ct += 1
        '''
        Handle artifacts
        '''
        if event_data.get('event_data', {}).get('artifact_data', {}):
            self.instance.artifacts = event_data['event_data']['artifact_data']
            self.instance.save(update_fields=['artifacts'])

        return False

    def cancel_callback(self):
        """
        Ansible runner callback to tell the job when/if it is canceled
        """
        unified_job_id = self.instance.pk
        self.instance.refresh_from_db()
        if not self.instance:
            logger.error(
                'unified job {} was deleted while running, canceling'.format(
                    unified_job_id))
            return True
        if self.instance.cancel_flag or self.instance.status == 'canceled':
            cancel_wait = (now() - self.instance.modified
                           ).seconds if self.instance.modified else 0
            if cancel_wait > 5:
                logger.warn(
                    'Request to cancel {} took {} seconds to complete.'.format(
                        self.instance.log_format, cancel_wait))
            return True
        return False

    def finished_callback(self, runner_obj):
        """
        Ansible runner callback triggered on finished run
        """
        event_data = {
            'event': 'EOF',
            'final_counter': self.event_ct,
            'guid': self.guid,
        }
        event_data.setdefault(self.event_data_key, self.instance.id)
        self.dispatcher.dispatch(event_data)

    def status_handler(self, status_data, runner_config):
        """
        Ansible runner callback triggered on status transition
        """
        if status_data['status'] == 'starting':
            job_env = dict(runner_config.env)
            '''
            Take the safe environment variables and overwrite
            '''
            for k, v in self.safe_env.items():
                if k in job_env:
                    job_env[k] = v
            from awx.main.signals import disable_activity_stream  # Circular import

            with disable_activity_stream():
                self.instance = self.update_model(self.instance.pk,
                                                  job_args=json.dumps(
                                                      runner_config.command),
                                                  job_cwd=runner_config.cwd,
                                                  job_env=job_env)
        elif status_data['status'] == 'failed':
            # For encrypted ssh_key_data, ansible-runner worker will open and write the
            # ssh_key_data to a named pipe. Then, once the podman container starts, ssh-agent will
            # read from this named pipe so that the key can be used in ansible-playbook.
            # Once the podman container exits, the named pipe is deleted.
            # However, if the podman container fails to start in the first place, e.g. the image
            # name is incorrect, then this pipe is not cleaned up. Eventually ansible-runner
            # processor will attempt to write artifacts to the private data dir via unstream_dir, requiring
            # that it open this named pipe. This leads to a hang. Thus, before any artifacts
            # are written by the processor, it's important to remove this ssh_key_data pipe.
            private_data_dir = self.instance.job_env.get(
                'AWX_PRIVATE_DATA_DIR', None)
            if private_data_dir:
                key_data_file = os.path.join(private_data_dir, 'artifacts',
                                             str(self.instance.id),
                                             'ssh_key_data')
                if os.path.exists(key_data_file) and stat.S_ISFIFO(
                        os.stat(key_data_file).st_mode):
                    os.remove(key_data_file)
        elif status_data['status'] == 'error':
            result_traceback = status_data.get('result_traceback', None)
            if result_traceback:
                from awx.main.signals import disable_activity_stream  # Circular import

                with disable_activity_stream():
                    self.instance = self.update_model(
                        self.instance.pk, result_traceback=result_traceback)
Пример #9
0
    def check(self, interval=None):
        """
        Repeatedly poll the isolated node to determine if the job has run.

        On success, copy job artifacts to the controlling node.
        On failure, continue to poll the isolated node (until the job timeout
        is exceeded).

        For a completed job run, this function returns (status, rc),
        representing the status and return code of the isolated
        `ansible-playbook` run.

        :param interval: an interval (in seconds) to wait between status polls
        """
        interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL
        extravars = {'src': self.private_data_dir}
        status = 'failed'
        rc = None
        last_check = time.time()
        dispatcher = CallbackQueueDispatcher()
        while status == 'failed':
            canceled = self.cancelled_callback(
            ) if self.cancelled_callback else False
            if not canceled and time.time() - last_check < interval:
                # If the job isn't cancelled, but we haven't waited `interval` seconds, wait longer
                time.sleep(1)
                continue

            if canceled:
                logger.warning(
                    'Isolated job {} was manually cancelled.'.format(
                        self.instance.id))

            logger.debug(
                'Checking on isolated job {} with `check_isolated.yml`.'.
                format(self.instance.id))
            runner_obj = self.run_management_playbook('check_isolated.yml',
                                                      self.private_data_dir,
                                                      extravars=extravars)
            status, rc = runner_obj.status, runner_obj.rc
            self.consume_events(dispatcher)

            last_check = time.time()

        if status == 'successful':
            status_path = self.path_to('artifacts', self.ident, 'status')
            rc_path = self.path_to('artifacts', self.ident, 'rc')
            with open(status_path, 'r') as f:
                status = f.readline()
            with open(rc_path, 'r') as f:
                rc = int(f.readline())

        # consume events one last time just to be sure we didn't miss anything
        # in the final sync
        self.consume_events(dispatcher)

        # emit an EOF event
        event_data = {
            'event': 'EOF',
            'final_counter': len(self.handled_events)
        }
        event_data.setdefault(self.event_data_key, self.instance.id)
        dispatcher.dispatch(event_data)

        return status, rc