Пример #1
0
 def _command_stop(self, command):
     run_id = command['run_id']
     if run_id in self._run_processes:
         proc = self._run_processes[run_id]
         now = util.stopwatch_now()
         if proc.last_sigterm_time is None:
             proc.last_sigterm_time = now
             logger.info('Stop: %s', run_id)
             try:
                 proc.terminate()
             except OSError:  # if process is already dead
                 pass
         elif now > proc.last_sigterm_time + self._kill_delay:
             logger.info('Kill: %s', run_id)
             try:
                 proc.kill()
             except OSError:  # if process is already dead
                 pass
     else:
         logger.error('Run %s not running', run_id)
Пример #2
0
                    if sweep_command and isinstance(sweep_command, list):
                        self._sweep_command = sweep_command

        # TODO: include sweep ID
        agent = self._api.register_agent(socket.gethostname(),
                                         sweep_id=self._sweep_id)
        agent_id = agent["id"]

        try:
            while self._running:
                commands = util.read_many_from_queue(self._queue, 100,
                                                     self.POLL_INTERVAL)
                for command in commands:
                    command["resp_queue"].put(self._process_command(command))

                now = util.stopwatch_now()
                if self._last_report_time is None or (
                        self._report_interval != 0 and
                        now > self._last_report_time + self._report_interval):
                    logger.info("Running runs: %s",
                                list(self._run_processes.keys()))
                    self._last_report_time = now
                run_status = {}
                for run_id, run_process in list(
                        six.iteritems(self._run_processes)):
                    poll_result = run_process.poll()
                    if poll_result is None:
                        run_status[run_id] = True
                        continue
                    elif (not isinstance(poll_result, bool)
                          and isinstance(poll_result, int)
Пример #3
0
    def run(self):

        # TODO: catch exceptions, handle errors, show validation warnings, and make more generic
        sweep_obj = self._api.sweep(self._sweep_id, "{}")
        if sweep_obj:
            sweep_yaml = sweep_obj.get('config')
            if sweep_yaml:
                sweep_config = yaml.safe_load(sweep_yaml)
                if sweep_config:
                    sweep_command = sweep_config.get('command')
                    if sweep_command and isinstance(sweep_command, list):
                        self._sweep_command = sweep_command

        # TODO: include sweep ID
        agent = self._api.register_agent(socket.gethostname(),
                                         sweep_id=self._sweep_id)
        agent_id = agent['id']

        try:
            while self._running:
                commands = util.read_many_from_queue(self._queue, 100,
                                                     self.POLL_INTERVAL)
                for command in commands:
                    command['resp_queue'].put(self._process_command(command))

                now = util.stopwatch_now()
                if self._last_report_time is None or (
                        self._report_interval != 0 and
                        now > self._last_report_time + self._report_interval):
                    logger.info('Running runs: %s',
                                list(self._run_processes.keys()))
                    self._last_report_time = now
                run_status = {}
                for run_id, run_process in list(
                        six.iteritems(self._run_processes)):
                    poll_result = run_process.poll()
                    if poll_result is None:
                        run_status[run_id] = True
                        continue
                    elif not isinstance(poll_result, bool) and isinstance(
                            poll_result, int) and poll_result > 0:
                        self._failed += 1
                        if self.is_flapping():
                            logger.error(
                                "Detected %i failed runs in the first %i seconds, shutting down.",
                                self.FLAPPING_MAX_FAILURES,
                                self.FLAPPING_MAX_SECONDS)
                            logger.info(
                                "To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true"
                            )
                            self._running = False
                            break
                    logger.info('Cleaning up finished run: %s', run_id)
                    del self._run_processes[run_id]
                    self._last_report_time = None
                    self._finished += 1

                if self._count and self._finished >= self._count or not self._running:
                    self._running = False
                    continue

                commands = self._api.agent_heartbeat(agent_id, {}, run_status)

                # TODO: send _server_responses
                self._server_responses = []
                for command in commands:
                    self._server_responses.append(
                        self._process_command(command))

        except KeyboardInterrupt:
            try:
                wandb.termlog(
                    'Ctrl-c pressed. Waiting for runs to end. Press ctrl-c again to terminate them.'
                )
                for run_id, run_process in six.iteritems(self._run_processes):
                    run_process.wait()
            except KeyboardInterrupt:
                pass
        finally:
            try:
                if not self._in_jupyter:
                    wandb.termlog(
                        'Terminating and syncing runs. Press ctrl-c to kill.')
                for run_id, run_process in six.iteritems(self._run_processes):
                    try:
                        run_process.terminate()
                    except OSError:
                        pass  # if process is already dead
                for run_id, run_process in six.iteritems(self._run_processes):
                    run_process.wait()
            except KeyboardInterrupt:
                wandb.termlog('Killing runs and quitting.')
                for run_id, run_process in six.iteritems(self._run_processes):
                    try:
                        run_process.kill()
                    except OSError:
                        pass  # if process is already dead
Пример #4
0
    def run(self):
        # TODO: include sweep ID
        agent = self._api.register_agent(
            socket.gethostname(), sweep_id=self._sweep_id)
        agent_id = agent['id']

        try:
            while self._running:
                commands = util.read_many_from_queue(
                    self._queue, 100, self.POLL_INTERVAL)
                for command in commands:
                    command['resp_queue'].put(self._process_command(command))

                now = util.stopwatch_now()
                if self._last_report_time is None or (self._report_interval != 0 and
                                                      now > self._last_report_time + self._report_interval):
                    logger.info('Running runs: %s', list(
                        self._run_processes.keys()))
                    self._last_report_time = now
                run_status = {}
                for run_id, run_process in list(six.iteritems(self._run_processes)):
                    if run_process.poll() is None:
                        run_status[run_id] = True
                    else:
                        logger.info('Cleaning up dead run: %s', run_id)
                        del self._run_processes[run_id]
                        self._last_report_time = None

                commands = self._api.agent_heartbeat(agent_id, {}, run_status)

                # TODO: send _server_responses
                self._server_responses = []
                for command in commands:
                    self._server_responses.append(
                        self._process_command(command))
        except KeyboardInterrupt:
            try:
                wandb.termlog(
                    'Ctrl-c pressed. Waiting for runs to end. Press ctrl-c again to terminate them.')
                for run_id, run_process in six.iteritems(self._run_processes):
                    run_process.wait()
            except KeyboardInterrupt:
                pass
        finally:
            try:
                if not self._in_jupyter:
                    wandb.termlog(
                        'Terminating and syncing runs. Press ctrl-c to kill.')
                for run_id, run_process in six.iteritems(self._run_processes):
                    try:
                        run_process.terminate()
                    except OSError:
                        pass  # if process is already dead
                for run_id, run_process in six.iteritems(self._run_processes):
                    run_process.wait()
            except KeyboardInterrupt:
                wandb.termlog('Killing runs and quitting.')
                try:
                    run_process.kill()
                except OSError:
                    pass  # if process is already dead