def __init__(self): self.logger = sly.get_task_logger('agent') sly.change_formatters_default_values(self.logger, 'service_type', sly.ServiceType.AGENT) sly.change_formatters_default_values(self.logger, 'event_type', sly.EventType.LOGJ) self.log_queue = LogQueue() add_task_handler(self.logger, self.log_queue) sly.add_default_logging_into_file(self.logger, constants.AGENT_LOG_DIR()) self.logger.info('Agent comes back...') self.task_pool_lock = threading.Lock() self.task_pool = {} # task_id -> task_manager (process_id) self.thread_pool = ThreadPoolExecutor(max_workers=10) self.thread_list = [] self.daemons_list = [] sly.fs.clean_dir(constants.AGENT_TMP_DIR()) self._stop_missed_containers() self.docker_api = docker.from_env(version='auto') self._docker_login() self.logger.info('Agent is ready to get tasks.') self.api = sly.AgentAPI(constants.TOKEN(), constants.SERVER_ADDRESS(), self.logger, constants.TIMEOUT_CONFIG_PATH()) self.agent_connect_initially() self.logger.info('Agent connected to server.')
def init_logger(self, loglevel=None): self.logger = sly.get_task_logger(self.info['task_id'], loglevel=loglevel) sly.change_formatters_default_values(self.logger, 'service_type', sly.ServiceType.AGENT) sly.change_formatters_default_values(self.logger, 'event_type', sly.EventType.LOGJ) self.log_queue = LogQueue() add_task_handler(self.logger, self.log_queue) sly.add_default_logging_into_file(self.logger, self.dir_logs) self.executor_log = concurrent.futures.ThreadPoolExecutor(max_workers=1)
def __init__(self): self.logger = sly.get_task_logger('agent') sly.change_formatters_default_values(self.logger, 'service_type', sly.ServiceType.AGENT) sly.change_formatters_default_values(self.logger, 'event_type', sly.EventType.LOGJ) self.log_queue = LogQueue() add_task_handler(self.logger, self.log_queue) sly.add_default_logging_into_file(self.logger, constants.AGENT_LOG_DIR()) self._stop_log_event = threading.Event() self.executor_log = ThreadPoolExecutor(max_workers=1) self.future_log = None self.logger.info('Agent comes back...') self.task_pool_lock = threading.Lock() self.task_pool = {} # task_id -> task_manager (process_id) self.thread_pool = ThreadPoolExecutor(max_workers=10) self.thread_list = [] self.daemons_list = [] self._remove_old_agent() self._validate_duplicated_agents() sly.fs.clean_dir(constants.AGENT_TMP_DIR()) self._stop_missed_containers(constants.TASKS_DOCKER_LABEL()) # for compatibility with old plugins self._stop_missed_containers(constants.TASKS_DOCKER_LABEL_LEGACY()) self.docker_api = docker.from_env( version='auto', timeout=constants.DOCKER_API_CALL_TIMEOUT()) self._docker_login() self.logger.info('Agent is ready to get tasks.') self.api = sly.AgentAPI(constants.TOKEN(), constants.SERVER_ADDRESS(), self.logger, constants.TIMEOUT_CONFIG_PATH()) self.agent_connect_initially() self.logger.info('Agent connected to server.')
class TaskLogged(multiprocessing.Process): def __init__(self, task_info): super().__init__() self.daemon = True self.info = deepcopy(task_info) # Move API key out of the task info message so that it does not get into # logs. self._user_api_key = self.info.pop('user_api_key', None) self.dir_task = osp.join(constants.AGENT_TASKS_DIR(), str(self.info['task_id'])) self.dir_logs = osp.join(self.dir_task, 'logs') sly.fs.mkdir(self.dir_task) sly.fs.mkdir(self.dir_logs) self.dir_task_host = osp.join(constants.AGENT_TASKS_DIR_HOST(), str(self.info['task_id'])) self._stop_log_event = threading.Event() self._stop_event = multiprocessing.Event() # pre-init for static analysis self.logger = None self.log_queue = None self.executor_log = None self.future_log = None self.api = None self.data_mgr = None self.public_api = None self.public_api_context = None def init_logger(self): self.logger = sly.get_task_logger(self.info['task_id']) sly.change_formatters_default_values(self.logger, 'service_type', sly.ServiceType.AGENT) sly.change_formatters_default_values(self.logger, 'event_type', sly.EventType.LOGJ) self.log_queue = LogQueue() add_task_handler(self.logger, self.log_queue) sly.add_default_logging_into_file(self.logger, self.dir_logs) self.executor_log = concurrent.futures.ThreadPoolExecutor( max_workers=1) def init_api(self): self.api = sly.AgentAPI(constants.TOKEN(), constants.SERVER_ADDRESS(), self.logger, constants.TIMEOUT_CONFIG_PATH()) if self._user_api_key is not None: self.public_api = sly.Api(constants.SERVER_ADDRESS(), self._user_api_key) self.public_api.add_additional_field('taskId', self.info['task_id']) self.public_api_context = self.public_api.task.get_context( self.info['task_id']) # else -> TelemetryReporter def init_additional(self): self.data_mgr = DataManager(self.logger, self.api, self.public_api, self.public_api_context) def submit_log(self): break_flag = False while True: log_lines = self.log_queue.get_log_batch_nowait() if len(log_lines) > 0: self.api.simple_request('Log', sly.api_proto.Empty, sly.api_proto.LogLines(data=log_lines)) break_flag = False else: if break_flag: return True if self._stop_log_event.isSet(): break_flag = True # exit after next loop without data time.sleep(0.3) def end_log_stop(self): self.logger.info('TASK_END', extra={ 'event_type': sly.EventType.TASK_STOPPED, 'stopped': 'by_user' }) return sly.EventType.TASK_STOPPED def end_log_crash(self, e): self.logger.critical('TASK_END', exc_info=True, extra={ 'event_type': sly.EventType.TASK_CRASHED, 'exc_str': str(e) }) return sly.EventType.TASK_CRASHED def end_log_finish(self): self.logger.info('TASK_END', extra={'event_type': sly.EventType.TASK_FINISHED}) return sly.EventType.TASK_FINISHED # in new process def run(self): try: self.init_logger() self.init_api() self.future_log = self.executor_log.submit( self.submit_log) # run log submitting except Exception as e: # unable to do something another if crashed print(e) json.dump( e, open( os.path.join(constants.AGENT_ROOT_DIR(), 'logger_fail.json'), 'w')) os._exit(1) # ok, documented try: self.report_start() self.init_additional() self.run_and_wait(self.task_main_func) except StopTaskException: exit_status = self.end_log_stop() except Exception as e: exit_status = self.end_log_crash(e) else: exit_status = self.end_log_finish() self.logger.info("WAIT_FOR_TASK_LOG") self.stop_log_thread() sys.exit(exit_codes[exit_status]) def task_main_func(self): raise NotImplementedError() def run_and_wait(self, subtask_fn): executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) future = executor.submit(subtask_fn) while future.running(): time.sleep(0.5) if self._stop_event.is_set(): executor.shutdown(wait=False) raise StopTaskException() if not self.future_log.running(): raise RuntimeError("SUBMIT_LOGS_ERROR") executor.shutdown(wait=True) return future.result() def stop_log_thread(self): self._stop_log_event.set() self.executor_log.shutdown(wait=True) return self.future_log.result() # crash if log thread crashed def join(self, timeout=None): self._stop_event.set() super().join(timeout) def report_start(self): pass def clean_task_dir(self): pass
class Agent: def __init__(self): self.logger = sly.get_task_logger('agent') sly.change_formatters_default_values(self.logger, 'service_type', sly.ServiceType.AGENT) sly.change_formatters_default_values(self.logger, 'event_type', sly.EventType.LOGJ) self.log_queue = LogQueue() add_task_handler(self.logger, self.log_queue) sly.add_default_logging_into_file(self.logger, constants.AGENT_LOG_DIR()) self.logger.info('Agent comes back...') self.task_pool_lock = threading.Lock() self.task_pool = {} # task_id -> task_manager (process_id) self.thread_pool = ThreadPoolExecutor(max_workers=10) self.thread_list = [] self.daemons_list = [] self._remove_old_agent() self._validate_duplicated_agents() sly.fs.clean_dir(constants.AGENT_TMP_DIR()) self._stop_missed_containers(constants.TASKS_DOCKER_LABEL()) # for compatibility with old plugins self._stop_missed_containers(constants.TASKS_DOCKER_LABEL_LEGACY()) self.docker_api = docker.from_env(version='auto') self._docker_login() self.logger.info('Agent is ready to get tasks.') self.api = sly.AgentAPI(constants.TOKEN(), constants.SERVER_ADDRESS(), self.logger, constants.TIMEOUT_CONFIG_PATH()) self.agent_connect_initially() self.logger.info('Agent connected to server.') def _remove_old_agent(self): container_id = os.getenv('REMOVE_OLD_AGENT', None) if container_id is None: return dc = docker.from_env() olg_agent = dc.containers.get(container_id) olg_agent.remove(force=True) agent_same_token = [] for cont in dc.containers.list(): if constants.TOKEN() in cont.name: agent_same_token.append(cont) if len(agent_same_token) > 1: raise RuntimeError( "Several agents with the same token are running. Please, kill them or contact support." ) agent_same_token[0].rename('supervisely-agent-{}'.format( constants.TOKEN())) def _validate_duplicated_agents(self): dc = docker.from_env() agent_same_token = [] for cont in dc.containers.list(): if constants.TOKEN() in cont.name: agent_same_token.append(cont) if len(agent_same_token) > 1: raise RuntimeError("Agent with the same token already exists.") def agent_connect_initially(self): try: hw_info = get_hw_info() except Exception: hw_info = {} self.logger.debug('Hardware information can not be obtained') docker_inspect_cmd = "curl -s --unix-socket /var/run/docker.sock http:/containers/$(hostname)/json" docker_img_info = subprocess.Popen( [docker_inspect_cmd], shell=True, executable="/bin/bash", stdout=subprocess.PIPE).communicate()[0] self.agent_info = { 'hardware_info': hw_info, 'agent_image': json.loads(docker_img_info)["Config"]["Image"], 'agent_version': json.loads(docker_img_info)["Config"]["Labels"]["VERSION"], 'agent_image_digest': get_self_docker_image_digest() } self.api.simple_request( 'AgentConnected', sly.api_proto.ServerInfo, sly.api_proto.AgentInfo(info=json.dumps(self.agent_info))) def send_connect_info(self): while True: time.sleep(2) self.api.simple_request('AgentPing', sly.api_proto.Empty, sly.api_proto.Empty()) def get_new_task(self): for task in self.api.get_endless_stream('GetNewTask', sly.api_proto.Task, sly.api_proto.Empty()): task_msg = json.loads(task.data) task_msg['agent_info'] = self.agent_info self.logger.debug('GET_NEW_TASK', extra={'task_msg': task_msg}) self.start_task(task_msg) def get_stop_task(self): for task in self.api.get_endless_stream('GetStopTask', sly.api_proto.Id, sly.api_proto.Empty()): stop_task_id = task.id self.logger.debug('GET_STOP_TASK', extra={'task_id': stop_task_id}) self.stop_task(stop_task_id) def stop_task(self, task_id): self.task_pool_lock.acquire() try: if task_id in self.task_pool: self.task_pool[task_id].join(timeout=20) self.task_pool[task_id].terminate() task_extra = { 'task_id': task_id, 'exit_status': self.task_pool[task_id], 'exit_code': self.task_pool[task_id].exitcode } self.logger.info('REMOVE_TASK_TEMP_DATA IF NECESSARY', extra=task_extra) self.task_pool[task_id].clean_task_dir() self.logger.info('TASK_STOPPED', extra=task_extra) del self.task_pool[task_id] else: self.logger.warning('Task could not be stopped. Not found', extra={'task_id': task_id}) self.logger.info('TASK_MISSED', extra={ 'service_type': sly.ServiceType.TASK, 'event_type': sly.EventType.TASK_STOPPED, 'task_id': task_id }) finally: self.task_pool_lock.release() def start_task(self, task): self.task_pool_lock.acquire() try: if task['task_id'] in self.task_pool: self.logger.warning('TASK_ID_ALREADY_STARTED', extra={'task_id': task['task_id']}) else: task_id = task['task_id'] task["agent_version"] = self.agent_info["agent_version"] self.task_pool[task_id] = create_task(task, self.docker_api) self.task_pool[task_id].start() finally: self.task_pool_lock.release() def tasks_health_check(self): while True: time.sleep(3) self.task_pool_lock.acquire() try: all_tasks = list(self.task_pool.keys()) for task_id in all_tasks: val = self.task_pool[task_id] if not val.is_alive(): self._forget_task(task_id) finally: self.task_pool_lock.release() # used only in healthcheck def _forget_task(self, task_id): task_extra = { 'event_type': sly.EventType.TASK_REMOVED, 'task_id': task_id, 'exit_status': self.task_pool[task_id], 'exit_code': self.task_pool[task_id].exitcode, 'service_type': sly.ServiceType.TASK } self.logger.info('REMOVE_TASK_TEMP_DATA IF NECESSARY', extra=task_extra) self.task_pool[task_id].clean_task_dir() del self.task_pool[task_id] self.logger.info('TASK_REMOVED', extra=task_extra) @staticmethod def _remove_containers(label_filter): dc = docker.from_env() stop_list = dc.containers.list(all=True, filters=label_filter) for cont in stop_list: cont.remove(force=True) return stop_list def _stop_missed_containers(self, ecosystem_token): self.logger.info('Searching for missed containers...') label_filter = {'label': 'ecosystem_token={}'.format(ecosystem_token)} stopped_list = Agent._remove_containers(label_filter=label_filter) if len(stopped_list) == 0: self.logger.info('There are no missed containers.') for cont in stopped_list: self.logger.info('Container stopped', extra={ 'cont_id': cont.id, 'labels': cont.labels }) self.logger.info('TASK_MISSED', extra={ 'service_type': sly.ServiceType.TASK, 'event_type': sly.EventType.MISSED_TASK_FOUND, 'task_id': int(cont.labels['task_id']) }) def _docker_login(self): doc_logs = constants.DOCKER_LOGIN().split(',') doc_pasws = constants.DOCKER_PASSWORD().split(',') doc_regs = constants.DOCKER_REGISTRY().split(',') for login, password, registry in zip(doc_logs, doc_pasws, doc_regs): if registry: doc_login = self.docker_api.login(username=login, password=password, registry=registry) self.logger.info('DOCKER_CLIENT_LOGIN_SUCCESS', extra={ **doc_login, 'registry': registry }) def submit_log(self): while True: log_lines = self.log_queue.get_log_batch_blocking() self.api.simple_request('Log', sly.api_proto.Empty, sly.api_proto.LogLines(data=log_lines)) def follow_daemon(self, process_cls, name, sleep_sec=5): proc = process_cls() self.daemons_list.append(proc) try: proc.start() while True: if not proc.is_alive(): err_msg = '{}_CRASHED'.format(name) self.logger.error('Agent process is dead.', extra={'exc_str': err_msg}) time.sleep(1) # an opportunity to send log raise RuntimeError(err_msg) time.sleep(sleep_sec) except Exception as e: proc.terminate() proc.join(timeout=2) raise e def inf_loop(self): self.thread_list.append( self.thread_pool.submit(sly.function_wrapper, self.tasks_health_check)) self.thread_list.append( self.thread_pool.submit(sly.function_wrapper, self.submit_log)) self.thread_list.append( self.thread_pool.submit(sly.function_wrapper, self.get_new_task)) self.thread_list.append( self.thread_pool.submit(sly.function_wrapper, self.get_stop_task)) self.thread_list.append( self.thread_pool.submit(sly.function_wrapper, self.send_connect_info)) self.thread_list.append( self.thread_pool.submit(sly.function_wrapper, self.follow_daemon, TelemetryReporter, 'TELEMETRY_REPORTER')) self.thread_list.append( self.thread_pool.submit(sly.function_wrapper, self.follow_daemon, ImageStreamer, 'IMAGE_STREAMER')) def wait_all(self): def terminate_all_deamons(): for process in self.daemons_list: process.terminate() process.join(timeout=2) return futures_statuses = wait(self.thread_list, return_when='FIRST_EXCEPTION') for future in self.thread_list: if future.done(): try: future.result() except Exception: terminate_all_deamons() break futures_statuses = wait(self.thread_list, return_when='FIRST_EXCEPTION') for future in self.thread_list: if future.done(): try: future.result() except Exception: terminate_all_deamons() break if len(futures_statuses.not_done) != 0: raise RuntimeError("AGENT: EXCEPTION IN BASE FUTURE !!!")