def execute(config: str, debug: bool, params): check_statuses() _create_computer() _create_docker() # Fail all InProgress Tasks logger = create_logger(_session, __name__) provider = TaskProvider(_session) step_provider = StepProvider(_session) for t in provider.by_status(TaskStatus.InProgress, worker_index=WORKER_INDEX): step = step_provider.last_for_task(t.id) logger.error( f'Task Id = {t.id} was in InProgress state ' f'when another tasks arrived to the same worker', ComponentType.Worker, t.computer_assigned, t.id, step) provider.change_status(t, TaskStatus.Failed) # Create dags dags = _dag(config, debug, params=params) for dag in dags: for ids in dag.values(): for id in ids: task = provider.by_id(id) task.gpu_assigned = ','.join( [str(i) for i in range(torch.cuda.device_count())]) provider.commit() execute_by_id(id, exit=False)
def execute(config: str, debug: bool): _create_computer() # Fail all InProgress Tasks logger = create_logger(_session, __name__) provider = TaskProvider(_session) step_provider = StepProvider(_session) for t in provider.by_status(TaskStatus.InProgress, worker_index=WORKER_INDEX): step = step_provider.last_for_task(t.id) logger.error( f'Task Id = {t.id} was in InProgress state ' f'when another tasks arrived to the same worker', ComponentType.Worker, t.computer_assigned, t.id, step) provider.change_status(t, TaskStatus.Failed) # Create dag created_dag = _dag(config, debug) for ids in created_dag.values(): for id in ids: task = provider.by_id(id) task.gpu_assigned = ','.join( [str(i) for i, _ in enumerate(GPUtil.getGPUs())]) provider.commit() execute_by_id(id, exit=False)
def stop(logger, session: Session, task: Task, dag: Dag): provider = TaskProvider(session) if task.status > TaskStatus.InProgress.value: return task.status status = TaskStatus.Stopped try: if task.status != TaskStatus.NotRan.value: app.control.revoke(task.celery_id, terminate=True) else: status = TaskStatus.Skipped except Exception as e: if Session.sqlalchemy_error(e): try: logger.error(traceback.format_exc(), ComponentType.API) except Exception: pass raise logger.error(traceback.format_exc(), ComponentType.API) finally: if task.pid: queue = f'{task.computer_assigned}_' \ f'{dag.docker_img or "default"}_supervisor' kill.apply_async((task.pid, ), queue=queue, retry=False) additional_info = yaml_load(task.additional_info) for p in additional_info.get('child_processes', []): kill.apply_async((p, ), queue=queue, retry=False) provider.change_status(task, status) return task.status
class ExecuteBuilder: def __init__(self, id: int, repeat_count: int = 1, exit=True): self.session = Session.create_session(key='ExecuteBuilder') self.id = id self.repeat_count = repeat_count self.logger = create_logger(self.session, 'ExecuteBuilder') self.logger_db = create_logger(self.session, 'ExecuteBuilder.db', console=False) self.exit = exit self.provider = None self.library_provider = None self.storage = None self.task = None self.dag = None self.executor = None self.hostname = None self.docker_img = None self.worker_index = None self.queue_personal = None self.config = None self.executor_type = None def info(self, msg: str, step=None): self.logger.info(msg, ComponentType.Worker, self.hostname, self.id, step) def error(self, msg: str, step=None): self.logger.error(msg, ComponentType.Worker, self.hostname, self.id, step) def warning(self, msg: str, step=None): self.logger.warning(msg, ComponentType.Worker, self.hostname, self.id, step) def debug(self, msg: str, step=None): self.logger.debug(msg, ComponentType.Worker, self.hostname, self.id, step) def create_base(self): self.info('create_base') if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.control.revoke(app.current_task.request.id, terminate=True) self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) set_global_seed(self.config['info'].get('seed', 0)) self.executor_type = self.config['executors'][ self.task.executor]['type'] executor = self.config['executors'][self.task.executor] cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', '') self.info(f'Env.before execution ' f'CUDA_VISIBLE_DEVICES={cuda_visible_devices}') if cuda_visible_devices.strip() != '': gpu_assigned = self.task.gpu_assigned or '' cuda_visible_devices = cuda_visible_devices.split(',') cuda_visible_devices = ','.join([ cuda_visible_devices[int(g)] for g in gpu_assigned.split(',') if g.strip() != '' ]) else: cuda_visible_devices = self.task.gpu_assigned cuda_visible_devices = cuda_visible_devices or '' env = { 'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1, 'CUDA_VISIBLE_DEVICES': cuda_visible_devices } env.update(executor.get('env', {})) for k, v in env.items(): os.environ[k] = str(v) self.info(f'Set env. {k} = {v}') def check_status(self): self.info('check_status') assert self.dag is not None, 'You must fetch task with dag_rel' if self.task.status >= TaskStatus.InProgress.value: msg = f'Task = {self.task.id}. Status = {self.task.status}, ' \ f'before the execute_by_id invocation.' if app.current_task: msg += f' Request Id = {app.current_task.request.id}' self.error(msg) return True def change_status(self): self.info('change_status') self.task.computer_assigned = self.hostname self.task.pid = os.getpid() self.task.worker_index = self.worker_index self.task.docker_assigned = self.docker_img self.provider.change_status(self.task, TaskStatus.InProgress) def download(self): self.info('download') if not self.task.debug: folder = self.storage.download(task=self.id) else: folder = os.getcwd() os.chdir(folder) libraries = self.library_provider.dag(self.task.dag) executor_type = self.executor_type self.info('download. folder changed') mlcomp_executors_folder = join(dirname(abspath(__file__)), 'executors') mlcomp_base_folder = os.path.abspath( join(mlcomp_executors_folder, '../../../')) imported, was_installation = self.storage.import_executor( mlcomp_executors_folder, mlcomp_base_folder, executor_type) if not imported: imported, was_installation = self.storage.import_executor( folder, folder, executor_type, libraries) if not imported: raise Exception(f'Executor = {executor_type} not found') self.info('download. executor imported') if was_installation and not self.task.debug: if self.repeat_count > 0: self.info('was installation. ' 'set task status to Queued. ' 'And resending the task to a queue') self.task.status = TaskStatus.Queued.value self.provider.commit() try: execute.apply_async((self.id, self.repeat_count - 1), queue=self.queue_personal, retry=False) except Exception: pass finally: sys.exit() assert Executor.is_registered(executor_type), \ f'Executor {executor_type} was not found' def create_executor(self): self.info('create_executor') additional_info = yaml_load(self.task.additional_info) \ if self.task.additional_info else dict() self.executor = Executor.from_config(executor=self.task.executor, config=self.config, additional_info=additional_info, session=self.session, logger=self.logger, logger_db=self.logger_db) def execute(self): self.info('execute start') res = self.executor(task=self.task, task_provider=self.provider, dag=self.dag) self.info('execute executor finished') res = res or {} self.task.result = yaml_dump(res) self.provider.commit() if 'stage' in res and 'stages' in res: index = res['stages'].index(res['stage']) if index < len(res['stages']) - 1: self.executor.info(f'stage = {res["stage"]} done. ' f'Go to the stage = ' f'{res["stages"][index + 1]}') time.sleep(3) self.executor.info(f'sending {(self.id, self.repeat_count)} ' f'to {self.queue_personal}') self.task.status = TaskStatus.Queued.value self.provider.commit() execute.apply_async((self.id, self.repeat_count), queue=self.queue_personal, retry=False) return self.executor.step.finish() self.provider.change_status(self.task, TaskStatus.Success) self.info('execute end') def build(self): try: self.create_base() bad_status = self.check_status() if bad_status: return self.change_status() self.download() self.create_executor() self.execute() except Exception as e: step = self.executor.step.id if \ (self.executor and self.executor.step) else None if Session.sqlalchemy_error(e): Session.cleanup(key='ExecuteBuilder') self.session = Session.create_session(key='ExecuteBuilder') self.logger.session = create_logger(self.session, 'ExecuteBuilder') self.error(traceback.format_exc(), step) if self.task.status <= TaskStatus.InProgress.value: self.provider.change_status(self.task, TaskStatus.Failed) raise e finally: if app.current_task: app.close() if self.exit: # noinspection PyProtectedMember os._exit(0)
class SupervisorBuilder: def __init__(self): self.session = Session.create_session(key='SupervisorBuilder') self.logger = create_logger(self.session, 'SupervisorBuilder') self.provider = None self.computer_provider = None self.docker_provider = None self.auxiliary_provider = None self.dag_provider = None self.queues = None self.not_ran_tasks = None self.dep_status = None self.computers = None self.auxiliary = {} self.tasks = [] self.tasks_stop = [] self.dags_start = [] self.sent_tasks = 0 def create_base(self): self.session.commit() self.provider = TaskProvider(self.session) self.computer_provider = ComputerProvider(self.session) self.docker_provider = DockerProvider(self.session) self.auxiliary_provider = AuxiliaryProvider(self.session) self.dag_provider = DagProvider(self.session) self.queues = [ f'{d.computer}_{d.name}' for d in self.docker_provider.all() if d.last_activity >= now() - datetime.timedelta(seconds=15) ] self.auxiliary['queues'] = self.queues def load_tasks(self): self.tasks = self.provider.by_status(TaskStatus.NotRan, TaskStatus.InProgress, TaskStatus.Queued) not_ran_tasks = [t for t in self.tasks if t.status == TaskStatus.NotRan.value] self.not_ran_tasks = [task for task in not_ran_tasks if not task.debug] self.not_ran_tasks = sorted( self.not_ran_tasks, key=lambda x: x.gpu or 0, reverse=True) self.logger.debug( f'Found {len(not_ran_tasks)} not ran tasks', ComponentType.Supervisor ) self.dep_status = self.provider.dependency_status(self.not_ran_tasks) self.auxiliary['not_ran_tasks'] = [ { 'id': t.id, 'name': t.name, 'dep_status': [ TaskStatus(s).name for s in self.dep_status.get(t.id, set()) ] } for t in not_ran_tasks[:5] ] def load_computers(self): computers = self.computer_provider.computers() for computer in computers.values(): computer['gpu'] = [0] * computer['gpu'] computer['ports'] = set() computer['cpu_total'] = computer['cpu'] computer['memory_total'] = computer['memory'] computer['gpu_total'] = len(computer['gpu']) computer['can_process_tasks'] = computer['can_process_tasks'] tasks = [ t for t in self.tasks if t.status in [TaskStatus.InProgress.value, TaskStatus.Queued.value] ] for task in tasks: if task.computer_assigned is None: continue assigned = task.computer_assigned comp_assigned = computers[assigned] comp_assigned['cpu'] -= task.cpu if task.gpu_assigned is not None: for g in task.gpu_assigned.split(','): comp_assigned['gpu'][int(g)] = task.id comp_assigned['memory'] -= task.memory * 1024 info = yaml_load(task.additional_info) if 'distr_info' in info: dist_info = info['distr_info'] if dist_info['rank'] == 0: comp_assigned['ports'].add(dist_info['master_port']) self.computers = [ { **value, 'name': name } for name, value in computers.items() ] self.auxiliary['computers'] = self.computers def process_to_celery(self, task: Task, queue: str, computer: dict): r = execute.apply_async((task.id,), queue=queue, retry=False) task.status = TaskStatus.Queued.value task.computer_assigned = computer['name'] task.celery_id = r.id if task.computer_assigned is not None: if task.gpu_assigned: for g in map(int, task.gpu_assigned.split(',')): computer['gpu'][g] = task.id computer['cpu'] -= task.cpu computer['memory'] -= task.memory * 1024 self.logger.info( f'Sent task={task.id} to celery. Queue = {queue} ' f'Task status = {task.status} Celery_id = {r.id}', ComponentType.Supervisor) self.provider.update() def create_service_task( self, task: Task, gpu_assigned=None, distr_info: dict = None, resume: dict = None ): new_task = Task( name=task.name, computer=task.computer, executor=task.executor, status=TaskStatus.NotRan.value, type=TaskType.Service.value, gpu_assigned=gpu_assigned, parent=task.id, report=task.report, dag=task.dag ) new_task.additional_info = task.additional_info if distr_info: additional_info = yaml_load(new_task.additional_info) additional_info['distr_info'] = distr_info new_task.additional_info = yaml_dump(additional_info) if resume: additional_info = yaml_load(new_task.additional_info) additional_info['resume'] = resume new_task.additional_info = yaml_dump(additional_info) return self.provider.add(new_task) def find_port(self, c: dict, docker_name: str): docker = self.docker_provider.get(c['name'], docker_name) ports = list(map(int, docker.ports.split('-'))) for p in range(ports[0], ports[1] + 1): if p not in c['ports']: return p raise Exception(f'All ports in {c["name"]} are taken') def _process_task_valid_computer(self, task: Task, c: dict, single_node: bool): if not c['can_process_tasks']: return 'this computer can not process tasks' if task.computer is not None and task.computer != c['name']: return 'name set in the config!= name of this computer' if task.cpu > c['cpu']: return f'task cpu = {task.cpu} > computer' \ f' free cpu = {c["cpu"]}' if task.memory > c['memory']: return f'task cpu = {task.cpu} > computer ' \ f'free memory = {c["memory"]}' queue = f'{c["name"]}_' \ f'{task.dag_rel.docker_img or "default"}' if queue not in self.queues: return f'required queue = {queue} not in queues' if task.gpu > 0 and not any(g == 0 for g in c['gpu']): return f'task requires gpu, but there is not any free' free_gpu = sum(g == 0 for g in c['gpu']) if single_node and task.gpu > free_gpu: return f'task requires {task.gpu} ' \ f'but there are only {free_gpu} free' def _process_task_get_computers( self, executor: dict, task: Task, auxiliary: dict ): single_node = executor.get('single_node', True) computers = [] for c in self.computers: error = self._process_task_valid_computer(task, c, single_node) auxiliary['computers'].append({'name': c['name'], 'error': error}) if not error: computers.append(c) if task.gpu > 0 and single_node and len(computers) > 0: computers = sorted( computers, key=lambda x: sum(g == 0 for g in c['gpu']), reverse=True )[:1] free_gpu = sum(sum(g == 0 for g in c['gpu']) for c in computers) if task.gpu > free_gpu: auxiliary['not_valid'] = f'gpu required by the ' \ f'task = {task.gpu},' \ f' but there are only {free_gpu} ' \ f'free gpus' return [] return computers def _process_task_to_send( self, executor: dict, task: Task, computers: List[dict] ): distr = executor.get('distr', True) to_send = [] for computer in computers: queue = f'{computer["name"]}_' \ f'{task.dag_rel.docker_img or "default"}' if task.gpu_max > 1 and distr: for index, task_taken_gpu in enumerate(computer['gpu']): if task_taken_gpu: continue to_send.append([computer, queue, index]) if len(to_send) >= task.gpu_max: break if len(to_send) >= task.gpu_max: break elif task.gpu_max > 0: cuda_devices = [] for index, task_taken_gpu in enumerate(computer['gpu']): if task_taken_gpu: continue cuda_devices.append(index) if len(cuda_devices) >= task.gpu_max: break task.gpu_assigned = ','.join(map(str, cuda_devices)) self.process_to_celery(task, queue, computer) else: self.process_to_celery(task, queue, computer) break return to_send def process_task(self, task: Task): auxiliary = self.auxiliary['process_tasks'][-1] auxiliary['computers'] = [] config = yaml_load(task.dag_rel.config) executor = config['executors'][task.executor] computers = self._process_task_get_computers(executor, task, auxiliary) if len(computers) == 0: return to_send = self._process_task_to_send(executor, task, computers) auxiliary['to_send'] = to_send[:5] additional_info = yaml_load(task.additional_info) rank = 0 master_port = None if len(to_send) > 0: master_port = self.find_port( to_send[0][0], to_send[0][1].split('_')[1] ) computer_names = {c['name'] for c, _, __ in to_send} if len(computer_names) == 1: task.computer_assigned = list(computer_names)[0] for computer, queue, gpu_assigned in to_send: main_cmp = to_send[0][0] # noinspection PyTypeChecker ip = 'localhost' if computer['name'] == main_cmp['name'] \ else main_cmp['ip'] distr_info = { 'master_addr': ip, 'rank': rank, 'local_rank': gpu_assigned, 'master_port': master_port, 'world_size': len(to_send), 'master_computer': main_cmp['name'] } service_task = self.create_service_task( task, distr_info=distr_info, gpu_assigned=gpu_assigned, resume=additional_info.get('resume') ) self.process_to_celery(service_task, queue, computer) rank += 1 main_cmp['ports'].add(master_port) if len(to_send) > 0: task.status = TaskStatus.Queued.value self.sent_tasks += len(to_send) def process_tasks(self): self.auxiliary['process_tasks'] = [] for task in self.not_ran_tasks: auxiliary = {'id': task.id, 'name': task.name} self.auxiliary['process_tasks'].append(auxiliary) if task.dag_rel is None: task.dag_rel = self.dag_provider.by_id(task.dag) if TaskStatus.Stopped.value in self.dep_status[task.id] \ or TaskStatus.Failed.value in self.dep_status[task.id] or \ TaskStatus.Skipped.value in self.dep_status[task.id]: auxiliary['not_valid'] = 'stopped or failed in dep_status' self.provider.change_status(task, TaskStatus.Skipped) continue if len(self.dep_status[task.id]) != 0 \ and self.dep_status[task.id] != {TaskStatus.Success.value}: auxiliary['not_valid'] = 'not all dep tasks are finished' continue self.process_task(task) self.auxiliary['process_tasks'] = self.auxiliary['process_tasks'][:5] def _stop_child_tasks(self, task: Task): self.provider.commit() children = self.provider.children(task.id, [Task.dag_rel]) dags = [c.dag_rel for c in children] for c, d in zip(children, dags): celery_tasks.stop(self.logger, self.session, c, d) def process_parent_tasks(self): tasks = self.provider.parent_tasks_stats() was_change = False for task, started, finished, statuses in tasks: status = task.status if statuses[TaskStatus.Failed] > 0: status = TaskStatus.Failed.value elif statuses[TaskStatus.Skipped] > 0: status = TaskStatus.Skipped.value elif statuses[TaskStatus.Queued] > 0: status = TaskStatus.Queued.value elif statuses[TaskStatus.InProgress] > 0: status = TaskStatus.InProgress.value elif statuses[TaskStatus.Success] > 0: status = TaskStatus.Success.value if status != task.status: if status == TaskStatus.InProgress.value: task.started = started elif status >= TaskStatus.Failed.value: task.started = started task.finished = finished self._stop_child_tasks(task) was_change = True task.status = status if was_change: self.provider.commit() self.auxiliary['parent_tasks_stats'] = [ { 'name': task.name, 'id': task.id, 'started': task.started, 'finished': finished, 'statuses': [ { 'name': k.name, 'count': v } for k, v in statuses.items() ], } for task, started, finished, statuses in tasks[:5] ] def write_auxiliary(self): self.auxiliary['duration'] = (now() - self.auxiliary['time']). \ total_seconds() auxiliary = Auxiliary( name='supervisor', data=yaml_dump(self.auxiliary) ) if len(auxiliary.data) > 16000: return self.auxiliary_provider.create_or_update(auxiliary, 'name') def stop_tasks(self, tasks: List[Task]): self.tasks_stop.extend([t.id for t in tasks]) def process_stop_tasks(self): # Stop not running tasks if len(self.tasks_stop) == 0: return tasks = self.provider.by_ids(self.tasks_stop) tasks_not_ran = [t.id for t in tasks if t.status in [TaskStatus.NotRan.value, TaskStatus.Queued.value]] tasks_started = [t for t in tasks if t.status in [TaskStatus.InProgress.value]] tasks_started_ids = [t.id for t in tasks_started] self.provider.change_status_all(tasks=tasks_not_ran, status=TaskStatus.Skipped) pids = [] for task in tasks_started: if task.pid: pids.append((task.computer_assigned, task.pid)) additional_info = yaml_load(task.additional_info) for p in additional_info.get('child_processes', []): pids.append((task.computer_assigned, p)) for computer, queue in self.docker_provider.queues_online(): pids_computer = [p for c, p in pids if c == computer] if len(pids_computer) > 0: celery_tasks.kill_all.apply_async((pids_computer,), queue=queue, retry=False) self.provider.change_status_all(tasks=tasks_started_ids, status=TaskStatus.Stopped) self.tasks_stop = [] def fast_check(self): if self.provider is None or self.computer_provider is None: return False if self.not_ran_tasks is None or self.queues is None: return False if len(self.tasks_stop) > 0: return False if len(self.dags_start) > 0: return False if len(self.auxiliary.get('to_send', [])) > 0: return False queues = set([ f'{d.computer}_{d.name}' for d in self.docker_provider.all() if d.last_activity >= now() - datetime.timedelta(seconds=15) ]) queues_set = set(queues) queues_set2 = set(self.queues) if queues_set != queues_set2: return False tasks = self.provider.by_status(TaskStatus.NotRan, TaskStatus.Queued, TaskStatus.InProgress) tasks_set = {t.id for t in tasks if t.status == TaskStatus.NotRan.value and not t.debug} tasks_set2 = {t.id for t in self.tasks if t.status == TaskStatus.NotRan.value} if tasks_set != tasks_set2: return False tasks_set = {t.id for t in tasks if t.status == TaskStatus.InProgress.value} tasks_set2 = {t.id for t in self.tasks if t.status == TaskStatus.InProgress.value} if tasks_set != tasks_set2: return False tasks_set = {t.id for t in tasks if t.status == TaskStatus.Queued.value} tasks_set2 = {t.id for t in self.tasks if t.status == TaskStatus.Queued.value} if tasks_set != tasks_set2: return False return True def start_dag(self, id: int): self.dags_start.append(id) def process_start_dags(self): if len(self.dags_start) == 0: return for id in self.dags_start: can_start_statuses = [ TaskStatus.Failed.value, TaskStatus.Skipped.value, TaskStatus.Stopped.value ] tasks = self.provider.by_dag(id) children_all = self.provider.children([t.id for t in tasks]) def find_resume(task): children = [c for c in children_all if c.parent == task.id] children = sorted(children, key=lambda x: x.id, reverse=True) if len(children) > 0: for c in children: if c.parent != task.id: continue info = yaml_load(c.additional_info) if 'distr_info' not in info: continue if info['distr_info']['rank'] == 0: return { 'master_computer': c.computer_assigned, 'master_task_id': c.id, 'load_last': True } raise Exception('Master task not found') else: return { 'master_computer': task.computer_assigned, 'master_task_id': task.id, 'load_last': True } for t in tasks: if t.status not in can_start_statuses: continue if t.parent: continue if t.type == TaskType.Train.value: info = yaml_load(t.additional_info) info['resume'] = find_resume(t) t.additional_info = yaml_dump(info) t.status = TaskStatus.NotRan.value t.pid = None t.started = None t.finished = None t.computer_assigned = None t.celery_id = None t.worker_index = None t.docker_assigned = None self.provider.commit() self.dags_start = [] def build(self): try: # if self.fast_check(): # return self.auxiliary = {'time': now()} self.create_base() self.process_stop_tasks() self.process_start_dags() self.process_parent_tasks() self.load_tasks() self.load_computers() self.process_tasks() self.write_auxiliary() except ObjectDeletedError: pass except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup(key='SupervisorBuilder') self.session = Session.create_session(key='SupervisorBuilder') self.logger = create_logger(self.session, 'SupervisorBuilder') self.logger.error(traceback.format_exc(), ComponentType.Supervisor)
class ExecuteBuilder: def __init__(self, id: int, repeat_count: int = 1, exit=True): self.session = Session.create_session(key='ExecuteBuilder') self.id = id self.repeat_count = repeat_count self.logger = create_logger(self.session, 'ExecuteBuilder') self.exit = exit self.provider = None self.library_provider = None self.storage = None self.task = None self.dag = None self.executor = None self.hostname = None self.docker_img = None self.worker_index = None self.queue_personal = None self.config = None self.executor_type = None def info(self, msg: str, step=None): self.logger.info(msg, ComponentType.Worker, self.hostname, self.id, step) def error(self, msg: str, step=None): self.logger.error(msg, ComponentType.Worker, self.hostname, self.id, step) def warning(self, msg: str, step=None): self.logger.warning(msg, ComponentType.Worker, self.hostname, self.id, step) def debug(self, msg: str, step=None): self.logger.debug(msg, ComponentType.Worker, self.hostname, self.id, step) def create_base(self): self.info('create_base') self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) self.executor_type = self.config['executors'][ self.task.executor]['type'] def check_status(self): self.info('check_status') assert self.dag is not None, 'You must fetch task with dag_rel' if self.task.status > TaskStatus.InProgress.value: msg = f'Task = {self.task.id}. Status = {self.task.status}, ' \ f'before the execute_by_id invocation' self.error(msg) raise Exception(msg) def change_status(self): self.info('change_status') self.task.computer_assigned = self.hostname self.task.pid = os.getpid() self.task.worker_index = self.worker_index self.task.docker_assigned = self.docker_img self.provider.change_status(self.task, TaskStatus.InProgress) def download(self): self.info('download') if not self.task.debug: folder = self.storage.download(task=self.id) else: folder = os.getcwd() os.chdir(folder) libraries = self.library_provider.dag(self.task.dag) executor_type = self.executor_type mlcomp_executors_folder = join(dirname(abspath(__file__)), 'executors') mlcomp_base_folder = os.path.abspath( join(mlcomp_executors_folder, '../../../')) imported, was_installation = self.storage.import_executor( mlcomp_executors_folder, mlcomp_base_folder, executor_type) if not imported: imported, was_installation = self.storage.import_executor( folder, folder, executor_type, libraries) if not imported: raise Exception(f'Executor = {executor_type} not found') if was_installation and not self.task.debug: if self.repeat_count > 0: try: self.warning(traceback.format_exc()) execute.apply_async((self.id, self.repeat_count - 1), queue=self.queue_personal) except Exception: pass finally: sys.exit() assert Executor.is_registered(executor_type), \ f'Executor {executor_type} was not found' def create_executor(self): self.info('create_executor') additional_info = yaml_load(self.task.additional_info) \ if self.task.additional_info else dict() self.executor = Executor.from_config(executor=self.task.executor, config=self.config, additional_info=additional_info, session=self.session, logger=self.logger) def execute(self): self.info('execute start') res = self.executor(task=self.task, task_provider=self.provider, dag=self.dag) self.info('execute executor finished') res = res or {} self.task.result = yaml_dump(res) self.provider.commit() if 'stage' in res and 'stages' in res: index = res['stages'].index(res['stage']) if index < len(res['stages']) - 1: self.executor.info(f'stage = {res["stage"]} done. ' f'Go to the stage = ' f'{res["stages"][index + 1]}') time.sleep(3) self.executor.info(f'sending {(self.id, self.repeat_count)} ' f'to {self.queue_personal}') execute.apply_async((self.id, self.repeat_count), queue=self.queue_personal) return self.executor.step.finish() self.provider.change_status(self.task, TaskStatus.Success) self.info('execute end') def build(self): try: self.create_base() self.check_status() self.change_status() self.download() self.create_executor() self.execute() except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup(key='ExecuteBuilder') self.session = Session.create_session(key='ExecuteBuilder') self.logger.session = create_logger(self.session, 'ExecuteBuilder') step = self.executor.step.id if \ (self.executor and self.executor.step) else None self.error(traceback.format_exc(), step) self.provider.change_status(self.task, TaskStatus.Failed) raise e finally: if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.close() if self.exit: # noinspection PyProtectedMember os._exit(0)