def project_edit(): data = request_data() provider = ProjectProvider(_write_session) res = provider.edit_project(data['name'], yaml_load(data['class_names']), yaml_load(data['ignore_folders'])) return res
def space_run(): data = request_data() provider = SpaceProvider(_write_session) file_changes = data.get('file_changes', '\n') file_changes = yaml_load(file_changes) def merge(d: dict, d2: dict): res = {} for k in set(d) | set(d2): if k in d and k in d2: v = d[k] v2 = d2[k] if isinstance(v, list) and isinstance(v2, list): res[k] = v[:] res[k].extend(v2) elif isinstance(v, dict) and isinstance(v2, dict): res[k] = v.copy() res[k].update(v2) else: raise Exception( f'Types are different: {type(v)}, {type(v2)}') elif k in d: res[k] = d[k] elif k in d2: res[k] = d2[k] return res suffix = [] for space in data['spaces']: if space['logic'] == 'and': space = provider.by_id(space['value'], key_column='name') if space.content: d = yaml_load(space.content) file_changes = merge(file_changes, d) suffix.append(space.name) for space in data['spaces']: if space['logic'] != 'or': continue space = provider.by_id(space['value'], key_column='name') space_related = provider.related(space.name) if space.content: space_related += [space] for rel in space_related: content = rel.content d = yaml_load(content) d = merge(file_changes, d) dag_suffix = ' '.join(suffix + rel.name) dag_copy(_write_session, data['dag'], file_changes=yaml_dump(d), dag_suffix=dag_suffix) if not any(s['logic'] == 'or' for s in data['spaces']): dag_copy(_write_session, data['dag'], file_changes=yaml_dump(file_changes), dag_suffix=' '.join(suffix))
def create_service_task( self, task: Task, gpu_assigned=None, distr_info: dict = None, resume: dict = None ): new_task = Task( name=task.name, computer=task.computer, executor=task.executor, status=TaskStatus.NotRan.value, type=TaskType.Service.value, gpu_assigned=gpu_assigned, parent=task.id, report=task.report, dag=task.dag ) new_task.additional_info = task.additional_info if distr_info: additional_info = yaml_load(new_task.additional_info) additional_info['distr_info'] = distr_info new_task.additional_info = yaml_dump(additional_info) if resume: additional_info = yaml_load(new_task.additional_info) additional_info['resume'] = resume new_task.additional_info = yaml_dump(additional_info) return self.provider.add(new_task)
def project_add(): data = request_data() provider = ProjectProvider(_write_session) provider.add_project( data['name'], yaml_load(data['class_names']), yaml_load(data['ignore_folders']) )
def dag_model_start(session: Session, data: dict): provider = ModelProvider(session) model = provider.by_id(data['model_id']) dag_provider = DagProvider(session) dag = dag_provider.by_id(data['dag'], joined_load=[Dag.project_rel]) project = dag.project_rel src_config = Config.from_yaml(dag.config) pipe = src_config['pipes'][data['pipe']['name']] equations = yaml_load(model.equations) versions = data['pipe']['versions'] if len(versions) > 0: version = data['pipe']['version'] pipe_equations = yaml_load(version['equations']) found_version = versions[0] for v in versions: if v['name'] == version['name']: found_version = v break found_version['used'] = now() if len(pipe) == 1: pipe[list(pipe)[0]].update(pipe_equations) else: pipe.update(pipe_equations) equations[data['pipe']['name']] = versions model.equations = yaml_dump(equations) for v in pipe.values(): v['model_id'] = model.id v['model_name'] = model.name config = { 'info': { 'name': data['pipe']['name'], 'project': project.name }, 'executors': pipe } if model.dag: old_dag = dag_provider.by_id(model.dag) if old_dag.name != dag.name: model.dag = dag.id else: model.dag = dag.id provider.commit() dag_standard(session=session, config=config, debug=False, upload_files=False, copy_files_from=data['dag'])
def set_space_fields(space: Space, data: dict): data['content'] = data.get('content', '') yaml_load(data['content']) space.name = data['name'] space.content = data['content'] if not space.created: space.created = now() space.changed = now() return space
def process_task(self, task: Task): auxiliary = self.auxiliary['process_tasks'][-1] auxiliary['computers'] = [] config = yaml_load(task.dag_rel.config) executor = config['executors'][task.executor] computers = self._process_task_get_computers(executor, task, auxiliary) if len(computers) == 0: return to_send = self._process_task_to_send(executor, task, computers) auxiliary['to_send'] = to_send[:5] additional_info = yaml_load(task.additional_info) rank = 0 master_port = None if len(to_send) > 0: master_port = self.find_port( to_send[0][0], to_send[0][1].split('_')[1] ) computer_names = {c['name'] for c, _, __ in to_send} if len(computer_names) == 1: task.computer_assigned = list(computer_names)[0] for computer, queue, gpu_assigned in to_send: main_cmp = to_send[0][0] # noinspection PyTypeChecker ip = 'localhost' if computer['name'] == main_cmp['name'] \ else main_cmp['ip'] distr_info = { 'master_addr': ip, 'rank': rank, 'local_rank': gpu_assigned, 'master_port': master_port, 'world_size': len(to_send), 'master_computer': main_cmp['name'] } service_task = self.create_service_task( task, distr_info=distr_info, gpu_assigned=gpu_assigned, resume=additional_info.get('resume') ) self.process_to_celery(service_task, queue, computer) rank += 1 main_cmp['ports'].add(master_port) if len(to_send) > 0: task.status = TaskStatus.Queued.value self.sent_tasks += len(to_send)
def computer_sync_end(): data = request_data() provider = ComputerProvider(_write_session) for computer in provider.all(): if data.get('computer') and data['computer'] != computer.name: continue meta = yaml_load(computer.meta) meta['manual_sync'] = { 'project': data['id'], 'ignore_folders': yaml_load(data['ignore_folders']) } computer.meta = yaml_dump(meta) provider.update()
def work(self): project = ProjectProvider(self.session).by_id(self.project) self.info(f'Task = {self.train_task} child_task: {self.child_task}') model = Model( created=now(), name=self.name, project=self.project, equations='', fold=self.fold ) provider = ModelProvider(self.session) if self.train_task: task_provider = TaskProvider(self.session) dag_provider = DagProvider(self.session) task = task_provider.by_id(self.train_task) dag = dag_provider.by_id(task.dag) task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) # get log directory config = yaml_load(dag.config) executor_config = config['executors'][task.executor] catalyst_config_file = executor_config['args']['config'] catalyst_config_file = join(task_dir, catalyst_config_file) catalyst_config = yaml_load(file=catalyst_config_file) catalyst_logdir = catalyst_config['args']['logdir'] model.score_local = task.score src_log = f'{task_dir}/{catalyst_logdir}' models_dir = join(MODEL_FOLDER, project.name) os.makedirs(models_dir, exist_ok=True) model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self, file=self.file) model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) file = self.file = 'best_full' shutil.copy(f'{src_log}/checkpoints/{file}.pth', model_weight_path) provider.add(model)
def sync_manual(self, computer: Computer, provider: ComputerProvider): """ button sync was clicked manually """ if not computer.meta: return meta = yaml_load(computer.meta) if 'manual_sync' not in meta: return manual_sync = meta['manual_sync'] project_provider = ProjectProvider(self.session) docker_provider = DockerProvider(self.session) dockers = docker_provider.get_online() project = project_provider.by_id(manual_sync['project']) for docker in dockers: if docker.computer == computer.name: continue source = provider.by_name(docker.computer) ignore_folders = [ [join('models', project.name), []] ] sync_directed(self.session, target=computer, source=source, ignore_folders=ignore_folders) del meta['manual_sync'] computer.meta = yaml_dump(meta) provider.update()
def detail(self, id: int): report_obj = self.by_id(id) tasks = self.query(ReportTasks.task).filter(ReportTasks.report == id ).all() tasks = [t[0] for t in tasks] config = yaml_load(report_obj.config) report = ReportLayoutInfo(config) series = self.query(ReportSeries). \ filter(ReportSeries.task.in_(tasks)). \ order_by(ReportSeries.epoch). \ options(joinedload(ReportSeries.task_rel, innerjoin=True)).all() items = dict() for s in report.series: items[s.name] = self._detail_series(series, s) for element in report.precision_recall + report.f1: items[element.name] = self._detail_single_img(id, element) for element in report.img_classify: items[element.name] = self.detail_img_classify_descr(id, element) for element in report.img_segment: items[element.name] = self.detail_img_segment_descr(id, element) return { 'data': items, 'layout': report.layout, 'metric': report.metric.serialize() }
def __init__(self, session: Session, task: Task, layout: str, part: str = 'valid', name: str = 'img_classify', max_img_size: Tuple[int, int] = None, main_metric: str = 'accuracy', plot_count: int = 0): self.session = session self.task = task self.layout = layout self.part = part self.name = name or 'img_classify' self.max_img_size = max_img_size self.main_metric = main_metric self.plot_count = plot_count self.dag_provider = DagProvider(session) self.report_provider = ReportProvider(session) self.layout_provider = ReportLayoutProvider(session) self.task_provider = TaskProvider(session) self.report_img_provider = ReportImgProvider(session) self.report_task_provider = ReportTasksProvider(session) self.report_series_provider = ReportSeriesProvider(session) self.project = self.task_provider.project(task.id).id self.layout = self.layout_provider.by_name(layout) self.layout_dict = yaml_load(self.layout.content)
def _dag(config: str, debug: bool = False, control_reqs=True, params: Tuple[str] = ()): logger = create_logger(_session, name='_dag') logger.info('started', ComponentType.Client) config_text = open(config, 'r').read() config_parsed = yaml_load(config_text) params = dict_from_list_str(params) config_parsed = merge_dicts_smart(config_parsed, params) config_text = yaml_dump(config_parsed) logger.info('config parsed', ComponentType.Client) type_name = config_parsed['info'].get('type', 'standard') if type_name == DagType.Standard.name.lower(): return dag_standard( session=_session, config=config_parsed, debug=debug, config_text=config_text, config_path=config, control_reqs=control_reqs, logger=logger, component=ComponentType.Client ) return dag_pipe( session=_session, config=config_parsed, config_text=config_text )
def load_computers(self): computers = self.computer_provider.computers() for computer in computers.values(): computer['gpu'] = [0] * computer['gpu'] computer['ports'] = set() computer['cpu_total'] = computer['cpu'] computer['memory_total'] = computer['memory'] computer['gpu_total'] = len(computer['gpu']) for task in self.provider.by_status(TaskStatus.Queued, TaskStatus.InProgress): if task.computer_assigned is None: continue assigned = task.computer_assigned comp_assigned = computers[assigned] comp_assigned['cpu'] -= task.cpu if task.gpu_assigned is not None: for g in task.gpu_assigned.split(','): comp_assigned['gpu'][int(g)] = task.id comp_assigned['memory'] -= task.memory * 1024 info = yaml_load(task.additional_info) if 'distr_info' in info: dist_info = info['distr_info'] if dist_info['rank'] == 0: comp_assigned['ports'].add(dist_info['master_port']) self.computers = [{ **value, 'name': name } for name, value in computers.items()] self.auxiliary['computers'] = self.computers
def get(self): query = self.query(self.model) res = dict() for r in query.all(): res[r.name] = yaml_load(r.data) res[r.name] = self.serializer(res[r.name]) return res
def __init__(self, session: Session, task: Task, layout: str, part: str = 'valid', name: str = 'img_segment', max_img_size: Tuple[int, int] = None, stack_type: str = 'vertical', main_metric: str = 'dice', plot_count: int = 0, colors: List[Tuple] = None): self.session = session self.task = task self.layout = layout self.part = part self.name = name or 'img_segment' self.max_img_size = max_img_size self.stack_type = stack_type self.main_metric = main_metric self.colors = colors self.plot_count = plot_count self.dag_provider = DagProvider(session) self.report_provider = ReportProvider(session) self.layout_provider = ReportLayoutProvider(session) self.task_provider = TaskProvider(session) self.report_img_provider = ReportImgProvider(session) self.report_task_provider = ReportTasksProvider(session) self.report_series_provider = ReportSeriesProvider(session) self.project = self.task_provider.project(task.id).id self.layout = self.layout_provider.by_name(layout) self.layout_dict = yaml_load(self.layout.content) self.create_base()
def sync(project: str, computer: str, only_from: bool, only_to: bool): _create_computer() computer = computer or socket.gethostname() provider = ComputerProvider(_session) project_provider = ProjectProvider(_session) computer = provider.by_name(computer) computers = provider.all() folders_excluded = [] p = project_provider.by_name(project) assert p, f'Project={project} is not found' ignore = yaml_load(p.ignore_folders) excluded = [] for f in ignore: excluded.append(str(f)) folders_excluded.append([join('data', p.name), excluded]) folders_excluded.append([join('models', p.name), []]) for c in computers: if c.name != computer.name: if not only_from: sync_directed(_session, computer, c, folders_excluded) if not only_to: sync_directed(_session, c, computer, folders_excluded)
def stop_processes_not_exist(session: Session, logger): provider = TaskProvider(session) hostname = socket.gethostname() tasks = provider.by_status(TaskStatus.InProgress, task_docker_assigned=DOCKER_IMG, computer_assigned=hostname) hostname = socket.gethostname() for t in tasks: if not psutil.pid_exists(t.pid): # tasks can retry the execution if (now() - t.last_activity).total_seconds() < 30: continue os.system(f'kill -9 {t.pid}') t.status = TaskStatus.Failed.value logger.error( f'process with pid = {t.pid} does not exist. ' f'Set task to failed state', ComponentType.WorkerSupervisor, hostname, t.id) provider.commit() additional_info = yaml_load(t.additional_info) for p in additional_info.get('child_processes', []): logger.info(f'killing child process = {p}') os.system(f'kill -9 {p}')
def find_resume(task): children = task_provider.children(task.id) children = sorted(children, key=lambda x: x.id, reverse=True) if len(children) > 0: for c in children: if c.parent != task.id: continue info = yaml_load(c.additional_info) if 'distr_info' not in info: continue if info['distr_info']['rank'] == 0: return { 'master_computer': c.computer_assigned, 'master_task_id': c.id, 'load_last': True } raise Exception('Master task not found') else: return { 'master_computer': task.computer_assigned, 'master_task_id': task.id, 'load_last': True }
def stop(logger, session: Session, task: Task, dag: Dag): provider = TaskProvider(session) if task.status > TaskStatus.InProgress.value: return task.status status = TaskStatus.Stopped try: if task.status != TaskStatus.NotRan.value: app.control.revoke(task.celery_id, terminate=True) else: status = TaskStatus.Skipped except Exception as e: if Session.sqlalchemy_error(e): try: logger.error(traceback.format_exc(), ComponentType.API) except Exception: pass raise logger.error(traceback.format_exc(), ComponentType.API) finally: if task.pid: queue = f'{task.computer_assigned}_' \ f'{dag.docker_img or "default"}_supervisor' kill.apply_async((task.pid, ), queue=queue, retry=False) additional_info = yaml_load(task.additional_info) for p in additional_info.get('child_processes', []): kill.apply_async((p, ), queue=queue, retry=False) provider.change_status(task, status) return task.status
def sync(self): hostname = socket.gethostname() try: provider = ComputerProvider(self.session) task_synced_provider = TaskSyncedProvider(self.session) computer = provider.by_name(hostname) sync_start = now() if FILE_SYNC_INTERVAL == 0: time.sleep(1) else: computers = provider.all_with_last_activtiy() computers = [ c for c in computers if (now() - c.last_activity).total_seconds() < 10 ] computers_names = {c.name for c in computers} for c, project, tasks in task_synced_provider.for_computer( computer.name): if c.name not in computers_names: self.logger.info( f'Computer = {c.name} ' f'is offline. Can not sync', ComponentType.WorkerSupervisor, hostname) continue if c.syncing_computer: continue excluded = list(map(str, yaml_load(project.ignore_folders))) folders_excluded = [[join('data', project.name), excluded], [join('models', project.name), []]] computer.syncing_computer = c.name provider.update() sync_directed(self.session, c, computer, folders_excluded) for t in tasks: task_synced_provider.add( TaskSynced(computer=computer.name, task=t.id)) time.sleep(FILE_SYNC_INTERVAL) computer.last_synced = sync_start computer.syncing_computer = None provider.update() except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup('FileSync') self.session = Session.create_session(key='FileSync') self.logger = create_logger(self.session, 'FileSync') self.logger.error(traceback.format_exc(), ComponentType.WorkerSupervisor, hostname)
def all(self): res = { s.name: yaml_load(s.content) for s in self.query(ReportLayout).all() } for k, v in res.items(): res[k] = ReportLayoutInfo.union_layouts(k, res) return res
def grid_cells(grid: List): for i, row in enumerate(grid): row_type = type(row) if row_type == list: if len(row) == 0: raise Exception(f'Empty list at {i} position') if type(row[0]) != dict: raise Exception('List entries can be dicts only') elif row_type == dict: if len(row) != 1: raise Exception('Dict must contain only one element') key = list(row)[0] val_type = type(row[key]) if val_type not in [list, str]: raise Exception('Dict value must be list or str') new_row = [] if val_type == str: if '-' in row[key]: start, end = map(int, row[key].split('-')) for p in range(start, end + 1): new_row.append({key: p}) else: if key == '_folder': for file in glob(join(row[key], '*.yml')): new_row.append(yaml_load(file)) else: for v in row[key]: if key == '_file': new_row.append(yaml_load(v)) else: new_row.append({key: v}) grid[i] = new_row else: raise Exception(f'Unknown type of row = {row_type}') res = list(product(*grid)) for i, r in enumerate(res): d = {} for dd in r: d.update(dd) res[i] = d return [[r, cell_name(r)] for r in res]
def create_executor(self): self.info('create_executor') additional_info = yaml_load(self.task.additional_info) \ if self.task.additional_info else dict() self.executor = Executor.from_config(executor=self.task.executor, config=self.config, additional_info=additional_info, session=self.session, logger=self.logger)
def submit(): assert exists('submit.yml'), 'no file submit.yml' data = yaml_load(file='submit.yml') submit = Submit(competition=data['competition'], submit_type='kernel', max_size=data.get('max_size', 1), folders=data.get('folders', []), datasets=data.get('datasets', []), files=data.get('files', [])) submit.work()
def create_executor(self): self.info('create_executor') os.environ['CUDA_VISIBLE_DEVICES'] = self.task.gpu_assigned or '' additional_info = yaml_load(self.task.additional_info) \ if self.task.additional_info else dict() self.executor = Executor.from_config(executor=self.task.executor, config=self.config, additional_info=additional_info, session=self.session, logger=self.logger)
def sync(project: str, computer: str, only_from: bool, only_to: bool, online: bool): """ Syncs specified project on this computer with other computers """ check_statuses() _create_computer() _create_docker() computer = computer or socket.gethostname() provider = ComputerProvider(_session) project_provider = ProjectProvider(_session) computer = provider.by_name(computer) computers = provider.all_with_last_activtiy() p = project_provider.by_name(project) assert p, f'Project={project} is not found' sync_folders = yaml_load(p.sync_folders) ignore_folders = yaml_load(p.ignore_folders) sync_folders = correct_folders(sync_folders, p.name) ignore_folders = correct_folders(ignore_folders, p.name) if not isinstance(sync_folders, list): sync_folders = [] if not isinstance(ignore_folders, list): ignore_folders = [] folders = [[s, ignore_folders] for s in sync_folders] for c in computers: if c.name != computer.name: if online and (now() - c.last_activity).total_seconds() > 100: continue if not only_from: sync_directed(_session, computer, c, folders) if not only_to: sync_directed(_session, c, computer, folders)
def model_start_begin(self, model_id: int): model = self.by_id(model_id) models_dags = self.query(Dag). \ filter(Dag.type == DagType.Pipe.value). \ filter(Dag.project == model.project). \ order_by(Dag.id.desc()). \ all() used_dag_names = set() versions = yaml_load(model.equations) res_dags = [] res_dag = None for dag in models_dags: if dag.name in used_dag_names: continue config = Config.from_yaml(dag.config) d = { 'name': dag.name, 'id': dag.id, 'pipes': [{ 'name': p } for p in config['pipes']] } for pipe in d['pipes']: pipe['versions'] = versions.get(pipe['name'], []) used = [ v.get('used', datetime.datetime.min) for v in pipe['versions'] ] pipe['used'] = datetime.datetime.min if len( used) == 0 else max(used) d['pipes'] = sorted(d['pipes'], key=lambda x: x['used'], reverse=True) for p in d['pipes']: del p['used'] for v in p['versions']: if 'used' in v: del v['used'] used_dag_names.add(dag.name) res_dags.append(d) if d['id'] == model.dag: res_dag = d return {'dags': res_dags, 'dag': res_dag, 'model_id': model_id}
def sync_manual(self, computer: Computer, provider: ComputerProvider): """ button sync was clicked manually """ if not computer.meta: return meta = yaml_load(computer.meta) if 'manual_sync' not in meta: return manual_sync = meta['manual_sync'] project_provider = ProjectProvider(self.session) docker_provider = DockerProvider(self.session) dockers = docker_provider.get_online() project = project_provider.by_id(manual_sync['project']) sync_folders = manual_sync['sync_folders'] ignore_folders = manual_sync['ignore_folders'] sync_folders = correct_folders(sync_folders, project.name) ignore_folders = correct_folders(ignore_folders, project.name) if not isinstance(sync_folders, list): sync_folders = [] if not isinstance(ignore_folders, list): ignore_folders = [] for docker in dockers: if docker.computer == computer.name: continue source = provider.by_name(docker.computer) folders = [[s, ignore_folders] for s in sync_folders] computer.syncing_computer = source.name provider.update() try: sync_directed( self.session, target=computer, source=source, folders=folders ) except Exception as e: self.process_error(e) del meta['manual_sync'] computer.meta = yaml_dump(meta) provider.update()
def report_layout_edit(): data = request_data() provider = ReportLayoutProvider(_write_session) layout = provider.by_name(data['name']) layout.last_modified = now() if 'content' in data and data['content'] is not None: data_loaded = yaml_load(data['content']) ReportLayoutInfo(data_loaded) layout.content = data['content'] if 'new_name' in data and data['new_name'] is not None: layout.name = data['new_name'] provider.commit()