예제 #1
0
    def __init__(self,
                 session: Session,
                 task: Task,
                 layout: str,
                 part: str = 'valid',
                 name: str = 'img_classify',
                 max_img_size: Tuple[int, int] = None,
                 main_metric: str = 'accuracy',
                 plot_count: int = 0):
        self.session = session
        self.task = task
        self.layout = layout
        self.part = part
        self.name = name or 'img_classify'
        self.max_img_size = max_img_size
        self.main_metric = main_metric
        self.plot_count = plot_count

        self.dag_provider = DagProvider(session)
        self.report_provider = ReportProvider(session)
        self.layout_provider = ReportLayoutProvider(session)
        self.task_provider = TaskProvider(session)
        self.report_img_provider = ReportImgProvider(session)
        self.report_task_provider = ReportTasksProvider(session)
        self.report_series_provider = ReportSeriesProvider(session)

        self.project = self.task_provider.project(task.id).id
        self.layout = self.layout_provider.by_name(layout)
        self.layout_dict = yaml_load(self.layout.content)
예제 #2
0
    def create_providers(self):
        self.log_info('create_providers')

        self.dag_provider = DagProvider(self.session)
        self.task_provider = TaskProvider(self.session)
        self.file_provider = FileProvider(self.session)
        self.dag_storage_provider = DagStorageProvider(self.session)
예제 #3
0
파일: tasks.py 프로젝트: jingmouren/mlcomp
    def create_base(self):
        self.info('create_base')

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)
        self.executor_type = self.config['executors'][
            self.task.executor]['type']
예제 #4
0
파일: describe.py 프로젝트: xyuan/mlcomp
def describe_task_names(dag: int):
    pd.set_option('display.max_columns', None)
    pd.set_option('display.expand_frame_repr', False)
    pd.set_option('max_colwidth', -1)

    provider = TaskProvider()
    tasks = provider.by_dag(dag)
    return pd.DataFrame([{'id': t.id, 'name': t.name} for t in tasks])
예제 #5
0
    def create_providers(self):
        self.provider = TaskProvider(self.session)
        self.report_provider = ReportProvider(self.session)
        self.report_tasks_provider = ReportTasksProvider(self.session)
        self.report_layout_provider = ReportLayoutProvider(self.session)
        self.project_provider = ProjectProvider(self.session)

        self.storage = Storage(self.session)
        self.dag_provider = DagProvider(self.session)
예제 #6
0
파일: standard.py 프로젝트: kiminh/mlcomp
    def create_providers(self):
        self.log_info('create_providers')

        self.provider = TaskProvider(self.session)
        self.report_provider = ReportProvider(self.session)
        self.report_tasks_provider = ReportTasksProvider(self.session)
        self.report_layout_provider = ReportLayoutProvider(self.session)
        self.project_provider = ProjectProvider(self.session)

        self.storage = Storage(self.session,
                               logger=self.logger,
                               component=self.component)
        self.dag_provider = DagProvider(self.session)
예제 #7
0
    def __init__(self, session: Session, logger=None,
                 component: ComponentType = None,
                 max_file_size: int = 10 ** 5, max_count=10 ** 3):
        self.file_provider = FileProvider(session)
        self.provider = DagStorageProvider(session)
        self.task_provider = TaskProvider(session)
        self.library_provider = DagLibraryProvider(session)
        self.dag_provider = DagProvider(session)

        self.logger = logger
        self.component = component
        self.max_file_size = max_file_size
        self.max_count = max_count
예제 #8
0
파일: signals.py 프로젝트: xyuan/mlcomp
def task_before_update(mapper, connection, target):
    target.last_activity = now()
    if target.parent:
        provider = TaskProvider(_session)
        parent = provider.by_id(target.parent)
        if parent is None:
            return

        parent.last_activity = target.last_activity

        try:
            provider.commit()
        except StaleDataError:
            pass
예제 #9
0
def stop_processes_not_exist(session: Session, logger):
    provider = TaskProvider(session)
    hostname = socket.gethostname()
    tasks = provider.by_status(
        TaskStatus.InProgress,
        task_docker_assigned=DOCKER_IMG,
        computer_assigned=hostname
    )
    # Kill processes which does not exist
    hostname = socket.gethostname()
    for t in tasks:
        if not psutil.pid_exists(t.pid):
            # tasks can retry the execution
            if (now() - t.last_activity).total_seconds() < 30:
                continue

            os.system(f'kill -9 {t.pid}')
            t.status = TaskStatus.Failed.value
            logger.error(
                f'process with pid = {t.pid} does not exist. '
                f'Set task to failed state',
                ComponentType.WorkerSupervisor, hostname, t.id
            )

            provider.commit()

            additional_info = yaml_load(t.additional_info)
            for p in additional_info.get('child_processes', []):
                logger.info(f'killing child process = {p}')
                os.system(f'kill -9 {p}')

    # Kill processes which exist but should not
    processes = get_pid('worker ')
    ids = [p['PID'] for p in processes]
    tasks = provider.by_ids(ids)
    tasks = {t.pid: t for t in tasks}

    for p in processes:
        pid = p['PID']
        if pid in tasks:
            task = tasks[pid]
            if task.status in [TaskStatus.Stopped.value,
                               TaskStatus.Failed.value,
                               TaskStatus.Skipped.value]:

                logger.info(f'Kill processes which exist but should not. '
                            f'Pid = {pid}')
                os.system(f'kill -9 {pid}')
예제 #10
0
    def work(self):
        project = ProjectProvider(self.session).by_id(self.project)

        self.info(f'Task = {self.train_task} child_task: {self.child_task}')

        model = Model(
            created=now(),
            name=self.name,
            project=self.project,
            equations='',
            fold=self.fold
        )

        provider = ModelProvider(self.session)
        if self.train_task:
            task_provider = TaskProvider(self.session)
            dag_provider = DagProvider(self.session)
            task = task_provider.by_id(self.train_task)
            dag = dag_provider.by_id(task.dag)

            task_dir = join(TASK_FOLDER, str(self.child_task or task.id))

            # get log directory
            config = yaml_load(dag.config)
            executor_config = config['executors'][task.executor]
            catalyst_config_file = executor_config['args']['config']
            catalyst_config_file = join(task_dir, catalyst_config_file)
            catalyst_config = yaml_load(file=catalyst_config_file)
            catalyst_logdir = catalyst_config['args']['logdir']

            model.score_local = task.score

            src_log = f'{task_dir}/{catalyst_logdir}'
            models_dir = join(MODEL_FOLDER, project.name)
            os.makedirs(models_dir, exist_ok=True)

            model_path_tmp = f'{src_log}/traced.pth'
            traced = trace_model_from_checkpoint(src_log, self, file=self.file)

            model_path = f'{models_dir}/{model.name}.pth'
            model_weight_path = f'{models_dir}/{model.name}_weight.pth'
            torch.jit.save(traced, model_path_tmp)
            shutil.copy(model_path_tmp, model_path)
            file = self.file = 'best_full'
            shutil.copy(f'{src_log}/checkpoints/{file}.pth',
                        model_weight_path)

        provider.add(model)
예제 #11
0
    def create_base(self):
        self.info('create_base')

        if app.current_task:
            app.current_task.update_state(state=states.SUCCESS)
            app.control.revoke(app.current_task.request.id, terminate=True)

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)
        self.executor_type = self.config['executors'][
            self.task.executor]['type']

        executor = self.config['executors'][self.task.executor]
        env = {'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1}
        env.update(executor.get('env', {}))

        for k, v in env.items():
            os.environ[k] = str(v)
            self.info(f'Set env. {k} = {v}')