示例#1
0
文件: storage.py 项目: shlemph/mlcomp
    def download(self, task: int):
        task = self.task_provider.by_id(
            task, joinedload(Task.dag_rel, innerjoin=True))
        folder = join(TASK_FOLDER, str(task.id))
        self.download_dag(task.dag, folder)

        config = Config.from_yaml(task.dag_rel.config)
        info = config['info']
        try:
            data_folder = os.path.join(DATA_FOLDER, info['project'])
            os.makedirs(data_folder, exist_ok=True)

            os.symlink(data_folder,
                       os.path.join(folder, 'data'),
                       target_is_directory=True)
        except FileExistsError:
            pass

        try:
            model_folder = os.path.join(MODEL_FOLDER, info['project'])
            os.makedirs(model_folder, exist_ok=True)

            os.symlink(model_folder,
                       os.path.join(folder, 'models'),
                       target_is_directory=True)
        except FileExistsError:
            pass

        sys.path.insert(0, folder)
        return folder
示例#2
0
    def download(self, task: int):
        task = self.task_provider.by_id(
            task, joinedload(Task.dag_rel, innerjoin=True))
        folder = join(TASK_FOLDER, str(task.id))
        os.makedirs(folder, exist_ok=True)
        items = self.provider.by_dag(task.dag)
        items = sorted(items, key=lambda x: x[1] is not None)
        for item, file in items:
            path = os.path.join(folder, item.path)
            if item.is_dir:
                os.makedirs(path, exist_ok=True)
            else:
                with open(path, 'wb') as f:
                    f.write(file.content)

        config = Config.from_yaml(task.dag_rel.config)
        info = config['info']
        try:
            data_folder = os.path.join(DATA_FOLDER, info['project'])
            os.makedirs(data_folder, exist_ok=True)

            os.symlink(data_folder, os.path.join(folder, 'data'))
        except FileExistsError:
            pass

        try:
            model_folder = os.path.join(MODEL_FOLDER, info['project'])
            os.makedirs(model_folder, exist_ok=True)

            os.symlink(model_folder, os.path.join(folder, 'models'))
        except FileExistsError:
            pass

        sys.path.insert(0, folder)
        return folder
示例#3
0
    def create_base(self):
        self.info('create_base')

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)
        self.executor_type = self.config['executors'][
            self.task.executor]['type']
示例#4
0
文件: tasks.py 项目: kiminh/mlcomp
    def create_base(self):
        self.info('create_base')

        if app.current_task:
            app.current_task.update_state(state=states.SUCCESS)
            app.control.revoke(app.current_task.request.id, terminate=True)

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)

        set_global_seed(self.config['info'].get('seed', 0))

        self.executor_type = self.config['executors'][
            self.task.executor]['type']

        executor = self.config['executors'][self.task.executor]

        if os.getenv('CUDA_VISIBLE_DEVICES', '').strip() != '':
            cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES',
                                             '').split(',')
            self.task.gpu_assigned = ','.join([
                cuda_visible_devices[int(g)]
                for g in (self.task.gpu_assigned or '').split(',')
            ])
            cuda_visible_devices = self.task.gpu_assigned
        else:
            cuda_visible_devices = self.task.gpu_assigned

        cuda_visible_devices = cuda_visible_devices or ''

        env = {
            'MKL_NUM_THREADS': 1,
            'OMP_NUM_THREADS': 1,
            'CUDA_VISIBLE_DEVICES': cuda_visible_devices
        }
        env.update(executor.get('env', {}))

        for k, v in env.items():
            os.environ[k] = str(v)
            self.info(f'Set env. {k} = {v}')
示例#5
0
def dag_model_start(session: Session, data: dict):
    provider = ModelProvider(session)
    model = provider.by_id(data['model_id'])
    dag_provider = DagProvider(session)
    dag = dag_provider.by_id(data['dag'], joined_load=[Dag.project_rel])

    project = dag.project_rel
    src_config = Config.from_yaml(dag.config)
    pipe = src_config['pipes'][data['pipe']['name']]

    equations = yaml_load(model.equations)
    versions = data['pipe']['versions']

    if len(versions) > 0:
        version = data['pipe']['version']
        pipe_equations = yaml_load(version['equations'])
        found_version = versions[0]
        for v in versions:
            if v['name'] == version['name']:
                found_version = v
                break

        found_version['used'] = now()

        if len(pipe) == 1:
            pipe[list(pipe)[0]].update(pipe_equations)
        else:
            pipe.update(pipe_equations)

    equations[data['pipe']['name']] = versions
    model.equations = yaml_dump(equations)

    for v in pipe.values():
        v['model_id'] = model.id
        v['model_name'] = model.name

    config = {
        'info': {
            'name': data['pipe']['name'],
            'project': project.name
        },
        'executors': pipe
    }

    if model.dag:
        old_dag = dag_provider.by_id(model.dag)
        if old_dag.name != dag.name:
            model.dag = dag.id
    else:
        model.dag = dag.id

    provider.commit()

    dag_standard(session=session,
                 config=config,
                 debug=False,
                 upload_files=False,
                 copy_files_from=data['dag'])
示例#6
0
    def model_start_begin(self, model_id: int):
        model = self.by_id(model_id)

        models_dags = self.query(Dag). \
            filter(Dag.type == DagType.Pipe.value). \
            filter(Dag.project == model.project). \
            order_by(Dag.id.desc()). \
            all()

        used_dag_names = set()
        versions = yaml_load(model.equations)

        res_dags = []
        res_dag = None

        for dag in models_dags:
            if dag.name in used_dag_names:
                continue
            config = Config.from_yaml(dag.config)
            d = {
                'name': dag.name,
                'id': dag.id,
                'pipes': [{
                    'name': p
                } for p in config['pipes']]
            }
            for pipe in d['pipes']:
                pipe['versions'] = versions.get(pipe['name'], [])
                used = [
                    v.get('used', datetime.datetime.min)
                    for v in pipe['versions']
                ]
                pipe['used'] = datetime.datetime.min if len(
                    used) == 0 else max(used)

            d['pipes'] = sorted(d['pipes'],
                                key=lambda x: x['used'],
                                reverse=True)
            for p in d['pipes']:
                del p['used']
                for v in p['versions']:
                    if 'used' in v:
                        del v['used']

            used_dag_names.add(dag.name)
            res_dags.append(d)

            if d['id'] == model.dag:
                res_dag = d

        return {'dags': res_dags, 'dag': res_dag, 'model_id': model_id}
示例#7
0
def dag_model_start(session: Session, data: dict):
    provider = ModelProvider(session)
    model = provider.by_id(data['model_id'])
    dag = DagProvider(session
                      ).by_id(data['dag'], joined_load=[Dag.project_rel])

    project = dag.project_rel
    src_config = Config.from_yaml(dag.config)
    pipe = src_config['pipes'][data['pipe']]
    for k, v in pipe.items():
        if v.get('slot') != data['slot']:
            continue
        params = yaml_load(data['interface_params'])
        slot = {
            'interface': data['interface'],
            'interface_params': params,
            'slot': k,
            'name': model.name,
            'id': data['model_id']
        }
        v['slot'] = slot

    config = {
        'info': {
            'name': data['pipe'],
            'project': project.name
        },
        'executors': pipe
    }

    dag_standard(
        session=session,
        config=config,
        debug=False,
        upload_files=False,
        copy_files_from=data['dag']
    )

    model.dag = data['dag']
    model.interface = data['interface']
    model.interface_params = data['interface_params']
    model.slot = data['slot']

    provider.commit()
示例#8
0
    def create_base(self):
        self.info('create_base')

        if app.current_task:
            app.current_task.update_state(state=states.SUCCESS)
            app.control.revoke(app.current_task.request.id, terminate=True)

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)
        self.executor_type = self.config['executors'][
            self.task.executor]['type']

        executor = self.config['executors'][self.task.executor]
        env = {'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1}
        env.update(executor.get('env', {}))

        for k, v in env.items():
            os.environ[k] = str(v)
            self.info(f'Set env. {k} = {v}')
示例#9
0
文件: kaggle.py 项目: shlemph/mlcomp
 def _from_config(cls, executor: dict, config: Config,
                  additional_info: dict):
     output = os.path.join(config.data_folder, config.get('output', '.'))
     return cls(output=output, competition=executor['competition'])
示例#10
0
    def get(self, filter, options: PaginatorOptions):
        query = self.query(Model). \
            options(joinedload(Model.dag_rel, innerjoin=True)). \
            options(joinedload(Model.project_rel, innerjoin=True))

        if filter.get('project'):
            query = query.filter(Model.project == filter['project'])
        if filter.get('name'):
            query = query.filter(Model.name.like(f'%{filter["name"]}%'))

        if filter.get('created_min'):
            created_min = parse_time(filter['created_min'])
            query = query.filter(Model.created >= created_min)
        if filter.get('created_max'):
            created_max = parse_time(filter['created_max'])
            query = query.filter(Model.created <= created_max)

        total = query.count()
        paginator = self.paginator(query, options) if options else query
        res = []
        models = paginator.all()
        models_projects = set()
        for model in models:
            row = self.to_dict(model, rules=('-project_rel.class_names', ))
            res.append(row)
            models_projects.add(model.project)

        models_dags = self.query(Dag). \
            filter(Dag.type == DagType.Pipe.value). \
            filter(Dag.project.in_(list(models_projects))). \
            order_by(Dag.id.desc()). \
            all()

        dags_by_project = defaultdict(list)
        used_dag_names = set()

        for dag in models_dags:
            if dag.name in used_dag_names:
                continue

            config = Config.from_yaml(dag.config)
            slots = []
            for pipe in config['pipes'].values():
                for k, v in pipe.items():
                    if 'slot' in v:
                        if v['slot'] not in slots:
                            slots.append(v['slot'])
                    elif 'slots' in v:
                        for slot in v['slots']:
                            if slot not in slots:
                                slots.append(slot)

            d = {
                'name': dag.name,
                'id': dag.id,
                'slots': slots,
                'interfaces': list(config['interfaces']),
                'pipes': list(config['pipes'])
            }

            dags_by_project[dag.project].append(d)
            used_dag_names.add(dag.name)

        for row in res:
            row['dags'] = dags_by_project[row['project']]

        projects = self.query(Project.name, Project.id). \
            order_by(Project.id.desc()). \
            limit(20). \
            all()
        projects = [{'name': name, 'id': id} for name, id in projects]
        return {'total': total, 'data': res, 'projects': projects}
示例#11
0
    def get(self, filter: dict, options: PaginatorOptions):
        query = self.query(Task, Project.name).\
            join(Dag, Dag.id == Task.dag).\
            join(Project, Project.id == Dag.project).\
            options(joinedload(Task.dag_rel, innerjoin=True))

        query = self._get_filter(query, filter)

        total = query.count()
        paginator = self.paginator(query, options)
        res = []

        for p, project_name in paginator.all():
            if p.dag_rel is None:
                continue

            item = {**self.to_dict(p, rules=('-additional_info', ))}
            item['status'] = to_snake(TaskStatus(item['status']).name)
            item['type'] = to_snake(TaskType(item['type']).name)
            item['dag_rel']['project'] = {
                'id': item['dag_rel']['project'],
                'name': project_name
            }
            if p.started is None:
                delta = 0
            elif p.status == TaskStatus.InProgress.value:
                delta = (now() - p.started).total_seconds()
            else:
                finish = (p.finished or p.last_activity)
                delta = (finish - p.started).total_seconds()
            item['duration'] = duration_format(delta)
            if p.dag_rel is not None:
                res.append(item)

        if filter.get('report'):
            tasks_within_report = self.query(
                ReportTasks.task
            ).filter(ReportTasks.report == int(filter['report']))
            tasks_within_report = {t[0] for t in tasks_within_report}
            for r in res:
                r['report_full'] = r['id'] in tasks_within_report

        projects = self.query(Project.name, Project.id). \
            order_by(Project.id.desc()). \
            limit(20). \
            all()
        dags = self.query(Dag.name, Dag.id). \
            order_by(Dag.id.desc()). \
            limit(20). \
            all()
        projects = [{'name': name, 'id': id} for name, id in projects]
        dags = [{'name': name, 'id': id} for name, id in dags]

        dags_model = self.query(Dag.name, Dag.id, Dag.config). \
            filter(Dag.type == DagType.Pipe.value). \
            order_by(Dag.id.desc()). \
            all()

        dags_model_dict = []
        used_dag_names = set()

        for name, id, config in dags_model:
            if name in used_dag_names:
                continue

            config = Config.from_yaml(config)
            slots = []
            for pipe in config['pipes'].values():
                for k, v in pipe.items():
                    if 'slot' in v:
                        slots.append(v['slot'])
                    elif 'slots' in v:
                        slots.extend(v['slots'])

            dag = {
                'name': name,
                'id': id,
                'slots': slots,
                'interfaces': [
                    {
                        'name': k,
                        'params': yaml_dump(v)
                    } for k, v in config['interfaces'].items()
                ]
            }
            dags_model_dict.append(dag)
            used_dag_names.add(name)

        return {
            'total': total,
            'data': res,
            'projects': projects,
            'dags': dags,
            'dags_model': dags_model_dict
        }