Пример #1
0
def _dag(config: str, debug: bool = False, control_reqs=True,
         params: Tuple[str] = ()):
    logger = create_logger(_session, name='_dag')
    logger.info('started', ComponentType.Client)

    config_text = open(config, 'r').read()
    config_parsed = yaml_load(config_text)
    params = dict_from_list_str(params)
    config_parsed = merge_dicts_smart(config_parsed, params)
    config_text = yaml_dump(config_parsed)

    logger.info('config parsed', ComponentType.Client)

    type_name = config_parsed['info'].get('type', 'standard')
    if type_name == DagType.Standard.name.lower():
        return dag_standard(
            session=_session,
            config=config_parsed,
            debug=debug,
            config_text=config_text,
            config_path=config,
            control_reqs=control_reqs,
            logger=logger,
            component=ComponentType.Client
        )

    return dag_pipe(
        session=_session, config=config_parsed, config_text=config_text
    )
Пример #2
0
    def build(self):
        try:
            self.create_base()

            self.check_status()

            self.change_status()

            self.download()

            self.create_executor()

            self.execute()

        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(key='ExecuteBuilder')
                self.session = Session.create_session(key='ExecuteBuilder')
                self.logger.session = create_logger(self.session,
                                                    'ExecuteBuilder')

            step = self.executor.step.id if \
                (self.executor and self.executor.step) else None

            self.error(traceback.format_exc(), step)
            self.provider.change_status(self.task, TaskStatus.Failed)
            raise e
        finally:
            if app.current_task:
                app.current_task.update_state(state=states.SUCCESS)
                app.close()

            if self.exit:
                # noinspection PyProtectedMember
                os._exit(0)
Пример #3
0
def execute(config: str, debug: bool):
    _create_computer()

    # Fail all InProgress Tasks
    logger = create_logger(_session, __name__)

    provider = TaskProvider(_session)
    step_provider = StepProvider(_session)

    for t in provider.by_status(TaskStatus.InProgress,
                                worker_index=WORKER_INDEX):
        step = step_provider.last_for_task(t.id)
        logger.error(
            f'Task Id = {t.id} was in InProgress state '
            f'when another tasks arrived to the same worker',
            ComponentType.Worker, t.computer_assigned, t.id, step)
        provider.change_status(t, TaskStatus.Failed)

    # Create dag
    created_dag = _dag(config, debug)
    for ids in created_dag.values():
        for id in ids:
            task = provider.by_id(id)
            task.gpu_assigned = ','.join(
                [str(i) for i, _ in enumerate(GPUtil.getGPUs())])

            provider.commit()
            execute_by_id(id, exit=False)
Пример #4
0
def execute(config: str, debug: bool, params):
    check_statuses()

    _create_computer()
    _create_docker()

    # Fail all InProgress Tasks
    logger = create_logger(_session, __name__)

    provider = TaskProvider(_session)
    step_provider = StepProvider(_session)

    for t in provider.by_status(TaskStatus.InProgress,
                                worker_index=WORKER_INDEX):
        step = step_provider.last_for_task(t.id)
        logger.error(
            f'Task Id = {t.id} was in InProgress state '
            f'when another tasks arrived to the same worker',
            ComponentType.Worker, t.computer_assigned, t.id, step)
        provider.change_status(t, TaskStatus.Failed)

    # Create dags
    dags = _dag(config, debug, params=params)
    for dag in dags:
        for ids in dag.values():
            for id in ids:
                task = provider.by_id(id)
                task.gpu_assigned = ','.join(
                    [str(i) for i in range(torch.cuda.device_count())])

                provider.commit()
                execute_by_id(id, exit=False)
Пример #5
0
def worker_supervisor():
    """
    Start worker supervisor.
    This program controls workers ran on the same machine.
    Also, it writes metric of resources consumption.
    """
    host = socket.gethostname()

    logger = create_logger(_session, 'worker_supervisor')
    logger.info('worker_supervisor start', ComponentType.WorkerSupervisor,
                host)

    _create_computer()
    _create_docker()

    start_schedule([(stop_processes_not_exist, 10)])

    if DOCKER_MAIN:
        syncer = FileSync()
        start_schedule([(worker_usage, 0)])
        start_schedule([(syncer.sync, 0)])

    name = f'{host}_{DOCKER_IMG}_supervisor'
    argv = [
        'worker', '--loglevel=INFO', '-P=solo', f'-n={name}', '-O fair',
        '-c=1', '--prefetch-multiplier=1', '-Q', f'{name}'
    ]

    logger.info('worker_supervisor run celery', ComponentType.WorkerSupervisor,
                host)

    app.worker_main(argv)
Пример #6
0
    def build(self):
        try:
            # if self.fast_check():
            #     return

            self.auxiliary = {'time': now()}

            self.create_base()

            self.process_stop_tasks()

            self.process_start_dags()

            self.process_parent_tasks()

            self.load_tasks()

            self.load_computers()

            self.process_tasks()

            self.write_auxiliary()

        except ObjectDeletedError:
            pass
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(key='SupervisorBuilder')
                self.session = Session.create_session(key='SupervisorBuilder')
                self.logger = create_logger(self.session, 'SupervisorBuilder')

            self.logger.error(traceback.format_exc(), ComponentType.Supervisor)
Пример #7
0
def sync_directed(
        session: Session,
        source: Computer,
        target: Computer,
        folders: List
):
    current_computer = socket.gethostname()
    logger = create_logger(session, __name__)
    for folder, excluded in folders:
        end = ' --perms  --chmod=777 --size-only'
        if len(excluded) > 0:
            parts = []
            folder_excluded = False
            for i in range(len(excluded)):
                if excluded[i] == folder:
                    folder_excluded = True
                    break
                if not excluded[i].startswith(folder):
                    continue

                part = os.path.relpath(excluded[i], folder)
                part = f'--exclude {part}'
                parts.append(part)

            if folder_excluded:
                continue

            if len(parts) > 0:
                end += ' ' + ' '.join(parts)

        source_folder = join(source.root_folder, folder)
        target_folder = join(target.root_folder, folder)

        if current_computer == source.name:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \
                      f'{source_folder}/ ' \
                      f'{target.user}@{target.ip}:{target_folder}/ {end}'
        elif current_computer == target.name:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {source.port} -o StrictHostKeyChecking=no" ' \
                      f'{source.user}@{source.ip}:{source_folder}/ ' \
                      f'{target_folder}/ {end}'
        else:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \
                      f' {source_folder}/ ' \
                      f'{target.user}@{target.ip}:{target_folder}/ {end}'

            command = f'ssh -p {source.port} ' \
                      f'{source.user}@{source.ip} "{command}"'

        logger.info(command, ComponentType.WorkerSupervisor, current_computer)
        try:
            subprocess.check_output(command, shell=True,
                                    stderr=subprocess.STDOUT,
                                    universal_newlines=True)
        except subprocess.CalledProcessError as exc:
            raise Exception(exc.output)
Пример #8
0
    def sync(self):
        hostname = socket.gethostname()
        try:
            provider = ComputerProvider(self.session)
            task_synced_provider = TaskSyncedProvider(self.session)

            computer = provider.by_name(hostname)
            sync_start = now()

            if FILE_SYNC_INTERVAL == 0:
                time.sleep(1)
            else:
                computers = provider.all_with_last_activtiy()
                computers = [
                    c for c in computers
                    if (now() - c.last_activity).total_seconds() < 10
                ]
                computers_names = {c.name for c in computers}

                for c, project, tasks in task_synced_provider.for_computer(
                        computer.name):
                    if c.name not in computers_names:
                        self.logger.info(
                            f'Computer = {c.name} '
                            f'is offline. Can not sync',
                            ComponentType.WorkerSupervisor, hostname)
                        continue

                    if c.syncing_computer:
                        continue

                    excluded = list(map(str,
                                        yaml_load(project.ignore_folders)))
                    folders_excluded = [[join('data', project.name), excluded],
                                        [join('models', project.name), []]]

                    computer.syncing_computer = c.name
                    provider.update()
                    sync_directed(self.session, c, computer, folders_excluded)

                    for t in tasks:
                        task_synced_provider.add(
                            TaskSynced(computer=computer.name, task=t.id))

                    time.sleep(FILE_SYNC_INTERVAL)

            computer.last_synced = sync_start
            computer.syncing_computer = None
            provider.update()
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup('FileSync')
                self.session = Session.create_session(key='FileSync')
                self.logger = create_logger(self.session, 'FileSync')

            self.logger.error(traceback.format_exc(),
                              ComponentType.WorkerSupervisor, hostname)
Пример #9
0
    def process_error(self, e: Exception):
        if Session.sqlalchemy_error(e):
            Session.cleanup('FileSync')
            self.session = Session.create_session(key='FileSync')
            self.logger = create_logger(self.session, 'FileSync')

        hostname = socket.gethostname()
        self.logger.error(
            traceback.format_exc(), ComponentType.WorkerSupervisor,
            hostname
        )
Пример #10
0
    def __init__(self, id: int, repeat_count: int = 1, exit=True):
        self.session = Session.create_session(key='ExecuteBuilder')
        self.id = id
        self.repeat_count = repeat_count
        self.logger = create_logger(self.session, 'ExecuteBuilder')
        self.logger_db = create_logger(self.session,
                                       'ExecuteBuilder.db',
                                       console=False)
        self.exit = exit

        self.provider = None
        self.library_provider = None
        self.storage = None
        self.task = None
        self.dag = None
        self.executor = None
        self.hostname = None
        self.docker_img = None
        self.worker_index = None
        self.queue_personal = None
        self.config = None
        self.executor_type = None
Пример #11
0
 def __init__(self):
     self.session = Session.create_session(key='SupervisorBuilder')
     self.logger = create_logger(self.session, 'SupervisorBuilder')
     self.provider = None
     self.computer_provider = None
     self.docker_provider = None
     self.auxiliary_provider = None
     self.dag_provider = None
     self.queues = None
     self.not_ran_tasks = None
     self.dep_status = None
     self.computers = None
     self.auxiliary = {}
Пример #12
0
    def wrapper():
        try:
            f(wrapper_vars['session'], wrapper_vars['logger'])
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(name)

                wrapper_vars['session'] = Session.create_session(key=name)
                wrapper_vars['logger'] = create_logger(wrapper_vars['session'],
                                                       name)

            wrapper_vars['logger'].error(traceback.format_exc(),
                                         ComponentType.WorkerSupervisor,
                                         hostname)
Пример #13
0
def find_imports(path: str,
                 files: List[str] = None,
                 exclude_patterns: List[str] = None,
                 encoding='utf-8'):
    res = []
    raw_imports = []
    files = files if files is not None \
        else glob(os.path.join(path, '**', '*.py'), recursive=True)

    exclude_patterns = exclude_patterns \
        if exclude_patterns is not None else []
    spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern,
                                        exclude_patterns)

    for file in files:
        if not file.endswith('.py'):
            continue
        file_rel = os.path.relpath(file, path)
        if spec.match_file(file_rel):
            continue

        with open(file, 'r', encoding=encoding) as f:
            content = f.read()
            try:
                tree = ast.parse(content)
                for node in ast.walk(tree):
                    if isinstance(node, ast.Import):
                        for subnode in node.names:
                            raw_imports.append((subnode.name, file_rel))
                    elif isinstance(node, ast.ImportFrom):
                        raw_imports.append((node.module, file_rel))
            except Exception as exc:
                logger = create_logger(Session.create_session(), __name__)
                logger.error('Failed on file: %s' % file_rel)
                raise exc

    for lib, file in raw_imports:
        name = lib.split('.')[0]
        try:
            if name in _mapping:
                name = _mapping[name]

            version = pkg_resources.get_distribution(name).version
            res.append((name, version))
        except Exception:
            pass

    return res
Пример #14
0
def _dag(config: str,
         debug: bool = False,
         control_reqs=True,
         params: Tuple[str] = ()):
    logger = create_logger(_session, name='_dag')
    logger.info('started', ComponentType.Client)

    config_text = open(config, 'r').read()
    config_parsed = yaml_load(config_text)
    params = dict_from_list_str(params)
    config_parsed = merge_dicts_smart(config_parsed, params)
    config_text = yaml_dump(config_parsed)

    logger.info('config parsed', ComponentType.Client)

    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip()
        config_parsed['info']['name'] += f'_{commit.decode("utf-8")[:6]}'
    except Exception:
        logger.info('commit not parsed')

    type_name = config_parsed['info'].get('type', 'standard')
    if type_name == DagType.Standard.name.lower():
        cells = grid_cells(
            config_parsed['grid']) if 'grid' in config_parsed else [None]
        dags = []
        for cell in cells:
            dag = dag_standard(session=_session,
                               config=config_parsed,
                               debug=debug,
                               config_text=config_text,
                               config_path=config,
                               control_reqs=control_reqs,
                               logger=logger,
                               component=ComponentType.Client,
                               grid_cell=cell)
            dags.append(dag)

        return dags

    return [
        dag_pipe(session=_session,
                 config=config_parsed,
                 config_text=config_text)
    ]
Пример #15
0
def sync_directed(
        session: Session, source: Computer, target: Computer,
        ignore_folders: List
):
    current_computer = socket.gethostname()
    end = ' --perms  --chmod=777 --size-only'
    logger = create_logger(session, __name__)
    for folder, excluded in ignore_folders:
        if len(excluded) > 0:
            excluded = excluded[:]
            for i in range(len(excluded)):
                excluded[i] = f'--exclude {excluded[i]}'
            end += ' ' + ' '.join(excluded)

        source_folder = join(source.root_folder, folder)
        target_folder = join(target.root_folder, folder)

        if current_computer == source.name:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \
                      f'{source_folder}/ ' \
                      f'{target.user}@{target.ip}:{target_folder}/ {end}'
        elif current_computer == target.name:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {source.port} -o StrictHostKeyChecking=no" ' \
                      f'{source.user}@{source.ip}:{source_folder}/ ' \
                      f'{target_folder}/ {end}'
        else:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \
                      f' {source_folder}/ ' \
                      f'{target.user}@{target.ip}:{target_folder}/ {end}'

            command = f'ssh -p {source.port} ' \
                      f'{source.user}@{source.ip} "{command}"'

        logger.info(command, ComponentType.WorkerSupervisor, current_computer)
        subprocess.check_output(command, shell=True)
Пример #16
0
def error_handler(f):
    name = f.__name__
    wrapper_vars = {'session': Session.create_session(key=name)}
    wrapper_vars['logger'] = create_logger(wrapper_vars['session'], name)

    hostname = socket.gethostname()

    def wrapper():
        try:
            f(wrapper_vars['session'], wrapper_vars['logger'])
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(name)

                wrapper_vars['session'] = Session.create_session(key=name)
                wrapper_vars['logger'] = create_logger(wrapper_vars['session'],
                                                       name)

            wrapper_vars['logger'].error(traceback.format_exc(),
                                         ComponentType.WorkerSupervisor,
                                         hostname)

    return wrapper
Пример #17
0
    def decorated(*args, **kwargs):
        global _read_session, _write_session, logger

        success = True
        status = 200
        error = ''

        try:
            res = f(*args, **kwargs)
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup('server.read')
                Session.cleanup('server.write')

                _read_session = Session.create_session(key='server.read')
                _write_session = Session.create_session(key='server.write')

                logger = create_logger(_write_session, __name__)

            logger.error(
                f'Requested Url: {request.path}\n\n{traceback.format_exc()}',
                ComponentType.API
            )

            error = traceback.format_exc()
            success = False
            status = 500
            res = None

        res = res or {}
        if isinstance(res, Response):
            return res

        res['success'] = success
        res['error'] = error

        return Response(json.dumps(res), status=status)
Пример #18
0
class FileSync:
    session = Session.create_session(key='FileSync')
    logger = create_logger(session, 'FileSync')

    def sync_manual(self, computer: Computer, provider: ComputerProvider):
        """
        button sync was clicked manually
        """
        if not computer.meta:
            return

        meta = yaml_load(computer.meta)
        if 'manual_sync' not in meta:
            return

        manual_sync = meta['manual_sync']

        project_provider = ProjectProvider(self.session)
        docker_provider = DockerProvider(self.session)

        dockers = docker_provider.get_online()
        project = project_provider.by_id(manual_sync['project'])

        for docker in dockers:
            if docker.computer == computer.name:
                continue

            source = provider.by_name(docker.computer)
            ignore_folders = [
                [join('models', project.name), []]
            ]
            sync_directed(self.session, target=computer, source=source,
                          ignore_folders=ignore_folders)

        del meta['manual_sync']
        computer.meta = yaml_dump(meta)
        provider.update()

    def sync(self):
        hostname = socket.gethostname()
        try:
            provider = ComputerProvider(self.session)
            task_synced_provider = TaskSyncedProvider(self.session)

            computer = provider.by_name(hostname)
            sync_start = now()

            if FILE_SYNC_INTERVAL == 0:
                time.sleep(1)
            else:
                self.sync_manual(computer, provider)

                computers = provider.all_with_last_activtiy()
                computers = [
                    c for c in computers
                    if (now() - c.last_activity).total_seconds() < 10
                ]
                computers_names = {c.name for c in computers}

                for c, project, tasks in task_synced_provider.for_computer(
                        computer.name):
                    if c.sync_with_this_computer:
                        if c.name not in computers_names:
                            self.logger.info(f'Computer = {c.name} '
                                             f'is offline. Can not sync',
                                             ComponentType.WorkerSupervisor,
                                             hostname)
                            continue

                        if c.syncing_computer:
                            continue

                        ignore_folders = [
                            [join('models', project.name), []]
                        ]

                        computer.syncing_computer = c.name
                        provider.update()

                        sync_directed(self.session, c, computer,
                                      ignore_folders)

                    for t in tasks:
                        task_synced_provider.add(
                            TaskSynced(computer=computer.name, task=t.id)
                        )

                    time.sleep(FILE_SYNC_INTERVAL)

            computer.last_synced = sync_start
            computer.syncing_computer = None
            provider.update()
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup('FileSync')
                self.session = Session.create_session(key='FileSync')
                self.logger = create_logger(self.session, 'FileSync')

            self.logger.error(
                traceback.format_exc(), ComponentType.WorkerSupervisor,
                hostname
            )
Пример #19
0
import socket

from kaggle.models import DatasetNewRequest

from mlcomp.db.core import Session
from mlcomp.db.enums import ComponentType
from mlcomp.db.providers import ModelProvider
from mlcomp.worker.executors.base.equation import Equation
from mlcomp.worker.executors.base.executor import Executor
from mlcomp.utils.logging import create_logger
from mlcomp.utils.config import Config

try:
    from kaggle import api
except OSError:
    logger = create_logger(Session.create_session(), __name__)
    logger.warning(
        'Could not find kaggle.json. '
        'Kaggle executors can not be used', ComponentType.Worker,
        socket.gethostname())


class DownloadType(Enum):
    Kaggle = 0
    Link = 1


@Executor.register
class Download(Executor):
    def __init__(self,
                 output: str,
Пример #20
0
from mlcomp.server.back.supervisor import register_supervisor
from mlcomp.utils.logging import create_logger
from mlcomp.utils.io import from_module_path, zip_folder
from mlcomp.server.back.create_dags import dag_model_add, dag_model_start
from mlcomp.utils.misc import to_snake, now
from mlcomp.db.models import Model, Report, ReportLayout, Task
from mlcomp.utils.io import yaml_load, yaml_dump
from mlcomp.worker.storage import Storage

app = Flask(__name__)
CORS(app)

_read_session = Session.create_session(key='server.read')
_write_session = Session.create_session(key='server.write')

logger = create_logger(_write_session, __name__)


@app.route('/', defaults={'path': ''}, methods=['GET'])
@app.route('/<path:path>', methods=['GET'])
def send_static(path):
    file = 'index.html'
    if '.' in path:
        file = path

    module_path = from_module_path(__file__, f'../front/dist/mlcomp/')
    return send_from_directory(module_path, file)


def request_data():
    return json.loads(request.data.decode('utf-8'))