示例#1
0
def _dag(config: str, debug: bool = False, control_reqs=True,
         params: Tuple[str] = ()):
    logger = create_logger(_session, name='_dag')
    logger.info('started', ComponentType.Client)

    config_text = open(config, 'r').read()
    config_parsed = yaml_load(config_text)
    params = dict_from_list_str(params)
    config_parsed = merge_dicts_smart(config_parsed, params)
    config_text = yaml_dump(config_parsed)

    logger.info('config parsed', ComponentType.Client)

    type_name = config_parsed['info'].get('type', 'standard')
    if type_name == DagType.Standard.name.lower():
        return dag_standard(
            session=_session,
            config=config_parsed,
            debug=debug,
            config_text=config_text,
            config_path=config,
            control_reqs=control_reqs,
            logger=logger,
            component=ComponentType.Client
        )

    return dag_pipe(
        session=_session, config=config_parsed, config_text=config_text
    )
示例#2
0
    def build(self):
        try:
            self.create_base()

            self.check_status()

            self.change_status()

            self.download()

            self.create_executor()

            self.execute()

        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(key='ExecuteBuilder')
                self.session = Session.create_session(key='ExecuteBuilder')
                self.logger.session = create_logger(self.session,
                                                    'ExecuteBuilder')

            step = self.executor.step.id if \
                (self.executor and self.executor.step) else None

            self.error(traceback.format_exc(), step)
            self.provider.change_status(self.task, TaskStatus.Failed)
            raise e
        finally:
            if app.current_task:
                app.current_task.update_state(state=states.SUCCESS)
                app.close()

            if self.exit:
                # noinspection PyProtectedMember
                os._exit(0)
示例#3
0
def execute(config: str, debug: bool):
    _create_computer()

    # Fail all InProgress Tasks
    logger = create_logger(_session, __name__)

    provider = TaskProvider(_session)
    step_provider = StepProvider(_session)

    for t in provider.by_status(TaskStatus.InProgress,
                                worker_index=WORKER_INDEX):
        step = step_provider.last_for_task(t.id)
        logger.error(
            f'Task Id = {t.id} was in InProgress state '
            f'when another tasks arrived to the same worker',
            ComponentType.Worker, t.computer_assigned, t.id, step)
        provider.change_status(t, TaskStatus.Failed)

    # Create dag
    created_dag = _dag(config, debug)
    for ids in created_dag.values():
        for id in ids:
            task = provider.by_id(id)
            task.gpu_assigned = ','.join(
                [str(i) for i, _ in enumerate(GPUtil.getGPUs())])

            provider.commit()
            execute_by_id(id, exit=False)
示例#4
0
def execute(config: str, debug: bool, params):
    check_statuses()

    _create_computer()
    _create_docker()

    # Fail all InProgress Tasks
    logger = create_logger(_session, __name__)

    provider = TaskProvider(_session)
    step_provider = StepProvider(_session)

    for t in provider.by_status(TaskStatus.InProgress,
                                worker_index=WORKER_INDEX):
        step = step_provider.last_for_task(t.id)
        logger.error(
            f'Task Id = {t.id} was in InProgress state '
            f'when another tasks arrived to the same worker',
            ComponentType.Worker, t.computer_assigned, t.id, step)
        provider.change_status(t, TaskStatus.Failed)

    # Create dags
    dags = _dag(config, debug, params=params)
    for dag in dags:
        for ids in dag.values():
            for id in ids:
                task = provider.by_id(id)
                task.gpu_assigned = ','.join(
                    [str(i) for i in range(torch.cuda.device_count())])

                provider.commit()
                execute_by_id(id, exit=False)
示例#5
0
def worker_supervisor():
    """
    Start worker supervisor.
    This program controls workers ran on the same machine.
    Also, it writes metric of resources consumption.
    """
    host = socket.gethostname()

    logger = create_logger(_session, 'worker_supervisor')
    logger.info('worker_supervisor start', ComponentType.WorkerSupervisor,
                host)

    _create_computer()
    _create_docker()

    start_schedule([(stop_processes_not_exist, 10)])

    if DOCKER_MAIN:
        syncer = FileSync()
        start_schedule([(worker_usage, 0)])
        start_schedule([(syncer.sync, 0)])

    name = f'{host}_{DOCKER_IMG}_supervisor'
    argv = [
        'worker', '--loglevel=INFO', '-P=solo', f'-n={name}', '-O fair',
        '-c=1', '--prefetch-multiplier=1', '-Q', f'{name}'
    ]

    logger.info('worker_supervisor run celery', ComponentType.WorkerSupervisor,
                host)

    app.worker_main(argv)
示例#6
0
    def build(self):
        try:
            # if self.fast_check():
            #     return

            self.auxiliary = {'time': now()}

            self.create_base()

            self.process_stop_tasks()

            self.process_start_dags()

            self.process_parent_tasks()

            self.load_tasks()

            self.load_computers()

            self.process_tasks()

            self.write_auxiliary()

        except ObjectDeletedError:
            pass
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(key='SupervisorBuilder')
                self.session = Session.create_session(key='SupervisorBuilder')
                self.logger = create_logger(self.session, 'SupervisorBuilder')

            self.logger.error(traceback.format_exc(), ComponentType.Supervisor)
示例#7
0
def sync_directed(
        session: Session,
        source: Computer,
        target: Computer,
        folders: List
):
    current_computer = socket.gethostname()
    logger = create_logger(session, __name__)
    for folder, excluded in folders:
        end = ' --perms  --chmod=777 --size-only'
        if len(excluded) > 0:
            parts = []
            folder_excluded = False
            for i in range(len(excluded)):
                if excluded[i] == folder:
                    folder_excluded = True
                    break
                if not excluded[i].startswith(folder):
                    continue

                part = os.path.relpath(excluded[i], folder)
                part = f'--exclude {part}'
                parts.append(part)

            if folder_excluded:
                continue

            if len(parts) > 0:
                end += ' ' + ' '.join(parts)

        source_folder = join(source.root_folder, folder)
        target_folder = join(target.root_folder, folder)

        if current_computer == source.name:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \
                      f'{source_folder}/ ' \
                      f'{target.user}@{target.ip}:{target_folder}/ {end}'
        elif current_computer == target.name:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {source.port} -o StrictHostKeyChecking=no" ' \
                      f'{source.user}@{source.ip}:{source_folder}/ ' \
                      f'{target_folder}/ {end}'
        else:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \
                      f' {source_folder}/ ' \
                      f'{target.user}@{target.ip}:{target_folder}/ {end}'

            command = f'ssh -p {source.port} ' \
                      f'{source.user}@{source.ip} "{command}"'

        logger.info(command, ComponentType.WorkerSupervisor, current_computer)
        try:
            subprocess.check_output(command, shell=True,
                                    stderr=subprocess.STDOUT,
                                    universal_newlines=True)
        except subprocess.CalledProcessError as exc:
            raise Exception(exc.output)
示例#8
0
文件: sync.py 项目: ASRlytics/mlcomp
    def sync(self):
        hostname = socket.gethostname()
        try:
            provider = ComputerProvider(self.session)
            task_synced_provider = TaskSyncedProvider(self.session)

            computer = provider.by_name(hostname)
            sync_start = now()

            if FILE_SYNC_INTERVAL == 0:
                time.sleep(1)
            else:
                computers = provider.all_with_last_activtiy()
                computers = [
                    c for c in computers
                    if (now() - c.last_activity).total_seconds() < 10
                ]
                computers_names = {c.name for c in computers}

                for c, project, tasks in task_synced_provider.for_computer(
                        computer.name):
                    if c.name not in computers_names:
                        self.logger.info(
                            f'Computer = {c.name} '
                            f'is offline. Can not sync',
                            ComponentType.WorkerSupervisor, hostname)
                        continue

                    if c.syncing_computer:
                        continue

                    excluded = list(map(str,
                                        yaml_load(project.ignore_folders)))
                    folders_excluded = [[join('data', project.name), excluded],
                                        [join('models', project.name), []]]

                    computer.syncing_computer = c.name
                    provider.update()
                    sync_directed(self.session, c, computer, folders_excluded)

                    for t in tasks:
                        task_synced_provider.add(
                            TaskSynced(computer=computer.name, task=t.id))

                    time.sleep(FILE_SYNC_INTERVAL)

            computer.last_synced = sync_start
            computer.syncing_computer = None
            provider.update()
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup('FileSync')
                self.session = Session.create_session(key='FileSync')
                self.logger = create_logger(self.session, 'FileSync')

            self.logger.error(traceback.format_exc(),
                              ComponentType.WorkerSupervisor, hostname)
示例#9
0
    def process_error(self, e: Exception):
        if Session.sqlalchemy_error(e):
            Session.cleanup('FileSync')
            self.session = Session.create_session(key='FileSync')
            self.logger = create_logger(self.session, 'FileSync')

        hostname = socket.gethostname()
        self.logger.error(
            traceback.format_exc(), ComponentType.WorkerSupervisor,
            hostname
        )
示例#10
0
    def __init__(self, id: int, repeat_count: int = 1, exit=True):
        self.session = Session.create_session(key='ExecuteBuilder')
        self.id = id
        self.repeat_count = repeat_count
        self.logger = create_logger(self.session, 'ExecuteBuilder')
        self.logger_db = create_logger(self.session,
                                       'ExecuteBuilder.db',
                                       console=False)
        self.exit = exit

        self.provider = None
        self.library_provider = None
        self.storage = None
        self.task = None
        self.dag = None
        self.executor = None
        self.hostname = None
        self.docker_img = None
        self.worker_index = None
        self.queue_personal = None
        self.config = None
        self.executor_type = None
示例#11
0
 def __init__(self):
     self.session = Session.create_session(key='SupervisorBuilder')
     self.logger = create_logger(self.session, 'SupervisorBuilder')
     self.provider = None
     self.computer_provider = None
     self.docker_provider = None
     self.auxiliary_provider = None
     self.dag_provider = None
     self.queues = None
     self.not_ran_tasks = None
     self.dep_status = None
     self.computers = None
     self.auxiliary = {}
示例#12
0
    def wrapper():
        try:
            f(wrapper_vars['session'], wrapper_vars['logger'])
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(name)

                wrapper_vars['session'] = Session.create_session(key=name)
                wrapper_vars['logger'] = create_logger(wrapper_vars['session'],
                                                       name)

            wrapper_vars['logger'].error(traceback.format_exc(),
                                         ComponentType.WorkerSupervisor,
                                         hostname)
示例#13
0
def find_imports(path: str,
                 files: List[str] = None,
                 exclude_patterns: List[str] = None,
                 encoding='utf-8'):
    res = []
    raw_imports = []
    files = files if files is not None \
        else glob(os.path.join(path, '**', '*.py'), recursive=True)

    exclude_patterns = exclude_patterns \
        if exclude_patterns is not None else []
    spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern,
                                        exclude_patterns)

    for file in files:
        if not file.endswith('.py'):
            continue
        file_rel = os.path.relpath(file, path)
        if spec.match_file(file_rel):
            continue

        with open(file, 'r', encoding=encoding) as f:
            content = f.read()
            try:
                tree = ast.parse(content)
                for node in ast.walk(tree):
                    if isinstance(node, ast.Import):
                        for subnode in node.names:
                            raw_imports.append((subnode.name, file_rel))
                    elif isinstance(node, ast.ImportFrom):
                        raw_imports.append((node.module, file_rel))
            except Exception as exc:
                logger = create_logger(Session.create_session(), __name__)
                logger.error('Failed on file: %s' % file_rel)
                raise exc

    for lib, file in raw_imports:
        name = lib.split('.')[0]
        try:
            if name in _mapping:
                name = _mapping[name]

            version = pkg_resources.get_distribution(name).version
            res.append((name, version))
        except Exception:
            pass

    return res
示例#14
0
def _dag(config: str,
         debug: bool = False,
         control_reqs=True,
         params: Tuple[str] = ()):
    logger = create_logger(_session, name='_dag')
    logger.info('started', ComponentType.Client)

    config_text = open(config, 'r').read()
    config_parsed = yaml_load(config_text)
    params = dict_from_list_str(params)
    config_parsed = merge_dicts_smart(config_parsed, params)
    config_text = yaml_dump(config_parsed)

    logger.info('config parsed', ComponentType.Client)

    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip()
        config_parsed['info']['name'] += f'_{commit.decode("utf-8")[:6]}'
    except Exception:
        logger.info('commit not parsed')

    type_name = config_parsed['info'].get('type', 'standard')
    if type_name == DagType.Standard.name.lower():
        cells = grid_cells(
            config_parsed['grid']) if 'grid' in config_parsed else [None]
        dags = []
        for cell in cells:
            dag = dag_standard(session=_session,
                               config=config_parsed,
                               debug=debug,
                               config_text=config_text,
                               config_path=config,
                               control_reqs=control_reqs,
                               logger=logger,
                               component=ComponentType.Client,
                               grid_cell=cell)
            dags.append(dag)

        return dags

    return [
        dag_pipe(session=_session,
                 config=config_parsed,
                 config_text=config_text)
    ]
示例#15
0
文件: sync.py 项目: shlemph/mlcomp
def sync_directed(
        session: Session, source: Computer, target: Computer,
        ignore_folders: List
):
    current_computer = socket.gethostname()
    end = ' --perms  --chmod=777 --size-only'
    logger = create_logger(session, __name__)
    for folder, excluded in ignore_folders:
        if len(excluded) > 0:
            excluded = excluded[:]
            for i in range(len(excluded)):
                excluded[i] = f'--exclude {excluded[i]}'
            end += ' ' + ' '.join(excluded)

        source_folder = join(source.root_folder, folder)
        target_folder = join(target.root_folder, folder)

        if current_computer == source.name:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \
                      f'{source_folder}/ ' \
                      f'{target.user}@{target.ip}:{target_folder}/ {end}'
        elif current_computer == target.name:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {source.port} -o StrictHostKeyChecking=no" ' \
                      f'{source.user}@{source.ip}:{source_folder}/ ' \
                      f'{target_folder}/ {end}'
        else:
            command = f'rsync -vhru -e ' \
                      f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \
                      f' {source_folder}/ ' \
                      f'{target.user}@{target.ip}:{target_folder}/ {end}'

            command = f'ssh -p {source.port} ' \
                      f'{source.user}@{source.ip} "{command}"'

        logger.info(command, ComponentType.WorkerSupervisor, current_computer)
        subprocess.check_output(command, shell=True)
示例#16
0
def error_handler(f):
    name = f.__name__
    wrapper_vars = {'session': Session.create_session(key=name)}
    wrapper_vars['logger'] = create_logger(wrapper_vars['session'], name)

    hostname = socket.gethostname()

    def wrapper():
        try:
            f(wrapper_vars['session'], wrapper_vars['logger'])
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(name)

                wrapper_vars['session'] = Session.create_session(key=name)
                wrapper_vars['logger'] = create_logger(wrapper_vars['session'],
                                                       name)

            wrapper_vars['logger'].error(traceback.format_exc(),
                                         ComponentType.WorkerSupervisor,
                                         hostname)

    return wrapper
示例#17
0
    def decorated(*args, **kwargs):
        global _read_session, _write_session, logger

        success = True
        status = 200
        error = ''

        try:
            res = f(*args, **kwargs)
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup('server.read')
                Session.cleanup('server.write')

                _read_session = Session.create_session(key='server.read')
                _write_session = Session.create_session(key='server.write')

                logger = create_logger(_write_session, __name__)

            logger.error(
                f'Requested Url: {request.path}\n\n{traceback.format_exc()}',
                ComponentType.API
            )

            error = traceback.format_exc()
            success = False
            status = 500
            res = None

        res = res or {}
        if isinstance(res, Response):
            return res

        res['success'] = success
        res['error'] = error

        return Response(json.dumps(res), status=status)
示例#18
0
文件: sync.py 项目: shlemph/mlcomp
class FileSync:
    session = Session.create_session(key='FileSync')
    logger = create_logger(session, 'FileSync')

    def sync_manual(self, computer: Computer, provider: ComputerProvider):
        """
        button sync was clicked manually
        """
        if not computer.meta:
            return

        meta = yaml_load(computer.meta)
        if 'manual_sync' not in meta:
            return

        manual_sync = meta['manual_sync']

        project_provider = ProjectProvider(self.session)
        docker_provider = DockerProvider(self.session)

        dockers = docker_provider.get_online()
        project = project_provider.by_id(manual_sync['project'])

        for docker in dockers:
            if docker.computer == computer.name:
                continue

            source = provider.by_name(docker.computer)
            ignore_folders = [
                [join('models', project.name), []]
            ]
            sync_directed(self.session, target=computer, source=source,
                          ignore_folders=ignore_folders)

        del meta['manual_sync']
        computer.meta = yaml_dump(meta)
        provider.update()

    def sync(self):
        hostname = socket.gethostname()
        try:
            provider = ComputerProvider(self.session)
            task_synced_provider = TaskSyncedProvider(self.session)

            computer = provider.by_name(hostname)
            sync_start = now()

            if FILE_SYNC_INTERVAL == 0:
                time.sleep(1)
            else:
                self.sync_manual(computer, provider)

                computers = provider.all_with_last_activtiy()
                computers = [
                    c for c in computers
                    if (now() - c.last_activity).total_seconds() < 10
                ]
                computers_names = {c.name for c in computers}

                for c, project, tasks in task_synced_provider.for_computer(
                        computer.name):
                    if c.sync_with_this_computer:
                        if c.name not in computers_names:
                            self.logger.info(f'Computer = {c.name} '
                                             f'is offline. Can not sync',
                                             ComponentType.WorkerSupervisor,
                                             hostname)
                            continue

                        if c.syncing_computer:
                            continue

                        ignore_folders = [
                            [join('models', project.name), []]
                        ]

                        computer.syncing_computer = c.name
                        provider.update()

                        sync_directed(self.session, c, computer,
                                      ignore_folders)

                    for t in tasks:
                        task_synced_provider.add(
                            TaskSynced(computer=computer.name, task=t.id)
                        )

                    time.sleep(FILE_SYNC_INTERVAL)

            computer.last_synced = sync_start
            computer.syncing_computer = None
            provider.update()
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup('FileSync')
                self.session = Session.create_session(key='FileSync')
                self.logger = create_logger(self.session, 'FileSync')

            self.logger.error(
                traceback.format_exc(), ComponentType.WorkerSupervisor,
                hostname
            )
示例#19
0
文件: kaggle.py 项目: shlemph/mlcomp
import socket

from kaggle.models import DatasetNewRequest

from mlcomp.db.core import Session
from mlcomp.db.enums import ComponentType
from mlcomp.db.providers import ModelProvider
from mlcomp.worker.executors.base.equation import Equation
from mlcomp.worker.executors.base.executor import Executor
from mlcomp.utils.logging import create_logger
from mlcomp.utils.config import Config

try:
    from kaggle import api
except OSError:
    logger = create_logger(Session.create_session(), __name__)
    logger.warning(
        'Could not find kaggle.json. '
        'Kaggle executors can not be used', ComponentType.Worker,
        socket.gethostname())


class DownloadType(Enum):
    Kaggle = 0
    Link = 1


@Executor.register
class Download(Executor):
    def __init__(self,
                 output: str,
示例#20
0
文件: app.py 项目: xang1234/mlcomp
from mlcomp.server.back.supervisor import register_supervisor
from mlcomp.utils.logging import create_logger
from mlcomp.utils.io import from_module_path, zip_folder
from mlcomp.server.back.create_dags import dag_model_add, dag_model_start
from mlcomp.utils.misc import to_snake, now
from mlcomp.db.models import Model, Report, ReportLayout, Task
from mlcomp.utils.io import yaml_load, yaml_dump
from mlcomp.worker.storage import Storage

app = Flask(__name__)
CORS(app)

_read_session = Session.create_session(key='server.read')
_write_session = Session.create_session(key='server.write')

logger = create_logger(_write_session, __name__)


@app.route('/', defaults={'path': ''}, methods=['GET'])
@app.route('/<path:path>', methods=['GET'])
def send_static(path):
    file = 'index.html'
    if '.' in path:
        file = path

    module_path = from_module_path(__file__, f'../front/dist/mlcomp/')
    return send_from_directory(module_path, file)


def request_data():
    return json.loads(request.data.decode('utf-8'))