Python JobBackend.create примеры использования

Язык программирования: Python

Пространство имен/Пакет: aetros.backend

Класс/Тип: JobBackend

Метод/Функция: create

Примеров на hotexamples.com: 6

Python JobBackend.create - 6 примеров найдено. Это лучшие примеры Python кода для aetros.backend.JobBackend.create, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

JobBackend(9)

create(5)

load(4)

get_job_model(3)

job(3)

start(3)

add_files(2)

get_best_weight_url(2)

load_light_job(2)

upload_weights(2)

connect(1)

ensure_model(1)

fetch(1)

section(1)

set_status(1)

sync_weights(1)

Пример #1

Показать файл

    def main(self, args):

        from aetros import keras_model_utils

        import aetros.const
        from aetros.backend import JobBackend
        from aetros.Trainer import Trainer

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' upload-weights')
        parser.add_argument('id', help='model name or job id')
        parser.add_argument('weights', help="Weights path")
        parser.add_argument(
            '--api-key',
            help="Secure key. Alternatively use API_KEY environment variable.")
        parser.add_argument(
            '--kpi', help="You can overwrite or set the KPI for this job")
        parser.add_argument(
            '--latest',
            action="store_true",
            help="Instead of best epoch we upload latest weights.")
        parsed_args = parser.parse_args(args)

        if not parsed_args.id or not parsed_args.weights:
            parser.print_help()
            return

        job_backend = JobBackend(api_key=parsed_args.api_key)

        if '/' in parsed_args.id and '@' not in parsed_args.id:
            job_backend.create(parsed_args.id)

        job_backend.load(parsed_args.id)

        if job_backend.job is None:
            raise Exception("Job not found")

        weights_path = parsed_args.weights

        if not os.path.exists(weights_path):
            raise Exception('Weights file does not exist in ' + weights_path)

        print("Uploading weights to %s of %s ..." %
              (job_backend.job_id, job_backend.model_id))

        job_backend.upload_weights(
            'weights.hdf5', weights_path,
            float(parsed_args.kpi) if parsed_args.kpi else None)

Пример #2

Показать файл

Файл: RunCommand.py Проект: marcj/aetros-cli

    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' run')
        parser.add_argument('command', nargs='?', help="The command to run. Default read in configuration file")
        parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host.")
        parser.add_argument('--no-image', action='store_true', help="Forces not to use docker, even when image is defined in the configuration file.")

        parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed.")
        parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in configuration file")
        parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.")
        parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.")

        parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.")
        parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.")

        parser.add_argument('--offline', '-o', action='store_true', help="Whether the execution should happen offline.")

        parser.add_argument('--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.")

        parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.")
        parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.")

        parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container. Only when --local")

        parser.add_argument('--volume', '-v', action='append', help="Volume into docker. Only when --local")
        parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env")

        parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.")

        parsed_args = parser.parse_args(args)

        if parsed_args.config and not os.path.exists(parsed_args.config):
            self.logger.error("fatal: file %s does not exist." % (parsed_args.config,))
            sys.exit(2)

        config = find_config(parsed_args.config)
        home_config = read_home_config()

        if config['model'] and not parsed_args.model:
            parsed_args.model = config['model']

        if not parsed_args.model:
            print("fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        if not parsed_args.local and parsed_args.volume:
            print("fatal: can not use volume with jobs on the cluster. Use datasets instead.")
            sys.exit(1)

        if parsed_args.local and parsed_args.priority:
            print("fatal: the priority can only be set for jobs in the cluster.")
            sys.exit(1)

        if config['image']:
            ensure_docker_installed(self.logger)

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if ('command' not in config or not config['command']) and not parsed_args.command:
            self.logger.error('No command given. Define the command in aetros.yml or use command argument.')
            sys.exit(1)

        job_backend = JobBackend(parsed_args.model, self.logger)

        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job_backend.job = {'config': {'ignore': ignore}}

        adding_files = loading_text("- Adding job files to index ... ")
        files_added, size_added = job_backend.add_files(config['root'], report=False)
        adding_files("done with %d file%s added (%s)."
                     % (files_added, 's' if files_added != 1 else '', human_size(size_added, 2)))

        create_info = {
            'type': 'custom',
            'config': config
        }

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"')

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.rebuild_image:
            create_info['config']['rebuild_image'] = True

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:
            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config['image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.no_image:
            create_info['config']['image'] = None

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        create_info['config']['resources'] = create_info['config'].get('resources', {})
        resources = create_info['config']['resources']

        default_cpu_and_memory = 1 if create_info['config']['image'] else 0
        resources['cpu'] = int(parsed_args.cpu or resources.get('cpu', default_cpu_and_memory))
        resources['memory'] = int(parsed_args.memory or resources.get('memory', default_cpu_and_memory))
        resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0))
        resources['gpu_memory'] = int(parsed_args.gpu_memory or resources.get('gpu_memory', 0))

        if parsed_args.local:
            create_info['server'] = 'local'

            # make sure we do not limit the resources to something that is not available on the local machine
            warning = []
            cpu = cpuinfo.get_cpu_info()
            mem = psutil.virtual_memory().total
            gpu = 0
            try:
                gpu = len(get_ordered_devices())
            except CudaNotImplementedException: pass

            if not create_info['config']['image'] and not all([x == 0 for x in six.itervalues(resources)]):
                self.logger.warning("! No Docker virtualization since no `image` defined, resources limitation ignored.")

            if create_info['config']['image'] and resources['gpu'] > 0:
                if not (sys.platform == "linux" or sys.platform == "linux2"):
                    self.logger.warning("! Your operating system does not support GPU allocation for "
                                        "Docker virtualization. "
                                        "NVIDIA-Docker2 is only supported on Linux.")

            local_max_resources = {'cpu': cpu['count'], 'memory': ceil(mem / 1024 / 1024 / 1024), 'gpu': gpu}

            if create_info['config']['image']:
                # read max hardware within Docker
                out = docker_call(['run', 'alpine', 'sh', '-c', 'nproc && cat /proc/meminfo | grep MemTotal'])
                cpus, memory = out.decode('utf-8').strip().split('\n')
                local_max_resources['cpu'] = int(cpus)

                memory = memory.replace('MemTotal:', '').replace('kB', '').strip()
                local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024)

            if local_max_resources['cpu'] < resources['cpu']:
                warning.append('CPU cores %d -> %d' % (resources['cpu'], local_max_resources['cpu']))
                resources['cpu'] = local_max_resources['cpu']

            if local_max_resources['memory'] < resources['memory']:
                warning.append('memory %dGB -> %dGB' % (resources['memory'], local_max_resources['memory']))
                resources['memory'] = local_max_resources['memory']

            if local_max_resources['gpu'] < resources['gpu']:
                warning.append('GPU cards %d -> %d' % (resources['gpu'], local_max_resources['gpu']))
                resources['gpu'] = local_max_resources['gpu']

            if warning:
                self.logger.warning("! Resources downgrade due to missing hardware: %s." % (', '.join(warning),))

        if parsed_args.config and not create_info['config']['configPath']:
            create_info['config']['configPath'] = parsed_args.config

        create_info['config']['sourcesAttached'] = True

        creating_git_job = loading_text("- Create job in local Git ... ")
        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job_backend.create(create_info=create_info, server=None)
        creating_git_job("created %s in %s." % (job_backend.job_id[0:9], job_backend.model_name))

        summary = "➤ Summary: Job running "
        if parsed_args.local:
            summary += 'locally'
        else:
            summary += 'on the cluster'

        if create_info['config']['image']:
            summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \
                       % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu'])
        else:
            summary += ' on host using all available resources.'

        print(summary)

        # tasks = []
        #
        # if 'tasks' in config:
        #     for name, task_config in six.iteritems(config['tasks']):
        #         replica = 1
        #         if 'replica' in task_config:
        #             replica = int(task_config['replica'])
        #         for index in range(0, replica):
        #             tasks.append(job_backend.create_task(job_id, task_config, name, index))

        if parsed_args.offline:
            if not parsed_args.local:
                self.logger.warning("Can not create a remote job in offline mode.")
                sys.exit(1)

            self.logger.warning("Execution started offline.")
        else:
            adding_files = loading_text("- Connecting to "+home_config['host']+" ... ")
            if job_backend.connect():
                adding_files("connected.")
            else:
                parsed_args.offline = True
                adding_files("failed. Continue in offline mode.")

        if not parsed_args.offline:
            sys.stdout.write("- Uploading job data ... ")
            job_backend.git.push()
            job_backend.client.wait_until_queue_empty(['files'], clear_end=False)

            sys.stdout.write(" done.\n")

            link = "%smodel/%s/job/%s" % (home_config['url'], job_backend.model_name, job_backend.job_id)
            sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link))

        if parsed_args.local:
            job_backend.start(collect_system=False, offline=parsed_args.offline, push=False)

            if not parsed_args.offline:
                job_backend.git.start_push_sync()

            cpus = create_info['config']['resources']['cpu']
            memory = create_info['config']['resources']['memory']

            if not parsed_args.gpu_device and create_info['config']['resources']['gpu'] > 0:
                # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1]
                parsed_args.gpu_device = []
                for i in range(0, create_info['config']['resources']['gpu']):
                    parsed_args.gpu_device.append(i)

            start_command(self.logger, job_backend, env, parsed_args.volume, cpus=cpus, memory=memory, gpu_devices=parsed_args.gpu_device,
                offline=parsed_args.offline)

Пример #3

Показать файл

Файл: RunCommand.py Проект: lizhangjun/aetros-cli

    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' run')
        parser.add_argument('command', nargs='?', help="The command to run. Default read in aetros.yml")
        parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host.")
        parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed.")
        parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in aetros.yml")
        parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.")
        parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.")

        parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.")
        parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.")

        parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.")
        parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.")

        parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container.")

        parser.add_argument('--volume', '-v', action='append', help="Volume into docker")
        parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env")

        parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.")

        parsed_args = parser.parse_args(args)

        config = read_config(parsed_args.config or 'aetros.yml')

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if 'command' not in config and not parsed_args.command:
            self.logger.error('No "command" given in aetros.yml or as argument.')
            sys.exit(1)

        job = JobBackend(parsed_args.model, self.logger, parsed_args.config or 'aetros.yml')
        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job.job = {'config': {'ignore': ignore}}

        files_added, size_added = job.add_files()

        print("%d files added (%s)" % (files_added, human_size(size_added, 2)))

        create_info = {
            'type': 'custom',
            'config': config
        }

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"')

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:

            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config['image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        if 'resources' not in create_info['config']:
            create_info['config']['resources'] = {}

        if parsed_args.cpu or parsed_args.memory or parsed_args.gpu is not None or parsed_args.gpu_memory:
            if parsed_args.cpu: create_info['config']['resources']['cpu'] = float(parsed_args.cpu)
            if parsed_args.memory: create_info['config']['resources']['memory'] = float(parsed_args.memory)
            if parsed_args.gpu is not None: create_info['config']['resources']['gpu'] = float(parsed_args.gpu)
            if parsed_args.gpu_memory: create_info['config']['resources']['gpu_memory'] = float(parsed_args.gpu_memory)

        if parsed_args.local:
            # usually, the aetros server would assign resources at job root level from the assigned server
            # but since it's started locally, we just use the requested one. User should know what they do.
            # start.py will use 'config' stuff anyone for docker limitation, so we should make sure it is
            # being displayed.

            if 'image' in create_info['config'] and create_info['config']:
                resources = create_info['config']['resources']
                create_info['resources_assigned'] = {'cpus': 1, 'memory': 1, 'gpus': []}

                if 'gpu' in resources and resources['gpu'] > 0:
                    create_info['resources_assigned']['gpus'] = [1] * resources['gpu']
                if 'cpu' in resources:
                    create_info['resources_assigned']['cpus'] = resources['cpu']
                if 'memory' in resources:
                    create_info['resources_assigned']['memory'] = resources['memory']
            else:
                # since this runs on the host, extract machine hardware and put int resources_assigned
                # so we see it at the job.
                pass

        if parsed_args.local:
            create_info['server'] = 'local'

        create_info['config']['sourcesAttached'] = True

        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job.create(create_info=create_info, server=None)

        print("Job %s/%s created." % (job.model_name, job.job_id))

        if parsed_args.local:
            start(self.logger, job.model_name + '/' + job.job_id, fetch=False, env=env, volumes=parsed_args.volume, gpu_devices=parsed_args.gpu_device)
        else:
            if parsed_args.volume:
                print("Can not use volume with jobs on the cluster. Use datasets instead.")
                sys.exit(1)

            #todo, make it visible
            job.git.push()
            print("Open http://%s/model/%s/job/%s to monitor it." % (job.host, job.model_name, job.job_id))

Пример #4

Показать файл

Файл: RunCommand.py Проект: mteguhsat/aetros-cli

    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' run')
        parser.add_argument(
            'command',
            nargs='?',
            help="The command to run. Default read in configuration file")
        parser.add_argument(
            '-i',
            '--image',
            help=
            "Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host."
        )
        parser.add_argument(
            '--no-image',
            action='store_true',
            help=
            "Forces not to use docker, even when image is defined in the configuration file."
        )

        parser.add_argument(
            '-s',
            '--server',
            action='append',
            help=
            "Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed."
        )
        parser.add_argument(
            '-m',
            '--model',
            help=
            "Under which model this job should be listed. Default read in configuration file"
        )
        parser.add_argument(
            '-l',
            '--local',
            action='store_true',
            help="Start the job immediately on the current machine.")
        parser.add_argument(
            '-c',
            '--config',
            help="Default aetros.yml in current working directory.")
        parser.add_argument(
            '--priority',
            help="Increases or decreases priority. Default is 0.")

        parser.add_argument(
            '--cpu',
            help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument(
            '--memory',
            help="How much memory should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu',
            help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu_memory',
            help="Memory requirement for the GPU. Docker only.")

        parser.add_argument(
            '--offline',
            '-o',
            action='store_true',
            help="Whether the execution should happen offline.")

        parser.add_argument(
            '--rebuild-image',
            action='store_true',
            help="Makes sure the Docker image is re-built without cache.")

        parser.add_argument(
            '--max-time',
            help=
            "Limit execution time in seconds. Sends SIGINT to the process group when reached."
        )
        parser.add_argument(
            '--max-epochs',
            help=
            "Limit execution epochs. Sends SIGINT to the process group when reached."
        )

        parser.add_argument(
            '--gpu-device',
            action='append',
            help=
            "Which device id should be mapped into the NVIDIA docker container. Only when --local"
        )

        parser.add_argument('--volume',
                            '-v',
                            action='append',
                            help="Volume into docker. Only when --local")
        parser.add_argument(
            '-e',
            action='append',
            help=
            "Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env"
        )

        parser.add_argument(
            '-p',
            '--param',
            action='append',
            help=
            "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed."
        )

        parsed_args = parser.parse_args(args)

        if parsed_args.config and not os.path.exists(parsed_args.config):
            self.logger.error("fatal: file %s does not exist." %
                              (parsed_args.config, ))
            sys.exit(2)

        config = find_config(parsed_args.config)
        home_config = read_home_config()

        if config['model'] and not parsed_args.model:
            parsed_args.model = config['model']

        if not parsed_args.model:
            print(
                "fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'."
            )
            sys.exit(2)

        if not parsed_args.local and parsed_args.volume:
            print(
                "fatal: can not use volume with jobs on the cluster. Use datasets instead."
            )
            sys.exit(1)

        if parsed_args.local and parsed_args.priority:
            print(
                "fatal: the priority can only be set for jobs in the cluster.")
            sys.exit(1)

        if config['image']:
            ensure_docker_installed(self.logger)

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if ('command' not in config
                or not config['command']) and not parsed_args.command:
            self.logger.error(
                'No command given. Define the command in aetros.yml or use command argument.'
            )
            sys.exit(1)

        job_backend = JobBackend(parsed_args.model, self.logger)

        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job_backend.job = {'config': {'ignore': ignore}}

        adding_files = loading_text("- Adding job files to index ... ")
        files_added, size_added = job_backend.add_files(config['root'],
                                                        report=False)
        adding_files("done with %d file%s added (%s)." %
                     (files_added, 's' if files_added != 1 else '',
                      human_size(size_added, 2)))

        create_info = {'type': 'custom', 'config': config}

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception(
                        '--param ' + param +
                        ' does not contain a `=`. Please use "--param name=value"'
                    )

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters,
                                            incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.rebuild_image:
            create_info['config']['rebuild_image'] = True

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:
            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config[
                    'image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.no_image:
            create_info['config']['image'] = None

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        create_info['config']['resources'] = create_info['config'].get(
            'resources', {})
        resources = create_info['config']['resources']

        default_cpu_and_memory = 1 if create_info['config']['image'] else 0
        resources['cpu'] = int(parsed_args.cpu
                               or resources.get('cpu', default_cpu_and_memory))
        resources['memory'] = int(
            parsed_args.memory
            or resources.get('memory', default_cpu_and_memory))
        resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0))
        resources['gpu_memory'] = int(parsed_args.gpu_memory
                                      or resources.get('gpu_memory', 0))

        if parsed_args.local:
            create_info['server'] = 'local'

            # make sure we do not limit the resources to something that is not available on the local machine
            warning = []
            cpu = cpuinfo.get_cpu_info()
            mem = psutil.virtual_memory().total
            gpu = 0
            try:
                gpu = len(get_ordered_devices())
            except CudaNotImplementedException:
                pass

            if not create_info['config']['image'] and not all(
                [x == 0 for x in six.itervalues(resources)]):
                self.logger.warning(
                    "! No Docker virtualization since no `image` defined, resources limitation ignored."
                )

            if create_info['config']['image'] and resources['gpu'] > 0:
                if not (sys.platform == "linux" or sys.platform == "linux2"):
                    self.logger.warning(
                        "! Your operating system does not support GPU allocation for "
                        "Docker virtualization. "
                        "NVIDIA-Docker2 is only supported on Linux.")

            local_max_resources = {
                'cpu': cpu['count'],
                'memory': ceil(mem / 1024 / 1024 / 1024),
                'gpu': gpu
            }

            if create_info['config']['image']:
                # read max hardware within Docker
                out = docker_call([
                    'run', 'alpine', 'sh', '-c',
                    'nproc && cat /proc/meminfo | grep MemTotal'
                ])
                cpus, memory = out.decode('utf-8').strip().split('\n')
                local_max_resources['cpu'] = int(cpus)

                memory = memory.replace('MemTotal:', '').replace('kB',
                                                                 '').strip()
                local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024)

            if local_max_resources['cpu'] < resources['cpu']:
                warning.append('CPU cores %d -> %d' %
                               (resources['cpu'], local_max_resources['cpu']))
                resources['cpu'] = local_max_resources['cpu']

            if local_max_resources['memory'] < resources['memory']:
                warning.append(
                    'memory %dGB -> %dGB' %
                    (resources['memory'], local_max_resources['memory']))
                resources['memory'] = local_max_resources['memory']

            if local_max_resources['gpu'] < resources['gpu']:
                warning.append('GPU cards %d -> %d' %
                               (resources['gpu'], local_max_resources['gpu']))
                resources['gpu'] = local_max_resources['gpu']

            if warning:
                self.logger.warning(
                    "! Resources downgrade due to missing hardware: %s." %
                    (', '.join(warning), ))

        if parsed_args.config and not create_info['config']['configPath']:
            create_info['config']['configPath'] = parsed_args.config

        create_info['config']['sourcesAttached'] = True

        creating_git_job = loading_text("- Create job in local Git ... ")
        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job_backend.create(create_info=create_info, server=None)
        creating_git_job("created %s in %s." %
                         (job_backend.job_id[0:9], job_backend.model_name))

        summary = "➤ Summary: Job running "
        if parsed_args.local:
            summary += 'locally'
        else:
            summary += 'on the cluster'

        if create_info['config']['image']:
            summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \
                       % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu'])
        else:
            summary += ' on host using all available resources.'

        print(summary)

        # tasks = []
        #
        # if 'tasks' in config:
        #     for name, task_config in six.iteritems(config['tasks']):
        #         replica = 1
        #         if 'replica' in task_config:
        #             replica = int(task_config['replica'])
        #         for index in range(0, replica):
        #             tasks.append(job_backend.create_task(job_id, task_config, name, index))

        if parsed_args.offline:
            if not parsed_args.local:
                self.logger.warning(
                    "Can not create a remote job in offline mode.")
                sys.exit(1)

            self.logger.warning("Execution started offline.")
        else:
            adding_files = loading_text("- Connecting to " +
                                        home_config['host'] + " ... ")
            if job_backend.connect():
                adding_files("connected.")
            else:
                parsed_args.offline = True
                adding_files("failed. Continue in offline mode.")

        if not parsed_args.offline:
            sys.stdout.write("- Uploading job data ... ")
            job_backend.git.push()
            job_backend.client.wait_until_queue_empty(['files'],
                                                      clear_end=False)

            sys.stdout.write(" done.\n")

            link = "%s/model/%s/job/%s" % (
                home_config['url'], job_backend.model_name, job_backend.job_id)
            sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link))

        if parsed_args.local:
            job_backend.start(collect_system=False,
                              offline=parsed_args.offline,
                              push=False)

            if not parsed_args.offline:
                job_backend.git.start_push_sync()

            cpus = create_info['config']['resources']['cpu']
            memory = create_info['config']['resources']['memory']

            if not parsed_args.gpu_device and create_info['config'][
                    'resources']['gpu'] > 0:
                # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1]
                parsed_args.gpu_device = []
                for i in range(0, create_info['config']['resources']['gpu']):
                    parsed_args.gpu_device.append(i)

            start_command(self.logger,
                          job_backend,
                          env,
                          parsed_args.volume,
                          cpus=cpus,
                          memory=memory,
                          gpu_devices=parsed_args.gpu_device,
                          offline=parsed_args.offline)

Пример #5

Показать файл

class KerasIntegration():
    def __init__(self,
                 id,
                 model,
                 api_key,
                 insights=False,
                 confusion_matrix=False,
                 insight_sample=None):
        """

        :type id: basestring The actual model name available in AETROS Trainer. Example peter/mnist-cnn
        :type insights: bool
        :type confusion_matrix: bool
        :type insight_sample: basestring|None A path to a sample which is being used for the insights. Default is first sample of data_validation.
        """
        self.confusion_matrix = confusion_matrix
        self.model = model

        if isinstance(model, Sequential) and not model.built:
            raise Exception('Sequential model is not built.')

        self.insight_sample = insight_sample
        self.id = id
        self.insights = insights
        self.model_type = 'custom'

        self.job_backend = JobBackend(api_token=api_key)

        copy = {
            'fit': self.model.fit,
            'fit_generator': self.model.fit_generator
        }

        def overwritten_fit(x,
                            y,
                            batch_size=32,
                            nb_epoch=10,
                            verbose=1,
                            callbacks=[],
                            validation_split=0.,
                            validation_data=None,
                            shuffle=True,
                            class_weight=None,
                            sample_weight=None,
                            **kwargs):

            callback = self.setup(x, nb_epoch, batch_size)
            callbacks.append(callback)
            copy['fit'](x, y, batch_size, nb_epoch, verbose, callbacks,
                        validation_split, validation_data, True, class_weight,
                        sample_weight, **kwargs)

            self.end()

        def overwritten_fit_generator(generator,
                                      samples_per_epoch,
                                      nb_epoch,
                                      verbose=1,
                                      callbacks=[],
                                      validation_data=None,
                                      nb_val_samples=None,
                                      class_weight={},
                                      max_q_size=10,
                                      nb_worker=1,
                                      pickle_safe=False):

            callback = self.setup(generator, nb_epoch)
            self.trainer.nb_val_samples = nb_val_samples
            self.trainer.data_validation = validation_data
            callbacks.append(callback)

            copy['fit_generator'](generator, samples_per_epoch, nb_epoch,
                                  verbose, callbacks, validation_data,
                                  nb_val_samples, class_weight, max_q_size,
                                  nb_worker, pickle_safe)
            self.end()

        self.model.fit = overwritten_fit
        self.model.fit_generator = overwritten_fit_generator

    def setup(self, x=None, nb_epoch=1, batch_size=16):
        graph = self.model_to_graph(self.model)

        from keras.preprocessing.image import Iterator

        if isinstance(x, Iterator):
            batch_size = x.batch_size

        settings = {
            'epochs':
            nb_epoch,
            'batchSize':
            batch_size,
            'optimizer':
            type(self.model.optimizer).__name__ if hasattr(
                self.model, 'optimizer') else ''
        }

        self.job_backend.ensure_model(self.id,
                                      self.model.to_json(),
                                      settings=settings,
                                      type=self.model_type,
                                      graph=graph)

        job_id = self.job_backend.create(self.id, insights=self.insights)
        self.job_backend.start()

        print(
            "AETROS job '%s' created and started. Open http://%s/trainer/app#/job=%s to monitor the training."
            % (job_id, self.job_backend.host, job_id))

        job = self.job_backend.load_light_job()
        general_logger = GeneralLogger(job, job_backend=self.job_backend)
        self.trainer = Trainer(self.job_backend, general_logger)

        self.monitoringThread = MonitoringThread(self.job_backend,
                                                 self.trainer)
        self.monitoringThread.daemon = True
        self.monitoringThread.start()

        self.trainer.model = self.model
        self.trainer.data_train = {'x': x}

        self.callback = KerasLogger(self.trainer, self.job_backend,
                                    general_logger)
        self.callback.log_epoch = False
        self.callback.model = self.model
        self.callback.confusion_matrix = self.confusion_matrix

        return self.callback

    def publish(self):
        graph = self.model_to_graph(self.model)
        self.job_backend.ensure_model(self.id,
                                      self.model.to_json(),
                                      type=self.model_type,
                                      graph=graph)

    def start(self, nb_epoch=1, nb_sample=1, title="TRAINING"):
        """
        Starts $title
        :return:
        """

        self.setup(nb_epoch)
        self.callback.params['nb_epoch'] = nb_epoch
        self.callback.params['nb_sample'] = nb_sample
        self.callback.on_train_begin()

        return self.callback

    def batch_begin(self, batch, size):
        logs = {
            'batch': batch,
            'size': size,
        }
        self.callback.on_batch_end(batch, logs)

    def batch_end(self, batch, size, loss=0, acc=0):

        logs = {
            'loss': loss,
            'acc': acc,
            'batch': batch,
            'size': size,
        }
        self.callback.on_batch_end(batch, logs)

    def epoch_end(self, epoch, loss=0, val_loss=0, acc=0, val_acc=0):
        """

        :type epoch: integer starting with 0
        """
        logs = {
            'loss': loss,
            'val_loss': val_loss,
            'acc': acc,
            'val_acc': val_acc,
            'epoch': epoch
        }
        self.callback.on_epoch_end(epoch, logs)

    def end(self):
        self.monitoringThread.stop()
        self.job_backend.sync_weights()
        self.job_backend.set_status('DONE')

    def model_to_graph(self, model):
        graph = {'nodes': [], 'links': [], 'groups': []}

        map = {'idx': {}, 'flatten': [], 'group_pointer': -1}

        def layer_to_dict(layer):
            info = {}

            if isinstance(layer, Dropout):
                info['dropout'] = layer.p

            if isinstance(layer, Dense):
                info['neurons'] = layer.output_dim
                info['activaton'] = layer.activation.__name__

            if isinstance(layer, Convolution2D):
                info['receptiveField'] = [layer.nb_col, layer.nb_row]
                info['features'] = layer.nb_filter

            if isinstance(layer, MaxPooling2D):
                info['poolingArea'] = [layer.pool_size[0], layer.pool_size[1]]

            if isinstance(layer, Embedding):
                info['inputDim'] = layer.input_dim
                info['outputDim'] = layer.output_dim
                info['dropout'] = layer.dropout

            if isinstance(layer, Activation):
                info['activaton'] = layer.activation.__name__

            if isinstance(layer, Merge):
                info['mode'] = layer.mode

            if isinstance(layer, RepeatVector):
                info['n'] = layer.n

            if isinstance(layer, InputLayer):
                info['inputShape'] = layer.input_shape

            info['outputShape'] = layer.output_shape

            return {
                'name': layer.name,
                'class': type(layer).__name__,
                'width': 60,
                'height': 40,
                'info': info
            }

        def add_layer(layer):
            graph['nodes'].append(layer_to_dict(layer))
            map['flatten'].append(layer)
            map['idx'][layer.name] = len(graph['nodes']) - 1
            # if map['group_pointer'] >= 0:
            #     graph['groups'][map['group_pointer']].append(len(graph['nodes'])-1)

        def get_idx(layer):
            return map['idx'][layer.name]

        def extract_layers(layers):
            for layer in layers:
                if layer not in map['flatten']:
                    add_layer(layer)
                    if hasattr(layer, 'layers') and isinstance(
                            layer.layers, list):
                        # graph['groups'].append([])
                        # map['group_pointer'] += 1
                        extract_layers(layer.layers)
                        # map['group_pointer'] -= 1
                    else:
                        for inbound_node in layer.inbound_nodes:
                            extract_layers(inbound_node.inbound_layers)

        extract_layers(model.layers)

        # build edges
        for layer in map['flatten']:

            for inbound_node in layer.inbound_nodes:
                for inbound_layer in inbound_node.inbound_layers:
                    graph['links'].append({
                        'source': get_idx(inbound_layer),
                        'target': get_idx(layer),
                    })

            if hasattr(layer, 'layers') and isinstance(layer.layers, list):
                graph['links'].append({
                    'source': get_idx(layer.layers[-1]),
                    'target': get_idx(layer),
                })

        return graph

    def model_to_layers(self, model):
        layers = []

        # from keras.models import Sequential
        # if isinstance(model, Sequential):
        #     for layer in model.layers:
        #         layers[]

        # 'fc': 'Dense',
        # 'conv': 'Convolutional2D',
        # 'pool': 'MaxPooling2D',
        # 'pool_average': 'AveragePooling2D',
        # 'zero_padding': 'ZeroPadding2D',
        # 'upsampling': 'UpSampling2D',
        # 'flatten': 'Flatten',
        # 'merge': 'Merge',

        layer_type_map = {
            'InputLayer': 'fc',
            'Dense': 'fc',
            'Convolution2D': 'conv',
            'MaxPooling2D': 'pool',
            'AveragePooling2D': 'pool_average',
            'ZeroPadding2D': 'zero_padding',
            'UpSampling2D': 'upsampling',
            'Flatten': 'flatten',
            'Merge': 'merge',
        }

        def get_input_layer(layer):
            if isinstance(layer, Activation) or isinstance(layer, Dropout):
                return get_input_layer(
                    layer.inbound_nodes[0].inbound_layers[0])

            return layer

        for keras_layer in model.layers:
            name = type(keras_layer).__name__

            if name in layer_type_map:
                typeStr = layer_type_map[name]
            else:
                typeStr = name

            layer = {
                'id': keras_layer.name,
                'name': keras_layer.name,
                'type': typeStr,
                'connectedTo': [],
                'receptiveField': {
                    'width': 0,
                    'height': 0
                },
                'poolingArea': {
                    'width': 0,
                    'height': 0
                },
                'padding': [],
                'features': 0,
            }

            if isinstance(keras_layer, Convolution2D):
                layer['receptiveField']['width'] = keras_layer.nb_col
                layer['receptiveField']['height'] = keras_layer.nb_row
                layer['features'] = keras_layer.nb_filter
            if isinstance(keras_layer, MaxPooling2D):
                layer['poolingArea']['width'] = keras_layer.pool_size[0]
                layer['poolingArea']['height'] = keras_layer.pool_size[1]

            if isinstance(keras_layer, InputLayer):
                if len(keras_layer.input_shape) == 4:

                    # grayscale
                    if keras_layer.input_shape[1] == 1:
                        layer['inputType'] = 'image'
                        layer['width'] = keras_layer.input_shape[2]
                        layer['height'] = keras_layer.input_shape[3]

                    elif keras_layer.input_shape[1] == 3:
                        layer['inputType'] = 'image_rgb'
                        layer['width'] = keras_layer.input_shape[2]
                        layer['height'] = keras_layer.input_shape[3]

                elif len(keras_layer.input_shape) == 2:
                    layer['inputType'] = 'list'
                    layer['width'] = keras_layer.input_shape[1]
                    layer['height'] = 1
                else:
                    layer['inputType'] = 'custom'
                    layer['shape'] = keras_layer.input_shape

            if isinstance(keras_layer, Dense):
                layer['weight'] = keras_layer.output_dim

            if isinstance(keras_layer, Dropout):
                layers[-1][0]['dropout'] = keras_layer.p

                continue

            if isinstance(keras_layer, Activation):
                activation_function = str(keras_layer.activation)
                layers[-1][0][
                    'activationFunction'] = activation_function.split(' ')[1]

                continue

            for inbound_node in keras_layer.inbound_nodes:
                for inbound_layer in inbound_node.inbound_layers:
                    inbound_layer = get_input_layer(inbound_layer)
                    layer['connectedTo'].append(inbound_layer.name)

            layers.append([layer])

        return layers

Пример #6

Показать файл

Файл: UploadWeightsCommand.py Проект: liviust/aetros-cli

    def main(self, args):

        from aetros import keras_model_utils

        import aetros.const
        from aetros.backend import JobBackend
        from aetros.logger import GeneralLogger
        from aetros.Trainer import Trainer

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' upload-weights')
        parser.add_argument('id', nargs='?', help='model name or job id')
        parser.add_argument(
            '--secure-key',
            help="Secure key. Alternatively use API_KEY environment varibale.")
        parser.add_argument(
            '--weights',
            help=
            "Weights path. Per default we try to find it in the ./weights/ folder."
        )
        parser.add_argument(
            '--accuracy',
            help=
            "If you specified model name, you should also specify the accuracy this weights got."
        )
        parser.add_argument(
            '--latest',
            action="store_true",
            help="Instead of best epoch we upload latest weights.")

        parsed_args = parser.parse_args(args)
        job_backend = JobBackend(api_token=parsed_args.secure_key)

        if '/' in parsed_args.id and '@' not in parsed_args.id:
            job_backend.create(parsed_args.id)

        job_backend.load(parsed_args.id)

        if job_backend.job is None:
            raise Exception("Job not found")

        job_model = job_backend.get_job_model()

        weights_path = job_model.get_weights_filepath_best()

        if parsed_args.weights:
            weights_path = parsed_args.weights

        print(("Validate weights in %s ..." % (weights_path, )))

        keras_model_utils.job_prepare(job_model)

        general_logger = GeneralLogger()
        trainer = Trainer(job_backend, general_logger)

        job_model.set_input_shape(trainer)

        print("Loading model ...")
        model_provider = job_model.get_model_provider()
        model = model_provider.get_model(trainer)

        loss = model_provider.get_loss(trainer)
        optimizer = model_provider.get_optimizer(trainer)

        print("Compiling ...")
        model_provider.compile(trainer, model, loss, optimizer)

        print(("Validate weights %s ..." % (weights_path, )))
        job_model.load_weights(model, weights_path)
        print("Validated.")

        print("Uploading weights to %s of %s ..." %
              (job_backend.job_id, job_backend.model_id))

        job_backend.upload_weights(
            'best.hdf5', weights_path,
            float(parsed_args.accuracy) if parsed_args.accuracy else None)

        print("Done")