示例#1
0
def predict(args):
    image_name = build_and_push_docker_image(
        model_zoo=args.model_zoo,
        base_image=args.image_base,
        docker_image_prefix=args.docker_image_prefix,
        extra_pypi=args.extra_pypi_index,
        cluster_spec=args.cluster_spec,
        docker_base_url=args.docker_base_url,
        docker_tlscert=args.docker_tlscert,
        docker_tlskey=args.docker_tlskey,
    )
    container_args = [
        "-m",
        "elasticdl.python.master.main",
        "--worker_image",
        image_name,
        "--model_zoo",
        _model_zoo_in_docker(args.model_zoo),
        "--cluster_spec",
        _cluster_spec_def_in_docker(args.cluster_spec),
    ]
    container_args.extend(
        build_arguments_from_parsed_result(
            args, filter_args=["model_zoo", "cluster_spec", "worker_image"]))

    _submit_job(image_name, args, container_args)
示例#2
0
def train(args):
    model_zoo = os.path.normpath(args.model_zoo)

    if args.distribution_strategy == DistributionStrategy.LOCAL:
        local_executor = LocalExecutor(args)
        local_executor.run()
    else:
        image_name = build_and_push_docker_image(
            model_zoo=model_zoo,
            base_image=args.image_base,
            docker_image_repository=args.docker_image_repository,
            extra_pypi=args.extra_pypi_index,
            cluster_spec=args.cluster_spec,
            docker_base_url=args.docker_base_url,
            docker_tlscert=args.docker_tlscert,
            docker_tlskey=args.docker_tlskey,
        )

        container_args = [
            "-m",
            "elasticdl.python.master.main",
            "--worker_image",
            image_name,
            "--model_zoo",
            _model_zoo_in_docker(model_zoo),
            "--cluster_spec",
            _cluster_spec_def_in_docker(args.cluster_spec),
        ]
        container_args.extend(
            build_arguments_from_parsed_result(
                args,
                filter_args=["model_zoo", "cluster_spec", "worker_image"]))

        _submit_job(image_name, args, container_args)
示例#3
0
def evaluate(args):
    image_name = build_and_push_docker_image(
        model_zoo=args.model_zoo,
        base_image=args.image_base,
        docker_image_prefix=args.docker_image_prefix,
        extra_pypi=args.extra_pypi_index,
        cluster_spec=args.cluster_spec,
        docker_base_url=args.docker_base_url,
        docker_tlscert=args.docker_tlscert,
        docker_tlskey=args.docker_tlskey,
    )
    container_args = [
        "-m",
        "elasticdl.python.master.main",
        "--job_name",
        args.job_name,
        "--worker_image",
        image_name,
        "--model_zoo",
        _model_zoo_in_docker(args.model_zoo),
        "--cluster_spec",
        _cluster_spec_def_in_docker(args.cluster_spec),
        "--num_workers",
        str(args.num_workers),
        "--worker_resource_request",
        args.worker_resource_request,
        "--worker_resource_limit",
        args.worker_resource_limit,
        "--envs",
        args.envs,
        "--namespace",
        args.namespace,
        "--records_per_task",
        str(args.records_per_task),
        "--minibatch_size",
        str(args.minibatch_size),
        "--evaluation_data_dir",
        args.evaluation_data_dir,
        "--checkpoint_filename_for_init",
        args.checkpoint_filename_for_init,
        "--dataset_fn",
        args.dataset_fn,
        "--eval_metrics_fn",
        args.eval_metrics_fn,
        "--model_def",
        args.model_def,
        "--model_params",
        args.model_params,
    ]
    container_args.extend(["--image_pull_policy", args.image_pull_policy])
    container_args.extend(["--restart_policy", args.restart_policy])
    container_args.extend(["--volume", args.volume])

    _submit_job(image_name, args, container_args)
示例#4
0
def train(args):
    model_zoo = os.path.normpath(args.model_zoo)

    if args.distribution_strategy == DistributionStrategy.LOCAL:
        local_executor = LocalExecutor(args)
        local_executor.run()
        return

    image_pre_built = bool(args.image_name)

    image_name = (args.image_name
                  if image_pre_built else build_and_push_docker_image(
                      model_zoo=model_zoo,
                      base_image=args.image_base,
                      docker_image_repository=args.docker_image_repository,
                      extra_pypi=args.extra_pypi_index,
                      cluster_spec=args.cluster_spec,
                      docker_base_url=args.docker_base_url,
                      docker_tlscert=args.docker_tlscert,
                      docker_tlskey=args.docker_tlskey,
                  ))

    container_args = [
        "--worker_image",
        image_name,
        "--model_zoo",
        _model_zoo_in_docker(model_zoo, image_pre_built),
        "--cluster_spec",
        _cluster_spec_def_in_docker(args.cluster_spec),
    ]
    container_args.extend(
        build_arguments_from_parsed_result(
            args,
            filter_args=[
                "model_zoo",
                "cluster_spec",
                "worker_image",
                "force_use_kube_config_file",
                "func",
            ],
        ))

    _submit_job(image_name, args, container_args)
示例#5
0
def evaluate(args):
    model_zoo = os.path.normpath(args.model_zoo)

    image_pre_built = bool(args.image_name)

    image_name = (args.image_name
                  if image_pre_built else build_and_push_docker_image(
                      model_zoo=model_zoo,
                      base_image=args.image_base,
                      docker_image_repository=args.docker_image_repository,
                      extra_pypi=args.extra_pypi_index,
                      cluster_spec=args.cluster_spec,
                      docker_base_url=args.docker_base_url,
                      docker_tlscert=args.docker_tlscert,
                      docker_tlskey=args.docker_tlskey,
                  ))
    container_args = [
        "-m",
        "elasticdl.python.master.main",
        "--worker_image",
        image_name,
        "--model_zoo",
        _model_zoo_in_docker(model_zoo, image_pre_built),
        "--cluster_spec",
        _cluster_spec_def_in_docker(args.cluster_spec),
    ]
    container_args.extend(
        build_arguments_from_parsed_result(
            args,
            filter_args=[
                "model_zoo",
                "cluster_spec",
                "worker_image",
                "force_use_kube_config_file",
            ],
        ))

    _submit_job(image_name, args, container_args)
示例#6
0
文件: api.py 项目: yupbank/elasticdl
def train(args):
    image_name = build_and_push_docker_image(
        model_zoo=args.model_zoo,
        base_image=args.image_base,
        docker_image_prefix=args.docker_image_prefix,
        extra_pypi=args.extra_pypi_index,
        cluster_spec=args.cluster_spec,
    )

    container_args = [
        "-m",
        "elasticdl.python.master.main",
        "--job_name",
        args.job_name,
        "--worker_image",
        image_name,
        "--model_zoo",
        _model_zoo_in_docker(args.model_zoo),
        "--cluster_spec",
        _cluster_spec_def_in_docker(args.cluster_spec),
        "--num_workers",
        str(args.num_workers),
        "--master_resource_request",
        args.master_resource_request,
        "--master_resource_limit",
        args.master_resource_limit,
        "--worker_resource_request",
        args.worker_resource_request,
        "--worker_resource_limit",
        args.worker_resource_limit,
        "--envs",
        args.envs,
        "--namespace",
        args.namespace,
        "--tensorboard_log_dir",
        args.tensorboard_log_dir,
        "--records_per_task",
        str(args.records_per_task),
        "--num_epochs",
        str(args.num_epochs),
        "--grads_to_wait",
        str(args.grads_to_wait),
        "--minibatch_size",
        str(args.minibatch_size),
        "--training_data_dir",
        args.training_data_dir,
        "--evaluation_data_dir",
        args.evaluation_data_dir,
        "--checkpoint_steps",
        str(args.checkpoint_steps),
        "--checkpoint_dir",
        args.checkpoint_dir,
        "--keep_checkpoint_max",
        str(args.keep_checkpoint_max),
        "--evaluation_steps",
        str(args.evaluation_steps),
        "--evaluation_start_delay_secs",
        str(args.evaluation_start_delay_secs),
        "--evaluation_throttle_secs",
        str(args.evaluation_throttle_secs),
        "--dataset_fn",
        args.dataset_fn,
        "--loss",
        args.loss,
        "--optimizer",
        args.optimizer,
        "--eval_metrics_fn",
        args.eval_metrics_fn,
        "--model_def",
        args.model_def,
        "--model_params",
        args.model_params,
    ]
    container_args.extend(["--image_pull_policy", args.image_pull_policy])
    container_args.extend(["--restart_policy", args.restart_policy])
    container_args.extend(["--volume", args.volume])

    _submit_job(image_name, args, container_args)