Пример #1
0
def build_docker_container(torchserve_branch="master"):
    LOGGER.info(f"Setting up docker image to be used")

    docker_dev_image_config_path = os.path.join(os.getcwd(), "test",
                                                "benchmark", "tests", "suite",
                                                "docker", "docker.yaml")

    docker_config = YamlHandler.load_yaml(docker_dev_image_config_path)
    YamlHandler.validate_docker_yaml(docker_config)

    account_id = run(
        "aws sts get-caller-identity --query Account --output text"
    ).stdout.strip()

    for processor, config in docker_config.items():
        docker_tag = None
        cuda_version = None
        for config_key, config_value in config.items():
            if processor == "gpu" and config_key == "cuda_version":
                cuda_version = config_value
            if config_key == "docker_tag":
                docker_tag = config_value
        dockerImageHandler = DockerImageHandler(docker_tag, cuda_version,
                                                torchserve_branch)
        dockerImageHandler.build_image()
        dockerImageHandler.push_docker_image_to_ecr(
            account_id, DEFAULT_REGION,
            f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}")
Пример #2
0
def test_vgg16_benchmark(ec2_connection, ec2_instance_type,
                         vgg16_config_file_path, docker_dev_image_config_path,
                         benchmark_execution_id):

    test_config = YamlHandler.load_yaml(vgg16_config_file_path)

    model_name = vgg16_config_file_path.split("/")[-1].split(".")[0]

    LOGGER.info("Validating yaml contents")

    LOGGER.info(YamlHandler.validate_benchmark_yaml(test_config))

    docker_config = YamlHandler.load_yaml(docker_dev_image_config_path)

    docker_repo_tag_for_current_instance = ""
    cuda_version_for_instance = ""
    account_id = run(
        "aws sts get-caller-identity --query Account --output text"
    ).stdout.strip()

    for processor, config in docker_config.items():
        docker_tag = None
        cuda_version = None
        for config_key, config_value in config.items():
            if processor == "gpu" and config_key == "cuda_version":
                cuda_version = config_value
            if config_key == "docker_tag":
                docker_tag = config_value
        # TODO: Improve logic that selectively pulls CPU image on CPU instances and likewise for GPU.

        docker_repo_tag = f"{DEFAULT_DOCKER_DEV_ECR_REPO}:{docker_tag}"

        if ec2_instance_type[:2] in GPU_INSTANCES and "gpu" in docker_tag:
            dockerImageHandler = DockerImageHandler(docker_tag, cuda_version)
            dockerImageHandler.pull_docker_image_from_ecr(
                account_id,
                DEFAULT_REGION,
                docker_repo_tag,
                connection=ec2_connection)
            docker_repo_tag_for_current_instance = docker_repo_tag
            cuda_version_for_instance = cuda_version
            break
        if ec2_instance_type[:2] not in GPU_INSTANCES and "cpu" in docker_tag:
            dockerImageHandler = DockerImageHandler(docker_tag, cuda_version)
            dockerImageHandler.pull_docker_image_from_ecr(
                account_id,
                DEFAULT_REGION,
                docker_repo_tag,
                connection=ec2_connection)
            docker_repo_tag_for_current_instance = docker_repo_tag
            cuda_version_for_instance = cuda_version
            break

    mode_list = []
    config_list = []
    batch_size_list = []
    processor_list = []

    apacheBenchHandler = ab_utils.ApacheBenchHandler(model_name=model_name,
                                                     connection=ec2_connection)

    for model, config in test_config.items():
        for mode, mode_config in config.items():
            mode_list.append(mode)
            benchmark_engine = mode_config.get("benchmark_engine")
            url = mode_config.get("url")
            workers = mode_config.get("workers")
            batch_delay = mode_config.get("batch_delay")
            batch_sizes = mode_config.get("batch_size")
            input_file = mode_config.get("input")
            requests = mode_config.get("requests")
            concurrency = mode_config.get("concurrency")
            backend_profiling = mode_config.get("backend_profiling")
            exec_env = mode_config.get("exec_env")
            processors = mode_config.get("processors")
            gpus = None
            if len(processors) == 2:
                gpus = processors[1].get("gpus")
            LOGGER.info(f"processors: {processors[1]}")
            LOGGER.info(f"gpus: {gpus}")

            LOGGER.info(
                f"\n benchmark_engine: {benchmark_engine}\n url: {url}\n workers: {workers}\n batch_delay: {batch_delay}\n batch_size:{batch_sizes}\n input_file: {input_file}\n requests: {requests}\n concurrency: {concurrency}\n backend_profiling: {backend_profiling}\n exec_env: {exec_env}\n processors: {processors}"
            )

            torchserveHandler = ts_utils.TorchServeHandler(
                exec_env=exec_env,
                cuda_version=cuda_version,
                gpus=gpus,
                torchserve_docker_image=docker_repo_tag_for_current_instance,
                backend_profiling=backend_profiling,
                connection=ec2_connection,
            )

            for batch_size in batch_sizes:

                # Start torchserve
                torchserveHandler.start_torchserve_docker()

                # Register
                torchserveHandler.register_model(url=url,
                                                 workers=workers,
                                                 batch_delay=batch_delay,
                                                 batch_size=batch_size)

                # Run benchmark
                apacheBenchHandler.run_apache_bench(requests=requests,
                                                    concurrency=concurrency,
                                                    input_file=input_file)

                # Unregister
                torchserveHandler.unregister_model()

                # Stop torchserve
                torchserveHandler.stop_torchserve()

                # Generate report (note: needs to happen after torchserve has stopped)
                apacheBenchHandler.generate_report(requests=requests,
                                                   concurrency=concurrency,
                                                   connection=ec2_connection)

                # Move artifacts into a common folder.
                remote_artifact_folder = (
                    f"/home/ubuntu/{benchmark_execution_id}/{model_name}/{ec2_instance_type}/{mode}/{batch_size}"
                )

                ec2_connection.run(f"mkdir -p {remote_artifact_folder}")
                ec2_connection.run(
                    f"cp -R /home/ubuntu/benchmark/* {remote_artifact_folder}")

                # Upload artifacts to s3 bucket
                ec2_connection.run(
                    f"aws s3 cp --recursive /home/ubuntu/{benchmark_execution_id}/ {S3_BUCKET_BENCHMARK_ARTIFACTS}/{benchmark_execution_id}/"
                )

                time.sleep(3)

                run(f"aws s3 cp --recursive /tmp/{model_name}/ {S3_BUCKET_BENCHMARK_ARTIFACTS}/{benchmark_execution_id}/{model_name}/{ec2_instance_type}/{mode}/{batch_size}"
                    )

                run(f"rm -rf /tmp/{model_name}")
                apacheBenchHandler.clean_up()
Пример #3
0
def ec2_instance(
    request,
    ec2_client,
    ec2_resource,
    ec2_instance_type,
    ec2_key_name,
    ec2_instance_role_name,
    ec2_instance_ami,
    region,
):

    use_instances_flag = request.config.getoption(
        "--use-instances") if request.config.getoption(
            "--use-instances") else None

    if use_instances_flag:
        instances_file = request.config.getoption("--use-instances")
        run(f"touch {instances_file}", warn=True)
        instances_dict = YamlHandler.load_yaml(instances_file)
        LOGGER.info(f"instances_dict: {instances_dict}")
        instances = instances_dict.get(request.node.name.split("[")[0], "")
        LOGGER.info(f"instances: {instances}")
        assert instances != "", f"Could not find instance details corresponding to test: {request.node.name.split('[')[0]}"
        instance_details = instances.get(ec2_instance_type, "")
        assert instance_details != "", f"Could not obtain details for instance type: {ec2_instance_type}"
        instance_id = instance_details.get("instance_id", "")
        assert instance_id != "", f"Missing instance_id"
        key_filename = instance_details.get("key_filename", "")
        assert key_filename != "", f"Missing key_filename"

        LOGGER.info(
            f"For test: {request.node.name}; Using instance_id: {instance_id} and key_filename: {key_filename}"
        )

        return instance_id, key_filename

    key_filename = ec2_utils.generate_ssh_keypair(ec2_client, ec2_key_name)

    params = {
        "KeyName":
        ec2_key_name,
        "ImageId":
        ec2_instance_ami,
        "InstanceType":
        ec2_instance_type,
        "IamInstanceProfile": {
            "Name": ec2_instance_role_name
        },
        "TagSpecifications": [
            {
                "ResourceType": "instance",
                "Tags": [{
                    "Key": "Name",
                    "Value": f"TS Benchmark {ec2_key_name}"
                }]
            },
        ],
        "MaxCount":
        1,
        "MinCount":
        1,
        "BlockDeviceMappings": [{
            "DeviceName": "/dev/sda1",
            "Ebs": {
                "VolumeSize": 220
            }
        }],
    }

    try:
        instances = ec2_resource.create_instances(**params)
    except ClientError as e:
        if e.response["Error"]["Code"] == "InsufficientInstanceCapacity":
            LOGGER.warning(
                f"Failed to launch {ec2_instance_type} in {region} because of insufficient capacity"
            )
        raise
    instance_id = instances[0].id

    LOGGER.info(f"Created instance: TS Benchmark {ec2_key_name}")

    # Define finalizer to terminate instance after this fixture completes
    def terminate_ec2_instance():
        ec2_client.terminate_instances(InstanceIds=[instance_id])

    def delete_ssh_keypair():
        ec2_utils.destroy_ssh_keypair(ec2_client, key_filename)

    do_not_terminate_flag = request.config.getoption("--do-not-terminate")

    LOGGER.info(f"do_not_terminate_flag: {do_not_terminate_flag}")

    instances_file = os.path.join(os.getcwd(), "instances.yaml")
    run(f"touch {instances_file}", warn=True)

    if not do_not_terminate_flag:
        request.addfinalizer(terminate_ec2_instance)
        request.addfinalizer(delete_ssh_keypair)

    if do_not_terminate_flag and not use_instances_flag:
        instances_dict = YamlHandler.load_yaml(instances_file)
        if not instances_dict:
            instances_dict = {}

        update_dictionary = {
            request.node.name.split("[")[0]: {
                ec2_instance_type: {
                    "instance_id": instance_id,
                    "key_filename": key_filename
                }
            }
        }

        instances_dict.update(update_dictionary)

        YamlHandler.write_yaml(instances_file, instances_dict)

    ec2_utils.check_instance_state(instance_id, state="running", region=region)
    ec2_utils.check_system_state(instance_id,
                                 system_status="ok",
                                 instance_status="ok",
                                 region=region)

    return instance_id, key_filename