Exemplo n.º 1
0
def _run_instance_role_disabled(image_uri, ec2_client, ec2_instance,
                                ec2_connection):
    expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete"

    ec2_instance_id, _ = ec2_instance
    account_id = test_utils.get_account_id_from_image_uri(image_uri)
    image_region = test_utils.get_region_from_image_uri(image_uri)
    repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri(
        image_uri)
    framework, _ = test_utils.get_framework_and_version_from_tag(image_uri)
    job_type = test_utils.get_job_type_from_image(image_uri)
    processor = test_utils.get_processor_from_image_uri(image_uri)

    container_name = f"{repo_name}-telemetry_bad_instance_role-ec2"

    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"

    test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region)
    ec2_connection.run(f"{docker_cmd} pull -q {image_uri}")

    preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags(
        ec2_instance_id, ec2_client=ec2_client)
    if expected_tag_key in preexisting_ec2_instance_tags:
        ec2_client.remove_tags(Resources=[ec2_instance_id],
                               Tags=[{
                                   "Key": expected_tag_key
                               }])

    # Disable access to EC2 instance metadata
    ec2_connection.run(f"sudo route add -host 169.254.169.254 reject")

    if "tensorflow" in framework and job_type == "inference":
        model_name = "saved_model_half_plus_two"
        model_base_path = test_utils.get_tensorflow_model_base_path(image_uri)
        env_vars_list = test_utils.get_tensorflow_inference_environment_variables(
            model_name, model_base_path)
        env_vars = " ".join([
            f"-e {entry['name']}={entry['value']}" for entry in env_vars_list
        ])
        inference_command = get_tensorflow_inference_command_tf27_above(
            image_uri, model_name)
        ec2_connection.run(
            f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri} {inference_command}"
        )
        time.sleep(5)
    else:
        framework_to_import = framework.replace("huggingface_", "")
        framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import
        ec2_connection.run(
            f"{docker_cmd} run --name {container_name} -id {image_uri} bash")
        output = ec2_connection.run(
            f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'",
            warn=True)
        assert output.ok, f"'import {framework_to_import}' fails when credentials not configured"

    ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id,
                                                        ec2_client=ec2_client)
    assert expected_tag_key not in ec2_instance_tags, (
        f"{expected_tag_key} was applied as an instance tag."
        "EC2 create_tags went through even though it should not have")
Exemplo n.º 2
0
def setup_ecs_inference_service(
    docker_image_uri,
    framework,
    cluster_arn,
    model_name,
    worker_instance_id,
    ei_accelerator_type=None,
    num_gpus=None,
    num_neurons=None,
    region=DEFAULT_REGION,
):
    """
    Function to setup Inference service on ECS
    :param docker_image_uri:
    :param framework:
    :param cluster_arn:
    :param model_name:
    :param worker_instance_id:
    :param num_gpus:
    :param num_neurons:
    :param region:
    :return: <tuple> service_name, task_family, revision if all steps passed else Exception
        Cleans up the resources if any step fails
    """
    datetime_suffix = datetime.datetime.now().strftime("%Y%m%d-%H-%M-%S")
    processor = "gpu" if "gpu" in docker_image_uri else "eia" if "eia" in docker_image_uri else "neuron" if "neuron" in docker_image_uri else "cpu"
    port_mappings = get_ecs_port_mappings(framework)
    log_group_name = f"/ecs/{framework}-inference-{processor}"
    num_cpus = ec2_utils.get_instance_num_cpus(worker_instance_id,
                                               region=region)
    # We assume that about 80% of RAM is free on the instance, since we are not directly querying it to find out
    # what the memory utilization is.
    memory = int(
        ec2_utils.get_instance_memory(worker_instance_id, region=region) * 0.8)
    cluster_name = get_ecs_cluster_name(cluster_arn, region=region)
    # Below values here are just for sanity
    arguments_dict = {
        "family_name": cluster_name,
        "image": docker_image_uri,
        "log_group_name": log_group_name,
        "log_stream_prefix": datetime_suffix,
        "port_mappings": port_mappings,
        "num_cpu": num_cpus,
        "memory": memory,
        "region": region,
    }

    if processor == "gpu" and num_gpus:
        arguments_dict["num_gpu"] = num_gpus
    if framework == "tensorflow":
        model_name = get_tensorflow_model_name(processor, model_name)
        model_base_path = get_tensorflow_model_base_path(docker_image_uri)
        _, image_framework_version = get_framework_and_version_from_tag(
            docker_image_uri)
        if Version(image_framework_version) in SpecifierSet(">=2.7"):
            arguments_dict["container_command"] = [
                build_tensorflow_inference_command_tf27_and_above(model_name)
            ]
            arguments_dict["entrypoint"] = ["sh", "-c"]

        arguments_dict[
            "environment"] = get_tensorflow_inference_environment_variables(
                model_name, model_base_path)
        print(f"Added environment variables: {arguments_dict['environment']}")
    elif framework in ["mxnet", "pytorch"]:
        arguments_dict["container_command"] = [
            get_inference_run_command(docker_image_uri, model_name, processor)
        ]
    if processor == "eia":
        arguments_dict["health_check"] = {
            "retries":
            2,
            "command": [
                "CMD-SHELL",
                "LD_LIBRARY_PATH=/opt/ei_health_check/lib /opt/ei_health_check/bin/health_check"
            ],
            "timeout":
            5,
            "interval":
            30,
            "startPeriod":
            60,
        }
        arguments_dict["inference_accelerators"] = {
            "deviceName": "device_1",
            "deviceType": ei_accelerator_type
        }

    if processor == "neuron" and num_neurons:
        arguments_dict["num_neurons"] = num_neurons

    try:
        task_family, revision = register_ecs_task_definition(**arguments_dict)
        print(f"Created Task definition - {task_family}:{revision}")

        service_name = create_ecs_service(cluster_name,
                                          f"service-{cluster_name}",
                                          f"{task_family}:{revision}",
                                          region=region)
        print(
            f"Created ECS service - {service_name} with cloudwatch log group - {log_group_name} "
            f"log stream prefix - {datetime_suffix}/{cluster_name}")
        if check_running_task_for_ecs_service(cluster_name,
                                              service_name,
                                              region=region):
            print("Service status verified as running. Running inference ...")
        else:
            raise Exception(f"No task running in the service: {service_name}")
        return service_name, task_family, revision
    except Exception as e:
        raise ECSServiceCreationException(
            f"Setup Inference Service Exception - {e}")