def ec2_instance(request, ec2_client, ec2_resource, ec2_instance_type, ec2_key_name, ec2_instance_role_name, ec2_instance_ami, region): print(f"Creating instance: CI-CD {ec2_key_name}") key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) params = { "KeyName": ec2_key_name, "ImageId": ec2_instance_ami, "InstanceType": ec2_instance_type, "IamInstanceProfile": { "Name": ec2_instance_role_name }, "TagSpecifications": [ { "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": f"CI-CD {ec2_key_name}" }] }, ], "MaxCount": 1, "MinCount": 1, } extra_volume_size_mapping = [{ "DeviceName": "/dev/sda1", "Ebs": { "VolumeSize": 300, } }] if ("benchmark" in os.getenv("TEST_TYPE") and "mxnet_training" in request.fixturenames and "gpu_only" in request.fixturenames) or \ ("tensorflow_training" in request.fixturenames and "gpu_only" in request.fixturenames and "horovod" in ec2_key_name): params["BlockDeviceMappings"] = extra_volume_size_mapping instances = ec2_resource.create_instances(**params) instance_id = instances[0].id # Define finalizer to terminate instance after this fixture completes def terminate_ec2_instance(): ec2_client.terminate_instances(InstanceIds=[instance_id]) if test_utils.is_pr_context(): test_utils.destroy_ssh_keypair(ec2_client, key_filename) else: with open(KEYS_TO_DESTROY_FILE, "a") as destroy_keys: destroy_keys.write(f"{key_filename}\n") request.addfinalizer(terminate_ec2_instance) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) return instance_id, key_filename
def attach_ecs_worker_node(worker_instance_type, ami_id, cluster_name, cluster_arn=None, region=DEFAULT_REGION, worker_eia_capable=False): """ Launch a worker instance in a cluster. :param worker_instance_type: :param ami_id: :param cluster_name: :param cluster_arn: :param region: :return: <tuple> instance_id, public_ip_address """ ecs_user_data = f"#!/bin/bash\necho ECS_CLUSTER={cluster_name} >> /etc/ecs/ecs.config" sts_client = boto3.client('sts') account_id = sts_client.get_caller_identity().get('Account') ecs_role_name = "ecsInstanceRole" ecs_instance_role_arn = f"arn:aws:iam::{account_id}:instance-profile/{ecs_role_name}" instc = ec2_utils.launch_instance( ami_id, region=region, instance_type=worker_instance_type, user_data=ecs_user_data, iam_instance_profile_arn=ecs_instance_role_arn, instance_name=f"ecs worker {cluster_name}", eia_capable=worker_eia_capable) instance_id = instc["InstanceId"] public_ip_address = ec2_utils.get_public_ip(instance_id, region=region) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) list_container_filter = ( f"ec2InstanceId in ['{instance_id}'] and agentConnected==true") if cluster_arn is None: cluster_arn = cluster_name container_arns = list_ecs_container_instances(cluster_arn, list_container_filter, "ACTIVE", region) if not container_arns: raise Exception( f"No ACTIVE container instance found on instance-id {instance_id} in cluster {cluster_arn}" ) return instance_id, public_ip_address
def attach_ecs_worker_node(worker_instance_type, ami_id, cluster_name, cluster_arn=None, region=DEFAULT_REGION): """ Launch a worker instance in a cluster. :param worker_instance_type: :param ami_id: :param cluster_name: :param cluster_arn: :param region: :return: <tuple> instance_id, public_ip_address """ ecs_user_data = f"#!/bin/bash\necho ECS_CLUSTER={cluster_name} >> /etc/ecs/ecs.config" instc = ec2_utils.launch_instance( ami_id, region=region, instance_type=worker_instance_type, user_data=ecs_user_data, iam_instance_profile_arn=ECS_INSTANCE_ROLE_ARN, instance_name=f"ecs worker {cluster_name}", ) instance_id = instc["InstanceId"] public_ip_address = ec2_utils.get_public_ip(instance_id, region=region) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) list_container_filter = ( f"ec2InstanceId in ['{instance_id}'] and agentConnected==true") if cluster_arn is None: cluster_arn = cluster_name container_arns = list_ecs_container_instances(cluster_arn, list_container_filter, "ACTIVE", region) if not container_arns: raise Exception( f"No ACTIVE container instance found on instance-id {instance_id} in cluster {cluster_arn}" ) return instance_id, public_ip_address
def ec2_instance(request, ec2_client, ec2_resource, ec2_instance_type, ec2_key_name, ec2_instance_role_name, ec2_instance_ami, region): print(f"Creating instance: CI-CD {ec2_key_name}") key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) instances = ec2_resource.create_instances( KeyName=ec2_key_name, ImageId=ec2_instance_ami, InstanceType=ec2_instance_type, IamInstanceProfile={"Name": ec2_instance_role_name}, TagSpecifications=[ { "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": f"CI-CD {ec2_key_name}" }] }, ], MaxCount=1, MinCount=1, ) instance_id = instances[0].id # Define finalizer to terminate instance after this fixture completes def terminate_ec2_instance(): ec2_client.terminate_instances(InstanceIds=[instance_id]) if test_utils.is_pr_context(): test_utils.destroy_ssh_keypair(ec2_client, key_filename) else: with open(KEYS_TO_DESTROY_FILE, 'a') as destroy_keys: destroy_keys.write(f"{key_filename}\n") request.addfinalizer(terminate_ec2_instance) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) return instance_id, key_filename
def ec2_instance(request, ec2_client, ec2_resource, ec2_instance_type, ec2_key_name, ec2_instance_role_name, ec2_instance_ami, region, ei_accelerator_type): if ec2_instance_type == "p3dn.24xlarge": region = P3DN_REGION ec2_client = boto3.client("ec2", region_name=region, config=Config(retries={"max_attempts": 10})) ec2_resource = boto3.resource( "ec2", region_name=region, config=Config(retries={"max_attempts": 10})) if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1: ec2_instance_ami = UBUNTU_18_BASE_DLAMI_US_EAST_1 print(f"Creating instance: CI-CD {ec2_key_name}") key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) def delete_ssh_keypair(): if test_utils.is_pr_context(): test_utils.destroy_ssh_keypair(ec2_client, key_filename) else: with open(KEYS_TO_DESTROY_FILE, "a") as destroy_keys: destroy_keys.write(f"{key_filename}\n") request.addfinalizer(delete_ssh_keypair) params = { "KeyName": ec2_key_name, "ImageId": ec2_instance_ami, "InstanceType": ec2_instance_type, "IamInstanceProfile": { "Name": ec2_instance_role_name }, "TagSpecifications": [ { "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": f"CI-CD {ec2_key_name}" }] }, ], "MaxCount": 1, "MinCount": 1, } extra_volume_size_mapping = [{ "DeviceName": "/dev/sda1", "Ebs": { "VolumeSize": 300, } }] if (("benchmark" in os.getenv("TEST_TYPE") or is_benchmark_dev_context()) and (("mxnet_training" in request.fixturenames and "gpu_only" in request.fixturenames) or "mxnet_inference" in request.fixturenames)) or ( "tensorflow_training" in request.fixturenames and "gpu_only" in request.fixturenames and "horovod" in ec2_key_name): params["BlockDeviceMappings"] = extra_volume_size_mapping if ei_accelerator_type: params["ElasticInferenceAccelerators"] = [{ 'Type': ei_accelerator_type, 'Count': 1 }] availability_zones = { "us-west-2": ["us-west-2a", "us-west-2b", "us-west-2c"], "us-east-1": ["us-east-1a", "us-east-1b", "us-east-1c"] } for a_zone in availability_zones[region]: params["Placement"] = {'AvailabilityZone': a_zone} try: instances = ec2_resource.create_instances(**params) if instances: break except ClientError as e: LOGGER.error(f"Failed to launch in {a_zone} with Error: {e}") continue else: try: instances = ec2_resource.create_instances(**params) except ClientError as e: if e.response['Error']['Code'] == "InsufficientInstanceCapacity": LOGGER.warning( f"Failed to launch {ec2_instance_type} in {region} because of insufficient capacity" ) if ec2_instance_type in ec2_utils.ICE_SKIP_INSTANCE_LIST: pytest.skip( f"Skipping test because {ec2_instance_type} instance could not be launched." ) raise instance_id = instances[0].id # Define finalizer to terminate instance after this fixture completes def terminate_ec2_instance(): ec2_client.terminate_instances(InstanceIds=[instance_id]) request.addfinalizer(terminate_ec2_instance) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) return instance_id, key_filename
def ec2_instance( request, ec2_client, ec2_resource, ec2_instance_type, ec2_key_name, ec2_instance_role_name, ec2_instance_ami, region, ei_accelerator_type, ): if ec2_instance_type == "p3dn.24xlarge": region = P3DN_REGION ec2_client = boto3.client("ec2", region_name=region, config=Config(retries={"max_attempts": 10})) ec2_resource = boto3.resource( "ec2", region_name=region, config=Config(retries={"max_attempts": 10})) if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1: ec2_instance_ami = (AML2_GPU_DLAMI_US_EAST_1 if ec2_instance_ami == AML2_GPU_DLAMI_US_WEST_2 else UBUNTU_18_BASE_DLAMI_US_EAST_1) print(f"Creating instance: CI-CD {ec2_key_name}") key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) def delete_ssh_keypair(): if test_utils.is_pr_context(): test_utils.destroy_ssh_keypair(ec2_client, key_filename) else: with open(KEYS_TO_DESTROY_FILE, "a") as destroy_keys: destroy_keys.write(f"{key_filename}\n") request.addfinalizer(delete_ssh_keypair) params = { "KeyName": ec2_key_name, "ImageId": ec2_instance_ami, "InstanceType": ec2_instance_type, "IamInstanceProfile": { "Name": ec2_instance_role_name }, "TagSpecifications": [ { "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": f"CI-CD {ec2_key_name}" }] }, ], "MaxCount": 1, "MinCount": 1, } volume_name = "/dev/sda1" if ec2_instance_ami in test_utils.UL_AMI_LIST else "/dev/xvda" if ("pytorch_training_habana" in request.fixturenames or "tensorflow_training_habana" in request.fixturenames or "hpu" in request.fixturenames): user_data = """#!/bin/bash sudo apt-get update && sudo apt-get install -y awscli""" params["UserData"] = user_data params["BlockDeviceMappings"] = [{ "DeviceName": volume_name, "Ebs": { "VolumeSize": 1000, }, }] elif ( (("benchmark" in os.getenv("TEST_TYPE") or is_benchmark_dev_context()) and (("mxnet_training" in request.fixturenames and "gpu_only" in request.fixturenames) or "mxnet_inference" in request.fixturenames)) or (is_neuron_image) or ("tensorflow_training" in request.fixturenames and "gpu_only" in request.fixturenames and "horovod" in ec2_key_name) or ("tensorflow_inference" in request.fixturenames and "graviton_compatible_only" in request.fixturenames) or ("graviton" in request.fixturenames)): params["BlockDeviceMappings"] = [{ "DeviceName": volume_name, "Ebs": { "VolumeSize": 300, }, }] else: # Using private AMI, the EBS volume size is reduced to 28GB as opposed to 50GB from public AMI. This leads to space issues on test instances # TODO: Revert the configuration once DLAMI is public params["BlockDeviceMappings"] = [{ "DeviceName": volume_name, "Ebs": { "VolumeSize": 90, }, }] if ei_accelerator_type: params["ElasticInferenceAccelerators"] = [{ "Type": ei_accelerator_type, "Count": 1 }] availability_zones = { "us-west-2": ["us-west-2a", "us-west-2b", "us-west-2c"], "us-east-1": ["us-east-1a", "us-east-1b", "us-east-1c"], } for a_zone in availability_zones[region]: params["Placement"] = {"AvailabilityZone": a_zone} try: instances = ec2_resource.create_instances(**params) if instances: break except ClientError as e: LOGGER.error(f"Failed to launch in {a_zone} due to {e}") continue else: try: instances = ec2_resource.create_instances(**params) except ClientError as e: if e.response["Error"]["Code"] == "InsufficientInstanceCapacity": LOGGER.warning( f"Failed to launch {ec2_instance_type} in {region} because of insufficient capacity" ) if ec2_instance_type in ec2_utils.ICE_SKIP_INSTANCE_LIST: pytest.skip( f"Skipping test because {ec2_instance_type} instance could not be launched." ) raise instance_id = instances[0].id # Define finalizer to terminate instance after this fixture completes def terminate_ec2_instance(): ec2_client.terminate_instances(InstanceIds=[instance_id]) request.addfinalizer(terminate_ec2_instance) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) return instance_id, key_filename