def ec2_instance(request, ec2_client, ec2_resource, ec2_instance_type, ec2_key_name, ec2_instance_role_name, ec2_instance_ami, region): print(f"Creating instance: CI-CD {ec2_key_name}") key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) params = { "KeyName": ec2_key_name, "ImageId": ec2_instance_ami, "InstanceType": ec2_instance_type, "IamInstanceProfile": { "Name": ec2_instance_role_name }, "TagSpecifications": [ { "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": f"CI-CD {ec2_key_name}" }] }, ], "MaxCount": 1, "MinCount": 1, } extra_volume_size_mapping = [{ "DeviceName": "/dev/sda1", "Ebs": { "VolumeSize": 300, } }] if ("benchmark" in os.getenv("TEST_TYPE") and "mxnet_training" in request.fixturenames and "gpu_only" in request.fixturenames) or \ ("tensorflow_training" in request.fixturenames and "gpu_only" in request.fixturenames and "horovod" in ec2_key_name): params["BlockDeviceMappings"] = extra_volume_size_mapping instances = ec2_resource.create_instances(**params) instance_id = instances[0].id # Define finalizer to terminate instance after this fixture completes def terminate_ec2_instance(): ec2_client.terminate_instances(InstanceIds=[instance_id]) if test_utils.is_pr_context(): test_utils.destroy_ssh_keypair(ec2_client, key_filename) else: with open(KEYS_TO_DESTROY_FILE, "a") as destroy_keys: destroy_keys.write(f"{key_filename}\n") request.addfinalizer(terminate_ec2_instance) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) return instance_id, key_filename
def ec2_instance(request, ec2_client, ec2_resource, ec2_instance_type, ec2_key_name, ec2_instance_role_name, ec2_instance_ami, region): print(f"Creating instance: CI-CD {ec2_key_name}") key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) instances = ec2_resource.create_instances( KeyName=ec2_key_name, ImageId=ec2_instance_ami, InstanceType=ec2_instance_type, IamInstanceProfile={"Name": ec2_instance_role_name}, TagSpecifications=[ { "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": f"CI-CD {ec2_key_name}" }] }, ], MaxCount=1, MinCount=1, ) instance_id = instances[0].id # Define finalizer to terminate instance after this fixture completes def terminate_ec2_instance(): ec2_client.terminate_instances(InstanceIds=[instance_id]) if test_utils.is_pr_context(): test_utils.destroy_ssh_keypair(ec2_client, key_filename) else: with open(KEYS_TO_DESTROY_FILE, 'a') as destroy_keys: destroy_keys.write(f"{key_filename}\n") request.addfinalizer(terminate_ec2_instance) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) return instance_id, key_filename
def ec2_instance(request, ec2_client, ec2_resource, ec2_instance_type, ec2_key_name, ec2_instance_role_name, ec2_instance_ami, region, ei_accelerator_type): if ec2_instance_type == "p3dn.24xlarge": region = P3DN_REGION ec2_client = boto3.client("ec2", region_name=region, config=Config(retries={"max_attempts": 10})) ec2_resource = boto3.resource( "ec2", region_name=region, config=Config(retries={"max_attempts": 10})) if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1: ec2_instance_ami = UBUNTU_18_BASE_DLAMI_US_EAST_1 print(f"Creating instance: CI-CD {ec2_key_name}") key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) def delete_ssh_keypair(): if test_utils.is_pr_context(): test_utils.destroy_ssh_keypair(ec2_client, key_filename) else: with open(KEYS_TO_DESTROY_FILE, "a") as destroy_keys: destroy_keys.write(f"{key_filename}\n") request.addfinalizer(delete_ssh_keypair) params = { "KeyName": ec2_key_name, "ImageId": ec2_instance_ami, "InstanceType": ec2_instance_type, "IamInstanceProfile": { "Name": ec2_instance_role_name }, "TagSpecifications": [ { "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": f"CI-CD {ec2_key_name}" }] }, ], "MaxCount": 1, "MinCount": 1, } extra_volume_size_mapping = [{ "DeviceName": "/dev/sda1", "Ebs": { "VolumeSize": 300, } }] if (("benchmark" in os.getenv("TEST_TYPE") or is_benchmark_dev_context()) and (("mxnet_training" in request.fixturenames and "gpu_only" in request.fixturenames) or "mxnet_inference" in request.fixturenames)) or ( "tensorflow_training" in request.fixturenames and "gpu_only" in request.fixturenames and "horovod" in ec2_key_name): params["BlockDeviceMappings"] = extra_volume_size_mapping if ei_accelerator_type: params["ElasticInferenceAccelerators"] = [{ 'Type': ei_accelerator_type, 'Count': 1 }] availability_zones = { "us-west-2": ["us-west-2a", "us-west-2b", "us-west-2c"], "us-east-1": ["us-east-1a", "us-east-1b", "us-east-1c"] } for a_zone in availability_zones[region]: params["Placement"] = {'AvailabilityZone': a_zone} try: instances = ec2_resource.create_instances(**params) if instances: break except ClientError as e: LOGGER.error(f"Failed to launch in {a_zone} with Error: {e}") continue else: try: instances = ec2_resource.create_instances(**params) except ClientError as e: if e.response['Error']['Code'] == "InsufficientInstanceCapacity": LOGGER.warning( f"Failed to launch {ec2_instance_type} in {region} because of insufficient capacity" ) if ec2_instance_type in ec2_utils.ICE_SKIP_INSTANCE_LIST: pytest.skip( f"Skipping test because {ec2_instance_type} instance could not be launched." ) raise instance_id = instances[0].id # Define finalizer to terminate instance after this fixture completes def terminate_ec2_instance(): ec2_client.terminate_instances(InstanceIds=[instance_id]) request.addfinalizer(terminate_ec2_instance) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) return instance_id, key_filename
def ec2_instance( request, ec2_client, ec2_resource, ec2_instance_type, ec2_key_name, ec2_instance_role_name, ec2_instance_ami, region, ei_accelerator_type, ): if ec2_instance_type == "p3dn.24xlarge": region = P3DN_REGION ec2_client = boto3.client("ec2", region_name=region, config=Config(retries={"max_attempts": 10})) ec2_resource = boto3.resource( "ec2", region_name=region, config=Config(retries={"max_attempts": 10})) if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1: ec2_instance_ami = (AML2_GPU_DLAMI_US_EAST_1 if ec2_instance_ami == AML2_GPU_DLAMI_US_WEST_2 else UBUNTU_18_BASE_DLAMI_US_EAST_1) print(f"Creating instance: CI-CD {ec2_key_name}") key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) def delete_ssh_keypair(): if test_utils.is_pr_context(): test_utils.destroy_ssh_keypair(ec2_client, key_filename) else: with open(KEYS_TO_DESTROY_FILE, "a") as destroy_keys: destroy_keys.write(f"{key_filename}\n") request.addfinalizer(delete_ssh_keypair) params = { "KeyName": ec2_key_name, "ImageId": ec2_instance_ami, "InstanceType": ec2_instance_type, "IamInstanceProfile": { "Name": ec2_instance_role_name }, "TagSpecifications": [ { "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": f"CI-CD {ec2_key_name}" }] }, ], "MaxCount": 1, "MinCount": 1, } volume_name = "/dev/sda1" if ec2_instance_ami in test_utils.UL_AMI_LIST else "/dev/xvda" if ("pytorch_training_habana" in request.fixturenames or "tensorflow_training_habana" in request.fixturenames or "hpu" in request.fixturenames): user_data = """#!/bin/bash sudo apt-get update && sudo apt-get install -y awscli""" params["UserData"] = user_data params["BlockDeviceMappings"] = [{ "DeviceName": volume_name, "Ebs": { "VolumeSize": 1000, }, }] elif ( (("benchmark" in os.getenv("TEST_TYPE") or is_benchmark_dev_context()) and (("mxnet_training" in request.fixturenames and "gpu_only" in request.fixturenames) or "mxnet_inference" in request.fixturenames)) or (is_neuron_image) or ("tensorflow_training" in request.fixturenames and "gpu_only" in request.fixturenames and "horovod" in ec2_key_name) or ("tensorflow_inference" in request.fixturenames and "graviton_compatible_only" in request.fixturenames) or ("graviton" in request.fixturenames)): params["BlockDeviceMappings"] = [{ "DeviceName": volume_name, "Ebs": { "VolumeSize": 300, }, }] else: # Using private AMI, the EBS volume size is reduced to 28GB as opposed to 50GB from public AMI. This leads to space issues on test instances # TODO: Revert the configuration once DLAMI is public params["BlockDeviceMappings"] = [{ "DeviceName": volume_name, "Ebs": { "VolumeSize": 90, }, }] if ei_accelerator_type: params["ElasticInferenceAccelerators"] = [{ "Type": ei_accelerator_type, "Count": 1 }] availability_zones = { "us-west-2": ["us-west-2a", "us-west-2b", "us-west-2c"], "us-east-1": ["us-east-1a", "us-east-1b", "us-east-1c"], } for a_zone in availability_zones[region]: params["Placement"] = {"AvailabilityZone": a_zone} try: instances = ec2_resource.create_instances(**params) if instances: break except ClientError as e: LOGGER.error(f"Failed to launch in {a_zone} due to {e}") continue else: try: instances = ec2_resource.create_instances(**params) except ClientError as e: if e.response["Error"]["Code"] == "InsufficientInstanceCapacity": LOGGER.warning( f"Failed to launch {ec2_instance_type} in {region} because of insufficient capacity" ) if ec2_instance_type in ec2_utils.ICE_SKIP_INSTANCE_LIST: pytest.skip( f"Skipping test because {ec2_instance_type} instance could not be launched." ) raise instance_id = instances[0].id # Define finalizer to terminate instance after this fixture completes def terminate_ec2_instance(): ec2_client.terminate_instances(InstanceIds=[instance_id]) request.addfinalizer(terminate_ec2_instance) ec2_utils.check_instance_state(instance_id, state="running", region=region) ec2_utils.check_system_state(instance_id, system_status="ok", instance_status="ok", region=region) return instance_id, key_filename