def setup_eks_cluster(framework_name, is_neuron): frameworks = { "tensorflow": "tf", "mxnet": "mx", "pytorch": "pt", } long_name = framework_name short_name = frameworks[long_name] codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7] num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4 cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}" # default volume size volume_size = 80 try: eks_utils.eks_setup() if is_neuron: #TODO the eks AMI used for neuron has a snapshot size of 500GB, if we pass the default 80GB the cluster #creation will fail. Once official EKS AMI for neuron 1.1 is released, revert this change. volume_size = 500 eks_utils.create_eks_cluster(cluster_name, "neuron", num_nodes, volume_size, "inf1.xlarge", "pytest.pem") else: eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, volume_size, "p3.16xlarge", "pytest.pem") except Exception: eks_utils.delete_eks_cluster(cluster_name) raise return cluster_name
def setup_eks_cluster(framework_name, is_neuron): frameworks = { "tensorflow": "tf", "mxnet": "mx", "pytorch": "pt", } long_name = framework_name short_name = frameworks[long_name] codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7] num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4 cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}" # default volume size volume_size = 80 try: eks_utils.eks_setup() if is_neuron: eks_utils.create_eks_cluster(cluster_name, num_nodes, volume_size, "inf1.xlarge", "pytest.pem") else: eks_utils.create_eks_cluster(cluster_name, num_nodes, volume_size, "p3.16xlarge", "pytest.pem") except Exception: eks_utils.delete_eks_cluster(cluster_name) raise return cluster_name
def setup_eks_clusters(dlc_images): terminable_clusters = [] frameworks = {"tensorflow": "tf", "pytorch": "pt", "mxnet": "mx"} for long_name, short_name in frameworks.items(): if long_name in dlc_images: cluster_name = None if not is_pr_context(): num_nodes = 3 if long_name != "pytorch" else 4 cluster_name = f"dlc-{short_name}-cluster-" \ f"{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}-{random.randint(1, 10000)}" eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, "p3.16xlarge", "pytest.pem") terminable_clusters.append(cluster_name) eks_utils.eks_setup(long_name, cluster_name) return terminable_clusters
def setup_eks_cluster(framework_name): frameworks = {"tensorflow": "tf", "pytorch": "pt", "mxnet": "mx"} long_name = framework_name short_name = frameworks[long_name] codebuild_version = os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')[0:7] num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4 cluster_name = f"dlc-{short_name}-cluster-" \ f"{codebuild_version}-{random.randint(1, 10000)}" try: eks_utils.eks_setup() eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, "p3.16xlarge", "pytest.pem") except Exception: eks_utils.delete_eks_cluster(cluster_name) raise return cluster_name
def setup_eks_clusters(dlc_images): frameworks = {"tensorflow": "tf", "pytorch": "pt", "mxnet": "mx"} frameworks_in_images = [ framework for framework in frameworks.keys() if framework in dlc_images ] if len(frameworks_in_images) != 1: raise ValueError( f"All images in dlc_images must be of a single framework for EKS tests.\n" f"Instead seeing {frameworks_in_images} frameworks.") long_name = frameworks_in_images[0] short_name = frameworks[long_name] num_nodes = 2 if is_pr_context() else 3 if long_name != "pytorch" else 4 cluster_name = f"dlc-{short_name}-cluster-" \ f"{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}-{random.randint(1, 10000)}" eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, "p3.16xlarge", "pytest.pem") eks_utils.eks_setup(long_name, cluster_name) return cluster_name