Exemplo n.º 1
0
def setup_eks_cluster(framework_name, is_neuron):
    frameworks = {
        "tensorflow": "tf",
        "mxnet": "mx",
        "pytorch": "pt",
    }
    long_name = framework_name
    short_name = frameworks[long_name]
    codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7]
    num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4
    cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}"
    # default volume size
    volume_size = 80
    try:
        eks_utils.eks_setup()
        if is_neuron:
            #TODO the eks AMI used for neuron has a snapshot size of 500GB, if we pass the default 80GB the cluster
            #creation will fail. Once official EKS AMI for neuron 1.1 is released, revert this change.
            volume_size = 500
            eks_utils.create_eks_cluster(cluster_name, "neuron", num_nodes, volume_size, "inf1.xlarge", "pytest.pem")
        else:
            eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, volume_size, "p3.16xlarge", "pytest.pem")
    except Exception:
        eks_utils.delete_eks_cluster(cluster_name)
        raise
    return cluster_name
def setup_eks_cluster(framework_name, is_neuron):
    frameworks = {
        "tensorflow": "tf",
        "mxnet": "mx",
        "pytorch": "pt",
    }
    long_name = framework_name
    short_name = frameworks[long_name]
    codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7]
    num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4
    cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}"
    # default volume size
    volume_size = 80
    try:
        eks_utils.eks_setup()
        if is_neuron:
            eks_utils.create_eks_cluster(cluster_name, num_nodes, volume_size,
                                         "inf1.xlarge", "pytest.pem")
        else:
            eks_utils.create_eks_cluster(cluster_name, num_nodes, volume_size,
                                         "p3.16xlarge", "pytest.pem")
    except Exception:
        eks_utils.delete_eks_cluster(cluster_name)
        raise
    return cluster_name
def setup_eks_clusters(dlc_images):
    terminable_clusters = []
    frameworks = {"tensorflow": "tf", "pytorch": "pt", "mxnet": "mx"}
    for long_name, short_name in frameworks.items():
        if long_name in dlc_images:
            cluster_name = None
            if not is_pr_context():
                num_nodes = 3 if long_name != "pytorch" else 4
                cluster_name = f"dlc-{short_name}-cluster-" \
                               f"{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}-{random.randint(1, 10000)}"
                eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes,
                                             "p3.16xlarge", "pytest.pem")
                terminable_clusters.append(cluster_name)
            eks_utils.eks_setup(long_name, cluster_name)
    return terminable_clusters
Exemplo n.º 4
0
def setup_eks_cluster(framework_name):
    frameworks = {"tensorflow": "tf", "pytorch": "pt", "mxnet": "mx"}
    long_name = framework_name
    short_name = frameworks[long_name]
    codebuild_version = os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')[0:7]
    num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4
    cluster_name = f"dlc-{short_name}-cluster-" \
                   f"{codebuild_version}-{random.randint(1, 10000)}"
    try:
        eks_utils.eks_setup()
        eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, "p3.16xlarge", "pytest.pem")
    except Exception:
        eks_utils.delete_eks_cluster(cluster_name)
        raise
    return cluster_name
def setup_eks_clusters(dlc_images):
    frameworks = {"tensorflow": "tf", "pytorch": "pt", "mxnet": "mx"}
    frameworks_in_images = [
        framework for framework in frameworks.keys() if framework in dlc_images
    ]
    if len(frameworks_in_images) != 1:
        raise ValueError(
            f"All images in dlc_images must be of a single framework for EKS tests.\n"
            f"Instead seeing {frameworks_in_images} frameworks.")
    long_name = frameworks_in_images[0]
    short_name = frameworks[long_name]
    num_nodes = 2 if is_pr_context() else 3 if long_name != "pytorch" else 4
    cluster_name = f"dlc-{short_name}-cluster-" \
                   f"{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}-{random.randint(1, 10000)}"
    eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, "p3.16xlarge",
                                 "pytest.pem")
    eks_utils.eks_setup(long_name, cluster_name)
    return cluster_name