Exemplo n.º 1
0
def submit_images_remote(c, node_count=int(env_values["CLUSTER_MAX_NODES"])):
    """This command isn't implemented please modify to use.

    The call below will work for submitting jobs to execute on a remote cluster using GPUs.
    Notive that we are passing in a {datastore} parameter to the path. This tells the submit
    method that we want the location as mapped by the datastore to be inserted here. Upon
    execution the appropriate path will be prepended to the training_data_path and validation_data_path.
    """
    raise NotImplementedError(
        "You need to modify this call before being able to use it")
    from aml_compute import PyTorchExperimentCLI
    exp = PyTorchExperimentCLI("<YOUR-EXPERIMENT-NAME>")
    run = exp.submit(
        os.path.join(_BASE_PATH, "src"),
        "<YOUR-TRAINING-SCRIPT>",
        {
            "--training_data_path": "{datastore}/train",
            "--validation_data_path": "{datastore}/validation",
            "--epochs": "1",
            "--data_type": "images",
            "--data-format": "channels_first",
        },
        node_count=node_count,
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
Exemplo n.º 2
0
def submit_images_local(c):
    """This command isn't implemented please modify to use.

    The call below will work for submitting jobs to execute locally on a GPU.
    Here we also map a volume to the docker container executing locally. This is the 
    location we tell our script to look for our training and validation data. Feel free to 
    adjust the other arguments as required by your trainining script.
    """
    raise NotImplementedError(
        "You need to modify this call before being able to use it")
    from aml_compute import PyTorchExperimentCLI
    exp = PyTorchExperimentCLI("<YOUR-EXPERIMENT-NAME>")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "<YOUR-TRAINING-SCRIPT>",
        {
            "--training_data_path": "/data/train",
            "--validation_data_path": "/data/validation",
            "--epochs": "1",
            "--data_type": "images",
            "--data-format": "channels_first",
        },
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        docker_args=["-v", f"{env_values['data']}:/data"],
        wait_for_completion=True,
    )
    print(run)
def submit_images(c,
                  node_count=int(env_values["CLUSTER_MAX_NODES"]),
                  epochs=1):
    """Submit PyTorch training job using real imagenet data to remote cluster
    
    Args:
        node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
        epochs (int, optional): Number of epochs to run training for. Defaults to 1.
    """
    from aml_compute import PyTorchExperimentCLI

    exp = PyTorchExperimentCLI("pytorch_real_images_remote")
    run = exp.submit(
        os.path.join(_BASE_PATH, "src"),
        "imagenet_pytorch_horovod.py",
        {
            "--use_gpu": True,
            "--epochs": epochs,
            "--training_data_path": "{datastore}/train",
            "--validation_data_path": "{datastore}/validation",
        },
        node_count=node_count,
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
Exemplo n.º 4
0
def submit_local(c):
    """This command isn't implemented please modify to use.

    The call below will work for submitting jobs to execute locally on a GPU.
    """
    raise NotImplementedError(
        "You need to modify this call before being able to use it")
    from aml_compute import PyTorchExperimentCLI
    exp = PyTorchExperimentCLI("<YOUR-EXPERIMENT-NAME>")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "<YOUR-TRAINING-SCRIPT>",
        {"YOUR": "ARGS"},
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
Exemplo n.º 5
0
def submit_remote(c, node_count=int(env_values["CLUSTER_MAX_NODES"])):
    """This command isn't implemented please modify to use.

    The call below will work for submitting jobs to execute on a remote cluster using GPUs.
    """
    raise NotImplementedError(
        "You need to modify this call before being able to use it")
    from aml_compute import PyTorchExperimentCLI
    exp = PyTorchExperimentCLI("<YOUR-EXPERIMENT-NAME>")
    run = exp.submit(
        os.path.join(_BASE_PATH, "src"),
        "<YOUR-TRAINING-SCRIPT>",
        {"YOUR": "ARGS"},
        node_count=node_count,
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
def submit_synthetic_local(c, epochs=1):
    """Submit PyTorch training job using synthetic imagenet data for local execution
    
    Args:
        epochs (int, optional): Number of epochs to run training for. Defaults to 1.
    """
    from aml_compute import PyTorchExperimentCLI

    exp = PyTorchExperimentCLI("pytorch_synthetic_images_local")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "imagenet_pytorch_horovod.py",
        {
            "--epochs": epochs,
            "--use_gpu": True
        },
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
Exemplo n.º 7
0
def submit_benchmark_remote(c,
                            node_count=int(env_values["CLUSTER_MAX_NODES"])):
    """Submit PyTorch training job using synthetic data to remote cluster
    
    Args:
        node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
    """
    from aml_compute import PyTorchExperimentCLI

    exp = PyTorchExperimentCLI("synthetic_benchmark_remote")
    run = exp.submit(
        os.path.join(_BASE_PATH, "src"),
        "pytorch_synthetic_benchmark.py",
        {
            "--model": "resnet50",
            "--batch-size": 64
        },
        node_count=node_count,
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
def submit_images_local(c, epochs=1):
    """Submit PyTorch training job using real imagenet data for local execution
    
    Args:
        epochs (int, optional): Number of epochs to run training for. Defaults to 1.
    """
    from aml_compute import PyTorchExperimentCLI

    exp = PyTorchExperimentCLI("pytorch_real_images_local")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "imagenet_pytorch_horovod.py",
        {
            "--epochs": epochs,
            "--use_gpu": True,
            "--training_data_path": "/data/train",
            "--validation_data_path": "/data/validation",
        },
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        docker_args=["-v", f"{env_values['DATA']}:/data"],
        wait_for_completion=True,
    )
    print(run)