def submit_tf_benchmark(c, node_count=int(env_values["CLUSTER_MAX_NODES"])): """Submits TensorFlow benchmark job using synthetic data on remote cluster Args: node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES']. Note: Runs ResNet 50 model with batch size of 256 and mixed precision """ from aml_compute import TFExperimentCLI exp = TFExperimentCLI("tf_benchmark") run = exp.submit( os.path.join(_BASE_PATH, "src"), "tf_cnn_benchmarks.py", { "--model": "resnet50", "--batch_size": 256, "--variable_update": "horovod", "--use_fp16": "", }, node_count=node_count, dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"), wait_for_completion=True, ) print(run)
def submit_images(c, node_count=int(env_values["CLUSTER_MAX_NODES"]), epochs=1): """Submit TensorFlow training job using real imagenet data to remote cluster Args: node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES']. epochs (int, optional): Number of epochs to run training for. Defaults to 1. """ from aml_compute import TFExperimentCLI exp = TFExperimentCLI("real_images_remote") run = exp.submit( os.path.join(_BASE_PATH, "src"), "resnet_main.py", { "--training_data_path": "{datastore}/train", "--validation_data_path": "{datastore}/validation", "--epochs": epochs, "--data_type": "images", "--data-format": "channels_first", }, node_count=node_count, dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"), wait_for_completion=True, ) print(run)
def submit_images(c): """This command isn't implemented please modify to use. The call below will work for submitting jobs to execute on a remote cluster using GPUs. Notive that we are passing in a {datastore} parameter to the path. This tells the submit method that we want the location as mapped by the datastore to be inserted here. Upon execution the appropriate path will be preappended to the training_data_path and validation_data_path. """ raise NotImplementedError( "You need to modify this call before being able to use it") from aml_compute import TFExperimentCLI exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>") run = exp.submit( os.path.join(_BASE_PATH, "src"), "<YOUR-TRAINING-SCRIPT>", { "--training_data_path": "{datastore}/train", "--validation_data_path": "{datastore}/validation", "--epochs": "1", "--data_type": "images", "--data-format": "channels_first", }, node_count=4, dependencies_file="TensorFlow/environment_gpu.yml", wait_for_completion=True, ) print(run)
def submit_remote(c): """This command isn't implemented please modify to use. The call below will work for submitting jobs to execute on a remote cluster using GPUs. """ raise NotImplementedError( "You need to modify this call before being able to use it") from aml_compute import TFExperimentCLI exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>") run = exp.submit( os.path.join(_BASE_PATH, "src"), "<YOUR-TRAINING-SCRIPT>", {"YOUR": "ARGS"}, node_count=4, dependencies_file="TensorFlow/environment_gpu.yml", wait_for_completion=True, ) print(run)
def submit_synthetic(c, node_count=int(env_values["CLUSTER_MAX_NODES"]), epochs=1): """Submit TensorFlow training job using synthetic imagenet data to remote cluster Args: node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES']. epochs (int, optional): Number of epochs to run training for. Defaults to 1. """ from aml_compute import TFExperimentCLI exp = TFExperimentCLI("synthetic_images_remote") run = exp.submit( os.path.join(_BASE_PATH, "src"), "resnet_main.py", {"--epochs": epochs}, node_count=node_count, dependencies_file="TensorFlow_imagenet/environment_gpu.yml", wait_for_completion=True, ) print(run)