def test_cifar_resnet_distributed_1bitsgd(device_id): params = [ "-e", "2", "-datadir", base_path, "-q", "1", "-es", "512", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.86, False, 3)
def test_cifar_convnet_distributed(device_id): # Create a path to TensorBoard log directory and make sure it does not exist. abs_path = os.path.dirname(os.path.abspath(__file__)) tb_logdir = os.path.join(abs_path, 'ConvNet_CIFAR10_DataAug_Distributed_test_log') if os.path.exists(tb_logdir): shutil.rmtree(tb_logdir) params = [ "-n", "2", "-m", "64", "-e", "3200", "-datadir", base_path, "-tensorboard_logdir", tb_logdir, "-q", "32", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.75, False, per_minibatch_tolerance=1e-2 ) # False since different workers may have different #cores # Ensure that the TensorBoard log directory was created and contains exactly one file with the expected name. tb_files = 0 for tb_file in os.listdir(tb_logdir): assert tb_file.startswith("events.out.tfevents") tb_files += 1 assert tb_files == 1
def test_cifar_resnet_distributed_block_momentum(device_id): params = [ "-e", "2", "-datadir", base_path, "-b", "3200", "-es", "512", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.89, False, 5)
def test_htk_lstm_truncated_distributed_gpu(device_id): params = [ "-n", "3", "-datadir", an4_dataset_directory(), "-q", "1", "-m", "640", "-e", "1000", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.76, True)
def test_cifar_convnet_distributed_block_momentum(device_id): params = [ "-n", "2", "-m", "64", "-e", "3200", "-datadir", base_path, "-b", "1600", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.78, False, 10)
def test_cifar_resnet_distributed(device_id): params = [ "-e", "2", "-datadir", prepare_CIFAR10_data(), "-q", "32", "-es", "512", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.86, False, 3)
def test_cifar_resnet_distributed_block_momentum(device_id): params = [ "-e", "2", "-datadir", base_path, "-b", "3200", "-es", "512", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.89, False, 5)
def test_alexnet_imagenet_distributed_block_momentum(device_id): params = [ "-n", "2", "-m", "8", "-e", "16", "-datadir", prepare_ImageNet_data(), "-b", "1600", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, False)
def test_sequence_to_sequence_distributed_block_momentum(device_id): params = [ "-e", "4", "-datadir", cmudict_dataset_directory(), "-ms", "100", "-es", "1000", "-b", "3200", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.8612, False, 1, 2E-2)
def test_sequence_to_sequence_distributed_gpu(device_id): params = [ "-e", "2", "-datadir", cmudict_dataset_directory(), "-q", "1", "-ms", "100", "-es", "500", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.8625, False, 0, 2E-2)
def test_bn_inception_imagenet_distributed(device_id): params = [ "-n", "4", "-datadir", prepare_ImageNet_data(), "-q", "32", "-e", "300", "-m", "2", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True)
def test_bn_inception_cifar_distributed(device_id): params = [ "-n", "8", "-datadir", prepare_CIFAR10_data(), "-q", "32", "-e", "500", "-m", "16", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.88, True)
def test_cifar_convnet_distributed_1bitsgd(device_id): params = [ "-n", "2", "-m", "64", "-e", "3200", "-datadir", prepare_CIFAR10_data(), "-q", "1", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.75, False, per_minibatch_tolerance=1e-2)
def test_alexnet_imagenet_distributed_1bitsgd(device_id): params = [ "-n", "2", "-datadir", prepare_ImageNet_data(), "-q", "1", "-m", "8", "-e", "16", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True)
def disabled_test_alexnet_imagenet_distributed_block_momentum(device_id): params = [ "-n", "2", "-m", "8", "-e", "16", "-datadir", prepare_ImageNet_data(), "-b", "1600", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, False)
def test_htk_lstm_truncated_distributed_block_momentum(device_id): params = [ "-n", "3", "-m", "640", "-e", "1000", "-datadir", an4_dataset_directory(), "-b", "1600", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.76, False, 4)
def test_bn_inception_imagenet_distributed(device_id): params = [ "-n", "4", "-datadir", prepare_ImageNet_data(), "-q", "32", "-e", "300", "-m", "2", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True, timeout_seconds=400)
def test_cifar_convnet_distributed_gpu(device_id): params = [ "-n", "2", "-m", "64", "-e", "3200", "-datadir", base_path, "-q", "1", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.75, False, per_minibatch_tolerance=1e-2)
def test_VGG16_imagenet_distributed(device_id): params = [ "-n", "2", "-m", "2", "-e", "2", "-datadir", prepare_ImageNet_data(), "-q", "32", "-device", str(device_id), "-r", "-testing"] # Currently we only test for CPU since the memory usage is very high for GPU (~6 GB) mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True, timeout_seconds=500, use_only_cpu=True)
def test_inception_v3_imagenet_distributed(device_id): # Inception-V3 distributed test on ImageNet need plenty of memory, # for now, the test server might feel hard to handle pytest.skip('Mute Inception-V3 distributed test temporarily') params = ["-n", "2", "-datadir", prepare_ImageNet_data(), "-q", "32", "-e", "200", "-m", "2", "-r", "-device", str(device_id)] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True, timeout_seconds=400)
def test_VGG19_imagenet_distributed(device_id): params = [ "-n", "2", "-m", "2", "-e", "2", "-datadir", prepare_ImageNet_data(), "-q", "32", "-device", str(device_id), "-r", "-testing" ] # Currently we only test for CPU since the memory usage is very high for GPU (~6 GB) mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True, timeout_seconds=500, use_only_cpu=True)
def test_inception_v3_imagenet_distributed(device_id): # Inception-V3 distributed test on ImageNet need plenty of memory, # for now, the test server might feel hard to handle pytest.skip('Mute Inception-V3 distributed test temporarily') params = [ "-n", "2", "-datadir", prepare_ImageNet_data(), "-q", "32", "-e", "200", "-m", "2", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True, timeout_seconds=400)
def test_cifar_convnet_distributed(device_id): # Create a path to TensorBoard log directory and make sure it does not exist. abs_path = os.path.dirname(os.path.abspath(__file__)) tb_logdir = os.path.join(abs_path, 'ConvNet_CIFAR10_DataAug_Distributed_test_log') if os.path.exists(tb_logdir): shutil.rmtree(tb_logdir) params = [ "-n", "2", "-m", "64", "-e", "3200", "-datadir", base_path, "-tensorboard_logdir", tb_logdir, "-q", "32", "-r", "-device", str(device_id) ] mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.75, False, per_minibatch_tolerance=1e-2) # False since different workers may have different #cores # Ensure that the TensorBoard log directory was created and contains exactly one file with the expected name. tb_files = 0 for tb_file in os.listdir(tb_logdir): assert tb_file.startswith("events.out.tfevents") tb_files += 1 assert tb_files == 1