def test_epoch_sync(num_workers: int, global_batch_size: int, dataset_len: int) -> None: """ Test that epoch_idx is synchronized across all workers regardless of whether the number of batches is evenly divisible by the number of workers. """ config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml")) config = conf.set_slots_per_trial(config, num_workers) max_len_batches = 10 config = conf.set_max_length(config, {"batches": max_len_batches}) config = conf.set_hparam(config, "dataset_len", dataset_len) config = conf.set_global_batch_size(config, global_batch_size) e_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("pytorch_no_op"), 1) t_id = exp.experiment_trials(e_id)[0].trial.id batches_per_epoch = (dataset_len + global_batch_size - 1) // global_batch_size # ceil for batch_idx in range(max_len_batches): epoch_idx = batch_idx // batches_per_epoch for rank in range(config["resources"]["slots_per_trial"]): assert exp.check_if_string_present_in_trial_logs( t_id, f"rank {rank} finished batch {batch_idx} in epoch {epoch_idx}")
def test_text_classification_glue_amp() -> None: example_path = conf.model_hub_examples_path("huggingface/text-classification") config = conf.load_config(os.path.join(example_path, "glue_config.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_hparam(config, "use_apex_amp", True) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_language_modeling_plm_amp() -> None: example_path = conf.model_hub_examples_path("huggingface/language-modeling") config = conf.load_config(os.path.join(example_path, "plm_config.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_hparam(config, "use_apex_amp", True) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_token_classification_ner_amp() -> None: example_path = conf.model_hub_examples_path("huggingface/token-classification") config = conf.load_config(os.path.join(example_path, "ner_config.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_global_batch_size(config, 32) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_hparam(config, "use_apex_amp", True) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_squad_v2_with_beam_search_amp() -> None: example_path = conf.model_hub_examples_path("huggingface/question-answering") config = conf.load_config(os.path.join(example_path, "squad_v2_beam_search.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_global_batch_size(config, 16) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_hparam(config, "use_apex_amp", True) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_multiple_choice_swag_amp() -> None: example_path = conf.model_hub_examples_path("huggingface/multiple-choice") config = conf.load_config(os.path.join(example_path, "swag_config.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_global_batch_size(config, 64) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_hparam(config, "use_apex_amp", True) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_detr_distributed_fake() -> None: example_path = conf.fixtures_path("mmdetection") config = conf.load_config(os.path.join(example_path, "distributed_fake_data.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) config = conf.set_hparam( config, "config_file", "/mmdetection/configs/detr/detr_r50_8x2_150e_coco.py" ) exp.run_basic_test_with_temp_config(config, example_path, 1)