def test_multi_value_matmul_prop(): args = """ --config unit_test --layers-per-ipu 3 7 7 7 --num-hidden-layers 24 --matmul-proportion 0.15 0.3 0.3 0.3 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert config.matmul_proportion == [0.15, 0.3, 0.3, 0.3] # Invalid inputs args = """ --config unit_test --layers-per-ipu 3 7 7 7 --num-hidden-layers 24 --matmul-proportion 0.15 0.3 0.3 """.split() with pytest.raises(SystemExit): config = BertConfig(**(vars(parse_bert_args(args)))) args = """ --config unit_test --layers-per-ipu 3 7 7 7 --num-hidden-layers 24 --matmul-proportion 0.15 0.3 0.3 0.3 0.3 """.split() with pytest.raises(SystemExit): config = BertConfig(**(vars(parse_bert_args(args))))
def test_invalid_layers_per_ipu(): args = """ --config unit_test --layers-per-ipu 1 1 1 1 --num-hidden-layers 3 """.split() with pytest.raises(SystemExit): config = BertConfig(**(vars(parse_bert_args(args)))) args = """ --config unit_test --layers-per-ipu 4 --num-hidden-layers 3 """.split() with pytest.raises(SystemExit): config = BertConfig(**(vars(parse_bert_args(args)))) args = """ --config unit_test --layers-per-ipu 0 1 2 1 --num-hidden-layers 3 """.split() with pytest.raises(SystemExit): config = BertConfig(**(vars(parse_bert_args(args)))) args = """ --config unit_test --layers-per-ipu 0 1 1 1 1 --num-hidden-layers 3 """.split() with pytest.raises(SystemExit): config = BertConfig(**(vars(parse_bert_args(args))))
def test_get_layer_ipu(): args = """ --config unit_test --layers-per-ipu 2 --num-hidden-layers 12 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert (_get_layer_ipu( config.layers_per_ipu) == [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5]) args = """ --config unit_test --layers-per-ipu 2 2 2 2 2 1 --num-hidden-layers 11 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert (_get_layer_ipu( config.layers_per_ipu) == [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5]) args = """ --config unit_test --layers-per-ipu 0 1 1 1 --num-hidden-layers 3 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert (_get_layer_ipu(config.layers_per_ipu) == [1, 2, 3])
def test_multi_value_layers_per_ipu(): args = """ --config unit_test --layers-per-ipu 1 2 3 4 --num-hidden-layers 10 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert config.layers_per_ipu == [1, 2, 3, 4] args = """ --config unit_test --layers-per-ipu 0 3 3 4 --num-hidden-layers 10 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert config.layers_per_ipu == [0, 3, 3, 4]
def main(arg_list=None): run_args, remaining_args = parse_args(arg_list) remaining_args += ["--checkpoint-dir", run_args.checkpoint_dir] # If no config is supplied, try to load the config that should have been saved with the ckpts. if "--config" not in remaining_args: config_path = find_checkpoint_config(run_args.checkpoint_dir) remaining_args += ["--config", config_path] bert_args = utils.parse_bert_args(remaining_args) if not run_args.no_logger_setup: setup_logger(logging.getLevelName('INFO')) # Force variable weights in inference mode - otherwise we can't override the model weights for # validating each new checkpoint. bert_args.variable_weights_inference = True # Required to allow squeezed models to fit. bert_args.max_copy_merge_size = 32000 logger.info("Program Start") # `parse_bert_args` will suffix the user-supplied checkpoint path with the current date/time. # To avoid modifying core Bert code, we'll just remove the suffix (we don't need the created # config). shutil.rmtree(bert_args.checkpoint_dir) bert_args.checkpoint_dir = os.path.dirname(bert_args.checkpoint_dir) logger.info( f"Validating over checkpoints in directory {bert_args.checkpoint_dir}") return validate_checkpoints(run_args, utils.get_validation_args(bert_args))
def test_single_value_layers_per_ipu(): args = """ --config unit_test --layers-per-ipu 1 --num-hidden-layers 4 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert config.layers_per_ipu == [1, 1, 1, 1]
def test_single_value_matmul_prop(): # Matmul proportion on all IPUs, not just encoder IPUs args = """ --config unit_test --layers-per-ipu 1 --num-hidden-layers 4 --matmul-proportion 0.2 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert config.matmul_proportion == [0.2, 0.2, 0.2, 0.2]
def test_bert_regression(custom_ops, output_path, bert_config_file, synthetic, extra_args, uid, synthetic_steps=100): """ Run a pretraining pass of BERT up to the specified number of epochs. This test will gather a number of statistics and assert that performance hasn't dropped substantially (with 10% leeway in some cases). `utils.run_py` only carries out a single step, we need to run multiple epochs to check accuracy, so this is based on the training example. """ # We'll try to create the output path straight-away so as not to waste # time if we get an error os.makedirs(output_path, exist_ok=True) args_string = [ "--config", bert_config_file, "--no-validation", "--no-model-save" ] if synthetic: args_string.append("--synthetic") args_string += ["--epochs", str(synthetic_steps)] args_string += ["--aggregate-metrics-over-steps", str(synthetic_steps)] if extra_args is not None: args_string += extra_args args = parse_bert_args(args_string) session, iteration = main(args) # Graph report statistics graph_report = json.loads(session.getGraphReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = int(np.sum(graph_report["memory"]["byTile"]["total"])) baseline_result = get_test_baseline(uid) accuracies = get_accuracy_stats(args, iteration) # TODO: Add epochs_to_full back in. result = RegressionResult(args.input_files, baseline_result, accuracies, total_memory, max_tile_memory, iteration.throughput) result.write(output_path, uid) # Could probably roll these into a single check for success, but for now # this will cause the reason of the failure to be line-highlighted assert (not result.status & ResultStatus.FAILED_ACCURACY) assert (not result.status & ResultStatus.FAILED_MEM_USAGE) assert (not result.status & ResultStatus.FAILED_TILE_MEM) assert (not result.status & ResultStatus.FAILED_THROUGHPUT)
def test_wikipedia_dataset(): args = "--config demo_tiny_128".split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) config.vocab_size = 30522 config.input_files = ["data/wikipedia/128/wiki_000.tfrecord"] num_tokens = 0 replacement_counts = Counter({"103": 0, "same": 0, "random": 0}) dataset = get_dataset(config) opts = get_options(config) loader = DataLoader(opts, dataset, batch_size=config.batch_size, num_workers=config.dataloader_workers) for datum in tqdm(loader): tokens, attn_mask, types, mask_lm_pos, labels, nsp = datum tokens = tokens.numpy() attn_mask = attn_mask.numpy() types = types.numpy() mask_lm_pos = mask_lm_pos.numpy() labels = labels.numpy() nsp = nsp.numpy() for b in range(config.batch_size): check_dimensions(config, tokens[b], attn_mask[b], types[b], mask_lm_pos[b], labels[b], nsp[b]) check_tokens(config, tokens[b], mask_lm_pos[b], labels[b]) check_attention_mask(attn_mask[b], tokens[b]) check_mask_lm_positions(config, mask_lm_pos[b]) check_labels(config, tokens[b], mask_lm_pos[b], labels[b]) check_token_type(types[b]) check_nsp(nsp[b]) replacement_counts += mask_type_count(tokens[b], mask_lm_pos[b], labels[b]) # Number of tokens, not including padding num_tokens += attn_mask[b, attn_mask[b] == 1].shape[0] # Test masked token proportions total = sum(replacement_counts.values()) for k in replacement_counts: replacement_counts[k] /= total assert (0.79 < replacement_counts["103"] < 0.81) assert (0.09 < replacement_counts["same"] < 0.11) assert (0.09 < replacement_counts["random"] < 0.11) assert (0.14 < total / num_tokens < 0.16) # should be ~0.15
def test_host_embedding(): args_string = [ "--config", 'configs/squad_base_inference.json', '--host-embedding=ALL', '--synthetic-data=true' ] args = utils.parse_bert_args(args_string) args.shuffle = False args.host_embedding = "ALL" host_embedding_outputs = np.array(run_embedding_layer(args), dtype=float) args.host_embedding = "NONE" ipu_embedding_outputs = np.array(run_embedding_layer(args), dtype=float) if np.allclose(host_embedding_outputs, ipu_embedding_outputs, rtol=0.3): logger.info("Passed") else: logger.info("Failed") raise TestFailureError("outputs do not match")
def test_host_embedding(custom_ops): args_string = [ "--config", os.path.join(bert_root_dir(), 'configs/mk1/squad_base_128_inference.json'), '--host-embedding=ALL', '--device-connection-type=ondemand', '--generated-data=true' ] args = utils.parse_bert_args(args_string) args.shuffle = False args.host_embedding = "ALL" host_embedding_outputs = np.array(run_embedding_layer(args), dtype=float) args.host_embedding = "NONE" ipu_embedding_outputs = np.array(run_embedding_layer(args), dtype=float) if np.allclose(host_embedding_outputs, ipu_embedding_outputs, rtol=0.3): logger.info("Passed") else: logger.info("Failed") raise TestFailureError("outputs do not match")
def main(arg_list=None): run_args, remaining_args = parse_args(arg_list) remaining_args += ["--checkpoint-dir", run_args.checkpoint_dir] bert_args = utils.parse_bert_args(remaining_args) print(bert_args) if not run_args.no_logger_setup: setup_logger(logging.getLevelName('INFO')) logger.info("Program Start") # `parse_bert_args` will suffix the user-supplied checkpoint path with the current date/time. # To avoid modifying core Bert code, we'll just remove the suffix (we don't need the created # config). shutil.rmtree(bert_args.checkpoint_dir) bert_args.checkpoint_dir = os.path.dirname(bert_args.checkpoint_dir) logger.info( f"Fine-Tuning over checkpoints in directory {bert_args.checkpoint_dir}" ) finetune_checkpoints(run_args, bert_args)
def test_constant_lrschedule(): """ Test that lr schedule "constant" results in unchanging LR """ import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) args = """ --config unit_test --lr-schedule constant """.split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) opts = get_options(config) # IPU Model and Optimizer model = PipelinedBertWithLoss(config).half().train() optimizer = get_optimizer(config, model) scheduler = get_lr_scheduler(optimizer, "constant") poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) def mock_data(): return get_generated_datum(config) # Compile the model poptorch_model.compile(*mock_data()) # Starting lr should be 1.0 assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate # Run for some steps for _ in range(5): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) # LR should be unchanged assert poptorch_model._dict_new_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate
def test_checkpoint_not_in_ir(): import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) # Config args = """ --config unit_test --lr-schedule constant --layers-per-ipu 0 3 --vocab-size 30400 --weight-decay 0.0 --recompute-checkpoint-every-layer False """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert config.recompute_checkpoint_every_layer is False # Execution parameters opts = get_options(config) model = PipelinedBertWithLoss(config).half().train() optimizer = get_optimizer(config, model) poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) # Compile model datum = get_generated_datum(config) poptorch_model.compile(*datum) ir = json.loads(poptorch_model._debugGetPopartIR()) assert not any(["Checkpoint" in node["name"] for node in ir["maingraph"] ]), ("Popart IR should contain a checkpoint") # Stash: 5 inputs, and 1 stash for transformers on ipu1 exp_num_stash = 5 + 1 assert sum([ "Stash" in node["type"] for node in ir["maingraph"] ]) == exp_num_stash, ("Both the graph input and the checkpoint(s) " "should be stashed") print(sum(["Stash" in node["type"] for node in ir["maingraph"]]))
level=log_level, format='%(asctime)s %(name)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') # Define a specific Handler for this file that removes the root name. console = logging.StreamHandler() console.setLevel(log_level) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s', '%Y-%m-%d %H:%M:%S') console.setFormatter(formatter) logger.addHandler(console) logger.propagate = False if __name__ == "__main__": args = utils.parse_bert_args() setup_logger(logging.getLevelName(args.log_level)) logger.info("Program Start") logger.info("Hostname: " + socket.gethostname()) logger.info("Command Executed: " + str(sys.argv)) # Run the main inference/training session by default if args.inference or not args.no_training: main(args) # If this was a training session and validation isn't disabled; validate. if not args.inference and not args.no_validation and not args.no_model_save: logger.info("Doing Validation") main(utils.get_validation_args(args))
def test_lrschedule_changes_lr(): """ Test that pytorch LR scheduler is correctly changing the learning rate in poptorch """ import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) # Get args and put in config args = """ --config unit_test --lr-warmup 0.25 --lr-schedule linear """.split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) opts = get_options(config) # IPU Model and Optimizer model = PipelinedBertWithLoss(config).half().train() optimizer = get_optimizer(config, model) scheduler = get_lr_scheduler(optimizer, config.lr_schedule, config.lr_warmup, config.training_steps) poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) def mock_data(): return get_generated_datum(config) # Compile the model poptorch_model.compile(*mock_data()) # Starting lr should be 0.0 assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == 0.0 # Run for warmup+1 steps to get to peak warmup_steps = int(config.lr_warmup * config.training_steps) for _ in range(warmup_steps + 1): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) # After warmup+1 steps LR should = 1.0 assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate # run the remaining steps for _ in range(warmup_steps + 1, config.training_steps): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) # LR should have decreased from the peak assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] < config.learning_rate assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][0] > 0.0 # Running beyond the schedule sets lr=0.0 for _ in range(config.training_steps, config.training_steps + 1): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == 0.0
from poptorch.enums import DataLoaderMode from bert_data import get_dataset, get_generated_datum from bert_model import PipelinedBertWithLoss from bert_ipu import get_options from bert_optimization import get_lr_scheduler, get_optimizer from bert_checkpoint import save_checkpoint, restore_checkpoint, checkpoints_exist from utils import parse_bert_args, cycle, logger if __name__ == "__main__": # Ignore known warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) logging.getLogger("poptorch::python").setLevel(logging.ERROR) # Build config from args config = transformers.BertConfig(**(vars(parse_bert_args()))) # Checkpoints should be saved to a directory with no existing checkpoints if config.checkpoint_dir and checkpoints_exist(config): raise RuntimeError( "Found previously saved checkpoint(s) at checkpoint-dir. " "Overwriting checkpoints is not supported. " "Please specify a different checkpoint-dir to " "save checkpoints from this run.") # Restore from checkpoint if necessary checkpoint = restore_checkpoint(config) if config.checkpoint_file else None # Execution parameters opts = get_options(config) # W&B
def test_ipu_cpu_match(recompute_checkpoint, embedding_serialization): """ Test that the BERT model ran on IPU approximately matches that same model ran on the CPU. """ import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) # Config args = """ --config unit_test --lr-schedule constant --layers-per-ipu 0 3 --vocab-size 30400 --batch-size 10 --batches-per-step 1 --gradient-accumulation 10 --enable-half-partials False --optimizer AdamW --learning-rate 0.001 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 config.recompute_checkpoint_every_layer = recompute_checkpoint config.embedding_serialization = embedding_serialization # Models and options opts = get_options(config) opts.anchorMode(poptorch.AnchorMode.Final) model_cpu = PipelinedBertWithLoss(config).train() model_ipu = PipelinedBertWithLoss(config).train() model_ipu.load_state_dict(model_cpu.state_dict()) # Check that copy was successful assert model_ipu is not model_cpu assert all([(a == b).all() for a, b in zip( model_cpu.parameters(), model_ipu.parameters())]) is True optimizer_cpu = torch.optim.AdamW(model_cpu.parameters(), lr=0.001) optimizer_ipu = poptorch.optim.AdamW(model_ipu.parameters(), lr=0.001, loss_scaling=1.0) poptorch_model = poptorch.trainingModel(model_ipu, opts, optimizer=optimizer_ipu) # Input tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') inputs = tokenizer("Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo" "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo" "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo" "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute", return_tensors="pt") inputs['labels'] = torch.randint(0, config.vocab_size, [1, config.mask_tokens], dtype=torch.long) inputs['next_sentence_label'] = torch.randint(0, 1, [1], dtype=torch.long) inputs['masked_lm_positions'] = torch.randint(0, config.sequence_length, [1, config.mask_tokens], dtype=torch.long) batch_size = config.batch_size batch = (inputs['input_ids'].repeat(batch_size, 1), inputs['attention_mask'].repeat(batch_size, 1), inputs['token_type_ids'].repeat(batch_size, 1), inputs['masked_lm_positions'].repeat(batch_size, 1), inputs['labels'].repeat(batch_size, 1), inputs['next_sentence_label'].repeat(batch_size, 1)) batch_cpu = (inputs['input_ids'].repeat(1, 1), inputs['attention_mask'].repeat(1, 1), inputs['token_type_ids'].repeat(1, 1), inputs['masked_lm_positions'].repeat(1, 1), inputs['labels'].repeat(1, 1), inputs['next_sentence_label'].repeat(1, 1)) # Training Loop for step in range(10): # Step CPU model optimizer_cpu.zero_grad() for b in range(batch_size): cpu_output = model_cpu(*batch_cpu) cpu_loss = cpu_output[0] cpu_loss.div(batch_size).backward() optimizer_cpu.step() # Step IPU Model ipu_output = poptorch_model(*batch) ipu_loss = ipu_output[0] with torch.no_grad(): print(f"CPU Loss: {cpu_loss}, IPU Loss: {ipu_loss}") # Check the losses are approximately equal assert np.allclose(cpu_loss.numpy(), ipu_loss.numpy(), atol=1e-6)