def testToyBERTModelGradientAccumulationLegacyExperimental( gradient_accumulation_steps): # Common setup total_steps = 10 device = "cuda" seed = 1 # EXPERIMENTAL IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) model_desc = bert_model_description() model = load_bert_onnx_model() optim_config = optim.LambConfig() opts = orttrainer.ORTTrainerOptions({ 'debug': { 'deterministic_compute': True }, 'device': { 'id': device, }, 'batch': { 'gradient_accumulation_steps': gradient_accumulation_steps }, }) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts) experimental_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) loss = trainer.train_step(*sample_input) experimental_losses.append(loss.cpu().item()) # LEGACY IMPLEMENTATION device = torch.device(device) torch.manual_seed(seed) onnxruntime.set_seed(seed) legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params( optim_config.lr) legacy_trainer = Legacy_ORTTrainer( model, None, legacy_model_desc, "LambOptimizer", None, learning_rate_description, device, _use_deterministic_compute=True, gradient_accumulation_steps=gradient_accumulation_steps) legacy_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) leg_loss = legacy_trainer.train_step(*sample_input, learning_rate) legacy_losses.append(leg_loss.cpu().item()) # Check results _test_helpers.assert_model_outputs(experimental_losses, legacy_losses, rtol=1e-6)
def testWrapModelLossFnStateDict(self): torch.manual_seed(1) device = torch.device("cuda") class LinearModel(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(2, 4) def forward(self, y=None, x=None): if y is not None: return self.linear(x) + y else: return self.linear(x) + torch.ones(2, 4) pt_model = LinearModel() data = torch.randn(2, 2) label = torch.tensor([0, 1], dtype=torch.int64) input_desc = IODescription('x', [2, 2], torch.float32) label_desc = IODescription('label', [2, ], torch.int64, num_classes=4) output_desc = IODescription('output', [2, 4], torch.float32) loss_desc = IODescription('loss', [], torch.float32) model_desc = ModelDescription([input_desc, label_desc], [loss_desc, output_desc]) def loss_fn(x, label): return F.nll_loss(F.log_softmax(x, dim=1), label) def get_lr_this_step(global_step): learningRate = 0.02 return torch.tensor([learningRate]) ort_trainer = ORTTrainer( pt_model, loss_fn, model_desc, "SGDOptimizer", None, IODescription('Learning_Rate', [1, ], torch.float32), device, get_lr_this_step=get_lr_this_step) ort_trainer.train_step(x=data, label=label) state_dict = ort_trainer.state_dict() assert state_dict.keys() == {'linear.bias', 'linear.weight'}
def get_onnx_model(self, model, model_desc, inputs, device, _enable_internal_postprocess=True, _extra_postprocess=None): lr_desc = IODescription('Learning_Rate', [ 1, ], torch.float32) model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes, lr_desc, device, world_rank=0, world_size=1, _opset_version=12, _enable_internal_postprocess=_enable_internal_postprocess, _extra_postprocess=_extra_postprocess) train_output = model.train_step(*inputs) return model.onnx_model_
def testToyBERTModelMixedPrecisionLossScalerLegacyExperimental( loss_scaler, legacy_loss_scaler): # Common setup total_steps = 128 device = "cuda" seed = 1 # EXPERIMENTAL IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) model_desc = bert_model_description() model = load_bert_onnx_model() optim_config = optim.AdamConfig(lr=0.001) opts = orttrainer.ORTTrainerOptions({ 'debug': { 'deterministic_compute': True }, 'device': { 'id': device, }, 'mixed_precision': { 'enabled': True, 'loss_scaler': loss_scaler } }) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts) experimental_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) experimental_losses.append( trainer.train_step(*sample_input).cpu().item()) # LEGACY IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) device = torch.device(device) model = load_bert_onnx_model() legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params( optim_config.lr) legacy_trainer = Legacy_ORTTrainer(model, None, legacy_model_desc, "AdamOptimizer", None, learning_rate_description, device, _use_deterministic_compute=True, use_mixed_precision=True, loss_scaler=legacy_loss_scaler) legacy_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) leg_loss = legacy_trainer.train_step(*sample_input, learning_rate) legacy_losses.append(leg_loss.cpu().item()) # Check results _test_helpers.assert_model_outputs(experimental_losses, legacy_losses)
def testToyBERTModelLegacyExperimentalBasicTraining(optimizer_config): # Common setup train_steps = 512 device = 'cuda' seed = 1 torch.manual_seed(seed) onnxruntime.set_seed(seed) # EXPERIMENTAL API model_desc = bert_model_description() model = load_bert_onnx_model() opts = orttrainer.ORTTrainerOptions({ 'debug' : { 'deterministic_compute': True }, 'device': { 'id': device, }, }) optim_config = optimizer_config(lr=0.01) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts) experimental_losses = [] for i in range(train_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) experimental_losses.append(trainer.train_step(*sample_input).cpu().item()) # LEGACY IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) if optimizer_config == optim.AdamConfig: legacy_optimizer = 'AdamOptimizer' elif optimizer_config == optim.LambConfig: legacy_optimizer = 'LambOptimizer' elif optimizer_config == optim.SGDConfig: legacy_optimizer = 'SGDOptimizer' else: raise RuntimeError("Invalid optimizer_config") device = torch.device(device) model = load_bert_onnx_model() legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params(lr=optim_config.lr) legacy_trainer = Legacy_ORTTrainer(model, None, legacy_model_desc, legacy_optimizer, None, learning_rate_description, device, _use_deterministic_compute=True) legacy_losses = [] for i in range(train_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) leg_loss = legacy_trainer.train_step(*sample_input, learning_rate) legacy_losses.append(leg_loss.cpu().item()) # Check results _test_helpers.assert_model_outputs(experimental_losses, legacy_losses, True)
def testToyBERTModelLegacyExperimentalCustomOptimParameters(params, legacy_optim_map): # Common setup total_steps = 128 device = "cuda" seed = 1 # EXPERIMENTAL API torch.manual_seed(seed) onnxruntime.set_seed(seed) model_desc = bert_model_description() model = load_bert_onnx_model() optim_config = optim.AdamConfig( params, alpha=0.9, beta=0.999, lambda_coef=0.01, epsilon=1e-6, do_bias_correction=False ) opts = orttrainer.ORTTrainerOptions( { "debug": {"deterministic_compute": True}, "device": { "id": device, }, } ) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts) experimental_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) experimental_losses.append(trainer.train_step(*sample_input).cpu().item()) # LEGACY IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) device = torch.device(device) model = load_bert_onnx_model() legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params(trainer.optim_config.lr) legacy_trainer = Legacy_ORTTrainer( model, None, legacy_model_desc, "AdamOptimizer", legacy_optim_map, learning_rate_description, device, _use_deterministic_compute=True, ) legacy_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) legacy_sample_input = [*sample_input, learning_rate] legacy_losses.append(legacy_trainer.train_step(legacy_sample_input).cpu().item()) # Check results _test_helpers.assert_model_outputs(experimental_losses, legacy_losses)
def _training_step(self, model: ORTTrainer, inputs: Dict[str, torch.Tensor]) -> float: model.train() for k, v in inputs.items(): inputs[k] = v.to(self.args.device) outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) return loss.item()
def testToyBERTModelLegacyExperimentalBasicTraining(): # Common setup train_steps = 10 device = 'cuda' seed = 1 torch.manual_seed(seed) onnxruntime.set_seed(seed) # EXPERIMENTAL API model_desc = bert_model_description() model = load_bert_onnx_model() params = optimizer_parameters(model) optim_config = optim.LambConfig() opts = orttrainer.ORTTrainerOptions({ 'debug': { 'deterministic_compute': True }, 'device': { 'id': device, }, }) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts) experimental_losses = [] for i in range(train_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) experimental_losses.append( trainer.train_step(*sample_input).cpu().item()) # LEGACY IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) device = torch.device(device) legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params( lr=0.001) legacy_trainer = Legacy_ORTTrainer(model, None, legacy_model_desc, "LambOptimizer", None, learning_rate_description, device) legacy_losses = [] for i in range(train_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) leg_loss = legacy_trainer.train_step(*sample_input, learning_rate) legacy_losses.append(leg_loss.cpu().item()) # Check results _test_helpers.assert_model_outputs(experimental_losses, legacy_losses, True, rtol=1e-5)
def testORTTrainerLegacyAndExperimentalPrecisionLossScaler(seed, device): # Common data total_steps = 5 bptt=35 # Setup experimental API torch.manual_seed(seed) set_seed(seed) loss_scaler = amp.DynamicLossScaler() options = orttrainer.ORTTrainerOptions({'device' : {'id' : device}, 'mixed_precision' : { 'enabled' : True, 'loss_scaler' : loss_scaler}, 'debug' : {'deterministic_compute' : True,}}) model, model_desc, my_loss, batcher_fn, train_data, val_data, _ = _load_pytorch_transformer_model(device) optim_config = optim.LambConfig(lr=0.001) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=options) # Training loop experimental_loss = [] experimental_preds_dtype = [] for i in range(total_steps): data, targets = batcher_fn(train_data, i) exp_loss, exp_preds = trainer.train_step(data, targets) experimental_loss.append(exp_loss.cpu()) experimental_preds_dtype.append(exp_preds.dtype) # Setup legacy API torch.manual_seed(seed) set_seed(seed) model, (model_desc, lr_desc), _, _, _, _, _ = _load_pytorch_transformer_model(device, legacy_api=True) loss_scaler = Legacy_LossScaler('ort_test_input_loss_scalar', True) legacy_trainer = Legacy_ORTTrainer(model, my_loss, model_desc, "LambOptimizer", None, lr_desc, device=device, _use_deterministic_compute=True, use_mixed_precision=True, loss_scaler=loss_scaler) # Training loop legacy_loss = [] legacy_preds_dtype = [] for i in range(total_steps): data, targets = batcher_fn(train_data, i) leg_loss, leg_preds = legacy_trainer.train_step(data, targets, torch.tensor([optim_config.lr])) legacy_loss.append(leg_loss.cpu()) legacy_preds_dtype.append(leg_preds.dtype) # Compare legacy vs experimental APIs assert experimental_preds_dtype == legacy_preds_dtype _test_helpers.assert_legacy_onnx_weights(trainer, legacy_trainer, rtol=1e-4, atol=1e-2) _test_helpers.assert_model_outputs(legacy_loss, experimental_loss, rtol=1e-4)
def create_ort_trainer(args, device, model): # set GPU memory limitation from onnxruntime.capi._pybind_state import set_cuda_mem_limit ort_cuda_mem_limit_in_gbs = 1 set_cuda_mem_limit(int(ort_cuda_mem_limit_in_gbs * 1024 * 1024 *1024)) # BertLAMB default initial settings: b1=0.9, b2=0.999, e=1e-6 def map_optimizer_attributes(name): no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"] no_decay = False for no_decay_key in no_decay_keys: if no_decay_key in name: no_decay = True break if no_decay: return {"alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6} else: return {"alpha": 0.9, "beta": 0.999, "lambda": 0.01, "epsilon": 1e-6} # we request ORTTrainer to create a LambOptimizer with given optimizer_attributes. # train_step does forward, backward, and optimize step. model = ORTTrainer(model, None, bert_model_description(args), "LambOptimizer", map_optimizer_attributes, IODescription('Learning_Rate', [1,], torch.float32), device, _opset_version = 10) if args.fp16: setattr(args, 'ort_loss_scale', LossScaler(model.loss_scale_input_name, True, up_scale_window=2000)) return model
def get_trainer( self, model, model_desc, device, onnx_opset_ver=12, frozen_weights=[], internal_loss_fn=False, get_lr_this_step=None, optimizer="SGDOptimizer", ): loss_fn = MNISTWrapper.my_loss if not internal_loss_fn else None return ORTTrainer( model, loss_fn, model_desc, optimizer, None, IODescription( "Learning_Rate", [ 1, ], torch.float32, ), device, _opset_version=onnx_opset_ver, frozen_weights=frozen_weights, get_lr_this_step=get_lr_this_step, )
def create_ort_trainer(gradient_accumulation_steps, use_mixed_precision, allreduce_post_accumulation, use_simple_model_desc=True, loss_scaler=None, partition_optimizer=False): model_desc = bert_model_description() simple_model_desc = remove_extra_info(model_desc) if use_simple_model_desc else model_desc learning_rate_description = ort_trainer_learning_rate_description() device = torch.device("cuda", 0) onnx_model = onnx.load(get_name("bert_toy_postprocessed.onnx")) model = ORTTrainer(onnx_model, None, simple_model_desc, "LambOptimizer", map_optimizer_attributes, learning_rate_description, device, postprocess_model=None, gradient_accumulation_steps=gradient_accumulation_steps, world_rank=0, world_size=1, loss_scaler=loss_scaler, use_mixed_precision=use_mixed_precision, allreduce_post_accumulation=allreduce_post_accumulation, partition_optimizer = partition_optimizer) return model, model_desc, device
def create_ort_trainer(args, device, model): # set GPU memory limitation (per card!) from onnxruntime.capi._pybind_state import set_cuda_mem_limit ort_cuda_mem_limit_in_gbs = args.gpu_memory_limit_gb set_cuda_mem_limit(int(ort_cuda_mem_limit_in_gbs * 1024 * 1024 * 1024)) # BertLAMB default initial settings: b1=0.9, b2=0.999, e=1e-6 def map_optimizer_attributes(name): no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"] no_decay = False for no_decay_key in no_decay_keys: if no_decay_key in name: no_decay = True break if no_decay: return { "alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6 } else: return { "alpha": 0.9, "beta": 0.999, "lambda": 0.01, "epsilon": 1e-6 } # we request ORTTrainer to create a LambOptimizer with given optimizer_attributes. # train_step does forward, backward, and optimize step. model = ORTTrainer( model, None, bert_model_description(args), "LambOptimizer", map_optimizer_attributes, IODescription('Learning_Rate', [ 1, ], torch.float32), device, gradient_accumulation_steps=args.gradient_accumulation_steps, world_rank=args.world_rank, world_size=args.world_size, use_mixed_precision=True if args.fp16 else False, allreduce_post_accumulation=True if args.allreduce_post_accumulation else False, deepspeed_zero_stage=1 if args.deepspeed_zero_stage else 0, _opset_version=12) if args.fp16: setattr( args, 'ort_loss_scale', LossScaler(model.loss_scale_input_name, True, up_scale_window=2000)) return model
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertModel(config=config) model.to(input_ids.device) model.eval() sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) # failed because there is not loss output model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc ], [self.last_hidden_state_desc, self.pooler_output_desc]) args_gradient_accumulation_steps = 8 args_local_rank = 0 args_world_size = 1 args_fp16 = True args_allreduce_post_accumulation = True model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription( 'Learning_Rate', [ 1, ], torch.float32), device=self.device, postprocess_model=postprocess_model, gradient_accumulation_steps=args_gradient_accumulation_steps, world_rank=args_local_rank, world_size=args_world_size, use_mixed_precision=True if args_fp16 else False, allreduce_post_accumulation=True if args_allreduce_post_accumulation else False) sequence_output, pooled_output = model( input_ids, token_type_ids=token_type_ids) sequence_output, pooled_output = model(input_ids) result = { "sequence_output": sequence_output, "pooled_output": pooled_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def get_trainer(self, model, model_desc, device, onnx_opset_ver=12): return ORTTrainer(model, MNISTWrapper.my_loss, model_desc, "SGDOptimizer", None, IODescription('Learning_Rate', [ 1, ], torch.float32), device, _opset_version=onnx_opset_ver)
def to_ort_model(self, model, config, args): model_desc = self.gpt2_model_description(config.n_head, config.vocab_size, config.n_embd, config.n_layer, config.n_ctx, args.per_gpu_train_batch_size) learning_rate_description = self.ort_trainer_learning_rate_description( ) def map_optimizer_attributes(name): no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"] no_decay = False for no_decay_key in no_decay_keys: if no_decay_key in name: no_decay = True break if no_decay: return { "alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": args.adam_epsilon } else: return { "alpha": 0.9, "beta": 0.999, "lambda": args.weight_decay, "epsilon": args.adam_epsilon } from onnxruntime.capi._pybind_state import set_cuda_device_id, set_arena_extend_strategy, ArenaExtendStrategy set_arena_extend_strategy(ArenaExtendStrategy.kSameAsRequested) set_cuda_device_id(self.args.local_rank) model = ORTTrainer( model, None, model_desc, "AdamOptimizer", map_optimizer_attributes, learning_rate_description, args.device, gradient_accumulation_steps=args.gradient_accumulation_steps, world_rank=self.args.world_rank, world_size=self.args.world_size, use_mixed_precision=self.args.fp16, allreduce_post_accumulation=True, _opset_version=12) logger.info("****************************Model converted to ORT") return model
def testORTTrainerLegacyAndExperimentalWeightsCheck(seed, device): # Common data total_steps = 5 bptt = 35 # Setup for the experimental ORTTRainer run torch.manual_seed(seed) set_seed(seed) optim_config = optim.LambConfig() opts = orttrainer.ORTTrainerOptions({ 'device' : { 'id' : device }, 'debug' : { 'deterministic_compute': True }, }) model, model_desc, my_loss, batcher_fn, train_data, val_data, _ = _load_pytorch_transformer_model(device) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=opts) # Training loop for i in range(total_steps): data, targets = batcher_fn(train_data, i) _ = trainer.train_step(data, targets) # Setup for the legacy ORTTrainer run torch.manual_seed(seed) set_seed(seed) model, (model_desc, lr_desc), _, _, _, _, _ = _load_pytorch_transformer_model(device, legacy_api=True) legacy_trainer = Legacy_ORTTrainer(model, my_loss, model_desc, "LambOptimizer", None, lr_desc, device, _use_deterministic_compute=True) # Training loop for i in range(total_steps): data, targets = batcher_fn(train_data, i) _, _ = legacy_trainer.train_step(data, targets, torch.tensor([optim_config.lr])) # Compare legacy vs experimental APIs _test_helpers.assert_legacy_onnx_weights(trainer, legacy_trainer, rtol=1e-4)
def testORTTrainerLegacyAndExperimentalGradientAccumulation(seed, device, gradient_accumulation_steps, total_steps): # Common data torch.set_printoptions(precision=10) # Setup experimental API torch.manual_seed(seed) set_seed(seed) options = orttrainer.ORTTrainerOptions({'device' : {'id' : device}, 'batch' : {'gradient_accumulation_steps' : gradient_accumulation_steps}, 'debug' : {'deterministic_compute' : True}}) model, model_desc, my_loss, batcher_fn, train_data, val_data, _ = _load_pytorch_transformer_model(device) optim_config = optim.LambConfig(lr=0.001) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=options) # Training loop experimental_loss = [] for i in range(total_steps): data, targets = batcher_fn(train_data, i) exp_loss, exp_preds = trainer.train_step(data, targets) experimental_loss.append(exp_loss.cpu()) # Setup legacy API torch.manual_seed(seed) set_seed(seed) model, (model_desc, lr_desc), _, _, _, _, _ = _load_pytorch_transformer_model(device, legacy_api=True) legacy_trainer = Legacy_ORTTrainer(model, my_loss, model_desc, "LambOptimizer", None, lr_desc, device=device, _use_deterministic_compute=True, gradient_accumulation_steps=gradient_accumulation_steps) # Training loop legacy_loss = [] for i in range(total_steps): data, targets = batcher_fn(train_data, i) leg_loss, leg_preds = legacy_trainer.train_step(data, targets, torch.tensor([optim_config.lr])) legacy_loss.append(leg_loss.cpu()) # Compare legacy vs experimental APIs _test_helpers.assert_model_outputs(legacy_loss, experimental_loss, rtol=1e-6)
def testTrainingAndEvalDropout(self): # Temporarily disable this test. # The graph below will trigger ORT # to sort backward graph before forward graph which gives incorrect result. # TODO Re-enable when that is fixed. return class TwoDropoutNet(nn.Module): def __init__(self, drop_prb_1, drop_prb_2, dim_size): super(TwoDropoutNet, self).__init__() self.drop_1 = nn.Dropout(drop_prb_1) self.drop_2 = nn.Dropout(drop_prb_2) self.weight_1 = torch.nn.Parameter(torch.zeros(dim_size, dtype=torch.float32)) def forward(self, x): x = x + self.weight_1 x = self.drop_1(x) x = self.drop_2(x) output = x return output[0] dim_size = 3 device = torch.device("cuda", 0) # This will drop all values, therefore expecting all 0 in output tensor model = TwoDropoutNet(0.999, 0.999, dim_size) input_desc = IODescription('input', [dim_size], torch.float32) output_desc = IODescription('output', [], torch.float32) model_desc = ModelDescription([input_desc], [output_desc]) lr_desc = ort_trainer_learning_rate_description() model = ORTTrainer(model, None, model_desc, "LambOptimizer", map_optimizer_attributes, lr_desc, device, postprocess_model=process_dropout, world_rank=0, world_size=1) input = torch.ones(dim_size, dtype=torch.float32).to(device) expected_training_output = [0.0] expected_eval_output = [1.0] learning_rate = torch.tensor([1.0000000e+00]).to(device) input_args=[input, learning_rate] train_output = model.train_step(*input_args) rtol = 1e-04 assert_allclose(expected_training_output, train_output.item(), rtol=rtol, err_msg="dropout training loss mismatch") eval_output = model.eval_step(input) assert_allclose(expected_eval_output, eval_output.item(), rtol=rtol, err_msg="dropout eval loss mismatch") # Do another train step to make sure it's using original ratios train_output_2 = model.train_step(*input_args) assert_allclose(expected_training_output, train_output_2.item(), rtol=rtol, err_msg="dropout training loss 2 mismatch")
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForMaskedLM(config=config) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) ##### model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc ], [self.loss_desc, self.prediction_scores_desc]) args_gradient_accumulation_steps = 8 args_local_rank = 0 args_world_size = 1 args_fp16 = True args_allreduce_post_accumulation = True model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription( 'Learning_Rate', [ 1, ], torch.float32), device=self.device, postprocess_model=postprocess_model, gradient_accumulation_steps=args_gradient_accumulation_steps, world_rank=args_local_rank, world_size=args_world_size, use_mixed_precision=True if args_fp16 else False, allreduce_post_accumulation=True if args_allreduce_post_accumulation else False) model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
def testToyBERTModelLegacyExperimentalLRScheduler(initial_lr, lr_scheduler, legacy_lr_scheduler): ############################################################################ # These tests require hard-coded values for 'total_steps' and 'initial_lr' # ############################################################################ # Common setup total_steps = 128 device = 'cuda' seed = 1 warmup = 0.05 cycles = 0.5 power = 1. lr_end = 1e-7 # Setup both Experimental and Legacy LR Schedulers before the experimental loop if legacy_lr_scheduler == _test_commons.legacy_constant_lr_scheduler or legacy_lr_scheduler == _test_commons.legacy_linear_lr_scheduler: legacy_lr_scheduler = partial(legacy_lr_scheduler, initial_lr=initial_lr, total_steps=total_steps, warmup=warmup) elif legacy_lr_scheduler == _test_commons.legacy_cosine_lr_scheduler: legacy_lr_scheduler = partial(legacy_lr_scheduler, initial_lr=initial_lr, total_steps=total_steps, warmup=warmup, cycles=cycles) elif legacy_lr_scheduler == _test_commons.legacy_poly_lr_scheduler: legacy_lr_scheduler = partial(legacy_lr_scheduler, initial_lr=initial_lr, total_steps=total_steps, warmup=warmup, power=power, lr_end=lr_end) else: raise RuntimeError("Invalid legacy_lr_scheduler") if lr_scheduler == optim.lr_scheduler.ConstantWarmupLRScheduler or lr_scheduler == optim.lr_scheduler.LinearWarmupLRScheduler: lr_scheduler = lr_scheduler(total_steps=total_steps, warmup=warmup) elif lr_scheduler == optim.lr_scheduler.CosineWarmupLRScheduler: lr_scheduler = lr_scheduler(total_steps=total_steps, warmup=warmup, cycles=cycles) elif lr_scheduler == optim.lr_scheduler.PolyWarmupLRScheduler: lr_scheduler = lr_scheduler(total_steps=total_steps, warmup=warmup, power=power, lr_end=lr_end) else: raise RuntimeError("Invalid lr_scheduler") # EXPERIMENTAL API model_desc = bert_model_description() model = load_bert_onnx_model() torch.manual_seed(seed) onnxruntime.set_seed(seed) optim_config = optim.AdamConfig(lr=initial_lr) opts = orttrainer.ORTTrainerOptions({ 'debug': { 'deterministic_compute': True }, 'device': { 'id': device, }, 'lr_scheduler': lr_scheduler }) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts) experimental_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) experimental_losses.append( trainer.train_step(*sample_input).cpu().item()) assert_allclose(trainer.options.lr_scheduler.get_last_lr()[0], legacy_lr_scheduler(i)) # LEGACY IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) device = torch.device(device) model = load_bert_onnx_model() legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params( initial_lr) legacy_trainer = Legacy_ORTTrainer(model, None, legacy_model_desc, "AdamOptimizer", None, learning_rate_description, device, _use_deterministic_compute=True, get_lr_this_step=legacy_lr_scheduler) legacy_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) leg_loss = legacy_trainer.train_step(*sample_input) legacy_losses.append(leg_loss.cpu().item()) # Check results _test_helpers.assert_model_outputs(experimental_losses, legacy_losses)
def train(self): """ Main training entry point. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs if self.use_new_api: lr_scheduler = orttrainer.optim.LinearWarmupLRScheduler(t_total, self.args.warmup_steps/float(t_total)) loss_scaler = amp.DynamicLossScaler() if self.args.fp16 else None device = self.args.device.type device = f'{device}:{self.args.device.index}' if self.args.device.index else f'{device}:0' options = orttrainer.ORTTrainerOptions({'batch' : { 'gradient_accumulation_steps' : self.args.gradient_accumulation_steps}, 'device': {'id': device}, 'mixed_precision': { 'enabled': self.args.fp16, 'loss_scaler': loss_scaler}, 'debug': {'deterministic_compute': True, }, 'utils': { 'grad_norm_clip': False}, 'distributed': {'allreduce_post_accumulation': True}, 'lr_scheduler': lr_scheduler }) param_optimizer = list(self.model.named_parameters()) params = [{ 'params': [n for n, p in param_optimizer if "bias" in n or "LayerNorm.weight" in n], "weight_decay_mode": 1, }, { 'params': [n for n, p in param_optimizer if not ("bias" in n or "LayerNorm.weight" in n)], "weight_decay_mode": 1, } ] optim_config = optim.AdamConfig(params=params, lr=2e-5, do_bias_correction=True) self.model = orttrainer.ORTTrainer(self.model, self.new_model_desc, optim_config, options=options) else: def map_optimizer_attributes(name): no_decay = "bias" in name or "LayerNorm.weight" in name if no_decay: return {"weight_decay_mode" : 1} else: return {"weight_decay_mode" : 1} get_lr_this_step = get_linear_schedule_with_warmup(self.args.warmup_steps, t_total, self.args.learning_rate) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) if self.args.fp16 else None self.model = ORTTrainer(self.model, None, self.model_desc, "AdamOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription('Learning_Rate', [1,], torch.float32), device=self.args.device, gradient_accumulation_steps=self.args.gradient_accumulation_steps, use_mixed_precision=self.args.fp16, allreduce_post_accumulation=True, get_lr_this_step=get_lr_this_step, loss_scaler=loss_scaler, enable_grad_norm_clip=False, _opset_version=12, _use_deterministic_compute=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataloader.dataset)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss = 0.0 logging_loss = 0.0 train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0], ) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0]) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self._training_step(self.model, inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): global_step += 1 if self.args.local_rank in [-1, 0]: if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or ( global_step == 1 and self.args.logging_first_step ): logs = {} if self.args.evaluate_during_training: results = self.evaluate() for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps if not self.use_new_api: learning_rate_scalar = get_lr_this_step(global_step) logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss epoch_iterator.write(json.dumps({**logs, **{"step": global_step}})) if self.args.max_steps > 0 and global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and global_step > self.args.max_steps: train_iterator.close() break logger.info("\n\nTraining completed. \n\n") return TrainOutput(global_step, tr_loss / global_step)
def train(self): """ Main training entry point. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = (self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1) else: t_total = int( len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs get_lr_this_step = get_linear_schedule_with_warmup( self.args.warmup_steps, t_total, self.args.learning_rate) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) def map_optimizer_attributes(name): # no_decay_keys = ["bias", "LayerNorm.weight"] no_decay = "bias" in name or "LayerNorm.weight" in name if no_decay: return {"weight_decay": 0.0, "weight_decay_mode": 1} else: return { "weight_decay": self.args.weight_decay, "weight_decay_mode": 1 } self.model = ORTTrainer( self.model, None, self.model_desc, "AdamOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription('Learning_Rate', [ 1, ], torch.float32), device=self.args.device, gradient_accumulation_steps=self.args.gradient_accumulation_steps, world_rank=0, world_size=1, # only support single GPU cases use_mixed_precision=self.args.fp16, allreduce_post_accumulation=True, get_lr_this_step=get_lr_this_step, loss_scaler=loss_scaler, enable_grad_norm_clip=False, _opset_version=12, _use_deterministic_compute=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataloader.dataset)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss = 0.0 logging_loss = 0.0 train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0], ) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0]) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self._training_step(self.model, inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): global_step += 1 if self.args.local_rank in [-1, 0]: if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or (global_step == 1 and self.args.logging_first_step): logs = {} if self.args.evaluate_during_training: results = self.evaluate() for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss ) / self.args.logging_steps learning_rate_scalar = get_lr_this_step( global_step) logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss epoch_iterator.write( json.dumps({ **logs, **{ "step": global_step } })) if self.args.max_steps > 0 and global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and global_step > self.args.max_steps: train_iterator.close() break logger.info("\n\nTraining completed. \n\n") return TrainOutput(global_step, tr_loss / global_step)
lr, ], torch.float32) model_desc, lr_desc = transformer_model_description() def get_lr_this_step(global_step): return 1 trainer = ORTTrainer( model, my_loss, model_desc, "LambOptimizer", None, lr_desc, device, _use_deterministic_compute=True) #, get_lr_this_step=get_lr_this_step) second_trainer = ORTTrainer( model, my_loss, model_desc, "LambOptimizer", None, lr_desc, device, _use_deterministic_compute=True) #, get_lr_this_step=get_lr_this_step) import time
def run_test(model, model_desc, device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, batch_args_option, dataset_len, epochs, use_new_api): dataloader = create_ort_test_dataloader(model_desc.inputs_, args.batch_size, args.seq_len, dataset_len, device) if use_new_api: assert use_internal_loss_scaler, 'new api should always use internal loss scaler' new_api_lr_scheduler = WrapLRScheduler(get_lr_this_step) new_api_loss_scaler = amp.DynamicLossScaler() if fp16 else None options = orttrainer.ORTTrainerOptions({ 'batch': { 'gradient_accumulation_steps': gradient_accumulation_steps }, 'device': { 'id': device }, 'mixed_precision': { 'enabled': fp16, 'loss_scaler': new_api_loss_scaler }, 'debug': { 'deterministic_compute': True, }, 'utils': { 'grad_norm_clip': True }, 'distributed': { 'allreduce_post_accumulation': True }, 'lr_scheduler': new_api_lr_scheduler }) param_optimizer = list(model.named_parameters()) params = [{ 'params': [ n for n, p in param_optimizer if "bias" in n or "LayerNorm.weight" in n ], "alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6 }, { 'params': [ n for n, p in param_optimizer if not ("bias" in n or "LayerNorm.weight" in n) ], "alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6 }] vocab_size = 99 new_model_desc = { 'inputs': [( 'input_ids', ['batch', 'max_seq_len_in_batch'], ), ( 'attention_mask', ['batch', 'max_seq_len_in_batch'], ), ( 'token_type_ids', ['batch', 'max_seq_len_in_batch'], ), ( 'masked_lm_labels', ['batch', 'max_seq_len_in_batch'], ), ('next_sentence_label', [ 'batch', ])], 'outputs': [('loss', [ 1, ], True), ('prediction_scores', ['batch', 'max_seq_len_in_batch', vocab_size]), ('seq_relationship_scores', ['batch', 2])] } optim_config = optim.LambConfig(params=params, lr=2e-5) model = orttrainer.ORTTrainer(model, new_model_desc, optim_config, options=options) print("running with new frontend API") else: model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription('Learning_Rate', [ 1, ], torch.float32), device=device, _enable_internal_postprocess=True, gradient_accumulation_steps=gradient_accumulation_steps, # BertLAMB default initial settings: b1=0.9, b2=0.999, e=1e-6 world_rank=args.local_rank, world_size=args.world_size, use_mixed_precision=fp16, allreduce_post_accumulation=allreduce_post_accumulation, get_lr_this_step=get_lr_this_step if use_internal_get_lr_this_step else None, loss_scaler=loss_scaler if use_internal_loss_scaler else None, _opset_version=14, _use_deterministic_compute=True) print("running with old frontend API") # trainig loop eval_batch = None if not use_new_api: model.train() for epoch in range(epochs): for step, batch in enumerate(dataloader): if eval_batch is None: eval_batch = batch if not use_internal_get_lr_this_step: lr = get_lr_this_step(step) learning_rate = torch.tensor([lr]) if not use_internal_loss_scaler and fp16: loss_scale = torch.tensor([loss_scaler.loss_scale_]) if batch_args_option == BatchArgsOption.List: if not use_internal_get_lr_this_step: batch = batch + [ learning_rate, ] if not use_internal_loss_scaler and fp16: batch = batch + [ loss_scale, ] outputs = model.train_step(*batch) elif batch_args_option == BatchArgsOption.Dict: args, kwargs = split_batch(batch, model_desc.inputs_, 0) if not use_internal_get_lr_this_step: kwargs['Learning_Rate'] = learning_rate if not use_internal_loss_scaler and fp16: kwargs[model.loss_scale_input_name] = loss_scale outputs = model.train_step(*args, **kwargs) else: args_count = int(len(model_desc.inputs_) / 2) # approx helf args, half kwargs args, kwargs = split_batch(batch, model_desc.inputs_, args_count) if not use_internal_get_lr_this_step: kwargs['Learning_Rate'] = learning_rate if not use_internal_loss_scaler and fp16: kwargs[model.loss_scale_input_name] = loss_scale outputs = model.train_step(*args, **kwargs) # eval if batch_args_option == BatchArgsOption.List: outputs = model.eval_step(*batch) elif batch_args_option == BatchArgsOption.Dict: args, kwargs = split_batch(batch, model_desc.inputs_, 0) outputs = model.eval_step(*args, **kwargs) else: args_count = int(len(model_desc.inputs_) / 2) # approx helf args, half kwargs args, kwargs = split_batch(batch, model_desc.inputs_, args_count) outputs = model.eval_step(*args, **kwargs) return (output.cpu().numpy() for output in outputs)
def run_test(model, model_desc, device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, batch_args_option): dataloader = create_ort_test_dataloader(model_desc.inputs_, args.batch_size, args.seq_len, device) model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription('Learning_Rate', [ 1, ], torch.float32), device=device, postprocess_model=postprocess_model, gradient_accumulation_steps=gradient_accumulation_steps, # BertLAMB default initial settings: b1=0.9, b2=0.999, e=1e-6 world_rank=args.local_rank, world_size=args.world_size, use_mixed_precision=fp16, allreduce_post_accumulation=allreduce_post_accumulation, get_lr_this_step=get_lr_this_step if use_internal_get_lr_this_step else None, loss_scaler=loss_scaler if use_internal_loss_scaler else None, _opset_version=12) # trainig loop eval_batch = None model.train() for step, batch in enumerate(dataloader): if eval_batch is None: eval_batch = batch if not use_internal_get_lr_this_step: lr = get_lr_this_step(step) learning_rate = torch.tensor([lr]) if not use_internal_loss_scaler and fp16: loss_scale = torch.tensor([loss_scaler.loss_scale_]) if batch_args_option == BatchArgsOption.List: if not use_internal_get_lr_this_step: batch = batch + [ learning_rate, ] if not use_internal_loss_scaler and fp16: batch = batch + [ loss_scale, ] outputs = model(*batch) elif batch_args_option == BatchArgsOption.Dict: args, kwargs = split_batch(batch, model_desc.inputs_, 0) if not use_internal_get_lr_this_step: kwargs['Learning_Rate'] = learning_rate if not use_internal_loss_scaler and fp16: kwargs[model.loss_scale_input_name] = loss_scale outputs = model(*args, **kwargs) else: args_count = int(len(model_desc.inputs_) / 2) # approx helf args, half kwargs args, kwargs = split_batch(batch, model_desc.inputs_, args_count) if not use_internal_get_lr_this_step: kwargs['Learning_Rate'] = learning_rate if not use_internal_loss_scaler and fp16: kwargs[model.loss_scale_input_name] = loss_scale outputs = model(*args, **kwargs) print(outputs[0]) # eval model.eval() if batch_args_option == BatchArgsOption.List: outputs = model(*batch) elif batch_args_option == BatchArgsOption.Dict: args, kwargs = split_batch(batch, model_desc.inputs_, 0) outputs = model(*args, **kwargs) else: args_count = int(len(model_desc.inputs_) / 2) # approx helf args, half kwargs args, kwargs = split_batch(batch, model_desc.inputs_, args_count) outputs = model(*args, **kwargs) return (output.cpu().numpy() for output in outputs)
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument("--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)") parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)") parser.add_argument("--epochs", type=int, default=10, metavar="N", help="number of epochs to train (default: 10)") parser.add_argument("--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)") parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training") parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument( "--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status", ) args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) kwargs = {"num_workers": 0, "pin_memory": True} train_loader = torch.utils.data.DataLoader( datasets.MNIST( "../data", train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.batch_size, shuffle=True, **kwargs, ) test_loader = torch.utils.data.DataLoader( datasets.MNIST( "../data", train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.test_batch_size, shuffle=True, **kwargs, ) comm = MPI.COMM_WORLD args.local_rank = (int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) if ("OMPI_COMM_WORLD_LOCAL_RANK" in os.environ) else 0) args.world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) if ( "OMPI_COMM_WORLD_RANK" in os.environ) else 0 args.world_size = comm.Get_size() if use_cuda: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 set_cuda_device_id(args.local_rank) else: device = torch.device("cpu") input_size = 784 hidden_size = 500 num_classes = 10 model = NeuralNet(input_size, hidden_size, num_classes) model_desc = mnist_model_description() # use log_interval as gradient accumulate steps trainer = ORTTrainer( model, my_loss, model_desc, "SGDOptimizer", None, IODescription( "Learning_Rate", [ 1, ], torch.float32, ), device, 1, args.world_rank, args.world_size, use_mixed_precision=False, allreduce_post_accumulation=True, ) print("\nBuild ort model done.") for epoch in range(1, args.epochs + 1): train_with_trainer(args, trainer, device, train_loader, epoch) import pdb test_with_trainer(args, trainer, device, test_loader)
class ORTTransformerTrainer: """ """ model: PreTrainedModel args: TrainingArguments train_dataset: Dataset eval_dataset: Dataset compute_metrics: Callable[[EvalPrediction], Dict] def __init__( self, model: PreTrainedModel, model_desc: ModelDescription, args: TrainingArguments, train_dataset: Dataset, eval_dataset: Dataset, compute_metrics: Callable[[EvalPrediction], Dict], ): """ """ self.model = model self.model_desc = model_desc self.args = args self.data_collator = DefaultDataCollator() self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics set_seed(self.args.seed) # Create output directory if needed if self.args.local_rank in [-1, 0]: os.makedirs(self.args.output_dir, exist_ok=True) def get_train_dataloader(self) -> DataLoader: if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") train_sampler = (SequentialSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler( self.train_dataset)) return DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, sampler=train_sampler, collate_fn=self.data_collator.collate_batch, ) def get_eval_dataloader(self) -> DataLoader: return DataLoader( self.eval_dataset, batch_size=self.args.eval_batch_size, shuffle=False, collate_fn=self.data_collator.collate_batch, ) def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: # We use the same batch_size as for eval. return DataLoader( test_dataset, batch_size=self.args.eval_batch_size, shuffle=False, collate_fn=self.data_collator.collate_batch, ) def train(self): """ Main training entry point. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = (self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1) else: t_total = int( len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs get_lr_this_step = get_linear_schedule_with_warmup( self.args.warmup_steps, t_total, self.args.learning_rate) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) def map_optimizer_attributes(name): # no_decay_keys = ["bias", "LayerNorm.weight"] no_decay = "bias" in name or "LayerNorm.weight" in name if no_decay: return {"weight_decay": 0.0, "weight_decay_mode": 1} else: return { "weight_decay": self.args.weight_decay, "weight_decay_mode": 1 } self.model = ORTTrainer( self.model, None, self.model_desc, "AdamOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription('Learning_Rate', [ 1, ], torch.float32), device=self.args.device, gradient_accumulation_steps=self.args.gradient_accumulation_steps, world_rank=0, world_size=1, # only support single GPU cases use_mixed_precision=self.args.fp16, allreduce_post_accumulation=True, get_lr_this_step=get_lr_this_step, loss_scaler=loss_scaler, enable_grad_norm_clip=False, _opset_version=12, _use_deterministic_compute=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataloader.dataset)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss = 0.0 logging_loss = 0.0 train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0], ) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0]) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self._training_step(self.model, inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): global_step += 1 if self.args.local_rank in [-1, 0]: if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or (global_step == 1 and self.args.logging_first_step): logs = {} if self.args.evaluate_during_training: results = self.evaluate() for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss ) / self.args.logging_steps learning_rate_scalar = get_lr_this_step( global_step) logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss epoch_iterator.write( json.dumps({ **logs, **{ "step": global_step } })) if self.args.max_steps > 0 and global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and global_step > self.args.max_steps: train_iterator.close() break logger.info("\n\nTraining completed. \n\n") return TrainOutput(global_step, tr_loss / global_step) def _training_step(self, model: ORTTrainer, inputs: Dict[str, torch.Tensor]) -> float: for k, v in inputs.items(): inputs[k] = v.to(self.args.device) outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) return loss.item() def save_model(self, output_dir: Optional[str] = None): output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) self.model.save_as_onnx(os.path.join(output_dir, "transformer.onnx")) def evaluate(self) -> Dict[str, float]: """ Run evaluation and return metrics. Returns: A dict containing: - the eval loss - the potential metrics computed from the predictions """ eval_dataloader = self.get_eval_dataloader() output = self._prediction_loop(eval_dataloader, description="Evaluation") return output.metrics def predict(self, test_dataset: Dataset) -> PredictionOutput: """ Run prediction and return predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in evaluate(). """ test_dataloader = self.get_test_dataloader(test_dataset) return self._prediction_loop(test_dataloader, description="Prediction") def _prediction_loop(self, dataloader: DataLoader, description: str) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", len(dataloader.dataset)) logger.info(" Batch size = %d", dataloader.batch_size) eval_losses: List[float] = [] preds: np.ndarray = None label_ids: np.ndarray = None self.model.eval() for inputs in tqdm(dataloader, desc=description): has_labels = any( inputs.get(k) is not None for k in ["labels", "masked_lm_labels"]) for k, v in inputs.items(): inputs[k] = v.to(self.args.device) with torch.no_grad(): outputs = self.model(**inputs) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if preds is None: preds = logits.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) if inputs.get("labels") is not None: if label_ids is None: label_ids = inputs["labels"].detach().cpu().numpy() else: label_ids = np.append( label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["loss"] = np.mean(eval_losses) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def main(): #Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--use-ort', action='store_true', default=False, help='to use onnxruntime as training backend') parser.add_argument('--use-ort-trainer', action='store_true', default=False, help='to use onnxruntime as training backend') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) kwargs = {'num_workers': 0, 'pin_memory': True} train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=args.test_batch_size, shuffle=True, **kwargs) comm = MPI.COMM_WORLD args.local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) if ('OMPI_COMM_WORLD_LOCAL_RANK' in os.environ) else 0 args.world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) if ('OMPI_COMM_WORLD_RANK' in os.environ) else 0 args.world_size=comm.Get_size() torch.cuda.set_device(args.local_rank) if use_cuda: device = torch.device("cuda", args.local_rank) else: device = torch.device("cpu") args.n_gpu = 1 set_cuda_device_id(args.local_rank) input_size = 784 hidden_size = 500 num_classes = 10 model = NeuralNet(input_size, hidden_size, num_classes) model_desc = mnist_model_description() if args.use_ort_trainer: # use log_interval as gradient accumulate steps trainer = ORTTrainer(model, my_loss, model_desc, "LambOptimizer", None, IODescription('Learning_Rate', [1,], torch.float32), device, 1, None, args.world_rank, args.world_size, use_mixed_precision=False, allreduce_post_accumulation = True) print('\nBuild ort model done.') for epoch in range(1, args.epochs + 1): train_with_trainer(args, trainer, device, train_loader, epoch) import pdb test_with_trainer(args, trainer, device, test_loader) else: model = ORTModel(model, my_loss, model_desc, device, None, args.world_rank, args.world_size) print('\nBuild ort model done.') optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train_with_model(args, model, device, train_loader, optimizer, epoch)
model_desc = mnist_model_description() # use log_interval as gradient accumulate steps with open("models/mnist_ort_ONNX.pt", "rb") as f: bin_str = f.read() model = onnx.load_model_from_string(bin_str) #print(model) trainer = ORTTrainer(model, None, model_desc, "LambOptimizer", None, IODescription('Learning_Rate', [ 1, ], torch.float32), device, gradient_accumulation_steps=1, world_rank=world_rank, world_size=world_size, use_mixed_precision=False, allreduce_post_accumulation=True) print('\nBuild ort model done.') ort_sd = trainer.state_dict() #print(ort_sd) printSizes(torch_model, "PyTorch") printSizes(trainer, "ORT") compareModels(torch_model, trainer)