def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForPreTraining(config=config) model.to(torch_device) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) result = { "loss": loss, "prediction_scores": prediction_scores, "seq_relationship_score": seq_relationship_score, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.parent.assertListEqual( list(result["seq_relationship_score"].size()), [self.batch_size, 2]) self.check_loss_output(result)
def create_and_check_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = BertForPreTraining(config=config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels, next_sentence_label=sequence_labels, ) self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
def load_bert(model_name): if model_name == 'robbert': tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robBERT-base") model = RobertaModel.from_pretrained("pdelobelle/robBERT-base") elif model_name == 'bertje': tokenizer = BertTokenizer.from_pretrained( "wietsedv/bert-base-dutch-cased") model = BertModel.from_pretrained("wietsedv/bert-base-dutch-cased") elif model_name == 'bert-nl': tokenizer = BertTokenizer.from_pretrained("data/bert-nl") config = BertConfig.from_json_file("data/bert-nl/config.json") model = BertForPreTraining(config).bert else: raise ValueError('invalid model name') model.eval() return tokenizer, model
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ]) from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) args = MyArgs(local_rank=0, world_size=1, max_steps=100, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7) def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) # It would be better to test both with/without mixed precision and allreduce_post_accumulation. # However, stress test of all the 4 cases is not stable at lease on the test machine. # There we only test mixed precision and allreduce_post_accumulation because it is the most useful use cases. option_fp16 = [True] option_allreduce_post_accumulation = [True] option_gradient_accumulation_steps = [1, 8] option_use_internal_get_lr_this_step = [True, False] option_use_internal_loss_scaler = [True, False] option_split_batch = [BatchArgsOption.ListAndDict] for fp16 in option_fp16: for allreduce_post_accumulation in option_allreduce_post_accumulation: for gradient_accumulation_steps in option_gradient_accumulation_steps: for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step: for use_internal_loss_scaler in option_use_internal_loss_scaler: for split_batch in option_split_batch: print("gradient_accumulation_steps:", gradient_accumulation_steps) print("use_internal_loss_scaler:", use_internal_loss_scaler) loss_ort, prediction_scores_ort, seq_relationship_score_ort =\ run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch) print(loss_ort) print(prediction_scores_ort) print(seq_relationship_score_ort)
def create_and_check_bert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, option_fp16, option_allreduce_post_accumulation, option_gradient_accumulation_steps, option_split_batch, option_use_internal_get_lr_this_step=[True], option_use_internal_loss_scaler=[True], ): seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels, ) model_desc = ModelDescription( [ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc, ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ], ) from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) dataset_len = 100 epochs = 8 max_steps = epochs * dataset_len args = MyArgs( local_rank=0, world_size=1, max_steps=max_steps, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7, ) def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler("loss_scale_input_name", True, up_scale_window=2000) for fp16 in option_fp16: for allreduce_post_accumulation in option_allreduce_post_accumulation: for gradient_accumulation_steps in option_gradient_accumulation_steps: for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step: for use_internal_loss_scaler in option_use_internal_loss_scaler: for split_batch in option_split_batch: print("gradient_accumulation_steps:", gradient_accumulation_steps) print("split_batch:", split_batch) seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) ( old_api_loss_ort, old_api_prediction_scores_ort, old_api_seq_relationship_score_ort, ) = run_test( model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch, dataset_len, epochs, use_new_api=False, ) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) if use_internal_get_lr_this_step and use_internal_loss_scaler: ( new_api_loss_ort, new_api_prediction_scores_ort, new_api_seq_relationship_score_ort, ) = run_test( model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch, dataset_len, epochs, use_new_api=True, ) assert_allclose( old_api_loss_ort, new_api_loss_ort) assert_allclose( old_api_prediction_scores_ort, new_api_prediction_scores_ort) assert_allclose( old_api_seq_relationship_score_ort, new_api_seq_relationship_score_ort)
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ]) import argparse args_ = argparse.Namespace(fp16=True, amp_opt_level='O1') from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) args = MyArgs(local_rank=0, world_size=1, max_steps=100, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7) from train_with_ort_trainer import get_lr def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) option_gradient_accumulation_steps = [8] option_fp16 = [True, False] option_allreduce_post_accumulation = True option_use_internal_get_lr_this_step = False option_use_internal_loss_scaler = False # TODO: with with fetches for gradient_accumulation_steps in option_gradient_accumulation_steps: for fp16 in option_fp16: for option_split_batch in BatchArgsOption: loss_ort, prediction_scores_ort, seq_relationship_score_ort =\ run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16, option_allreduce_post_accumulation, get_lr_this_step, option_use_internal_get_lr_this_step, loss_scaler, option_use_internal_loss_scaler, option_split_batch) print(loss_ort) print(prediction_scores_ort) print(seq_relationship_score_ort)