def __init__( self, pretrained_bert_model, language, name, prediction_type, output_hidden_states, output_attentions, attention_length_before=1, attention_length_after=1, config_path=None, max_length=512, number_of_sentence=1, number_of_sentence_before=0, number_of_sentence_after=0, seed=1111, hidden_dropout_prob=0., attention_probs_dropout_prob=0., stop_attention_at_sent_before=None, stop_attention_before_sent=0, ): super(BertExtractor, self).__init__() # Load pre-trained model tokenizer (vocabulary) # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization. if config_path is None: configuration = BertConfig() configuration.hidden_dropout_prob = hidden_dropout_prob configuration.attention_probs_dropout_prob = attention_probs_dropout_prob configuration.output_hidden_states = output_hidden_states configuration.output_attentions = output_attentions self.model = BertModel.from_pretrained( pretrained_bert_model, config=configuration) #, config=configuration else: self.model = BertModel.from_pretrained( pretrained_bert_model) #, config=configuration self.tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_model) self.language = language self.attention_length_before = attention_length_before self.attention_length_after = attention_length_after self.pretrained_bert_model = pretrained_bert_model self.NUM_HIDDEN_LAYERS = self.model.config.num_hidden_layers self.FEATURE_COUNT = self.model.config.hidden_size self.NUM_ATTENTION_HEADS = self.model.config.num_attention_heads self.name = name self.config = { 'max_length': max_length, 'seed': seed, 'number_of_sentence': number_of_sentence, 'number_of_sentence_before': number_of_sentence_before, 'number_of_sentence_after': number_of_sentence_after, 'attention_length_before': attention_length_before, 'attention_length_after': attention_length_after, 'stop_attention_at_sent_before': stop_attention_at_sent_before, 'stop_attention_before_sent': stop_attention_before_sent, 'output_hidden_states': output_hidden_states, 'output_attentions': output_attentions, 'model_type': 'bert', 'hidden_size': self.model.config.hidden_size, 'hidden_act': self.model.config.hidden_act, 'initializer_range': self.model.config.initializer_range, 'vocab_size': self.model.config.vocab_size, 'hidden_dropout_prob': self.model.config.hidden_dropout_prob, 'num_attention_heads': self.model.config.num_attention_heads, 'type_vocab_size': self.model.config.type_vocab_size, 'max_position_embeddings': self.model.config.max_position_embeddings, 'num_hidden_layers': self.model.config.num_hidden_layers, 'intermediate_size': self.model.config.intermediate_size, 'attention_probs_dropout_prob': self.model.config.attention_probs_dropout_prob } if config_path is not None: with open(config_path, 'r') as f: self.config.update(json.load(f)) self.prediction_type = prediction_type # ['sentence', 'token-level']
def test_ipu_cpu_match(recompute_checkpoint, embedding_serialization): """ Test that the BERT model ran on IPU approximately matches that same model ran on the CPU. """ import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) # Config args = """ --config unit_test --lr-schedule constant --layers-per-ipu 0 3 --vocab-size 30400 --batch-size 10 --batches-per-step 1 --gradient-accumulation 10 --enable-half-partials False --optimizer AdamW --learning-rate 0.001 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 config.recompute_checkpoint_every_layer = recompute_checkpoint config.embedding_serialization = embedding_serialization # Models and options opts = get_options(config) opts.anchorMode(poptorch.AnchorMode.Final) model_cpu = PipelinedBertWithLoss(config).train() model_ipu = PipelinedBertWithLoss(config).train() model_ipu.load_state_dict(model_cpu.state_dict()) # Check that copy was successful assert model_ipu is not model_cpu assert all([(a == b).all() for a, b in zip( model_cpu.parameters(), model_ipu.parameters())]) is True optimizer_cpu = torch.optim.AdamW(model_cpu.parameters(), lr=0.001) optimizer_ipu = poptorch.optim.AdamW(model_ipu.parameters(), lr=0.001, loss_scaling=1.0) poptorch_model = poptorch.trainingModel(model_ipu, opts, optimizer=optimizer_ipu) # Input tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') inputs = tokenizer("Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo" "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo" "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo" "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute", return_tensors="pt") inputs['labels'] = torch.randint(0, config.vocab_size, [1, config.mask_tokens], dtype=torch.long) inputs['next_sentence_label'] = torch.randint(0, 1, [1], dtype=torch.long) inputs['masked_lm_positions'] = torch.randint(0, config.sequence_length, [1, config.mask_tokens], dtype=torch.long) batch_size = config.batch_size batch = (inputs['input_ids'].repeat(batch_size, 1), inputs['attention_mask'].repeat(batch_size, 1), inputs['token_type_ids'].repeat(batch_size, 1), inputs['masked_lm_positions'].repeat(batch_size, 1), inputs['labels'].repeat(batch_size, 1), inputs['next_sentence_label'].repeat(batch_size, 1)) batch_cpu = (inputs['input_ids'].repeat(1, 1), inputs['attention_mask'].repeat(1, 1), inputs['token_type_ids'].repeat(1, 1), inputs['masked_lm_positions'].repeat(1, 1), inputs['labels'].repeat(1, 1), inputs['next_sentence_label'].repeat(1, 1)) # Training Loop for step in range(10): # Step CPU model optimizer_cpu.zero_grad() for b in range(batch_size): cpu_output = model_cpu(*batch_cpu) cpu_loss = cpu_output[0] cpu_loss.div(batch_size).backward() optimizer_cpu.step() # Step IPU Model ipu_output = poptorch_model(*batch) ipu_loss = ipu_output[0] with torch.no_grad(): print(f"CPU Loss: {cpu_loss}, IPU Loss: {ipu_loss}") # Check the losses are approximately equal assert np.allclose(cpu_loss.numpy(), ipu_loss.numpy(), atol=1e-6)