def create_and_check_bert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, ): model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, token_type_ids, input_mask, token_labels, sequence_labels) result = { "loss": loss, "prediction_scores": prediction_scores, "seq_relationship_score": seq_relationship_score, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) self.parent.assertListEqual( list(result["seq_relationship_score"].size()), [self.batch_size, 2]) self.check_loss_output(result)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def convert_ckpt_compatible(ckpt_path, config_path): ckpt = torch.load(ckpt_path, map_location='cpu') keys = list(ckpt.keys()) for key in keys: if 'LayerNorm' in key: if 'gamma' in key: ckpt[key.replace('gamma', 'weight')] = ckpt.pop(key) else: ckpt[key.replace('beta', 'bias')] = ckpt.pop(key) model_config = BertConfig.from_json_file(config_path) model = BertForPreTraining(model_config) model.load_state_dict(ckpt) new_ckpt = model.bert.state_dict() return new_ckpt
def get_bert(bert_model, bert_do_lower_case, use_albert=False, use_sparse=False, use_electra=False): # Avoid a hard dependency on BERT by only importing it if it's being used from pytorch_transformers import BertTokenizer, BertModel, BertForPreTraining tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=bert_do_lower_case) bert = BertForPreTraining.from_pretrained(bert_model, use_albert=use_albert, use_sparse=use_sparse, use_electra=use_electra) return tokenizer, bert
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model, from_tf=False) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=0, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
import argparse if __name__ == '__main__': parser = argparse.ArgumentParser( description= "Extraction some layers of the full BertForPreTraining for Transfer Learned Distillation" ) parser.add_argument("--bert_model", default='bert-base-uncased', type=str) parser.add_argument( "--dump_checkpoint", default='serialization_dir/transfer_learning_checkpoint_0247911.pth', type=str) parser.add_argument("--vocab_transform", action='store_true') args = parser.parse_args() model = BertForPreTraining.from_pretrained(args.bert_model) state_dict = model.state_dict() compressed_sd = {} for w in ['word_embeddings', 'position_embeddings']: compressed_sd[f'distilbert.embeddings.{w}.weight'] = \ state_dict[f'bert.embeddings.{w}.weight'] for w in ['weight', 'bias']: compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \ state_dict[f'bert.embeddings.LayerNorm.{w}'] std_idx = 0 for teacher_idx in [0, 2, 4, 7, 9, 11]: for w in ['weight', 'bias']: compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
from pytorch_transformers import ( WEIGHTS_NAME, AdamW, WarmupLinearSchedule, BertConfig, BertForMaskedLM, BertTokenizer, BertForPreTraining, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, RobertaConfig, RobertaForMaskedLM, RobertaTokenizer) ## extract last layer attention ?? config = BertConfig.from_pretrained('bert-base-uncased') config.output_attentions = True config.output_hidden_states = True tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining(config) model.eval() input_ids1 = tokenizer.encode("Hello, my dog is cute") # Batch size 1 input_ids2 = tokenizer.encode("Hello, my dog is one") input_ids = torch.tensor([input_ids1, input_ids2]) outputs = model(input_ids) word_dot_distance = torch.randn(2, 1, 4, 3) ## 2 batch word_word_relation = torch.LongTensor( np.round(np.random.uniform(size=(2, 1, 4, 4), low=0, high=2))) out = torch.gather(word_dot_distance, dim=3, index=word_word_relation) distance_type = nn.Embedding(3, 5, padding_idx=0) distance_type.weight
from pytorch_transformers import ( WEIGHTS_NAME, AdamW, WarmupLinearSchedule, BertConfig, BertForMaskedLM, BertTokenizer, BertForPreTraining, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, RobertaConfig, RobertaForMaskedLM, RobertaTokenizer) ## extract last layer attention ?? config = BertConfig.from_pretrained('bert-base-uncased') config.output_attentions = True config.output_hidden_states = True tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining(config) model.eval() model.resize_token_embeddings(60000) model.bert.embeddings(torch.LongTensor(np.array([[0, 60000], [4, 50000]]))) input_ids1 = tokenizer.encode("Hello, my dog is cute") # Batch size 1 input_ids2 = tokenizer.encode("Hello, my dog is one") input_ids = torch.tensor([input_ids1, input_ids2]) outputs = model(input_ids) # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) hidden = outputs[-2] layers = outputs[-1] ## 12 layers
sequence_output, pooled_output ) mean_pooled_output = torch.mean(sequence_output, dim=1) mean_pooled_output = self.dropout(mean_pooled_output) logits = self.classifier(mean_pooled_output) outputs = (prediction_scores, seq_relationship_score, logits) return outputs config = BertConfig(str(PATH_TO_CKPT_CONFIG / "config.json")) model = BertPretrain(config, len(TARGETS)) # Prepare extended bert embedding orig_bert = BertForPreTraining.from_pretrained("bert-base-cased") orig_tokenizer = BertTokenizer.from_pretrained("bert-base-cased") state_dict = orig_bert.state_dict() del state_dict["cls.predictions.decoder.weight"], state_dict["cls.predictions.bias"] orig_embedding = state_dict["bert.embeddings.word_embeddings.weight"] extra_tokens = list(tokenizer.vocab.keys())[len(orig_tokenizer.vocab) :] new_tokens_as_orig_indices = [[i] for i in range(len(orig_tokenizer.vocab))] + [ orig_tokenizer.encode(t, add_special_tokens=False) for t in extra_tokens ] new_embedding = torch.zeros(len(new_tokens_as_orig_indices), orig_embedding.shape[-1]) new_embedding.normal_(mean=0.0, std=0.02)