def __init__(self, bert_dir): super().__init__() self.bert_config = RobertaConfig.from_pretrained(bert_dir, output_hidden_states=False) self.intermediate = RobertaModel.from_pretrained(bert_dir) self.span_info_collect = SICModel(self.bert_config.hidden_size) self.interpretation = InterpretationModel(self.bert_config.hidden_size) self.output = nn.Linear(self.bert_config.hidden_size, self.bert_config.num_labels)
def load(cls, pretrained_model_name_or_path, language=None, **kwargs): """ Load a language model either by supplying * the name of a remote model on s3 ("roberta-base" ...) * or a local path of a model trained via transformers ("some_dir/huggingface_model") * or a local path of a model trained via FARM ("some_dir/farm_model") :param pretrained_model_name_or_path: name or path of a model :param language: (Optional) Name of language the model was trained for (e.g. "german"). If not supplied, FARM will try to infer it from the model name. :return: Language Model """ roberta = cls() if "farm_lm_name" in kwargs: roberta.name = kwargs["farm_lm_name"] else: roberta.name = pretrained_model_name_or_path # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(farm_lm_config): # FARM style config = RobertaConfig.from_pretrained(farm_lm_config) farm_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" roberta.model = RobertaModel.from_pretrained(farm_lm_model, config=config, **kwargs) roberta.language = roberta.model.config.language else: # Huggingface transformer Style roberta.model = RobertaModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) roberta.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) return roberta
def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = RobertaConfig() self.torch_model = RobertaModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.RobertaModel.from_torch( self.torch_model, self.test_device)
def test(use_cuda): torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = RobertaConfig() torch_model = RobertaModel(cfg) torch_model.eval() if torch.cuda.is_available(): torch_model.to(test_device) turbo_model = turbo_transformers.RobertaModel.from_torch( torch_model, test_device) input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(1, 10), dtype=torch.long, device=test_device) torch_result = torch_model(input_ids) torch_result_final = torch_result[0][:, 0].cpu().numpy() turbo_result = turbo_model(input_ids) turbo_result_final = turbo_result[0].cpu().numpy() # See the differences # print(numpy.size(torch_result_final), numpy.size(turbo_result_final)) # print(torch_result_final - turbo_result_final) assert (numpy.allclose(torch_result_final, turbo_result_final, atol=1e-3, rtol=1e-3))
def convert_pytorch_to_roberta_checkpoint(pytorch_checkpoint_path: str, roberta_dump_folder_path: str): """ Copy/paste/tweak roberta's weights to our BERT structure. """ import pickle model = RobertaForMaskedLM.from_pretrained(pytorch_checkpoint_path) config = RobertaConfig.from_pretrained(pytorch_checkpoint_path) from argparse import Namespace huggingface_train_args = Namespace( **vars(torch.load(f"{pytorch_checkpoint_path}/training_args.bin"))) model.eval() # disable dropout # tokenizer = RobertaTokenizer.from_pretrained(roberta_checkpoint_path) if config.num_hidden_layers == 12: roberta = FairseqRobertaModel.from_pretrained("roberta.base") elif config.num_hidden_layers == 24: roberta = FairseqRobertaModel.from_pretrained("roberta.large") else: raise Exception("Only roberta LM is supported!") roberta.eval() # roberta_sent_encoder = roberta.model.decoder.sentence_encoder # update config from huggingface and reuse lots of settings from fairseq pretrained roberta.args.warmup_updates = huggingface_train_args.warmup_steps roberta.args.weight_decay = huggingface_train_args.weight_decay roberta.args.adam_eps = huggingface_train_args.adam_epsilon roberta.args.clip_norm = huggingface_train_args.max_grad_norm roberta.args.max_update = huggingface_train_args.max_steps roberta.args.total_num_update = huggingface_train_args.max_steps roberta.args.save_interval_updates = huggingface_train_args.save_steps roberta.args.attention_dropout = config.attention_probs_dropout_prob roberta.args.encoder_embed_dim = config.hidden_size roberta.args.encoder_ffn_embed_dim = config.intermediate_size roberta.args.activation_fn = config.hidden_act roberta.args.activation_dropout = config.hidden_dropout_prob roberta.args.encoder_layers = config.num_hidden_layers roberta.args.encoder_attention_heads = config.num_attention_heads roberta.args.__dict__.update(huggingface_train_args.__dict__) roberta.model.decoder.sentence_encoder.embed_tokens.weight = model.roberta.embeddings.word_embeddings.weight roberta.model.decoder.sentence_encoder.embed_positions.weight = model.roberta.embeddings.position_embeddings.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. roberta.model.decoder.sentence_encoder.emb_layer_norm.weight = model.roberta.embeddings.LayerNorm.weight roberta.model.decoder.sentence_encoder.emb_layer_norm.bias = model.roberta.embeddings.LayerNorm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] # roberta.model.decoder.sentence_encoder.layers[i]: TransformerSentenceEncoderLayer = roberta.model.decoder.sentence_encoder.layers[i] # self attention self_attn: BertSelfAttention = layer.attention.self assert (roberta.model.decoder.sentence_encoder.layers[i].self_attn. k_proj.weight.data.shape == roberta.model.decoder. sentence_encoder.layers[i].self_attn.q_proj.weight.data.shape == roberta.model.decoder.sentence_encoder.layers[i].self_attn. v_proj.weight.data.shape == torch.Size( (config.hidden_size, config.hidden_size))) roberta.model.decoder.sentence_encoder.layers[ i].self_attn.q_proj.weight = self_attn.query.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.q_proj.bias = self_attn.query.bias roberta.model.decoder.sentence_encoder.layers[ i].self_attn.k_proj.weight = self_attn.key.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.k_proj.bias = self_attn.key.bias roberta.model.decoder.sentence_encoder.layers[ i].self_attn.v_proj.weight = self_attn.value.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.v_proj.bias = self_attn.value.bias # self-attention output self_output: BertSelfOutput = layer.attention.output assert self_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[ i].self_attn.out_proj.weight.shape roberta.model.decoder.sentence_encoder.layers[ i].self_attn.out_proj.weight = self_output.dense.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn.out_proj.bias = self_output.dense.bias roberta.model.decoder.sentence_encoder.layers[ i].self_attn_layer_norm.weight = self_output.LayerNorm.weight roberta.model.decoder.sentence_encoder.layers[ i].self_attn_layer_norm.bias = self_output.LayerNorm.bias # intermediate intermediate: BertIntermediate = layer.intermediate assert intermediate.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[ i].fc1.weight.shape roberta.model.decoder.sentence_encoder.layers[ i].fc1.weight = intermediate.dense.weight roberta.model.decoder.sentence_encoder.layers[ i].fc1.bias = intermediate.dense.bias # output bert_output: BertOutput = layer.output assert bert_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[ i].fc2.weight.shape roberta.model.decoder.sentence_encoder.layers[ i].fc2.weight = bert_output.dense.weight roberta.model.decoder.sentence_encoder.layers[ i].fc2.bias = bert_output.dense.bias roberta.model.decoder.sentence_encoder.layers[ i].final_layer_norm.weight = bert_output.LayerNorm.weight roberta.model.decoder.sentence_encoder.layers[ i].final_layer_norm.bias = bert_output.LayerNorm.bias # LM Head roberta.model.decoder.lm_head.dense.weight = model.lm_head.dense.weight roberta.model.decoder.lm_head.dense.bias = model.lm_head.dense.bias roberta.model.decoder.lm_head.layer_norm.weight = model.lm_head.layer_norm.weight roberta.model.decoder.lm_head.layer_norm.bias = model.lm_head.layer_norm.bias roberta.model.decoder.lm_head.weight = model.lm_head.decoder.weight roberta.model.decoder.lm_head.bias = model.lm_head.decoder.bias input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze( 0) # batch of size 1 their_output = model(input_ids)[0] our_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 copy_success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if copy_success else "💩") if not copy_success: raise Exception("Something went wRoNg") pathlib.Path(roberta_dump_folder_path).mkdir(parents=True, exist_ok=True) print(f"Saving model to {roberta_dump_folder_path}") from fairseq import checkpoint_utils state_dict = { "args": roberta.args, "model": roberta.model.state_dict(), # these last two were copied from fairseq pretrained just to make .from_pretrain() function works "extra_state": { 'train_iterator': { 'epoch': 0 }, 'val_loss': 1.4955725940408326 }, "optimizer_history": [{ 'criterion_name': 'MaskedLmLoss', 'optimizer_name': 'MemoryEfficientFP16Optimizer', 'lr_scheduler_state': { 'best': 1.495530066777925 }, 'num_updates': 500000 }] } from fairseq import checkpoint_utils # checkpoint_utils.save_state(f"{roberta_dump_folder_path}/model.pt", roberta.args, roberta.state_dict(), ) # del model checkpoint_utils.torch_persistent_save( state_dict, f"{roberta_dump_folder_path}/model.pt") loaded_model = FairseqRobertaModel.from_pretrained( roberta_dump_folder_path) loaded_model.eval() # roberta.model(input_ids) # loaded_model.model(input_ids) del state_dict copied_dict = roberta.state_dict() loaded_dict = loaded_model.state_dict() assert loaded_model.state_dict().keys() == roberta.state_dict().keys() for k in roberta.state_dict().keys(): loaded_val = loaded_dict[k] copied_val = copied_dict[k] if not torch.allclose(loaded_val, copied_val, atol=1e-3): print(k) loaded_output = loaded_model.model(input_ids)[0] save_success = torch.allclose(our_output, loaded_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if save_success else "💩") if not save_success: raise Exception("Something went wRoNg") # except: # print("Fail to save") # torch.save(roberta, f"{roberta_dump_folder_path}/model.pt") print("Done")
def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool): """ Copy/paste/tweak roberta's weights to our BERT structure. """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout roberta_sent_encoder = roberta.model.decoder.sentence_encoder config = RobertaConfig( vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, intermediate_size=roberta.args.encoder_ffn_embed_dim, max_position_embeddings=514, type_vocab_size=1, layer_norm_eps=1e-5, # PyTorch default used in fairseq ) if classification_head: config.num_labels = roberta.args.num_classes print("Our BERT config:", config) model = RobertaForSequenceClassification( config) if classification_head else RobertaForMaskedLM(config) model.eval() # Now let's copy all the weights. # Embeddings model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[ i] # self attention self_attn: BertSelfAttention = layer.attention.self assert (roberta_layer.self_attn.k_proj.weight.data.shape == roberta_layer.self_attn.q_proj.weight.data.shape == roberta_layer.self_attn.v_proj.weight.data.shape == torch.Size( (config.hidden_size, config.hidden_size))) self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias # self-attention output self_output: BertSelfOutput = layer.attention.output assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape self_output.dense.weight = roberta_layer.self_attn.out_proj.weight self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias # intermediate intermediate: BertIntermediate = layer.intermediate assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias # output bert_output: BertOutput = layer.output assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias # end of layer if classification_head: model.classifier.dense.weight = roberta.model.classification_heads[ "mnli"].dense.weight model.classifier.dense.bias = roberta.model.classification_heads[ "mnli"].dense.bias model.classifier.out_proj.weight = roberta.model.classification_heads[ "mnli"].out_proj.weight model.classifier.out_proj.bias = roberta.model.classification_heads[ "mnli"].out_proj.bias else: # LM Head model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight model.lm_head.decoder.bias = roberta.model.decoder.lm_head.bias # Let's check that we get the same results. input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze( 0) # batch of size 1 our_output = model(input_ids)[0] if classification_head: their_output = roberta.model.classification_heads["mnli"]( roberta.extract_features(input_ids)) else: their_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if success else "💩") if not success: raise Exception("Something went wRoNg") pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path)
# Loading data print('Loading data...') dev_dataset, dev_features = load_roberta_data(feature_dir=args.dev_feat_dir, output_features=True, evaluate=True) eval_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.eval_batch_size) dev_steps_per_epoch = len(dev_features) // args.eval_batch_size if len(dev_dataset) % args.eval_batch_size != 0: dev_steps_per_epoch += 1 for i, init_restore_dir in enumerate(args.init_restore_dir): bert_config = RobertaConfig.from_json_file(args.bert_config_file) model = RobertaJointForNQ2(RobertaModel(bert_config), bert_config) utils.torch_show_all_params(model) utils.torch_init_model(model, init_restore_dir) if args.float16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) roberta_evaluate(model, args, dev_features, device, i) roberta_ensemble(args) # STEP 5 预测短答案,并合并 from albert_modeling import AlBertJointForShort, AlbertConfig
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_neg_data', type=Path, required=True) parser.add_argument('--pregenerated_pos_data', type=Path, required=True) parser.add_argument('--validation_neg_data', type=Path, required=True) parser.add_argument('--validation_pos_data', type=Path, required=True) parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument('--exp_group', type=str, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--method", type=str, choices=[ 'neg_samebatch', 'distill_samebatch', 'distill_samebatch_lstm', 'distill', 'kl', 'unlikelihood' ]) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--save_before", action='store_true') parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--max_seq_len", default=512, type=int) parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--port_idx", type=int) parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--valid_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--kr_freq", default=0.0, type=float) parser.add_argument("--mlm_freq", default=0, type=float) parser.add_argument("--kl_w", default=1000, type=float) parser.add_argument("--ul_w", default=1, type=float) parser.add_argument("--gamma", default=0.5, type=float, help="coeff of UL and 1-coeff of LL") parser.add_argument('--no_mlm', action='store_true', help="don't do any MLM training") parser.add_argument("--no_tie", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--no_ul', action='store_true', help="don't do any UL training") parser.add_argument('--no_ll', action='store_true', help="don't do any LL training") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: print(torch.cuda.is_available()) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() print("Num of gpus: ", n_gpu) else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) print("GPU Device: ", device) n_gpu = 1 dist_comms.init_distributed_training(args.local_rank, args.port_idx) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) pt_output = Path(getenv('PT_OUTPUT_DIR', '')) args.output_dir = Path(os.path.join(pt_output, args.output_dir)) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) if args.bert_model != "roberta-base": tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) else: tokenizer = RobertaTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) tokenizer.vocab = tokenizer.encoder total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.bert_model != "roberta-base": if args.method == "neg_samebatch": config = BertConfig.from_pretrained(args.bert_model) config.bert_model = args.bert_model core_model = BertForNegSameBatch.from_pretrained(args.bert_model, args.gamma, config=config) core_model.init_orig_bert() elif args.method == "unlikelihood": config = BertConfig.from_pretrained(args.bert_model) core_model = BertForNegPreTraining.from_pretrained(args.bert_model, config=config) else: raise NotImplementedError( f"method {args.method} is not implemented") else: config = RobertaConfig.from_pretrained(args.bert_model) core_model = RobertaForNegPreTraining.from_pretrained(args.bert_model) core_model = core_model.to(device) # Prepare optimizer param_optimizer = list(core_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) core_model, optimizer = amp.initialize(core_model, optimizer, opt_level=args.fp16_opt_level) model = torch.nn.parallel.DistributedDataParallel( core_model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() if args.local_rank == 0 or args.local_rank == -1: if args.save_before: before_train_path = Path( os.path.join(args.output_dir, "before_training")) print("Before training path: ", before_train_path) before_train_path.mkdir(parents=True, exist_ok=True) model.module.save_pretrained( os.path.join(args.output_dir, "before_training")) tokenizer.save_pretrained( os.path.join(args.output_dir, "before_training")) # writer = SummaryWriter(log_dir=args.output_dir) wandb.init(project="neg_v2", name=str(args.output_dir).split("/")[-1], group=args.exp_group, entity='negation') mlm_averagemeter = AverageMeter() ul_averagemeter = AverageMeter() ll_averagemeter = AverageMeter() kl_averagemeter = AverageMeter() neg_epoch_dataset = PregeneratedDataset( epoch=0, training_path=args.pregenerated_neg_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) pos_epoch_dataset = PregeneratedDataset( epoch=0, training_path=args.pregenerated_pos_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) neg_validation_dataset = PregeneratedDataset( epoch=0, training_path=args.validation_neg_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) pos_validation_dataset = PregeneratedDataset( epoch=0, training_path=args.validation_pos_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: neg_train_sampler = RandomSampler(neg_epoch_dataset) pos_train_sampler = RandomSampler(pos_epoch_dataset) neg_valid_sampler = RandomSampler(neg_validation_dataset) pos_valid_sampler = RandomSampler(pos_validation_dataset) else: neg_train_sampler = DistributedSampler(neg_epoch_dataset) pos_train_sampler = DistributedSampler(pos_epoch_dataset) neg_valid_sampler = DistributedSampler(neg_validation_dataset) pos_valid_sampler = DistributedSampler(pos_validation_dataset) neg_train_dataloader = DataLoader(neg_epoch_dataset, sampler=neg_train_sampler, batch_size=args.train_batch_size) pos_train_dataloader = DataLoader(pos_epoch_dataset, sampler=pos_train_sampler, batch_size=args.train_batch_size) neg_valid_dataloader = DataLoader(neg_validation_dataset, sampler=neg_valid_sampler, batch_size=args.valid_batch_size) pos_valid_dataloader = DataLoader(pos_validation_dataset, sampler=pos_valid_sampler, batch_size=args.valid_batch_size) def inf_train_gen(): while True: for kr_step, kr_batch in enumerate(neg_train_dataloader): yield kr_step, kr_batch kr_gen = inf_train_gen() def pos_inf_train_gen(): while True: for kr_step, kr_batch in enumerate(pos_train_dataloader): yield kr_step, kr_batch pos_kr_gen = pos_inf_train_gen() mlm_loss, neg_loss = 0, 0 mlm_nb_it, neg_nb_it = 1, 1 mlm_nb_ex, neg_nb_ex = 0, 0 for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 ul_tr_loss = 0 nb_ul_tr_examples, nb_ul_tr_steps = 0, 1 ll_tr_loss = 0 nb_ll_tr_examples, nb_ll_tr_steps = 0, 1 kl_tr_loss = 0 nb_kl_tr_examples, nb_kl_tr_steps = 0, 1 if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1 and args.local_rank == 0): logging.info("** ** * Saving fine-tuned model ** ** * ") model.module.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): if not args.no_mlm and (random.random() > args.mlm_freq): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, masked_lm_labels=lm_label_ids, negated=False) loss = outputs[1] loss_dict = outputs[0] mlm_loss += loss_dict['mlm'].item() mlm_nb_it += 1 mlm_nb_ex += input_ids.size(0) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if args.local_rank == 0 or args.local_rank == -1: mlm_averagemeter.update(loss_dict['mlm'].item()) # writer.add_scalar('MLM/train', loss_dict['mlm'].item(), mlm_nb_it) wandb.log({'MLM/train': loss_dict['mlm'].item()}) nb_tr_steps += 1 nb_ll_tr_steps += 1 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str( f"MLM: {mlm_averagemeter:.6f}, UL: {ul_averagemeter:.6f}, LL: {ll_averagemeter:.6f}, KL: {kl_averagemeter:.6f}" ) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 pbar.update(1) random_num = random.random() if random_num > args.kr_freq: if args.method in ["neg_samebatch"]: ul_step, ul_batch = next(kr_gen) ul_batch = tuple(t.to(device) for t in ul_batch) ul_input_ids, ul_input_mask, ul_segment_ids, ul_lm_label_ids = ul_batch ll_step, ll_batch = next(pos_kr_gen) ll_batch = tuple(t.to(device) for t in ll_batch) ll_input_ids, ll_input_mask, ll_segment_ids, ll_lm_label_ids = ll_batch batch_mask = torch.zeros( (ul_input_ids.size(0) + ll_input_ids.size(0)), dtype=ll_input_mask.dtype, device=device) batch_mask[:ul_input_ids.size(0)] = 1. outputs = model( input_ids=torch.cat([ul_input_ids, ll_input_ids], 0), attention_mask=torch.cat( [ul_input_mask, ll_input_mask], 0), token_type_ids=torch.cat( [ul_segment_ids, ll_segment_ids], 0), masked_lm_labels=torch.cat( [ul_lm_label_ids, ll_lm_label_ids], 0), negated=True, batch_neg_mask=batch_mask) loss = outputs[1] * args.ul_w loss_dict = outputs[0] if args.local_rank == 0 or args.local_rank == -1: wandb.log({ 'UL/train': loss_dict['neg'].item(), 'LL/train': loss_dict['pos'].item() }) ul_averagemeter.update(loss_dict['neg'].item()) ll_averagemeter.update(loss_dict['pos'].item()) neg_nb_it += 1 elif random.random() > 0.5 and not args.no_ul: kr_step, kr_batch = next(kr_gen) kr_batch = tuple(t.to(device) for t in kr_batch) input_ids, input_mask, segment_ids, lm_label_ids = kr_batch outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, masked_lm_labels=lm_label_ids, negated=True) loss = outputs[1] * args.ul_w loss_dict = outputs[0] nb_ul_tr_steps += 1 neg_loss += loss_dict['neg'].item() if args.local_rank == 0 or args.local_rank == -1: wandb.log({ 'UL/train': loss_dict['neg'].item(), 'KL/train': loss_dict['kl'].item() * args.kl_w }) ul_averagemeter.update(loss_dict['neg'].item()) kl_averagemeter.update(loss_dict['kl'].item() * args.kl_w) neg_nb_it += 1 elif not args.no_ll: kr_step, kr_batch = next(pos_kr_gen) kr_batch = tuple(t.to(device) for t in kr_batch) input_ids, input_mask, segment_ids, lm_label_ids = kr_batch outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, masked_lm_labels=lm_label_ids, negated=False) loss = outputs[1] loss_dict = outputs[0] nb_ll_tr_steps += 1 mlm_loss += loss_dict['mlm'].item() mlm_nb_it += 1 if args.local_rank == 0 or args.local_rank == -1: wandb.log({'LL/train': loss_dict['mlm'].item()}) ll_averagemeter.update(loss_dict['mlm'].item()) mlm_nb_ex += input_ids.size(0) else: continue if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if args.local_rank == 0 or args.local_rank == -1: nb_tr_steps += 1 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str( f"MLM: {mlm_averagemeter:.6f}, UL: {ul_averagemeter:.6f}, LL: {ll_averagemeter:.6f}, KL: {kl_averagemeter:.6f}" ) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if n_gpu > 1 and args.local_rank == -1 or ( n_gpu <= 1 and args.local_rank == 0): if False and (step + 1) % 100 == 0: neg_valid_res = validate( model=model, dataloader=neg_valid_dataloader, device=device, negated=True) pos_valid_res = validate( model=model, dataloader=pos_valid_dataloader, device=device, negated=False) wandb.log({ 'neg/valid/p@1': neg_valid_res % 100., 'pos/valid/p@1': pos_valid_res % 100. }) # Save a trained model if n_gpu > 1 and args.local_rank == -1 or (n_gpu <= 1 and args.local_rank == 0): print("Saving model") logging.info("** ** * Saving fine-tuned model ** ** * ") model.module.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) print(str(wandb.run.id)) pickle.dump( str(wandb.run.id), open(os.path.join(args.output_dir, 'wandb_run_id.pkl'), 'wb'))
def __init__(self, vocab: Vocabulary, pretrained_model: str = None, requires_grad: bool = True, transformer_weights_model: str = None, reset_classifier: bool = False, binary_loss: bool = False, layer_freeze_regexes: List[str] = None, on_load: bool = False, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) if on_load: logging.info(f"Skipping loading of initial Transformer weights") transformer_config = RobertaConfig.from_pretrained( pretrained_model) self._transformer_model = RobertaModel(transformer_config) elif transformer_weights_model: logging.info( f"Loading Transformer weights model from {transformer_weights_model}" ) transformer_model_loaded = load_archive(transformer_weights_model) self._transformer_model = transformer_model_loaded.model._transformer_model else: self._transformer_model = RobertaModel.from_pretrained( pretrained_model) for name, param in self._transformer_model.named_parameters(): grad = requires_grad if layer_freeze_regexes and grad: grad = not any( [bool(re.search(r, name)) for r in layer_freeze_regexes]) param.requires_grad = grad transformer_config = self._transformer_model.config self._output_dim = transformer_config.hidden_size classifier_input_dim = self._output_dim classifier_output_dim = 1 transformer_config.num_labels = classifier_output_dim self._classifier = None if not on_load and transformer_weights_model \ and hasattr(transformer_model_loaded.model, "_classifier") \ and not reset_classifier: self._classifier = transformer_model_loaded.model._classifier old_dims = (self._classifier.dense.in_features, self._classifier.out_proj.out_features) new_dims = (classifier_input_dim, classifier_output_dim) if old_dims != new_dims: logging.info( f"NOT copying Transformer classifier weights, incompatible dims: {old_dims} vs {new_dims}" ) self._classifier = None if self._classifier is None: self._classifier = RobertaClassificationHead(transformer_config) self._binary_loss = binary_loss self._accuracy = CategoricalAccuracy() self._sigmoid = torch.nn.Sigmoid() if self._binary_loss: self._loss = torch.nn.BCEWithLogitsLoss() else: self._loss = torch.nn.CrossEntropyLoss() self._debug = 2 self._padding_value = 1 # The index of the RoBERTa padding token
def init_encoder(cls, cfg_name: str, projection_dim: int = 0, dropout: float = 0.1, **kwargs) -> BertModel: cfg = RobertaConfig.from_pretrained(cfg_name if cfg_name else 'roberta-base') if dropout != 0: cfg.attention_probs_dropout_prob = dropout cfg.hidden_dropout_prob = dropout return cls.from_pretrained(cfg_name, config=cfg, project_dim=projection_dim, **kwargs)