def __init__(self, args, dictionary, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout from pytorch_transformers import RobertaModel, BertModel from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer if args.pretrained_bert_model.startswith('roberta'): self.embed = RobertaModel.from_pretrained( args.pretrained_bert_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) # self.context = RobertaModel.from_pretrained(args.pretrained_bert_model, # cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = RobertaConfig.from_pretrained( args.pretrained_bert_model) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: self.embed = BertModel.from_pretrained( args.pretrained_bert_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) # self.context = BertModel.from_pretrained(args.pretrained_bert_model, # cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = BertConfig.from_pretrained( args.pretrained_bert_model) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.padding_idx = self.tokenizer.convert_tokens_to_ids( self.tokenizer.pad_token)
def main(): bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2) bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config) count = 0 for name, param in bert_base_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in bert_base_uncased: ', count) roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2) roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config) count = 0 for name, param in roberta_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in roberta: ', count) albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2) albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config) count = 0 for name, param in albert_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in albert: ', count)
def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout # from pytorch_transformers import RobertaModel from fairseq.modules.roberta_causal_mask import RobertaCasulMaskModel, BertCasulMaskModel from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer if args.roberta_model.startswith('roberta'): self.roberta = RobertaCasulMaskModel.from_pretrained( args.roberta_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = RobertaConfig.from_pretrained(args.roberta_model) self.tokenizer = RobertaTokenizer.from_pretrained( args.roberta_model) else: self.roberta = BertCasulMaskModel.from_pretrained( args.roberta_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = BertConfig.from_pretrained(args.roberta_model) self.tokenizer = BertTokenizer.from_pretrained(args.roberta_model) self.config.output_attentions = True self.roberta.pooler.dense.weight.requires_grad = False self.roberta.pooler.dense.bias.requires_grad = False embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx # self.embed_tokens = embed_tokens # self.embed_scale = math.sqrt(embed_dim) self.args = args # if args.sentence_transformer_arch == 'fairseq': # self.padding_idx = embed_tokens.padding_idx # self.sent_embed_positions = PositionalEmbedding( # 1024, embed_dim, self.padding_idx, # left_pad=False, # learned=args.encoder_learned_pos, # ) # self.doc_layers = nn.ModuleList([]) # self.doc_layers.extend([ # TransformerEncoderLayer(args) # for i in range(args.encoder_layers) # ]) if args.sentence_transformer_arch == 'bert': # from pytorch_transformers import RobertaConfig, RobertaTokenizer # self.config = RobertaConfig.from_pretrained(args.roberta_model) # self.config.output_attentions = True # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') embed_dim = self.config.hidden_size print('*** padding idx before ***', embed_tokens.padding_idx) self.padding_idx = self.tokenizer.convert_tokens_to_ids( self.tokenizer.pad_token) print('*** padding idx after ***', self.padding_idx) # let's assume each document has at most 128-self.padding_idx-1 sentences # in case of roberta, it is 126 self.sent_position_embeddings = nn.Embedding(128, embed_dim) if args.encoder_layers: self.config.num_hidden_layers = args.encoder_layers if args.dropout: self.config.hidden_dropout_prob = args.dropout if args.attention_dropout: self.config.attention_probs_dropout_prob = args.attention_dropout if args.attn_type == 'attn_score': self.sent_encoder = AttnScoreBertEncoder(self.config) elif args.attn_type == 'attn_prob': self.sent_encoder = BertEncoder(self.config) else: raise Exception('--attn-type doesn\'t support {} yet !'.format( args.attn_type)) self.sent_encoder.apply(self._init_weights) print('*** sentence encoder config ***') print(self.config) else: raise Exception( '--sentence-transformer-arch doesn\'t support {} yet!'.format( args.sentence_transformer_arch))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/', type=str, required=False, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--task_name", default='ubuntu', type=str, required=False, help="The name of the task to train.") parser.add_argument("--output_dir", default='/hdd/lujunyu/model/ubuntu_roberta_new/', type=str, required=False, help="The output directory where the model checkpoints will be written.") parser.add_argument("--init_checkpoint", default='/hdd/lujunyu/model/ubuntu_roberta_new/model.pt', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Other parameters parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=256, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--eval_batch_size", default=750, type=int, help="Total batch size for eval.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') bert_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2, type_vocab_size=2) tokenizer = RobertaTokenizer.from_pretrained('roberta-base') test_dataset = UbuntuDatasetForRoberta( file_path=os.path.join(args.data_dir, "test.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=args.eval_batch_size, sampler=SequentialSampler(test_dataset), num_workers=8) state_dict = torch.load(args.init_checkpoint, map_location='cpu') model = RobertaForSequenceClassification.from_pretrained(args.init_checkpoint, config=bert_config) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) f = open(os.path.join(args.output_dir, 'logits_test.txt'), 'w') model.eval() test_loss = 0 nb_test_steps, nb_test_examples = 0, 0 for input_ids, segment_ids, label_ids in tqdm(test_dataloader, desc="Step"): input_ids = input_ids.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): tmp_test_loss, logits = model(input_ids, token_type_ids=segment_ids, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for logit, label in zip(logits, label_ids): logit = '{},{}'.format(logit[0], logit[1]) f.write('_\t{}\t{}\n'.format(logit, label)) test_loss += tmp_test_loss.mean().item() nb_test_examples += input_ids.size(0) nb_test_steps += 1 f.close() test_loss = test_loss / nb_test_steps result = evaluate(os.path.join(args.output_dir, 'logits_test.txt')) result.update({'test_loss':test_loss}) output_eval_file = os.path.join(args.output_dir, "results_test.txt") with open(output_eval_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = ArgumentParser() parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--model_type", default='roberta', type=str, required=True, help="Model type selected in the list") parser.add_argument("--model_name_or_path", default='roberta-large', type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: ") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--comment", default='', type=str, help="The comment") parser.add_argument('--output_dir', type=Path, default="output") parser.add_argument("--restore", type=bool, default=True, help="Whether restore from the last checkpoint, is nochenckpoints, start from scartch") parser.add_argument("--max_seq_length", type=int, default=256, help="max lenght of token sequence") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--evaluate_during_training", type=bool, default=False, help="Rul evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--adapter_transformer_layers", default=2, type=int, help="The transformer layers of adapter.") parser.add_argument("--adapter_size", default=768, type=int, help="The hidden size of adapter.") parser.add_argument("--adapter_list", default="0,11,23", type=str, help="The layer where add an adapter") parser.add_argument("--adapter_skip_layers", default=0, type=int, help="The skip_layers of adapter according to bert layers") parser.add_argument('--meta_adapter_model', type=str, help='the pretrained adapter model') parser.add_argument("--per_gpu_train_batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument('--gradient_accumulation_steps',type=int,default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=3e-5,type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=10, help="How often do we snapshot losses, for inclusion in the progress dump? (0 = disable)") parser.add_argument('--save_steps', type=int, default=1000, help="Save checkpoint every X updates steps.") parser.add_argument('--eval_steps', type=int, default=None, help="eval every X updates steps.") parser.add_argument('--max_save_checkpoints', type=int, default=500, help="The max amounts of checkpoint saving. Bigger than it will delete the former checkpoints") parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument('--negative_sample', type=int, default=0, help='how many negative samples to select') # args args = parser.parse_args() args.adapter_list = args.adapter_list.split(',') args.adapter_list = [int(i) for i in args.adapter_list] name_prefix = 'maxlen-'+str(args.max_seq_length)+'_'+'epoch-'+str(args.num_train_epochs)+'_'+'batch-'+str(args.per_gpu_train_batch_size)+'_'+'lr-'+str(args.learning_rate)+'_'+'warmup-'+str(args.warmup_steps)+'_'+str(args.comment) args.my_model_name = args.task_name+'_'+name_prefix args.output_dir = os.path.join(args.output_dir, args.my_model_name) if args.eval_steps is None: args.eval_steps = args.save_steps*2 # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) args.output_mode = output_modes[args.task_name] processor = processors[args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab tokenizer = RobertaTokenizer.from_pretrained('roberta-large') config = RobertaConfig.from_pretrained('roberta-large', output_attentions=True) pretrained_model = PretrainedModel() adapter_model = AdapterModel(args, pretrained_model.config) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab pretrained_model.to(args.device) adapter_model.to(args.device) model = (pretrained_model, adapter_model) logger.info("Training/evaluation parameters %s", args) val_dataset = load_and_cache_examples(args, args.task_name, tokenizer, 'dev', evaluate=True) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, 'train', evaluate=False) global_step, tr_loss = train(args, train_dataset, val_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(adapter_model, 'module') else adapter_model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/', type=str, required=False, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--task_name", default='ubuntu', type=str, required=False, help="The name of the task to train.") parser.add_argument("--output_dir", default='/hdd/lujunyu/model/chatbert/check/', type=str, required=False, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--data_augmentation", default=False, action='store_true', help="Whether to use augmentation") parser.add_argument("--max_seq_length", default=256, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_test", default=True, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--train_batch_size", default=400, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=100, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=20.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0.0, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=1e-3, type=float, help="weight_decay") parser.add_argument("--save_checkpoints_steps", default=3125, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=5, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): if args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = RobertaTokenizer.from_pretrained('roberta-base') if args.data_augmentation: train_dataset = UbuntuDatasetForRoberta( file_path=os.path.join(args.data_dir, "train_augment_ubuntu.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) else: train_dataset = UbuntuDatasetForRoberta( file_path=os.path.join(args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) eval_dataset = UbuntuDatasetForRoberta( file_path=os.path.join(args.data_dir, "valid.txt"), ### TODO:change max_seq_length=args.max_seq_length, tokenizer=tokenizer ) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, sampler=RandomSampler(train_dataset), num_workers=8) eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=args.eval_batch_size, sampler=SequentialSampler(eval_dataset), num_workers=8) model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=bert_config) model.to(device) num_train_steps = None if args.do_train: num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model.named_parameters()) # remove pooler, which is not used thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_steps) else: optimizer = None scheduler = None if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) for name, param in model.named_parameters(): if param.requires_grad: print(name, param.data) global_step = 0 best_metric = 0.0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, label_ids = batch loss, _ = model(input_ids, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients scheduler.step() model.zero_grad() global_step += 1 if (step + 1) % args.save_checkpoints_steps == 0: model.eval() f = open(os.path.join(args.output_dir, 'logits_dev.txt'), 'w') eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, labels=label_ids) logits = logits.detach().cpu().numpy() logits_all.append(logits) label_ids = label_ids.cpu().numpy() for logit, label in zip(logits, label_ids): logit = '{},{}'.format(logit[0], logit[1]) f.write('_\t{}\t{}\n'.format(logit, label)) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 f.close() logits_all = np.concatenate(logits_all,axis=0) eval_loss = eval_loss / nb_eval_steps result = evaluate(os.path.join(args.output_dir, 'logits_dev.txt')) result.update({'eval_loss': eval_loss}) output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ### Save the best checkpoint if best_metric < result['R10@1'] + result['R10@2']: try: ### Remove 'module' prefix when using DataParallel state_dict = model.module.state_dict() except AttributeError: state_dict = model.state_dict() torch.save(state_dict, os.path.join(args.output_dir, "model.pt")) best_metric = result['R10@1'] + result['R10@2'] logger.info('Saving the best model in {}'.format(os.path.join(args.output_dir, "model.pt"))) ### visualize bad cases of the best model logger.info('Saving Bad cases...') visualize_bad_cases( logits=logits_all, input_file_path=os.path.join(args.data_dir, 'valid.txt'), output_file_path=os.path.join(args.output_dir, 'valid_bad_cases.txt') ) model.train()