def __init__(self,args=None, labels=None, device='cuda', bert_model_path='bert-base-uncased', architecture="DocumentBertLSTM", batch_size=10, bert_batch_size=7, learning_rate = 5e-5, weight_decay=0, use_tensorboard=False): if args is not None: self.args = vars(args) if not args: self.args = {} self.args['bert_model_path'] = bert_model_path self.args['device'] = device self.args['learning_rate'] = learning_rate self.args['weight_decay'] = weight_decay self.args['batch_size'] = batch_size self.args['labels'] = labels self.args['bert_batch_size'] = bert_batch_size self.args['architecture'] = architecture self.args['use_tensorboard'] = use_tensorboard if 'fold' not in self.args: self.args['fold'] = 0 assert self.args['labels'] is not None, "Must specify all labels in prediction" self.log = logging.getLogger() self.bert_tokenizer = BertTokenizer.from_pretrained(self.args['bert_model_path']) #account for some random tensorflow naming scheme if os.path.exists(self.args['bert_model_path']): if os.path.exists(os.path.join(self.args['bert_model_path'], CONFIG_NAME)): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], CONFIG_NAME)) elif os.path.exists(os.path.join(self.args['bert_model_path'], 'bert_config.json')): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], 'bert_config.json')) else: raise ValueError("Cannot find a configuration for the BERT based model you are attempting to load.") else: config = BertConfig.from_pretrained(self.args['bert_model_path']) config.__setattr__('num_labels',len(self.args['labels'])) config.__setattr__('bert_batch_size',self.args['bert_batch_size']) if 'use_tensorboard' in self.args and self.args['use_tensorboard']: assert 'model_directory' in self.args is not None, "Must have a logging and checkpoint directory set." from torch.utils.tensorboard import SummaryWriter self.tensorboard_writer = SummaryWriter(os.path.join(self.args['model_directory'], "..", "runs", self.args['model_directory'].split(os.path.sep)[-1]+'_'+self.args['architecture']+'_'+str(self.args['fold']))) self.bert_doc_classification = document_bert_architectures[self.args['architecture']].from_pretrained(self.args['bert_model_path'], config=config) self.optimizer = torch.optim.Adam( self.bert_doc_classification.parameters(), weight_decay=self.args['weight_decay'], lr=self.args['learning_rate'] )
def __init__(self, debug, args, data_dir, data_process_output): self.eval_steps = args.eval_steps self.adam_epsilon = args.adam_epsilon self.warmup_steps = args.warmup_steps self.learning_rate = args.learning_rate self.weight_decay = args.weight_decay self.gradient_accumulation_steps = args.gradient_accumulation_steps self.device = torch.device('cuda') self.debug = debug self.seed = 2019 self.args = args self.data_dir = args.data_dir self.max_seq_length = args.max_seq_length self.batch_size = args.per_gpu_train_batch_size self.train_steps = args.train_steps self.tokenizer = BertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) self.config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) self.seed_everything() self.do_eval = True self.data_dir = data_dir self.data_process_output = data_process_output self.output_dir = './'
def test_model_from_pretrained(self): logging.basicConfig(level=logging.INFO) for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: config = BertConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, PretrainedConfig) model = BertModel.from_pretrained(model_name) model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True) self.assertIsNotNone(model) self.assertIsInstance(model, PreTrainedModel) for value in loading_info.values(): self.assertEqual(len(value), 0) config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, True) self.assertEqual(model.config, config)
def load_pretrain(configs, model_class, fine_tune_dir, processor, eval=False): """ configs: 配置文件 model_class: 模型名称 fine_tune_dir: 微调模型保存路径 processor: DataProcessor eval: 是否验证 """ model_class_map = { 'Bert': Bert, 'BertCRF': BertCRF, 'BertBiLSTMCRF': BertBiLSTMCRF, 'BiLSTM': BiLSTM, 'BiLSTMCRF': BiLSTMCRF } model_class_ = model_class_map[model_class] label_list = processor.get_labels() check_dir(fine_tune_dir) if eval: model_pretrained_path = fine_tune_dir else: model_pretrained_path = configs['pretrained_model_dir'] tokenizer = BertTokenizer.from_pretrained( model_pretrained_path, do_lower_case=configs['lower_case']) if model_class in ['Bert', 'BertCRF', 'BertBiLSTMCRF']: bert_config = BertConfig.from_pretrained(model_pretrained_path, num_labels=len(label_list), finetuning_task="ner") model = model_class_.from_pretrained(model_pretrained_path, config=bert_config, model_configs=configs) elif model_class in ['BiLSTM', 'BiLSTMCRF']: configs['num_labels'] = len(label_list) if configs['use_pretrained_embedding']: pretrained_word_embed = build_word_embed( tokenizer, pretrain_embed_file=configs['pretrain_embed_file'], pretrain_embed_pkl=configs['pretrain_embed_pkl']) configs['word_vocab_size'] = pretrained_word_embed.shape[0] configs['word_embedding_dim'] = pretrained_word_embed.shape[1] else: pretrained_word_embed = None if eval: model_pretrained_path = fine_tune_dir model = model_class_.from_pretrained(model_pretrained_path, pretrained_word_embed) else: model = model_class_(configs, pretrained_word_embed) else: raise ValueError("Invalid Model Class") return model, tokenizer
def __init__(self, model_name_or_path, hidden_size=768, num_class=2): super(NeuralNet, self).__init__() self.config = BertConfig.from_pretrained(model_name_or_path, num_labels=4) self.config.output_hidden_states = True self.bert = BertModel.from_pretrained(model_name_or_path, config=self.config) for param in self.bert.parameters(): param.requires_grad = True self.weights = torch.rand(13, 1).cuda() self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)]) self.fc = nn.Linear(hidden_size, num_class)
def load(cls, pretrained_model_name_or_path, language=None): bert = cls() # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = os.path.join(pretrained_model_name_or_path, "language_model_config.json") if os.path.exists(farm_lm_config): # FARM style bert_config = BertConfig.from_pretrained(farm_lm_config) farm_lm_model = os.path.join(pretrained_model_name_or_path, "language_model.bin") bert.model = BertModel.from_pretrained(farm_lm_model, config=bert_config) bert.language = bert.model.config.language else: # Pytorch-transformer Style bert.model = BertModel.from_pretrained( pretrained_model_name_or_path) bert.language = cls._infer_language_from_name( pretrained_model_name_or_path) return bert
def main(): parser = argparse.ArgumentParser() ## Required parameters(即required=True的参数必须在命令上出现) parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "数据集路径. The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="模型类型(这里为bert). Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "下载好的预训练模型. Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "模型预测和断点文件的存放路径. The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help= "预训练的配置名字或路径. Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "预训练分词器名字或路径. Pretrained tokenizer name or path if not the same as model_name" ) parser.add_argument( "--cache_dir", default="", type=str, help= "从亚马逊s3下载的预训练模型存放路径. Where do you want to store the pre-trained models downloaded from s3" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "最长序列长度. The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="是否训练. Whether to run training.") parser.add_argument("--do_test", action='store_true', help="是否测试. Whether to run testing.") parser.add_argument("--predict_eval", action='store_true', help="是否预测验证集. Whether to predict eval set.") parser.add_argument("--do_eval", action='store_true', help="是否验证. Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="是否训练中跑验证. Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="是否用小写模型. Set this flag if you are using an uncased model.") parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="训练时每个GPU/CPU上的batch size. Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="验证时每个GPU/CPU上的batch size. Batch size per GPU/CPU for evaluation." ) parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "反向传播前梯度累计的次数. Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="Adam的初始学习率. The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="权重衰减系数. Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Adam的Epsilon系数. Epsilon for Adam optimizer.") parser.add_argument( "--max_grad_norm", default=1.0, type=float, help= " 如果所有参数的gradient组成的向量的L2 norm大于max norm,那么需要根据L2 norm/max_norm进行缩放。从而使得L2 norm小于预设的clip_norm. Max gradient norm." ) parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="训练epoch数. Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument( "--warmup_steps", default=0, type=int, help="线性warmup的steps. Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="测试集划分. text split") parser.add_argument('--logging_steps', type=int, default=50, help="日志更新steps. Log every X updates steps.") parser.add_argument( '--save_steps', type=int, default=50, help="断点文件保存steps. Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "评估所有的断点. Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="不用cuda. Avoid using CUDA when available") parser.add_argument( '--overwrite_output_dir', action='store_true', help="重写输出路径. Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="重写训练和评估的缓存. Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="初始化用的随机种子. random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "是否用16位混合精度. Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "fp16的优化level. For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="为了分布式训练. For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="远程debug用的ip. For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="远程debug用的端口. For distant debugging.") parser.add_argument("--freeze", default=0, type=int, required=False, help="冻结BERT. freeze bert.") parser.add_argument("--not_do_eval_steps", default=0.35, type=float, help="not_do_eval_steps.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: # 如果无指定GPU或允许使用CUDA,就使用当前所有GPU device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # 指定使用哪个GPU(local_rank代表当前程序进程使用的GPU标号) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging 初始化日志 logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed 设置种子数 set_seed(args) # 创建存放路径 try: os.makedirs(args.output_dir) except: pass # 载入预训练好的BERT分词器 tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # 载入预设好的BERT配置文件 config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=2) # Prepare model 载入并配置好基于BERT的序列分类模型 model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) # 开启FP16 if args.fp16: model.half() model.to(device) # 如果是指定了单个GPU,用DistributedDataParallel进行GPU训练 if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) # 如果有多个GPU,就直接用torch.nn.DataParallel,会自动调用当前可用的多个GPU elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # 总batch size = GPU数量 * 每个GPU上的mbatch size args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader 导入数据并准备符合格式的输入 train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # 如果无指定GPU就随机采样,如果指定了GPU就分布式采样 if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) # 准备dataloader train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) # 训练steps num_train_optimization_steps = args.train_steps # Prepare optimizer 准备优化器 param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] # no_dacay内的参数不参与权重衰减 # BN是固定C,[B,H,W]进行归一化处理(处理为均值0,方差1的正太分布上),适用于CNN # LN是固定N,[C,H,W]进行归一化处理,适用于RNN(BN适用于固定深度的前向神经网络,而RNN因输入序列长度不一致而深度不固定,因此BN不合适,而LN不依赖于batch的大小和输入sequence的深度,因此可以用于batchsize为1和RNN中对边长的输入sequence的normalize操作) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # 配置优化器和warmup机制 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps // args.gradient_accumulation_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # 循环遍历 # 先做一个eval for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data 准备验证集的dataloader eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # 开启预测模式(不用dropout和BN) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: # 将数据放在GPU上 input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) # 禁止进行梯度更新 with torch.no_grad(): tmp_eval_loss, logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps # 计算验证集的预测损失 eval_accuracy = accuracy(inference_logits, gold_labels) # 计算验证集的预测准确性 result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step } # 将验证集的预测评价写入到evel_results.txt中 output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') # 如果当前训练的模型表现最佳,则保存该模型 if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) model.train() # 分batch循环迭代训练模型 for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) nb_tr_examples += input_ids.size(0) del input_ids, input_mask, segment_ids, label_ids if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_steps += 1 # 用FP16去做反向传播 if args.fp16: optimizer.backward(loss) else: loss.backward() # 梯度累计后进行更新 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() # 梯度更新 scheduler.step() # 梯度更新 optimizer.zero_grad() # 清空现有梯度,避免累计 global_step += 1 # 每隔args.eval_steps*args.gradient_accumulation_steps,打印训练过程中的结果 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # 每隔args.eval_steps*args.gradient_accumulation_steps,预测验证集并评估结果 if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and ( step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) # 预测测试集 if args.do_test: del model gc.collect() # 清理内存 args.do_train = False # 停止训练 # 载入训练好的的最佳模型文件 model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: # nn.Module中的half()方法将模型中的float32转化为float16 model.half() model.to(device) # 将模型放在GPU上 # 设置GPU训练方式 if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # 预测验证集和测试集 for file, flag in [('dev.csv', 'dev'), ('CSC_test.csv', 'CSC_test'), ('NS_test.csv', 'NS_test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) # 保存预测结果文件 if flag == 'CSC_test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['qid', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_CSC.csv"), index=False) if flag == 'NS_test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['qid', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_NS.csv"), index=False) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False) # 只预测验证集 if args.predict_eval: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False)
def test_eval(self): data = DATAMultiWOZ(debug=False, data_dir=self.data_dir) test_examples = data.read_examples( os.path.join(self.data_dir, 'test.json')) print('eval_examples的数量', len(test_examples)) ID = [x.guid for x in test_examples] test_features = data.convert_examples_to_features( test_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(test_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( test_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( test_features, 'segment_ids'), dtype=torch.long) eval_labels_domain = torch.tensor( [f.labels_domain for f in test_features], dtype=torch.long) eval_labels_dependcy = torch.tensor( [f.labels_dependcy for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, eval_labels_domain, eval_labels_dependcy) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(os.path.join( self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() inference_labels = [] gold_labels_domain = [] gold_labels_dependcy = [] scores_domain = [] scores_dependcy = [] for input_ids, input_mask, segment_ids, label_domain, label_dependcy in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_domain = label_domain.to(self.device) label_dependcy = label_dependcy.to(self.device) with torch.no_grad(): logits_domain = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, ).view(-1, self.num_labels_domain).detach().cpu().numpy() logits_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, ).view(-1, self.num_labels_dependcy).detach().cpu().numpy() label_domain = label_domain.view(-1).to('cpu').numpy() label_dependcy = label_dependcy.view(-1).to('cpu').numpy() scores_domain.append(logits_domain) scores_dependcy.append(logits_dependcy) gold_labels_domain.append(label_domain) gold_labels_dependcy.append(label_dependcy) gold_labels_domain = np.concatenate(gold_labels_domain, 0) gold_labels_depandcy = np.concatenate(gold_labels_dependcy, 0) scores_domain = np.concatenate(scores_domain, 0) scores_dependcy = np.concatenate(scores_dependcy, 0) # 计算评价指标 assert scores_domain.shape[0] == scores_dependcy.shape[ 0] == gold_labels_domain.shape[0] == gold_labels_depandcy.shape[0] eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain, mode='domain', report=True) eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy, mode='depency', report=True) print('eval_accuracy_domain', eval_accuracy_domain) print('eval_accuracy_dependcy', eval_accuracy_dependcy)
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( ) num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_domain, label_dependcy = batch loss_domain, loss_dependcy = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domain=label_domain, label_dependcy=label_dependcy) loss = loss_domain + loss_dependcy tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % ( self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels_domain = [] gold_labels_dependcy = [] inference_logits = [] scores_domain = [] scores_dependcy = [] ID = [x.guid for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss_domain, eval_loss_dependcy, eval_accuracy_domain, eval_accuracy_dependcy = 0, 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_domain, label_dependcy in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_domain = label_domain.to(self.device) label_dependcy = label_dependcy.to(self.device) with torch.no_grad(): batch_eval_loss_domain, batch_eval_loss_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domain=label_domain, label_dependcy=label_dependcy) logits_domain, logits_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits_domain = logits_domain.view( -1, self.num_labels_domain).detach().cpu().numpy() logits_dependcy = logits_dependcy.view( -1, self.num_labels_dependcy).detach().cpu().numpy() label_domain = label_domain.view(-1).to('cpu').numpy() label_dependcy = label_dependcy.view(-1).to( 'cpu').numpy() scores_domain.append(logits_domain) scores_dependcy.append(logits_dependcy) gold_labels_domain.append(label_domain) gold_labels_dependcy.append(label_dependcy) eval_loss_domain += batch_eval_loss_domain.mean().item( ) eval_loss_dependcy += batch_eval_loss_dependcy.mean( ).item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels_domain = np.concatenate(gold_labels_domain, 0) gold_labels_dependcy = np.concatenate( gold_labels_dependcy, 0) scores_domain = np.concatenate(scores_domain, 0) scores_dependcy = np.concatenate(scores_dependcy, 0) model.train() eval_loss_domain = eval_loss_domain / nb_eval_steps eval_loss_dependcy = eval_loss_dependcy / nb_eval_steps eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain, mode='domain') eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy, mode='dependcy') print('eval_F1_domain', eval_accuracy_domain, 'eval_F1_dependcy', eval_accuracy_dependcy, 'global_step', global_step, 'loss', train_loss) result = { 'eval_loss_domain': eval_loss_domain, 'eval_loss_dependcy': eval_loss_dependcy, 'eval_F1_domain': eval_accuracy_domain, 'eval_F1_dependcy': eval_accuracy_dependcy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy_domain > best_acc: print("=" * 80) print("Best F1", eval_accuracy_domain) print("Saving Model......") # best_acc = eval_accuracy best_acc = eval_accuracy_domain # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def test_eval(self): data = DATAMultiWOZ(debug=False, data_dir=self.data_dir) test_examples = data.read_examples( os.path.join(self.data_dir, 'test.json')) print('eval_examples的数量', len(test_examples)) dialogueID = [x.guid for x in test_examples] test_features = data.convert_examples_to_features( test_examples, self.tokenizer, self.max_seq_length) test_input_ids = torch.tensor(data.select_field( test_features, 'input_ids'), dtype=torch.long) test_input_mask = torch.tensor(data.select_field( test_features, 'input_mask'), dtype=torch.long) test_segment_ids = torch.tensor(data.select_field( test_features, 'segment_ids'), dtype=torch.long) test_utterance_mask = torch.tensor(data.select_field( test_features, 'utterance_mask'), dtype=torch.long) test_domainslot_mask = torch.tensor(data.select_field( test_features, 'domainslot_mask'), dtype=torch.long) test_label_tokens_start = torch.tensor( [f.label_tokens_start for f in test_features], dtype=torch.long) test_label_tokens_end = torch.tensor( [f.label_tokens_end for f in test_features], dtype=torch.long) test_label_sentence_domainslot = torch.tensor( [f.label_sentence_domainslot for f in test_features], dtype=torch.long) test_label_tokens_domainslot = torch.tensor( [f.label_tokens_domainslot for f in test_features], dtype=torch.long) test_hist_tokens = [f.hist_token for f in test_features] test_data = TensorDataset( test_input_ids, test_input_mask, test_segment_ids, test_utterance_mask, test_domainslot_mask, test_label_tokens_start, test_label_tokens_end, test_label_sentence_domainslot, test_label_tokens_domainslot) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(os.path.join( self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() gold_labels_tokens_start = [] gold_labels_tokens_end = [] gold_label_sentence_domainslot = [] gold_label_tokens_domainslot = [] scores_tokens_start = [] scores_tokens_end = [] scores_sentence_domainslot = [] scores_tokens_domainslot = [] for input_ids, input_mask, segment_ids, \ utterance_mask, domainslot_mask, \ label_tokens_start, label_tokens_end, \ label_sentence_domainslot, label_tokens_domainslot in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) utterance_mask = utterance_mask.to(self.device) domainslot_mask = domainslot_mask.to(self.device) label_tokens_start = label_tokens_start.to(self.device) label_tokens_end = label_tokens_end.to(self.device) label_sentence_domainslot = label_sentence_domainslot.to( self.device) # print(label_sentence_domainslot.size()) # print(label_sentence_domainslot) label_tokens_domainslot = label_tokens_domainslot.to(self.device) with torch.no_grad(): batch_eval_loss_tokens_start, batch_eval_loss_tokens_end, batch_eval_loss_sentence_domainslot, batch_eval_loss_tokens_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, domainslot_mask=domainslot_mask, label_tokens_start=label_tokens_start, label_tokens_end=label_tokens_end, label_sentence_domainslot=label_sentence_domainslot, label_tokens_domainslot=label_tokens_domainslot) logits_tokens_start, logits_tokens_end, logits_sentence_domainslot, logits_tokens_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, domainslot_mask=domainslot_mask) logits_tokens_start = logits_tokens_start.view(-1, 2).cpu().numpy() logits_tokens_end = logits_tokens_end.view(-1, 2).cpu().numpy() logits_tokens_domainslot = logits_tokens_domainslot.view( -1, 2).detach().cpu().numpy() logits_sentence_domainslot = logits_sentence_domainslot.view( -1, 2).cpu().numpy() label_tokens_start = label_tokens_start.view(-1).to('cpu').numpy() label_tokens_end = label_tokens_end.view(-1).to('cpu').numpy() label_sentence_domainslot = label_sentence_domainslot.to( 'cpu').numpy() label_tokens_domainslot = label_tokens_domainslot.to('cpu').numpy() scores_tokens_start.append(logits_tokens_start) scores_tokens_end.append(logits_tokens_end) scores_sentence_domainslot.append(logits_sentence_domainslot) scores_tokens_domainslot.append(logits_tokens_domainslot) gold_labels_tokens_start.append(label_tokens_start) gold_labels_tokens_end.append(label_tokens_end) gold_label_sentence_domainslot.append(label_sentence_domainslot) gold_label_tokens_domainslot.append(label_tokens_domainslot) gold_labels_tokens_start = np.concatenate(gold_labels_tokens_start, 0) gold_labels_tokens_end = np.concatenate(gold_labels_tokens_end, 0) gold_label_sentence_domainslot = np.concatenate( gold_label_sentence_domainslot, 0) gold_label_tokens_domainslot = np.concatenate( gold_label_tokens_domainslot, 0) scores_tokens_start = np.concatenate(scores_tokens_start, 0) scores_tokens_end = np.concatenate(scores_tokens_end, 0) scores_sentence_domainslot = np.concatenate(scores_sentence_domainslot, 0) scores_tokens_domainslot = np.concatenate(scores_tokens_domainslot, 0) # 计算评价指标 # eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain',report=True) # eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy,mode='dependcy',report=True) eval_F1_tokenstart, eval_F1_tokenend, F1_sentence_domainslot, F1_token_domainslot = compute_jointGoal_domainslot_1_( dialogueID, test_hist_tokens, scores_tokens_start, scores_tokens_end, scores_sentence_domainslot, scores_tokens_domainslot, gold_labels_tokens_start, gold_labels_tokens_end, gold_label_sentence_domainslot, gold_label_tokens_domainslot) print('F1_token_domainslot', F1_token_domainslot, 'F1_sentence_domainslot', F1_sentence_domainslot, 'eval_F1_tokenstart', eval_F1_tokenstart, 'eval_F1_tokenend', eval_F1_tokenend)
aspect = self.aspect_bert(text_ids, attention_mask=text_att_mask)[0] if self.tp == 'cdm': mask = torch.zeros_like(text_ids).float() # B L for i, (s, e) in enumerate(pos): s = max(0, s.item() - self.SDR) e = min(e.item() + self.SDR + 1, len(aspect)) mask[i, s:e] = torch.tensor([1.0] * (e - s)) aspect = aspect * mask.unsqueeze(-1) cat = torch.cat([text_asp, aspect], -1) # B L 2H cat = self.reduce2_bert_dim(cat) x = self.aspect_self_att(cat) # B L H x = self.bert_pooler(cat) # B H out = self.reduce2_num_class_linear(x) # , 'aspect_emphasize_att_acore_BhL': aspect_emphasize_att_score.squeeze().detach().cpu().numpy()} return {'output': out} if __name__ == "__main__": conf = BertConfig.from_pretrained('/mnt/sda1/bert/uncased_L-12_H-768_A-12') m = MY_BERT_LCF(conf) input_ids = torch.ones((16, 80)).long() attention_mask = torch.ones_like(input_ids) text_asp_ids, text_asp_att_mask = copy.deepcopy(input_ids), copy.deepcopy( attention_mask) pos = torch.LongTensor([[1, 4] for _ in range(16)]) print( m.forward(text_asp_ids, text_asp_att_mask, input_ids, attention_mask, pos)['output'].size())
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--classifier', default='guoday', type=str, required=True, help='classifier type, guoday or MLP or GRU_MLP or ...') parser.add_argument('--optimizer', default='RAdam', type=str, required=True, help='optimizer we use, RAdam or ...') parser.add_argument("--do_label_smoothing", default='yes', type=str, required=True, help="Whether to do label smoothing. yes or no.") parser.add_argument('--draw_loss_steps', default=1, type=int, required=True, help='training steps to draw loss') parser.add_argument('--label_name', default='label', type=str, required=True, help='label name in original train set index') ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_test", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_eval", default='yes', type=str, required=True, help="Whether to run eval on the dev set. yes or no.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=200, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # tensorboard_log_dir = args.output_dir # loss_now = tf.placeholder(dtype=tf.float32, name='loss_now') # loss_mean = tf.placeholder(dtype=tf.float32, name='loss_mean') # loss_now_variable = loss_now # loss_mean_variable = loss_mean # train_loss = tf.summary.scalar('train_loss', loss_now_variable) # dev_loss_mean = tf.summary.scalar('dev_loss_mean', loss_mean_variable) # merged = tf.summary.merge([train_loss, dev_loss_mean]) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) config.hidden_dropout_prob = args.dropout # Prepare model if args.do_train == 'yes': model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train == 'yes': print( '________________________now training______________________________' ) # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True, label_name=args.label_name) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) # print('train_feature_size=', train_features.__sizeof__()) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # print('train_data=',train_data[0]) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.optimizer == 'RAdam': optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 loss_batch = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # with tf.Session() as sess: # summary_writer = tf.summary.FileWriter(tensorboard_log_dir, sess.graph) # sess.run(tf.global_variables_initializer()) list_loss_mean = [] bx = [] eval_F1 = [] ax = [] for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() loss_batch += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: # optimizer.backward(loss) loss.backward() else: loss.backward() # draw loss every n docs if (step + 1) % int(args.draw_loss_steps / (args.train_batch_size / args.gradient_accumulation_steps)) == 0: list_loss_mean.append(round(loss_batch, 4)) bx.append(step + 1) plt.plot(bx, list_loss_mean, label='loss_mean', linewidth=1, color='b', marker='o', markerfacecolor='green', markersize=2) plt.savefig(args.output_dir + '/labeled.jpg') loss_batch = 0 # paras update every batch data. if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 # report results every 200 real batch. if step % (args.eval_steps * args.gradient_accumulation_steps) == 0 and step > 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # do evaluation totally 10 times during training stage. if args.do_eval == 'yes' and (step + 1) % int( num_train_optimization_steps / 10) == 0 and step > 450: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_labels = np.concatenate(inference_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() ############################################### num_gold_0 = np.sum(gold_labels == 0) num_gold_1 = np.sum(gold_labels == 1) num_gold_2 = np.sum(gold_labels == 2) right_0 = 0 right_1 = 0 right_2 = 0 error_0 = 0 error_1 = 0 error_2 = 0 for gold_label, inference_label in zip( gold_labels, inference_labels): if gold_label == inference_label: if gold_label == 0: right_0 += 1 elif gold_label == 1: right_1 += 1 else: right_2 += 1 elif inference_label == 0: error_0 += 1 elif inference_label == 1: error_1 += 1 else: error_2 += 1 recall_0 = right_0 / (num_gold_0 + 1e-5) recall_1 = right_1 / (num_gold_1 + 1e-5) recall_2 = right_2 / (num_gold_2 + 1e-5) precision_0 = right_0 / (error_0 + right_0 + 1e-5) precision_1 = right_1 / (error_1 + right_1 + 1e-5) precision_2 = right_2 / (error_2 + right_2 + 1e-5) f10 = 2 * precision_0 * recall_0 / (precision_0 + recall_0 + 1e-5) f11 = 2 * precision_1 * recall_1 / (precision_1 + recall_1 + 1e-5) f12 = 2 * precision_2 * recall_2 / (precision_2 + recall_2 + 1e-5) output_dev_result_file = os.path.join( args.output_dir, "dev_results.txt") with open(output_dev_result_file, 'a', encoding='utf-8') as f: f.write('precision:' + str(precision_0) + ' ' + str(precision_1) + ' ' + str(precision_2) + '\n') f.write('recall:' + str(recall_0) + ' ' + str(recall_1) + ' ' + str(recall_2) + '\n') f.write('f1:' + str(f10) + ' ' + str(f11) + ' ' + str(f12) + '\n' + '\n') eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) # draw loss. eval_F1.append(round(eval_accuracy, 4)) ax.append(step) plt.plot(ax, eval_F1, label='eval_F1', linewidth=1, color='r', marker='o', markerfacecolor='blue', markersize=2) for a, b in zip(ax, eval_F1): plt.text(a, b, b, ha='center', va='bottom', fontsize=8) plt.savefig(args.output_dir + '/labeled.jpg') result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("more accurate model arises, now best F1 = ", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model, only save the model it-self model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if (step+1) / int(num_train_optimization_steps/10) > 9.5: print("=" * 80) print("End of training. Saving Model......") # Save a trained model, only save the model it-self model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "pytorch_model_final_step.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if args.do_test == 'yes': start_time = time.time() print( '___________________now testing for best eval f1 model_________________________' ) try: del model except: pass gc.collect() args.do_train = 'no' model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) model.half() for layer in model.modules(): if isinstance(layer, torch.nn.modules.batchnorm._BatchNorm): layer.float() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from " "https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() # print('test_logits=', logits) label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) if flag == 'dev': print(flag, accuracy(logits, gold_labels)) elif flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) # df[['id', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) else: raise ValueError('flag not in [dev, test]') print('inference time usd = {}s'.format(time.time() - start_time)) '''
# Date: 2020/12/4 # Author: Qianqian Peng from mention_detection.mention_detection import load_model import torch from transformers import BertTokenizer as BertTokenizer_new from transformers import BertConfig as BertConfig_new from transformers import BertModel as BertModel_new import torch.nn as nn from pytorch_transformers.modeling_bert import ( BertPreTrainedModel, BertConfig, BertModel, ) bert_new = BertModel_new.from_pretrained( './model/bert-large-uncased', config=BertConfig_new.from_pretrained('bert-large-uncased')) bert_old = BertModel.from_pretrained( './model/bert-large-uncased', config=BertConfig.from_pretrained('bert-large-uncased'))
# if os.path.exists(data_path) is not True: # os.mkdir(data_path) # make data, 如果train_path或者dev_path都存在,通过data_path做数据 # if (os.path.exists(train_path) and os.path.exists(dev_path)) is not True: # make_data(data_path, train_path, dev_path) if args.limit_vocab: vocab_path = os.path.join(args.folder_path, args.limit_vocabulary_name) else: vocab_path = os.path.join(args.folder_path, args.vocab_name) # 创建分词器 # 使用vocab文件进行pretrain的时候出了错误,输出的vocab成为了乱码 tokenizer = BertTokenizer.from_pretrained(vocab_path) config_path = os.path.join(args.folder_path, args.config_name) config = BertConfig.from_pretrained(config_path) if args.limit_vocab: config.vocab_size = tokenizer.vocab_size rouge = Rouge() # if os.path.exists(save_path) is not True: # os.mkdir(save_path) dev_data = pd.read_csv(dev_path, encoding='utf-8') dev_texts = list(dev_data['text'].values) dev_summaries = list(dev_data['summarization'].values) summary_all = [] for summary in dev_summaries: summary = ''.join([word + ' ' for words in summary for word in words])
def main(): batch_size = 32 max_seq_len = 128 n_epochs = 3 bert_model = 'bert-base-uncased' learning_rate = 3e-5 adam_epsilon = 1e-8 warmup_steps = 0 num_labels = 1 output_dir = "fine_tuned--{0}--SEQ_LEN={1}--BATCH_SIZE={2}--HEAD={3}".format( bert_model, max_seq_len, batch_size, num_labels) dataset_dir = "dataset\custom_training_set.csv" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = BertConfig.from_pretrained(bert_model) config.num_labels = num_labels tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForSequenceClassification(config) model.to(device) train_dataset = Dataset(dataset_dir, tokenizer, max_seq_len) num_train_optimization_steps = int( len(train_dataset) / batch_size) * n_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) train_sampler = data.RandomSampler(train_dataset) train_dataloader = data.DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) model.train() for _ in trange(n_epochs, desc="Epoch"): for batch in tqdm(train_dataloader, desc="Iteration"): batch = (t.to(device) for t in batch) input_ids, input_mask, segment_ids, labels = batch outputs = model(input_ids, input_mask, segment_ids, labels) loss = outputs[0] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() if not os.path.exists(output_dir): os.mkdir(output_dir) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) # Prepare model model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and (step + 1) % ( args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if args.do_test: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False)
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader() num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(self.model_name_or_path,self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids,input_mask,segment_ids,\ utterance_mask,domain_mask, \ slot_mask,hist_mask,\ label_value_start,label_value_end,\ label_domainslot = batch loss_tokenstart,loss_tokenend,loss_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask = utterance_mask, domain_mask = domain_mask, slot_mask = slot_mask, hist_mask = hist_mask, label_value_start=label_value_start, label_value_end = label_value_end, label_domainslot = label_domainslot ) loss = loss_tokenstart + loss_tokenend + loss_domainslot # loss = loss_domainslot tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['de.csv']: gold_value_start = [] gold_value_end = [] gold_domainslot = [] scores_value_start = [] scores_value_end = [] scores_domainslot = [] dialogueID = [x.guid for x in eval_examples] utterance_text = [x.text_eachturn for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss_tokens_start,eval_loss_tokens_end,eval_loss_domainslot = 0,0,0 eval_F1_tokens_start,eval_F1_tokens_end = 0,0 eval_F1_sentence_domainslot,eval_F1_tokens_domainslot = 0,0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids,input_mask, segment_ids,\ utterance_mask,domain_mask, \ slot_mask,hist_mask,\ label_value_start,label_value_end,\ label_domainslot in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) utterance_mask = utterance_mask.to(self.device) domain_mask = domain_mask.to(self.device) slot_mask = slot_mask.to(self.device) hist_mask = hist_mask.to(self.device) label_value_start = label_value_start.to(self.device) label_value_end = label_value_end.to(self.device) label_domainslot = label_domainslot.to(self.device) with torch.no_grad(): batch_eval_loss_value_start,batch_eval_loss_value_end,batch_eval_loss_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask = utterance_mask, domain_mask = domain_mask, slot_mask = slot_mask, hist_mask = hist_mask, label_value_start = label_value_start, label_value_end=label_value_end, label_domainslot=label_domainslot ) logits_value_start,logits_value_end,logits_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask = utterance_mask, domain_mask = domain_mask, slot_mask = slot_mask, hist_mask = hist_mask, ) logits_value_start = logits_value_start.cpu().numpy() logits_value_end = logits_value_end.cpu().numpy() logits_domainslot = logits_domainslot.cpu().numpy() label_value_start = label_value_start.to('cpu').numpy() label_value_end = label_value_end.to('cpu').numpy() label_domainslot = label_domainslot.to('cpu').numpy() scores_value_start.append(logits_value_start) scores_value_end.append(logits_value_end) scores_domainslot.append(logits_domainslot) gold_value_start.append(label_value_start) gold_value_end.append(label_value_end) gold_domainslot.append(label_domainslot) eval_loss_tokens_start += batch_eval_loss_value_start.mean().item() eval_loss_tokens_end += batch_eval_loss_value_end.mean().item() eval_loss_domainslot += batch_eval_loss_domainslot.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_value_start = np.concatenate(gold_value_start,0) gold_value_end = np.concatenate(gold_value_end,0) gold_domainslot = np.concatenate(gold_domainslot,0) scores_value_start = np.concatenate(scores_value_start, 0) scores_value_end = np.concatenate(scores_value_end, 0) scores_domainslot = np.concatenate(scores_domainslot,0) model.train() eval_loss_tokens_start = eval_loss_tokens_start/nb_eval_steps eval_loss_tokens_end = eval_loss_tokens_end / nb_eval_steps eval_loss_domainslot = eval_loss_domainslot /nb_eval_steps # print(scores_domainslot.shape) # print(gold_labels_domainslot.shape) # print(scores_domainslot) # print(gold_labels_domainslot) # exit() # eval_accuracy_token_start = accuracyF1(scores_domain, gold_labels_domain,mode='domain') # eval_accuracy_token_end = accuracyF1(scores_dependcy, gold_labels_dependcy ,mode= 'dependcy') eval_F1_valuestart,eval_F1_valueend,F1_domainslot = compute_jointGoal_domainslot( dialogueID, utterance_text, scores_value_start, scores_value_end, scores_domainslot, gold_value_start, gold_value_end, gold_domainslot ) print( 'F1_domainslot',F1_domainslot, 'eval_F1_valuestart',eval_F1_valuestart, 'eval_F1_valueend', eval_F1_valueend, 'global_step',global_step, 'loss',train_loss ) result = { 'eval_loss_tokens_start':eval_loss_tokens_start, 'eval_loss_tokens_end': eval_loss_tokens_end, 'eval_loss_domainslot':eval_loss_domainslot, 'F1_domainslot': F1_domainslot, 'eval_F1_valuestart': eval_F1_valuestart, 'eval_F1_valueend': eval_F1_valueend, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_F1_valuestart > best_acc : print("=" * 80) print("Best jointGoal", eval_F1_valuestart) print("Saving Model......") # best_acc = eval_accuracy best_acc = eval_F1_valuestart # Save a trained model model_to_save = model.module if hasattr(model,'module') else model output_model_file = os.path.join(self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def test_eval(self): data = DATAMultiWOZ( debug=False, data_dir=self.data_dir ) test_examples = data.read_examples(os.path.join(self.data_dir, 'test.tsv')) print('eval_examples的数量', len(test_examples)) ID = [x.guid for x in test_examples] test_features = data.convert_examples_to_features(test_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(test_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field(test_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field(test_features, 'segment_ids'), dtype=torch.long) all_utterance_mask = torch.tensor(data.select_field(test_features, 'utterance_mask'), dtype=torch.long) all_response_mask = torch.tensor(data.select_field(test_features, 'response_mask'), dtype=torch.long) all_history_mask = torch.tensor(data.select_field(test_features, 'history_mask'), dtype=torch.long) all_label = torch.tensor([f.label for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_utterance_mask,all_response_mask,all_history_mask, all_label) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained( os.path.join(self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() inference_labels = [] gold_labels = [] scores = [] for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, ).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() scores.append(logits) inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) scores = np.concatenate(scores, 0) logits = np.concatenate(inference_labels, 0) # 计算评价指标 assert len(ID) == scores.shape[0]== scores.shape[0] eval_accuracy = accuracyF1(logits, gold_labels) # eval_DOUBAN_MRR,eval_DOUBAN_mrr,eval_DOUBAN_MAP,eval_Precision1 = compute_DOUBAN(ID,scores,gold_labels) # print( # 'eval_MRR',eval_DOUBAN_MRR,eval_DOUBAN_mrr, # 'eval_MAP',eval_DOUBAN_MAP, # 'eval_Precision1',eval_Precision1) print('F1',eval_accuracy)