def et(et_dataloader, max_et_unseen_acc, et_label_list, et_hypo_seen_str_indicator, et_hypo_2_type_index): model.eval() et_loss, et_step, preds = 0, 0, [] for input_ids, input_mask, segment_ids, label_ids in et_dataloader: input_ids, input_mask, segment_ids, label_ids = input_ids.to( device), input_mask.to(device), segment_ids.to( device), label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None)[0] tmp_et_loss = CrossEntropyLoss()(logits.view( -1, num_labels), label_ids.view(-1)) et_loss += tmp_et_loss.mean().item() et_step += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) # 进行反向传播时,到该调用detach()的Variable就会停止,不能再继续向前进行传播. # cpu()函数作用是将数据从GPU上复制到memory上,相对应的函数是cuda() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) et_loss = et_loss / et_step preds = preds[0] ''' preds: size*2 (entail, not_entail) wenpeng added a softxmax so that each row is a prob vec ''' pred_probs = softmax(preds, axis=1)[:, 0] pred_binary_labels_harsh, pred_binary_labels_loose = [], [] for i in range(preds.shape[0]): pred_binary_labels_harsh.append( 0 ) if preds[i][0] > preds[i][ 1] + 0.1 else pred_binary_labels_harsh.append( 1) pred_binary_labels_loose.append( 0) if preds[i][0] > preds[i][ 1] else pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_emotion_zeroshot_TwpPhasePred( pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, et_label_list, et_hypo_seen_str_indicator, et_hypo_2_type_index, seen_types) # result = compute_metrics('F1', preds, all_label_ids.numpy()) loss = train_loss / train_step if args.do_train else None # test_acc = mean_f1#result.get("f1") if unseen_acc > max_et_unseen_acc: max_et_unseen_acc = unseen_acc print( 'seen_f1:{} unseen_f1:{} max_unseen_f1:{}'.format( seen_acc, unseen_acc, max_et_unseen_acc)) return max_et_unseen_acc
def do_eval(eval_features, eval_examples): """Do evaluation on the current model.""" # Logg some information. logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Get the eval data and create a sequential dataloader. eval_data = create_tensor_dataset(eval_features) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Set the model to eval mode (disable dropout) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None # Iterate over the evaluation data. for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) # Forward pass with deactivated autograd engine. with torch.no_grad(): logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) # Calculate eval loss. tmp_eval_loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) # Calculate the mean loss and get all predictions. eval_loss = eval_loss / nb_eval_steps loss = tr_loss/global_step if args.do_train else None preds = preds[0] preds = np.argmax(preds, axis=1) # Compute the metrics for the given task result = compute_metrics(task_name, preds, out_label_ids) # Save additional information in the result dict. result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss # Save all settings for external evaluation result['_task'] = task_name result['_input_mode'] = args.input_to_use result['_learning_rate'] = args.learning_rate result['_bert-model'] = args.bert_model result['_batch_size'] = args.train_batch_size result['_warmup'] = args.warmup_proportion result['_num_epochs'] = args.num_train_epochs result['_seq_len'] = args.max_seq_length result['_seed'] = args.seed result['_gradient_acc'] = args.gradient_accumulation_steps return result, preds
def compute_td_loss(current_model, target_model, batch_size, replay_buffer, per, use_cpp_buffer, use_async_rb, optimizer, gamma, memory_mgr, robust, **kwargs): t = time.time() dtype = kwargs['dtype'] if per: buffer_beta = kwargs['buffer_beta'] if use_async_rb: if not replay_buffer.sample_available(): replay_buffer.async_sample(batch_size, buffer_beta) res = replay_buffer.wait_sample() replay_buffer.async_sample(batch_size, buffer_beta) else: res = replay_buffer.sample(batch_size, buffer_beta) if use_cpp_buffer: state, action, reward, next_state, done, indices, weights = res[ 'obs'], res['act'], res['rew'], res['next_obs'], res[ 'done'], res['indexes'], res['weights'] else: state, action, reward, next_state, done, weights, indices = res[ 0], res[1], res[2], res[3], res[4], res[5], res[6] else: if use_async_rb: if replay_buffer.sample_available(): replay_buffer.async_sample(batch_size) res = replay_buffer.wait_sample() replay_buffer.async_sample(batch_size) else: res = replay_buffer.sample(batch_size) if use_cpp_buffer: state, action, reward, next_state, done = res['obs'], res[ 'act'], res['rew'], res['next_obs'], res['done'] else: state, action, reward, next_state, done = res[0], res[1], res[ 2], res[3], res[4] if use_cpp_buffer and not use_async_rb: action = action.transpose()[0].astype(int) reward = reward.transpose()[0].astype(int) done = done.transpose()[0].astype(int) log_time('sample_time', time.time() - t) t = time.time() numpy_weights = weights if per: state, next_state, action, reward, done, weights = memory_mgr.get_cuda_tensors( state, next_state, action, reward, done, weights) else: state, next_state, action, reward, done = memory_mgr.get_cuda_tensors( state, next_state, action, reward, done) bound_solver = kwargs.get('bound_solver', 'cov') optimizer.zero_grad() state = state.to(torch.float) next_state = next_state.to(torch.float) # Normalize input pixel to 0-1 if dtype in UINTS: state /= 255 next_state /= 255 state_max = 1.0 state_min = 0.0 else: state_max = float('inf') state_min = float('-inf') beta = kwargs.get('beta', 0) if robust and bound_solver != 'pgd': cur_q_logits = current_model(state, method_opt="forward") tgt_next_q_logits = target_model(next_state, method_opt="forward") else: cur_q_logits = current_model(state) tgt_next_q_logits = target_model(next_state) if robust: eps = kwargs['eps'] cur_q_value = cur_q_logits.gather(1, action.unsqueeze(1)).squeeze(1) tgt_next_q_value = tgt_next_q_logits.max(1)[0] expected_q_value = reward + gamma * tgt_next_q_value * (1 - done) ''' # Merge two states into one batch state = state.to(torch.float) if dtype in UINTS: state /= 255 state_and_next_state = torch.cat((state, next_state), 0) logits = current_model(state_and_next_state) cur_q_logits = logits[:state.size(0)] cur_next_q_logits = logits[state.size(0):] tgt_next_q_value = tgt_next_q_logits.gather(1, torch.max(cur_next_q_logits, 1)[1].unsqueeze(1)).squeeze(1) ''' if kwargs['natural_loss_fn'] == 'huber': loss_fn = torch.nn.SmoothL1Loss(reduction='none') loss = loss_fn(cur_q_value, expected_q_value.detach()) else: loss = (cur_q_value - expected_q_value.detach()).pow(2) if per: loss = loss * weights prios = loss + 1e-5 weights_norm = np.linalg.norm(numpy_weights) batch_cur_q_value = torch.mean(cur_q_value) batch_exp_q_value = torch.mean(expected_q_value) loss = loss.mean() td_loss = loss.clone() if robust: if eps < np.finfo(np.float32).tiny: reg_loss = torch.zeros(state.size(0)) if USE_CUDA: reg_loss = reg_loss.cuda() if bound_solver == 'pgd': labels = torch.argmax(cur_q_logits, dim=1).clone().detach() adv_margin = ori_margin = logits_margin( current_model.forward(state), labels) optimizer.zero_grad() else: if bound_solver != 'pgd': sa = kwargs.get('sa', None) pred = cur_q_logits labels = torch.argmax(pred, dim=1).clone().detach() c = torch.eye(current_model.num_actions).type_as( state)[labels].unsqueeze(1) - torch.eye( current_model.num_actions).type_as(state).unsqueeze(0) I = (~(labels.data.unsqueeze(1) == torch.arange( current_model.num_actions).type_as( labels.data).unsqueeze(0))) c = (c[I].view(state.size(0), current_model.num_actions - 1, current_model.num_actions)) sa_labels = sa[labels] lb_s = torch.zeros(state.size(0), current_model.num_actions) if USE_CUDA: labels = labels.cuda() c = c.cuda() sa_labels = sa_labels.cuda() lb_s = lb_s.cuda() env_id = kwargs.get('env_id', '') if env_id == 'Acrobot-v1': eps_v = get_acrobot_eps(eps) if USE_CUDA: eps_v = eps_v.cuda() else: eps_v = eps state_ub = torch.clamp(state + eps_v, max=state_max) state_lb = torch.clamp(state - eps_v, min=state_min) lb = get_logits_lower_bound(current_model, state, state_ub, state_lb, eps_v, c, beta) hinge = kwargs.get('hinge', False) if hinge: reg_loss, _ = torch.min(lb, dim=1) hinge_c = kwargs.get('hinge_c', 1) reg_loss = torch.clamp(reg_loss, max=hinge_c) reg_loss = -reg_loss else: lb = lb_s.scatter(1, sa_labels, lb) reg_loss = CrossEntropyLoss()(-lb, labels) else: labels = torch.argmax(cur_q_logits, dim=1).clone().detach() hinge_c = kwargs.get('hinge_c', 1) adv_state = attack(current_model, state, kwargs['attack_config'], logits_margin) optimizer.zero_grad() adv_margin = logits_margin(current_model.forward(adv_state), labels) ori_margin = logits_margin(current_model.forward(state), labels) reg_loss = torch.clamp(adv_margin, min=-hinge_c) if per: reg_loss = reg_loss * weights reg_loss = reg_loss.mean() kappa = kwargs['kappa'] loss += kappa * reg_loss loss.backward() # Gradient clipping. grad_norm = 0.0 max_norm = kwargs['grad_clip'] if max_norm > 0: parameters = current_model.parameters() for p in parameters: grad_norm += p.grad.data.norm(2).item()**2 grad_norm = np.sqrt(grad_norm) clip_coef = max_norm / (grad_norm + 1e-6) if clip_coef < 1: for p in parameters: p.grad.data.mul_(clip_coef) # update weights optimizer.step() nn_time = time.time() - t log_time('nn_time', time.time() - t) t = time.time() if per: replay_buffer.update_priorities(indices, prios.data.cpu().numpy()) log_time('reweight_time', time.time() - t) res = (loss, grad_norm, weights_norm, td_loss, batch_cur_q_value, batch_exp_q_value) if robust: if bound_solver == 'pgd': res += (ori_margin, adv_margin) res += (reg_loss, ) return res
def distill(args, output_model_file, processor, label_list, tokenizer, device, n_gpu, tensorboard_logger, eval_data=None): assert args.kd_policy is not None model = args.kd_policy.student args.kd_policy.teacher.eval() num_labels = len(args.labels) global_step = 0 nb_tr_steps = 0 tr_loss = 0 save_best_model = eval_data is not None and args.eval_interval > 0 train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) optimizer, t_total = get_optimizer(args, model, num_train_steps) train_data = prepare(args, processor, label_list, tokenizer, 'train') logger.info("***** Running distillation *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) train_steps = 0 best_eval_accuracy = 0 for epoch in trange(int(args.num_train_epochs), desc="Epoch", dynamic_ncols=True): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 args.kd_policy.on_epoch_begin(model, None, None) for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", dynamic_ncols=True)): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch model.train() logits = args.kd_policy.forward(input_ids, segment_ids, input_mask) loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1)) loss = args.kd_policy.before_backward_pass(model, epoch, None, None, loss, None).overall_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() train_steps += 1 tensorboard_logger.add_scalar('distillation_train_loss', loss.item(), train_steps) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if save_best_model and train_steps % args.eval_interval == 0: eval_loss, eval_accuracy, _ = eval(args, model, eval_data, device, verbose=False) tensorboard_logger.add_scalar('distillation_dev_loss', eval_loss, train_steps) tensorboard_logger.add_scalar('distillation_dev_accuracy', eval_accuracy, train_steps) if eval_accuracy > best_eval_accuracy: save_model(model, output_model_file) best_eval_accuracy = eval_accuracy args.kd_policy.on_epoch_end(model, None, None) if save_best_model: eval_loss, eval_accuracy, _ = eval(args, model, eval_data, device, verbose=False) if eval_accuracy > best_eval_accuracy: save_model(model, output_model_file) else: save_model(model, output_model_file) return global_step, tr_loss / nb_tr_steps
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default=None, type=str, required=True, help="输入数据dir。应该包含任务的.tsv文件(或其他数据文件)。") parser.add_argument( "--bert_model", default=None, type=str, required=True, help= "Bert pre-trained model selected in the list: bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="训练任务的名称") parser.add_argument("--output_dir", default=None, type=str, required=True, help="将写入模型预测和checkpoints的输出目录。 ") parser.add_argument("--cache_dir", default="", type=str, help="您希望将从s3下载的预训练模型存储在何处") parser.add_argument( "--max_seq_length", default=128, type=int, help="WordPiece tokenization 后输入序列的最大总长度,大于这个的序列将被截断,小于的padded") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="如果您使用的是uncased模型,请设置此标志。") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=256, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") # ?????????????????????????????? parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% of training." ) parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.0 (default value): dynamic loss scaling.Positive power of 2: static loss scaling value.\n" ) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, # "mnli-mm": MnliMismatchedProcessor, # "mrpc": MrpcProcessor, # "sst-2": Sst2Processor, # "sts-b": StsbProcessor, # "qqp": QqpProcessor, # "qnli": QnliProcessor, "rte": RteProcessor # "wnli": WnliProcessor, } output_modes = { # "cola": "classification", # "mnli": "classification", # "mrpc": "classification", # "sst-2": "classification", # "sts-b": "regression", # "qqp": "classification", # "qnli": "classification", "rte": "classification" # "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: # 未指定GPU,或无GPU device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: # 分布式 torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # ??????????多GPU??????? # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # ?????单GPU没有分布式?????? torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # 如果显存不足,假设原来的batch size=10,数据总量为1000,那么一共需要100train steps,同时一共进行100次梯度更新。 # 若是显存不够,我们需要减小batch size,我们设置gradient_accumulation_steps=2,那么我们新的batch_size=10/2=5, # 我们需要运行两次,才能在内存中放入10条数据,梯度更新的次数不变为100次,那么我们的train_steps=200 args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: # 多GPU torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() # RteProcessor output_mode = output_modes[task_name] # "classification" label_list = processor.get_labels() # ["entailment", "not_entailment"] num_labels = len(label_list) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format( args.local_rank)) # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=cache_dir, # num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=num_labels) # 2个标签 if args.fp16: model.half() model.to(device) if n_gpu > 1: # 多GPU model = torch.nn.DataParallel(model) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=args.do_lower_case) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # 不weight_decay optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # nd 在不在 n 中如果在把p放进去 if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) if args.do_train: num_train_steps = None # train_examples = processor.get_train_examples_wenpeng('/home/wyin3/Datasets/glue_data/RTE/train.tsv') train_examples, seen_types = processor.get_examples_Wikipedia_train( '/home/zut_csi/tomding/zs/BenchmarkingZeroShotData/tokenized_wiki2categories.txt', 100000) # /export/home/Dataset/wikipedia/parsed_output/tokenized_wiki/tokenized_wiki2categories.txt', 100000) #train_pu_half_v1.txt # seen_classes=[0,2,4,6,8] eval_examples, eval_label_list, eval_hypo_seen_str_indicator, eval_hypo_2_type_index = processor.get_examples_emotion_test( '/home/zut_csi/tomding/zs/BenchmarkingZeroShot/emotion/dev.txt', seen_types) # /export/home/Dataset/Stuttgart_Emotion/unify-emotion-datasets-master/zero-shot-split/dev.txt', seen_types) test_examples, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index = processor.get_examples_emotion_test( '/home/zut_csi/tomding/zs/BenchmarkingZeroShot/emotion/test.txt', seen_types) # /export/home/Dataset/Stuttgart_Emotion/unify-emotion-datasets-master/zero-shot-split/test.txt', seen_types) train_features, eval_features, test_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode), convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode), convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer, output_mode) all_input_ids, eval_all_input_ids, test_all_input_ids = torch.tensor( [f.input_ids for f in train_features], dtype=torch.long), torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long), torch.tensor( [f.input_ids for f in test_features], dtype=torch.long) all_input_mask, eval_all_input_mask, test_all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long), torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long), torch.tensor( [f.input_mask for f in test_features], dtype=torch.long) all_segment_ids, eval_all_segment_ids, test_all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long), torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long), torch.tensor( [f.segment_ids for f in test_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) eval_all_label_ids, test_all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long), torch.tensor( [f.label_id for f in test_features], dtype=torch.long) train_data, eval_data, test_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids), TensorDataset( eval_all_input_ids, eval_all_input_mask, eval_all_segment_ids, eval_all_label_ids), TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids) train_sampler, eval_sampler, test_sampler = RandomSampler( train_data), SequentialSampler(eval_data), SequentialSampler( test_data) eval_dataloader, test_dataloader, train_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size), DataLoader( test_data, sampler=test_sampler, batch_size=args.eval_batch_size), DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size) # ??????????????batch_size 已经除 args.gradient_accumulation_steps????????????????? num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_steps = num_train_steps // torch.distributed.get_world_size( ) # 全局的整个的进程数 max_test_unseen_acc, max_dev_unseen_acc, max_dev_seen_acc, max_overall_acc = 0.0, 0.0, 0.0, 0.0 # logger.info( '****************************************************** Running_training ***************************************************' ) logger.info("Num_examples:{} Batch_size:{} Num_steps:{}".format( len(train_examples), args.train_batch_size, num_train_steps)) for _ in trange(int(args.num_train_epochs), desc="Epoch"): train_loss = 0 for train_step, batch_data in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch_data = tuple(b.to(device) for b in batch_data) input_ids, input_mask, segment_ids, label_ids = batch_data logits = model(input_ids, segment_ids, input_mask, labels=None)[0] tmp_train_loss = CrossEntropyLoss()(logits.view( -1, num_labels), label_ids.view(-1)) if n_gpu > 1: # 多GPU tmp_train_loss = tmp_train_loss.mean( ) # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: tmp_train_loss = tmp_train_loss / args.gradient_accumulation_steps tmp_train_loss.backward() train_loss += tmp_train_loss.item() optimizer.step() optimizer.zero_grad() if (train_step + 1 ) % 200 == 0: # start evaluate on dev set after this epoch def et(et_dataloader, max_et_unseen_acc, et_label_list, et_hypo_seen_str_indicator, et_hypo_2_type_index): model.eval() et_loss, et_step, preds = 0, 0, [] for input_ids, input_mask, segment_ids, label_ids in et_dataloader: input_ids, input_mask, segment_ids, label_ids = input_ids.to( device), input_mask.to(device), segment_ids.to( device), label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None)[0] tmp_et_loss = CrossEntropyLoss()(logits.view( -1, num_labels), label_ids.view(-1)) et_loss += tmp_et_loss.mean().item() et_step += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) # 进行反向传播时,到该调用detach()的Variable就会停止,不能再继续向前进行传播. # cpu()函数作用是将数据从GPU上复制到memory上,相对应的函数是cuda() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) et_loss = et_loss / et_step preds = preds[0] ''' preds: size*2 (entail, not_entail) wenpeng added a softxmax so that each row is a prob vec ''' pred_probs = softmax(preds, axis=1)[:, 0] pred_binary_labels_harsh, pred_binary_labels_loose = [], [] for i in range(preds.shape[0]): pred_binary_labels_harsh.append( 0 ) if preds[i][0] > preds[i][ 1] + 0.1 else pred_binary_labels_harsh.append( 1) pred_binary_labels_loose.append( 0) if preds[i][0] > preds[i][ 1] else pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_emotion_zeroshot_TwpPhasePred( pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, et_label_list, et_hypo_seen_str_indicator, et_hypo_2_type_index, seen_types) # result = compute_metrics('F1', preds, all_label_ids.numpy()) loss = train_loss / train_step if args.do_train else None # test_acc = mean_f1#result.get("f1") if unseen_acc > max_et_unseen_acc: max_et_unseen_acc = unseen_acc print( 'seen_f1:{} unseen_f1:{} max_unseen_f1:{}'.format( seen_acc, unseen_acc, max_et_unseen_acc)) return max_et_unseen_acc # if seen_acc+unseen_acc > max_overall_acc: # max_overall_acc = seen_acc + unseen_acc # if seen_acc > max_dev_seen_acc: # max_dev_seen_acc = seen_acc logger.info( '********************* Running evaluation *********************' ) logger.info("Num_examples:{} Batch_size:{}".format( len(eval_examples), args.eval_batch_size)) max_dev_unseen_acc = et(eval_dataloader, max_dev_unseen_acc, eval_label_list, eval_hypo_seen_str_indicator, eval_hypo_2_type_index) logger.info( '********************* Running testing *********************' ) logger.info("Num_examples:{} Batch_size:{}".format( len(test_examples), args.eval_batch_size)) max_test_unseen_acc = et(test_dataloader, max_test_unseen_acc, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index)